Merge tag 'mt76-for-kvalo-2022-12-09' of https://github.com/nbd168/wireless
authorKalle Valo <kvalo@kernel.org>
Wed, 21 Dec 2022 18:21:36 +0000 (20:21 +0200)
committerKalle Valo <kvalo@kernel.org>
Wed, 21 Dec 2022 18:21:36 +0000 (20:21 +0200)
mt76 patches for 6.2

- fixes
- per-PHY LED support

716 files changed:
.clang-format
Documentation/bpf/bpf_iterators.rst [new file with mode: 0644]
Documentation/bpf/index.rst
Documentation/bpf/instruction-set.rst
Documentation/bpf/kfuncs.rst
Documentation/bpf/map_sk_storage.rst [new file with mode: 0644]
Documentation/devicetree/bindings/clock/samsung,exynosautov9-clock.yaml
Documentation/devicetree/bindings/net/bluetooth.txt [deleted file]
Documentation/devicetree/bindings/net/bluetooth/bluetooth-controller.yaml [new file with mode: 0644]
Documentation/devicetree/bindings/net/bluetooth/brcm,bcm4377-bluetooth.yaml [new file with mode: 0644]
Documentation/devicetree/bindings/net/bluetooth/qualcomm-bluetooth.yaml [moved from Documentation/devicetree/bindings/net/qualcomm-bluetooth.yaml with 96% similarity]
Documentation/devicetree/bindings/net/broadcom-bluetooth.yaml
Documentation/devicetree/bindings/net/can/fsl,flexcan.yaml
Documentation/devicetree/bindings/net/can/renesas,rcar-canfd.yaml
Documentation/devicetree/bindings/net/dsa/hirschmann,hellcreek.yaml
Documentation/devicetree/bindings/net/qcom,ipa.yaml
Documentation/devicetree/bindings/net/realtek-bluetooth.yaml
Documentation/devicetree/bindings/net/socionext,synquacer-netsec.yaml [new file with mode: 0644]
Documentation/devicetree/bindings/net/socionext-netsec.txt [deleted file]
Documentation/devicetree/bindings/soc/qcom/qcom,wcnss.yaml
Documentation/loongarch/booting.rst [new file with mode: 0644]
Documentation/loongarch/index.rst
Documentation/networking/device_drivers/ethernet/mellanox/mlx5.rst
Documentation/networking/devlink/devlink-info.rst
Documentation/networking/devlink/devlink-port.rst
Documentation/networking/devlink/etas_es58x.rst [new file with mode: 0644]
Documentation/networking/ethtool-netlink.rst
Documentation/networking/ipvs-sysctl.rst
Documentation/networking/timestamping.rst
Documentation/networking/xfrm_device.rst
Documentation/translations/zh_CN/loongarch/booting.rst [new file with mode: 0644]
Documentation/translations/zh_CN/loongarch/index.rst
Documentation/virt/kvm/api.rst
Documentation/virt/kvm/halt-polling.rst [moved from Documentation/virt/kvm/x86/halt-polling.rst with 92% similarity]
Documentation/virt/kvm/index.rst
Documentation/virt/kvm/x86/index.rst
MAINTAINERS
Makefile
arch/arm/boot/dts/at91rm9200.dtsi
arch/arm/mach-at91/sama5.c
arch/arm64/boot/dts/apple/t8103-j274.dts
arch/arm64/boot/dts/apple/t8103-j293.dts
arch/arm64/boot/dts/apple/t8103-j313.dts
arch/arm64/boot/dts/apple/t8103-j456.dts
arch/arm64/boot/dts/apple/t8103-j457.dts
arch/arm64/boot/dts/apple/t8103-jxxx.dtsi
arch/arm64/include/asm/efi.h
arch/arm64/kernel/efi-rt-wrapper.S
arch/arm64/kernel/efi.c
arch/arm64/mm/dma-mapping.c
arch/arm64/mm/fault.c
arch/loongarch/include/asm/pgtable.h
arch/loongarch/include/asm/smp.h
arch/loongarch/kernel/smp.c
arch/loongarch/mm/tlbex.S
arch/mips/include/asm/pgtable.h
arch/powerpc/include/asm/interrupt.h
arch/powerpc/net/bpf_jit_comp32.c
arch/riscv/Kconfig
arch/riscv/include/asm/asm.h
arch/riscv/include/asm/efi.h
arch/riscv/include/asm/pgalloc.h
arch/riscv/include/asm/pgtable.h
arch/riscv/include/asm/smp.h
arch/riscv/kernel/entry.S
arch/riscv/kernel/machine_kexec.c
arch/riscv/kernel/setup.c
arch/riscv/kernel/smp.c
arch/riscv/kernel/traps.c
arch/riscv/kernel/vdso/Makefile
arch/riscv/net/bpf_jit_comp64.c
arch/s390/include/asm/pgtable.h
arch/s390/kvm/vsie.c
arch/sparc/include/asm/pgtable_64.h
arch/x86/include/asm/nospec-branch.h
arch/x86/include/asm/pgtable.h
arch/x86/kernel/cpu/bugs.c
arch/x86/kernel/process.c
arch/x86/kvm/x86.c
drivers/acpi/numa/hmat.c
drivers/ata/libahci_platform.c
drivers/bluetooth/Kconfig
drivers/bluetooth/Makefile
drivers/bluetooth/btintel.c
drivers/bluetooth/btrtl.c
drivers/bluetooth/btrtl.h
drivers/bluetooth/btusb.c
drivers/bluetooth/hci_bcm.c
drivers/bluetooth/hci_bcm4377.c [new file with mode: 0644]
drivers/bluetooth/hci_bcsp.c
drivers/bluetooth/hci_h5.c
drivers/bluetooth/hci_ll.c
drivers/bluetooth/hci_qca.c
drivers/bluetooth/virtio_bt.c
drivers/char/tpm/tpm-interface.c
drivers/clk/at91/at91rm9200.c
drivers/clk/qcom/gcc-sc8280xp.c
drivers/clk/qcom/gdsc.c
drivers/clk/qcom/gdsc.h
drivers/clk/samsung/clk-exynos-clkout.c
drivers/clk/samsung/clk-exynos7885.c
drivers/clocksource/timer-riscv.c
drivers/dax/hmem/device.c
drivers/gpio/gpio-amd8111.c
drivers/gpio/gpio-rockchip.c
drivers/gpio/gpiolib.c
drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
drivers/gpu/drm/amd/display/Kconfig
drivers/gpu/drm/i915/display/intel_display.c
drivers/gpu/drm/i915/gt/intel_gt.c
drivers/gpu/drm/i915/gt/intel_gt_requests.c
drivers/gpu/drm/i915/intel_dram.c
drivers/hid/hid-core.c
drivers/hid/hid-ids.h
drivers/hid/hid-ite.c
drivers/hid/hid-lg4ff.c
drivers/hid/hid-logitech-hidpp.c
drivers/hid/hid-quirks.c
drivers/hid/hid-uclogic-core.c
drivers/hid/hid-uclogic-rdesc.c
drivers/hid/i2c-hid/Kconfig
drivers/hwmon/asus-ec-sensors.c
drivers/hwmon/coretemp.c
drivers/hwmon/i5500_temp.c
drivers/hwmon/ibmpex.c
drivers/hwmon/ina3221.c
drivers/hwmon/ltc2947-core.c
drivers/i2c/busses/i2c-cadence.c
drivers/i2c/busses/i2c-imx.c
drivers/i2c/busses/i2c-npcm7xx.c
drivers/i2c/busses/i2c-qcom-geni.c
drivers/i2c/i2c-core-base.c
drivers/input/touchscreen/raydium_i2c_ts.c
drivers/iommu/intel/dmar.c
drivers/iommu/intel/iommu.c
drivers/iommu/intel/iommu.h
drivers/iommu/intel/svm.c
drivers/media/common/videobuf2/frame_vector.c
drivers/media/common/videobuf2/videobuf2-core.c
drivers/mmc/core/core.c
drivers/mmc/core/mmc_test.c
drivers/mmc/host/mtk-sd.c
drivers/mmc/host/sdhci-esdhc-imx.c
drivers/mmc/host/sdhci-sprd.c
drivers/mmc/host/sdhci.c
drivers/mmc/host/sdhci.h
drivers/net/bonding/bond_main.c
drivers/net/can/c_can/c_can_platform.c
drivers/net/can/can327.c
drivers/net/can/ctucanfd/Kconfig
drivers/net/can/flexcan/flexcan-core.c
drivers/net/can/flexcan/flexcan.h
drivers/net/can/m_can/m_can.c
drivers/net/can/m_can/m_can.h
drivers/net/can/m_can/m_can_platform.c
drivers/net/can/m_can/tcan4x5x-core.c
drivers/net/can/m_can/tcan4x5x-regmap.c
drivers/net/can/rcar/rcar_canfd.c
drivers/net/can/slcan/slcan-core.c
drivers/net/can/usb/Kconfig
drivers/net/can/usb/esd_usb.c
drivers/net/can/usb/etas_es58x/Makefile
drivers/net/can/usb/etas_es58x/es581_4.c
drivers/net/can/usb/etas_es58x/es58x_core.c
drivers/net/can/usb/etas_es58x/es58x_core.h
drivers/net/can/usb/etas_es58x/es58x_devlink.c [new file with mode: 0644]
drivers/net/can/usb/etas_es58x/es58x_fd.c
drivers/net/can/usb/gs_usb.c
drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c
drivers/net/can/usb/ucan.c
drivers/net/dsa/microchip/ksz8.h
drivers/net/dsa/microchip/ksz8795.c
drivers/net/dsa/microchip/ksz8795_reg.h
drivers/net/dsa/microchip/ksz9477.c
drivers/net/dsa/microchip/ksz9477.h
drivers/net/dsa/microchip/ksz9477_reg.h
drivers/net/dsa/microchip/ksz_common.c
drivers/net/dsa/microchip/ksz_common.h
drivers/net/dsa/mv88e6xxx/Makefile
drivers/net/dsa/mv88e6xxx/chip.c
drivers/net/dsa/mv88e6xxx/global1_atu.c
drivers/net/dsa/mv88e6xxx/global1_vtu.c
drivers/net/dsa/mv88e6xxx/trace.c [new file with mode: 0644]
drivers/net/dsa/mv88e6xxx/trace.h [new file with mode: 0644]
drivers/net/dsa/sja1105/sja1105_devlink.c
drivers/net/dsa/sja1105/sja1105_main.c
drivers/net/ethernet/aeroflex/greth.c
drivers/net/ethernet/broadcom/Kconfig
drivers/net/ethernet/broadcom/bnx2.c
drivers/net/ethernet/broadcom/bnxt/bnxt.c
drivers/net/ethernet/broadcom/genet/bcmgenet.c
drivers/net/ethernet/cavium/liquidio/octeon_console.c
drivers/net/ethernet/cavium/thunder/nicvf_main.c
drivers/net/ethernet/chelsio/inline_crypto/ch_ipsec/chcr_ipsec.c
drivers/net/ethernet/freescale/dpaa2/dpaa2-switch-flower.c
drivers/net/ethernet/freescale/fec_main.c
drivers/net/ethernet/hisilicon/hisi_femac.c
drivers/net/ethernet/hisilicon/hix5hd2_gmac.c
drivers/net/ethernet/hisilicon/hns/hns_ethtool.c
drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
drivers/net/ethernet/intel/e1000e/netdev.c
drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
drivers/net/ethernet/intel/i40e/i40e_common.c
drivers/net/ethernet/intel/i40e/i40e_ethtool.c
drivers/net/ethernet/intel/i40e/i40e_main.c
drivers/net/ethernet/intel/i40e/i40e_prototype.h
drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
drivers/net/ethernet/intel/ice/ice_main.c
drivers/net/ethernet/intel/ice/ice_ptp.c
drivers/net/ethernet/intel/ice/ice_ptp.h
drivers/net/ethernet/intel/ice/ice_ptp_hw.c
drivers/net/ethernet/intel/ice/ice_ptp_hw.h
drivers/net/ethernet/intel/igb/igb_ethtool.c
drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
drivers/net/ethernet/intel/ixgbevf/ipsec.c
drivers/net/ethernet/marvell/mvneta.c
drivers/net/ethernet/marvell/octeontx2/af/cgx.c
drivers/net/ethernet/marvell/octeontx2/af/cgx.h
drivers/net/ethernet/marvell/octeontx2/af/lmac_common.h
drivers/net/ethernet/marvell/octeontx2/af/rpm.c
drivers/net/ethernet/marvell/octeontx2/af/rpm.h
drivers/net/ethernet/marvell/octeontx2/af/rvu.h
drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c
drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c
drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_hash.c
drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c
drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c
drivers/net/ethernet/mediatek/mtk_eth_soc.c
drivers/net/ethernet/mediatek/mtk_wed.c
drivers/net/ethernet/mediatek/mtk_wed_mcu.c
drivers/net/ethernet/mediatek/mtk_wed_wo.c
drivers/net/ethernet/mellanox/mlx4/en_tx.c
drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
drivers/net/ethernet/mellanox/mlx5/core/Makefile
drivers/net/ethernet/mellanox/mlx5/core/devlink.c
drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c
drivers/net/ethernet/mellanox/mlx5/core/en.h
drivers/net/ethernet/mellanox/mlx5/core/en/fs.h
drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/accept.c
drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.c
drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.h
drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/drop.c
drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/goto.c
drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/mirred.c
drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/mirred_nic.c
drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/police.c
drivers/net/ethernet/mellanox/mlx5/core/en/tc/meter.c
drivers/net/ethernet/mellanox/mlx5/core/en/tc/meter.h
drivers/net/ethernet/mellanox/mlx5/core/en/tc/post_meter.c
drivers/net/ethernet/mellanox/mlx5/core/en/tc/post_meter.h
drivers/net/ethernet/mellanox/mlx5/core/en/tc_priv.h
drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c
drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h
drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c
drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_offload.c
drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c
drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_stats.c
drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
drivers/net/ethernet/mellanox/mlx5/core/en_tc.h
drivers/net/ethernet/mellanox/mlx5/core/eq.c
drivers/net/ethernet/mellanox/mlx5/core/esw/debugfs.c
drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
drivers/net/ethernet/mellanox/mlx5/core/lag/mp.c
drivers/net/ethernet/mellanox/mlx5/core/lib/aso.c
drivers/net/ethernet/mellanox/mlx5/core/lib/aso.h
drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c
drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c
drivers/net/ethernet/mellanox/mlx5/core/steering/dr_dbg.c
drivers/net/ethernet/mellanox/mlx5/core/steering/dr_definer.c [new file with mode: 0644]
drivers/net/ethernet/mellanox/mlx5/core/steering/dr_domain.c
drivers/net/ethernet/mellanox/mlx5/core/steering/dr_rule.c
drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c
drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.h
drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v1.c
drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v1.h
drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v2.c
drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h
drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c
drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5_ifc_dr_ste_v1.h
drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h
drivers/net/ethernet/mellanox/mlx5/core/vport.c
drivers/net/ethernet/mellanox/mlxsw/spectrum_ipip.c
drivers/net/ethernet/mellanox/mlxsw/spectrum_ipip.h
drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h
drivers/net/ethernet/microchip/encx24j600-regmap.c
drivers/net/ethernet/microchip/lan966x/lan966x_main.c
drivers/net/ethernet/microchip/lan966x/lan966x_main.h
drivers/net/ethernet/microchip/lan966x/lan966x_ptp.c
drivers/net/ethernet/microchip/lan966x/lan966x_tc_flower.c
drivers/net/ethernet/microchip/lan966x/lan966x_vcap_impl.c
drivers/net/ethernet/microchip/sparx5/sparx5_fdma.c
drivers/net/ethernet/microchip/sparx5/sparx5_main.c
drivers/net/ethernet/microchip/sparx5/sparx5_packet.c
drivers/net/ethernet/microchip/vcap/vcap_api.c
drivers/net/ethernet/microchip/vcap/vcap_api_client.h
drivers/net/ethernet/microchip/vcap/vcap_api_debugfs.c
drivers/net/ethernet/microchip/vcap/vcap_api_private.h
drivers/net/ethernet/microsoft/mana/mana_en.c
drivers/net/ethernet/myricom/myri10ge/myri10ge.c
drivers/net/ethernet/netronome/nfp/ccm_mbox.c
drivers/net/ethernet/netronome/nfp/crypto/ipsec.c
drivers/net/ethernet/netronome/nfp/nfdk/dp.c
drivers/net/ethernet/netronome/nfp/nfp_net.h
drivers/net/ethernet/netronome/nfp/nfp_net_common.c
drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h
drivers/net/ethernet/qlogic/qed/qed_ll2.c
drivers/net/ethernet/qlogic/qlcnic/qlcnic_ethtool.c
drivers/net/ethernet/realtek/r8169_main.c
drivers/net/ethernet/renesas/ravb_main.c
drivers/net/ethernet/sfc/efx_common.c
drivers/net/ethernet/sfc/siena/efx_common.c
drivers/net/ethernet/stmicro/stmmac/Kconfig
drivers/net/ethernet/stmicro/stmmac/Makefile
drivers/net/ethernet/stmicro/stmmac/dwmac-tegra.c [new file with mode: 0644]
drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c
drivers/net/ethernet/ti/am65-cpsw-nuss.c
drivers/net/ieee802154/ca8210.c
drivers/net/ieee802154/cc2520.c
drivers/net/ipa/Makefile
drivers/net/ipa/data/ipa_data-v4.7.c [new file with mode: 0644]
drivers/net/ipa/ipa_data.h
drivers/net/ipa/ipa_main.c
drivers/net/ipa/ipa_reg.c
drivers/net/ipa/ipa_reg.h
drivers/net/ipa/ipa_sysfs.c
drivers/net/ipa/ipa_version.h
drivers/net/ipa/reg/ipa_reg-v4.7.c [new file with mode: 0644]
drivers/net/macsec.c
drivers/net/mdio/fwnode_mdio.c
drivers/net/mdio/of_mdio.c
drivers/net/netdevsim/ipsec.c
drivers/net/phy/Kconfig
drivers/net/phy/mdio_device.c
drivers/net/phy/mxl-gpy.c
drivers/net/phy/sfp.c
drivers/net/plip/plip.c
drivers/net/tap.c
drivers/net/team/team.c
drivers/net/thunderbolt.c
drivers/net/tun.c
drivers/net/usb/asix_devices.c
drivers/net/virtio_net.c
drivers/net/vmxnet3/vmxnet3_drv.c
drivers/net/wireless/intel/iwlegacy/common.c
drivers/net/wireless/marvell/libertas/main.c
drivers/net/wireless/mediatek/mt76/dma.c
drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_8188f.c
drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c
drivers/net/wireless/realtek/rtlwifi/rtl8188ee/hw.c
drivers/net/wireless/realtek/rtlwifi/rtl8723be/hw.c
drivers/net/wireless/realtek/rtlwifi/rtl8821ae/hw.c
drivers/net/wireless/realtek/rtlwifi/rtl8821ae/phy.c
drivers/net/wireless/realtek/rtw89/core.c
drivers/net/wireless/realtek/rtw89/core.h
drivers/net/wireless/realtek/rtw89/fw.c
drivers/net/wireless/realtek/rtw89/fw.h
drivers/net/wireless/realtek/rtw89/phy.c
drivers/net/wireless/realtek/rtw89/reg.h
drivers/net/wireless/realtek/rtw89/rtw8852c_rfk.c
drivers/net/wireless/realtek/rtw89/txrx.h
drivers/net/wireless/rsi/rsi_91x_coex.c
drivers/net/wwan/iosm/iosm_ipc_mux.c
drivers/net/xen-netback/common.h
drivers/net/xen-netback/interface.c
drivers/net/xen-netback/netback.c
drivers/net/xen-netback/rx.c
drivers/net/xen-netfront.c
drivers/nvme/host/core.c
drivers/nvme/host/multipath.c
drivers/nvme/host/pci.c
drivers/pinctrl/intel/pinctrl-intel.c
drivers/pinctrl/mediatek/mtk-eint.c
drivers/pinctrl/pinctrl-single.c
drivers/platform/x86/amd/pmc.c
drivers/s390/net/qeth_l2_main.c
drivers/usb/core/message.c
drivers/usb/core/usb.h
fs/afs/server.c
fs/fscache/cookie.c
fs/nilfs2/dat.c
include/asm-generic/tlb.h
include/linux/bpf.h
include/linux/bpf_lsm.h
include/linux/bpf_verifier.h
include/linux/btf.h
include/linux/btf_ids.h
include/linux/cgroup.h
include/linux/gfp.h
include/linux/license.h
include/linux/mlx5/fs.h
include/linux/mlx5/mlx5_ifc.h
include/linux/mlx5/vport.h
include/linux/mm.h
include/linux/mmc/mmc.h
include/linux/netdevice.h
include/linux/netfilter/ipset/ip_set.h
include/linux/pgtable.h
include/linux/rhashtable.h
include/linux/skbuff.h
include/linux/skmsg.h
include/linux/soc/mediatek/mtk_wed.h
include/linux/stmmac.h
include/linux/usb.h
include/linux/virtio_net.h
include/net/act_api.h
include/net/af_rxrpc.h
include/net/bluetooth/hci.h
include/net/bluetooth/hci_core.h
include/net/cfg802154.h
include/net/devlink.h
include/net/dst_metadata.h
include/net/ip_vs.h
include/net/ipv6.h
include/net/mana/gdma.h
include/net/netfilter/nf_conntrack_core.h
include/net/netfilter/nf_nat.h
include/net/netns/xdp.h
include/net/nl802154.h
include/net/ping.h
include/net/sock.h
include/net/tc_wrapper.h [new file with mode: 0644]
include/net/tcp.h
include/net/tso.h
include/net/xfrm.h
include/trace/events/fscache.h
include/trace/events/rxrpc.h
include/uapi/linux/bpf.h
include/uapi/linux/devlink.h
include/uapi/linux/ethtool_netlink.h
include/uapi/linux/if_bridge.h
include/uapi/linux/if_tun.h
include/uapi/linux/net_tstamp.h
include/uapi/linux/netfilter/ipset/ip_set.h
include/uapi/linux/netfilter/nf_conntrack_sctp.h
include/uapi/linux/netfilter/nfnetlink_cttimeout.h
include/uapi/linux/openvswitch.h
include/uapi/linux/virtio_bt.h
include/uapi/linux/virtio_net.h
include/uapi/linux/xfrm.h
ipc/sem.c
kernel/bpf/bpf_cgrp_storage.c
kernel/bpf/bpf_inode_storage.c
kernel/bpf/bpf_lsm.c
kernel/bpf/bpf_task_storage.c
kernel/bpf/btf.c
kernel/bpf/helpers.c
kernel/bpf/memalloc.c
kernel/bpf/verifier.c
kernel/cgroup/cgroup-internal.h
kernel/events/core.c
kernel/sysctl.c
lib/Kconfig
lib/Kconfig.debug
lib/packing.c
lib/rhashtable.c
mm/compaction.c
mm/damon/sysfs.c
mm/hugetlb.c
mm/khugepaged.c
mm/madvise.c
mm/memcontrol.c
mm/memory.c
mm/mmap.c
mm/mmu_gather.c
mm/vmscan.c
net/bluetooth/6lowpan.c
net/bluetooth/Kconfig
net/bluetooth/af_bluetooth.c
net/bluetooth/hci_codec.c
net/bluetooth/hci_conn.c
net/bluetooth/hci_core.c
net/bluetooth/hci_debugfs.c
net/bluetooth/hci_event.c
net/bluetooth/hci_request.c
net/bluetooth/hci_sync.c
net/bluetooth/iso.c
net/bluetooth/l2cap_core.c
net/bluetooth/lib.c
net/bluetooth/mgmt.c
net/bluetooth/rfcomm/core.c
net/bpf/test_run.c
net/bridge/br_mdb.c
net/bridge/br_multicast.c
net/bridge/br_private.h
net/bridge/netfilter/nf_conntrack_bridge.c
net/can/af_can.c
net/can/raw.c
net/core/bpf_sk_storage.c
net/core/dev.c
net/core/devlink.c
net/core/dst.c
net/core/failover.c
net/core/filter.c
net/core/skbuff.c
net/core/skmsg.c
net/core/sock.c
net/core/sock_map.c
net/core/tso.c
net/dsa/tag.c
net/dsa/tag_hellcreek.c
net/dsa/tag_ksz.c
net/dsa/tag_sja1105.c
net/ethernet/eth.c
net/ethtool/Makefile
net/ethtool/common.c
net/ethtool/netlink.c
net/ethtool/netlink.h
net/ethtool/rss.c [new file with mode: 0644]
net/ieee802154/nl802154.c
net/ieee802154/nl802154.h
net/ipv4/fib_frontend.c
net/ipv4/fib_semantics.c
net/ipv4/ip_gre.c
net/ipv4/ping.c
net/ipv4/tcp_bpf.c
net/ipv4/udp_offload.c
net/ipv6/addrconf.c
net/ipv6/ip6_offload.c
net/ipv6/ip6_output.c
net/ipv6/udp_offload.c
net/mac802154/iface.c
net/mac802154/main.c
net/mac802154/rx.c
net/mac802154/trace.h
net/mptcp/pm_netlink.c
net/mptcp/sockopt.c
net/ncsi/ncsi-cmd.c
net/netfilter/Kconfig
net/netfilter/Makefile
net/netfilter/ipset/ip_set_hash_gen.h
net/netfilter/ipset/ip_set_hash_ip.c
net/netfilter/ipset/ip_set_hash_ipport.c
net/netfilter/ipset/ip_set_hash_netnet.c
net/netfilter/ipvs/ip_vs_core.c
net/netfilter/ipvs/ip_vs_ctl.c
net/netfilter/ipvs/ip_vs_est.c
net/netfilter/nf_conntrack_core.c
net/netfilter/nf_conntrack_netlink.c
net/netfilter/nf_conntrack_proto.c
net/netfilter/nf_conntrack_proto_icmpv6.c
net/netfilter/nf_conntrack_proto_sctp.c
net/netfilter/nf_conntrack_standalone.c
net/netfilter/nf_flow_table_ip.c
net/netfilter/nf_flow_table_offload.c
net/netfilter/nf_nat_ovs.c [new file with mode: 0644]
net/netfilter/nf_tables_api.c
net/netfilter/nft_set_pipapo.c
net/nfc/nci/ntf.c
net/openvswitch/Kconfig
net/openvswitch/conntrack.c
net/openvswitch/datapath.c
net/openvswitch/vport.c
net/openvswitch/vport.h
net/rxrpc/Kconfig
net/rxrpc/Makefile
net/rxrpc/af_rxrpc.c
net/rxrpc/ar-internal.h
net/rxrpc/call_accept.c
net/rxrpc/call_event.c
net/rxrpc/call_object.c
net/rxrpc/conn_client.c
net/rxrpc/conn_event.c
net/rxrpc/conn_object.c
net/rxrpc/conn_service.c
net/rxrpc/input.c
net/rxrpc/io_thread.c [new file with mode: 0644]
net/rxrpc/key.c
net/rxrpc/local_event.c
net/rxrpc/local_object.c
net/rxrpc/net_ns.c
net/rxrpc/output.c
net/rxrpc/peer_event.c
net/rxrpc/peer_object.c
net/rxrpc/proc.c
net/rxrpc/recvmsg.c
net/rxrpc/rxkad.c
net/rxrpc/rxperf.c [new file with mode: 0644]
net/rxrpc/security.c
net/rxrpc/sendmsg.c
net/rxrpc/server_key.c
net/rxrpc/skbuff.c
net/rxrpc/txbuf.c
net/sched/Kconfig
net/sched/act_api.c
net/sched/act_bpf.c
net/sched/act_connmark.c
net/sched/act_csum.c
net/sched/act_ct.c
net/sched/act_ctinfo.c
net/sched/act_gact.c
net/sched/act_gate.c
net/sched/act_ife.c
net/sched/act_ipt.c
net/sched/act_mirred.c
net/sched/act_mpls.c
net/sched/act_nat.c
net/sched/act_pedit.c
net/sched/act_police.c
net/sched/act_sample.c
net/sched/act_simple.c
net/sched/act_skbedit.c
net/sched/act_skbmod.c
net/sched/act_tunnel_key.c
net/sched/act_vlan.c
net/sched/cls_api.c
net/sched/cls_basic.c
net/sched/cls_bpf.c
net/sched/cls_cgroup.c
net/sched/cls_flow.c
net/sched/cls_flower.c
net/sched/cls_fw.c
net/sched/cls_matchall.c
net/sched/cls_route.c
net/sched/cls_rsvp.c
net/sched/cls_rsvp.h
net/sched/cls_rsvp6.c
net/sched/cls_tcindex.c
net/sched/cls_u32.c
net/sched/sch_api.c
net/sctp/sysctl.c
net/tipc/link.c
net/tipc/node.c
net/tls/tls_sw.c
net/unix/diag.c
net/xfrm/Makefile
net/xfrm/xfrm_device.c
net/xfrm/xfrm_interface_bpf.c [new file with mode: 0644]
net/xfrm/xfrm_interface_core.c [moved from net/xfrm/xfrm_interface.c with 98% similarity]
net/xfrm/xfrm_output.c
net/xfrm/xfrm_policy.c
net/xfrm/xfrm_state.c
net/xfrm/xfrm_user.c
scripts/bpf_doc.py
sound/firewire/dice/dice-stream.c
sound/soc/codecs/cs42l51.c
sound/soc/codecs/tlv320adc3xxx.c
sound/soc/fsl/fsl_micfil.c
sound/soc/soc-ops.c
tools/bpf/bpftool/common.c
tools/include/uapi/linux/bpf.h
tools/include/uapi/linux/if_link.h
tools/lib/bpf/Makefile
tools/lib/bpf/bpf.h
tools/lib/bpf/usdt.c
tools/testing/selftests/bpf/DENYLIST.aarch64
tools/testing/selftests/bpf/DENYLIST.s390x
tools/testing/selftests/bpf/Makefile
tools/testing/selftests/bpf/bpf_legacy.h
tools/testing/selftests/bpf/config
tools/testing/selftests/bpf/network_helpers.c
tools/testing/selftests/bpf/prog_tests/btf_skc_cls_ingress.c
tools/testing/selftests/bpf/prog_tests/cgrp_local_storage.c
tools/testing/selftests/bpf/prog_tests/dynptr.c
tools/testing/selftests/bpf/prog_tests/empty_skb.c
tools/testing/selftests/bpf/prog_tests/kfunc_dynptr_param.c
tools/testing/selftests/bpf/prog_tests/map_kptr.c
tools/testing/selftests/bpf/prog_tests/task_kfunc.c
tools/testing/selftests/bpf/prog_tests/tc_redirect.c
tools/testing/selftests/bpf/prog_tests/test_tunnel.c
tools/testing/selftests/bpf/prog_tests/user_ringbuf.c
tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c
tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c
tools/testing/selftests/bpf/prog_tests/xfrm_info.c [new file with mode: 0644]
tools/testing/selftests/bpf/progs/bpf_iter_ksym.c
tools/testing/selftests/bpf/progs/bpf_misc.h
tools/testing/selftests/bpf/progs/bpf_tracing_net.h
tools/testing/selftests/bpf/progs/btf_type_tag_percpu.c
tools/testing/selftests/bpf/progs/cgrp_ls_sleepable.c [new file with mode: 0644]
tools/testing/selftests/bpf/progs/dynptr_fail.c
tools/testing/selftests/bpf/progs/dynptr_success.c
tools/testing/selftests/bpf/progs/linked_list.c
tools/testing/selftests/bpf/progs/map_kptr_fail.c
tools/testing/selftests/bpf/progs/rcu_read_lock.c
tools/testing/selftests/bpf/progs/task_kfunc_failure.c
tools/testing/selftests/bpf/progs/task_kfunc_success.c
tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c
tools/testing/selftests/bpf/progs/user_ringbuf_fail.c
tools/testing/selftests/bpf/progs/xfrm_info.c [new file with mode: 0644]
tools/testing/selftests/bpf/test_cpp.cpp
tools/testing/selftests/bpf/test_loader.c [new file with mode: 0644]
tools/testing/selftests/bpf/test_offload.py
tools/testing/selftests/bpf/test_progs.h
tools/testing/selftests/bpf/test_sockmap.c
tools/testing/selftests/bpf/verifier/calls.c
tools/testing/selftests/bpf/verifier/direct_packet_access.c
tools/testing/selftests/bpf/verifier/map_ptr.c
tools/testing/selftests/bpf/verifier/ringbuf.c
tools/testing/selftests/bpf/verifier/spin_lock.c
tools/testing/selftests/bpf/verifier/value_or_null.c
tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_ipip6.sh [moved from tools/testing/selftests/drivers/net/mlxsw/spectrum-2/devlink_trap_tunnel_ipip6.sh with 99% similarity]
tools/testing/selftests/net/.gitignore
tools/testing/selftests/net/af_unix/Makefile
tools/testing/selftests/net/af_unix/diag_uid.c [new file with mode: 0644]
tools/testing/selftests/net/bpf/Makefile
tools/testing/selftests/net/config
tools/testing/selftests/net/fib_tests.sh
tools/testing/selftests/net/forwarding/Makefile
tools/testing/selftests/net/forwarding/bridge_mdb.sh
tools/testing/selftests/net/forwarding/bridge_mdb_host.sh [new file with mode: 0755]
tools/testing/selftests/net/rtnetlink.sh
tools/testing/selftests/net/toeplitz.sh
tools/testing/selftests/netfilter/conntrack_icmp_related.sh
tools/vm/slabinfo-gnuplot.sh

index 1247d54..8d01225 100644 (file)
@@ -535,6 +535,7 @@ ForEachMacros:
   - 'perf_hpp_list__for_each_sort_list_safe'
   - 'perf_pmu__for_each_hybrid_pmu'
   - 'ping_portaddr_for_each_entry'
+  - 'ping_portaddr_for_each_entry_rcu'
   - 'plist_for_each'
   - 'plist_for_each_continue'
   - 'plist_for_each_entry'
diff --git a/Documentation/bpf/bpf_iterators.rst b/Documentation/bpf/bpf_iterators.rst
new file mode 100644 (file)
index 0000000..6d77707
--- /dev/null
@@ -0,0 +1,485 @@
+=============
+BPF Iterators
+=============
+
+
+----------
+Motivation
+----------
+
+There are a few existing ways to dump kernel data into user space. The most
+popular one is the ``/proc`` system. For example, ``cat /proc/net/tcp6`` dumps
+all tcp6 sockets in the system, and ``cat /proc/net/netlink`` dumps all netlink
+sockets in the system. However, their output format tends to be fixed, and if
+users want more information about these sockets, they have to patch the kernel,
+which often takes time to publish upstream and release. The same is true for popular
+tools like `ss <https://man7.org/linux/man-pages/man8/ss.8.html>`_ where any
+additional information needs a kernel patch.
+
+To solve this problem, the `drgn
+<https://www.kernel.org/doc/html/latest/bpf/drgn.html>`_ tool is often used to
+dig out the kernel data with no kernel change. However, the main drawback for
+drgn is performance, as it cannot do pointer tracing inside the kernel. In
+addition, drgn cannot validate a pointer value and may read invalid data if the
+pointer becomes invalid inside the kernel.
+
+The BPF iterator solves the above problem by providing flexibility on what data
+(e.g., tasks, bpf_maps, etc.) to collect by calling BPF programs for each kernel
+data object.
+
+----------------------
+How BPF Iterators Work
+----------------------
+
+A BPF iterator is a type of BPF program that allows users to iterate over
+specific types of kernel objects. Unlike traditional BPF tracing programs that
+allow users to define callbacks that are invoked at particular points of
+execution in the kernel, BPF iterators allow users to define callbacks that
+should be executed for every entry in a variety of kernel data structures.
+
+For example, users can define a BPF iterator that iterates over every task on
+the system and dumps the total amount of CPU runtime currently used by each of
+them. Another BPF task iterator may instead dump the cgroup information for each
+task. Such flexibility is the core value of BPF iterators.
+
+A BPF program is always loaded into the kernel at the behest of a user space
+process. A user space process loads a BPF program by opening and initializing
+the program skeleton as required and then invoking a syscall to have the BPF
+program verified and loaded by the kernel.
+
+In traditional tracing programs, a program is activated by having user space
+obtain a ``bpf_link`` to the program with ``bpf_program__attach()``. Once
+activated, the program callback will be invoked whenever the tracepoint is
+triggered in the main kernel. For BPF iterator programs, a ``bpf_link`` to the
+program is obtained using ``bpf_link_create()``, and the program callback is
+invoked by issuing system calls from user space.
+
+Next, let us see how you can use the iterators to iterate on kernel objects and
+read data.
+
+------------------------
+How to Use BPF iterators
+------------------------
+
+BPF selftests are a great resource to illustrate how to use the iterators. In
+this section, we’ll walk through a BPF selftest which shows how to load and use
+a BPF iterator program.   To begin, we’ll look at `bpf_iter.c
+<https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/tree/tools/testing/selftests/bpf/prog_tests/bpf_iter.c>`_,
+which illustrates how to load and trigger BPF iterators on the user space side.
+Later, we’ll look at a BPF program that runs in kernel space.
+
+Loading a BPF iterator in the kernel from user space typically involves the
+following steps:
+
+* The BPF program is loaded into the kernel through ``libbpf``. Once the kernel
+  has verified and loaded the program, it returns a file descriptor (fd) to user
+  space.
+* Obtain a ``link_fd`` to the BPF program by calling the ``bpf_link_create()``
+  specified with the BPF program file descriptor received from the kernel.
+* Next, obtain a BPF iterator file descriptor (``bpf_iter_fd``) by calling the
+  ``bpf_iter_create()`` specified with the ``bpf_link`` received from Step 2.
+* Trigger the iteration by calling ``read(bpf_iter_fd)`` until no data is
+  available.
+* Close the iterator fd using ``close(bpf_iter_fd)``.
+* If needed to reread the data, get a new ``bpf_iter_fd`` and do the read again.
+
+The following are a few examples of selftest BPF iterator programs:
+
+* `bpf_iter_tcp4.c <https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/tree/tools/testing/selftests/bpf/progs/bpf_iter_tcp4.c>`_
+* `bpf_iter_task_vma.c <https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/tree/tools/testing/selftests/bpf/progs/bpf_iter_task_vma.c>`_
+* `bpf_iter_task_file.c <https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/tree/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c>`_
+
+Let us look at ``bpf_iter_task_file.c``, which runs in kernel space:
+
+Here is the definition of ``bpf_iter__task_file`` in `vmlinux.h
+<https://facebookmicrosites.github.io/bpf/blog/2020/02/19/bpf-portability-and-co-re.html#btf>`_.
+Any struct name in ``vmlinux.h`` in the format ``bpf_iter__<iter_name>``
+represents a BPF iterator. The suffix ``<iter_name>`` represents the type of
+iterator.
+
+::
+
+    struct bpf_iter__task_file {
+            union {
+                struct bpf_iter_meta *meta;
+            };
+            union {
+                struct task_struct *task;
+            };
+            u32 fd;
+            union {
+                struct file *file;
+            };
+    };
+
+In the above code, the field 'meta' contains the metadata, which is the same for
+all BPF iterator programs. The rest of the fields are specific to different
+iterators. For example, for task_file iterators, the kernel layer provides the
+'task', 'fd' and 'file' field values. The 'task' and 'file' are `reference
+counted
+<https://facebookmicrosites.github.io/bpf/blog/2018/08/31/object-lifetime.html#file-descriptors-and-reference-counters>`_,
+so they won't go away when the BPF program runs.
+
+Here is a snippet from the  ``bpf_iter_task_file.c`` file:
+
+::
+
+  SEC("iter/task_file")
+  int dump_task_file(struct bpf_iter__task_file *ctx)
+  {
+    struct seq_file *seq = ctx->meta->seq;
+    struct task_struct *task = ctx->task;
+    struct file *file = ctx->file;
+    __u32 fd = ctx->fd;
+
+    if (task == NULL || file == NULL)
+      return 0;
+
+    if (ctx->meta->seq_num == 0) {
+      count = 0;
+      BPF_SEQ_PRINTF(seq, "    tgid      gid       fd      file\n");
+    }
+
+    if (tgid == task->tgid && task->tgid != task->pid)
+      count++;
+
+    if (last_tgid != task->tgid) {
+      last_tgid = task->tgid;
+      unique_tgid_count++;
+    }
+
+    BPF_SEQ_PRINTF(seq, "%8d %8d %8d %lx\n", task->tgid, task->pid, fd,
+            (long)file->f_op);
+    return 0;
+  }
+
+In the above example, the section name ``SEC(iter/task_file)``, indicates that
+the program is a BPF iterator program to iterate all files from all tasks. The
+context of the program is ``bpf_iter__task_file`` struct.
+
+The user space program invokes the BPF iterator program running in the kernel
+by issuing a ``read()`` syscall. Once invoked, the BPF
+program can export data to user space using a variety of BPF helper functions.
+You can use either ``bpf_seq_printf()`` (and BPF_SEQ_PRINTF helper macro) or
+``bpf_seq_write()`` function based on whether you need formatted output or just
+binary data, respectively. For binary-encoded data, the user space applications
+can process the data from ``bpf_seq_write()`` as needed. For the formatted data,
+you can use ``cat <path>`` to print the results similar to ``cat
+/proc/net/netlink`` after pinning the BPF iterator to the bpffs mount. Later,
+use  ``rm -f <path>`` to remove the pinned iterator.
+
+For example, you can use the following command to create a BPF iterator from the
+``bpf_iter_ipv6_route.o`` object file and pin it to the ``/sys/fs/bpf/my_route``
+path:
+
+::
+
+  $ bpftool iter pin ./bpf_iter_ipv6_route.o  /sys/fs/bpf/my_route
+
+And then print out the results using the following command:
+
+::
+
+  $ cat /sys/fs/bpf/my_route
+
+
+-------------------------------------------------------
+Implement Kernel Support for BPF Iterator Program Types
+-------------------------------------------------------
+
+To implement a BPF iterator in the kernel, the developer must make a one-time
+change to the following key data structure defined in the `bpf.h
+<https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/tree/include/linux/bpf.h>`_
+file.
+
+::
+
+  struct bpf_iter_reg {
+            const char *target;
+            bpf_iter_attach_target_t attach_target;
+            bpf_iter_detach_target_t detach_target;
+            bpf_iter_show_fdinfo_t show_fdinfo;
+            bpf_iter_fill_link_info_t fill_link_info;
+            bpf_iter_get_func_proto_t get_func_proto;
+            u32 ctx_arg_info_size;
+            u32 feature;
+            struct bpf_ctx_arg_aux ctx_arg_info[BPF_ITER_CTX_ARG_MAX];
+            const struct bpf_iter_seq_info *seq_info;
+  };
+
+After filling the data structure fields, call ``bpf_iter_reg_target()`` to
+register the iterator to the main BPF iterator subsystem.
+
+The following is the breakdown for each field in struct ``bpf_iter_reg``.
+
+.. list-table::
+   :widths: 25 50
+   :header-rows: 1
+
+   * - Fields
+     - Description
+   * - target
+     - Specifies the name of the BPF iterator. For example: ``bpf_map``,
+       ``bpf_map_elem``. The name should be different from other ``bpf_iter`` target names in the kernel.
+   * - attach_target and detach_target
+     - Allows for target specific ``link_create`` action since some targets
+       may need special processing. Called during the user space link_create stage.
+   * - show_fdinfo and fill_link_info
+     - Called to fill target specific information when user tries to get link
+       info associated with the iterator.
+   * - get_func_proto
+     - Permits a BPF iterator to access BPF helpers specific to the iterator.
+   * - ctx_arg_info_size and ctx_arg_info
+     - Specifies the verifier states for BPF program arguments associated with
+       the bpf iterator.
+   * - feature
+     - Specifies certain action requests in the kernel BPF iterator
+       infrastructure. Currently, only BPF_ITER_RESCHED is supported. This means
+       that the kernel function cond_resched() is called to avoid other kernel
+       subsystem (e.g., rcu) misbehaving.
+   * - seq_info
+     - Specifies certain action requests in the kernel BPF iterator
+       infrastructure. Currently, only BPF_ITER_RESCHED is supported. This means
+       that the kernel function cond_resched() is called to avoid other kernel
+       subsystem (e.g., rcu) misbehaving.
+
+
+`Click here
+<https://lore.kernel.org/bpf/20210212183107.50963-2-songliubraving@fb.com/>`_
+to see an implementation of the ``task_vma`` BPF iterator in the kernel.
+
+---------------------------------
+Parameterizing BPF Task Iterators
+---------------------------------
+
+By default, BPF iterators walk through all the objects of the specified types
+(processes, cgroups, maps, etc.) across the entire system to read relevant
+kernel data. But often, there are cases where we only care about a much smaller
+subset of iterable kernel objects, such as only iterating tasks within a
+specific process. Therefore, BPF iterator programs support filtering out objects
+from iteration by allowing user space to configure the iterator program when it
+is attached.
+
+--------------------------
+BPF Task Iterator Program
+--------------------------
+
+The following code is a BPF iterator program to print files and task information
+through the ``seq_file`` of the iterator. It is a standard BPF iterator program
+that visits every file of an iterator. We will use this BPF program in our
+example later.
+
+::
+
+  #include <vmlinux.h>
+  #include <bpf/bpf_helpers.h>
+
+  char _license[] SEC("license") = "GPL";
+
+  SEC("iter/task_file")
+  int dump_task_file(struct bpf_iter__task_file *ctx)
+  {
+        struct seq_file *seq = ctx->meta->seq;
+        struct task_struct *task = ctx->task;
+        struct file *file = ctx->file;
+        __u32 fd = ctx->fd;
+        if (task == NULL || file == NULL)
+                return 0;
+        if (ctx->meta->seq_num == 0) {
+                BPF_SEQ_PRINTF(seq, "    tgid      pid       fd      file\n");
+        }
+        BPF_SEQ_PRINTF(seq, "%8d %8d %8d %lx\n", task->tgid, task->pid, fd,
+                        (long)file->f_op);
+        return 0;
+  }
+
+----------------------------------------
+Creating a File Iterator with Parameters
+----------------------------------------
+
+Now, let us look at how to create an iterator that includes only files of a
+process.
+
+First,  fill the ``bpf_iter_attach_opts`` struct as shown below:
+
+::
+
+  LIBBPF_OPTS(bpf_iter_attach_opts, opts);
+  union bpf_iter_link_info linfo;
+  memset(&linfo, 0, sizeof(linfo));
+  linfo.task.pid = getpid();
+  opts.link_info = &linfo;
+  opts.link_info_len = sizeof(linfo);
+
+``linfo.task.pid``, if it is non-zero, directs the kernel to create an iterator
+that only includes opened files for the process with the specified ``pid``. In
+this example, we will only be iterating files for our process. If
+``linfo.task.pid`` is zero, the iterator will visit every opened file of every
+process. Similarly, ``linfo.task.tid`` directs the kernel to create an iterator
+that visits opened files of a specific thread, not a process. In this example,
+``linfo.task.tid`` is different from ``linfo.task.pid`` only if the thread has a
+separate file descriptor table. In most circumstances, all process threads share
+a single file descriptor table.
+
+Now, in the userspace program, pass the pointer of struct to the
+``bpf_program__attach_iter()``.
+
+::
+
+  link = bpf_program__attach_iter(prog, &opts); iter_fd =
+  bpf_iter_create(bpf_link__fd(link));
+
+If both *tid* and *pid* are zero, an iterator created from this struct
+``bpf_iter_attach_opts`` will include every opened file of every task in the
+system (in the namespace, actually.) It is the same as passing a NULL as the
+second argument to ``bpf_program__attach_iter()``.
+
+The whole program looks like the following code:
+
+::
+
+  #include <stdio.h>
+  #include <unistd.h>
+  #include <bpf/bpf.h>
+  #include <bpf/libbpf.h>
+  #include "bpf_iter_task_ex.skel.h"
+
+  static int do_read_opts(struct bpf_program *prog, struct bpf_iter_attach_opts *opts)
+  {
+        struct bpf_link *link;
+        char buf[16] = {};
+        int iter_fd = -1, len;
+        int ret = 0;
+
+        link = bpf_program__attach_iter(prog, opts);
+        if (!link) {
+                fprintf(stderr, "bpf_program__attach_iter() fails\n");
+                return -1;
+        }
+        iter_fd = bpf_iter_create(bpf_link__fd(link));
+        if (iter_fd < 0) {
+                fprintf(stderr, "bpf_iter_create() fails\n");
+                ret = -1;
+                goto free_link;
+        }
+        /* not check contents, but ensure read() ends without error */
+        while ((len = read(iter_fd, buf, sizeof(buf) - 1)) > 0) {
+                buf[len] = 0;
+                printf("%s", buf);
+        }
+        printf("\n");
+  free_link:
+        if (iter_fd >= 0)
+                close(iter_fd);
+        bpf_link__destroy(link);
+        return 0;
+  }
+
+  static void test_task_file(void)
+  {
+        LIBBPF_OPTS(bpf_iter_attach_opts, opts);
+        struct bpf_iter_task_ex *skel;
+        union bpf_iter_link_info linfo;
+        skel = bpf_iter_task_ex__open_and_load();
+        if (skel == NULL)
+                return;
+        memset(&linfo, 0, sizeof(linfo));
+        linfo.task.pid = getpid();
+        opts.link_info = &linfo;
+        opts.link_info_len = sizeof(linfo);
+        printf("PID %d\n", getpid());
+        do_read_opts(skel->progs.dump_task_file, &opts);
+        bpf_iter_task_ex__destroy(skel);
+  }
+
+  int main(int argc, const char * const * argv)
+  {
+        test_task_file();
+        return 0;
+  }
+
+The following lines are the output of the program.
+::
+
+  PID 1859
+
+     tgid      pid       fd      file
+     1859     1859        0 ffffffff82270aa0
+     1859     1859        1 ffffffff82270aa0
+     1859     1859        2 ffffffff82270aa0
+     1859     1859        3 ffffffff82272980
+     1859     1859        4 ffffffff8225e120
+     1859     1859        5 ffffffff82255120
+     1859     1859        6 ffffffff82254f00
+     1859     1859        7 ffffffff82254d80
+     1859     1859        8 ffffffff8225abe0
+
+------------------
+Without Parameters
+------------------
+
+Let us look at how a BPF iterator without parameters skips files of other
+processes in the system. In this case, the BPF program has to check the pid or
+the tid of tasks, or it will receive every opened file in the system (in the
+current *pid* namespace, actually). So, we usually add a global variable in the
+BPF program to pass a *pid* to the BPF program.
+
+The BPF program would look like the following block.
+
+  ::
+
+    ......
+    int target_pid = 0;
+
+    SEC("iter/task_file")
+    int dump_task_file(struct bpf_iter__task_file *ctx)
+    {
+          ......
+          if (task->tgid != target_pid) /* Check task->pid instead to check thread IDs */
+                  return 0;
+          BPF_SEQ_PRINTF(seq, "%8d %8d %8d %lx\n", task->tgid, task->pid, fd,
+                          (long)file->f_op);
+          return 0;
+    }
+
+The user space program would look like the following block:
+
+  ::
+
+    ......
+    static void test_task_file(void)
+    {
+          ......
+          skel = bpf_iter_task_ex__open_and_load();
+          if (skel == NULL)
+                  return;
+          skel->bss->target_pid = getpid(); /* process ID.  For thread id, use gettid() */
+          memset(&linfo, 0, sizeof(linfo));
+          linfo.task.pid = getpid();
+          opts.link_info = &linfo;
+          opts.link_info_len = sizeof(linfo);
+          ......
+    }
+
+``target_pid`` is a global variable in the BPF program. The user space program
+should initialize the variable with a process ID to skip opened files of other
+processes in the BPF program. When you parametrize a BPF iterator, the iterator
+calls the BPF program fewer times which can save significant resources.
+
+---------------------------
+Parametrizing VMA Iterators
+---------------------------
+
+By default, a BPF VMA iterator includes every VMA in every process.  However,
+you can still specify a process or a thread to include only its VMAs. Unlike
+files, a thread can not have a separate address space (since Linux 2.6.0-test6).
+Here, using *tid* makes no difference from using *pid*.
+
+----------------------------
+Parametrizing Task Iterators
+----------------------------
+
+A BPF task iterator with *pid* includes all tasks (threads) of a process. The
+BPF program receives these tasks one after another. You can specify a BPF task
+iterator with *tid* parameter to include only the tasks that match the given
+*tid*.
index 1088d44..b81533d 100644 (file)
@@ -24,6 +24,7 @@ that goes into great technical depth about the BPF Architecture.
    maps
    bpf_prog_run
    classic_vs_extended.rst
+   bpf_iterators
    bpf_licensing
    test_debug
    clang-notes
index 5d79843..e672d5e 100644 (file)
@@ -122,11 +122,11 @@ BPF_END   0xd0   byte swap operations (see `Byte swap instructions`_ below)
 
 ``BPF_XOR | BPF_K | BPF_ALU`` means::
 
-  src_reg = (u32) src_reg ^ (u32) imm32
+  dst_reg = (u32) dst_reg ^ (u32) imm32
 
 ``BPF_XOR | BPF_K | BPF_ALU64`` means::
 
-  src_reg = src_reg ^ imm32
+  dst_reg = dst_reg ^ imm32
 
 
 Byte swap instructions
index 9077447..9fd7fb5 100644 (file)
@@ -191,6 +191,15 @@ rebooting or panicking. Due to this additional restrictions apply to these
 calls. At the moment they only require CAP_SYS_BOOT capability, but more can be
 added later.
 
+2.4.8 KF_RCU flag
+-----------------
+
+The KF_RCU flag is used for kfuncs which have a rcu ptr as its argument.
+When used together with KF_ACQUIRE, it indicates the kfunc should have a
+single argument which must be a trusted argument or a MEM_RCU pointer.
+The argument may have reference count of 0 and the kfunc must take this
+into consideration.
+
 2.5 Registering the kfuncs
 --------------------------
 
@@ -213,3 +222,201 @@ type. An example is shown below::
                 return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_task_kfunc_set);
         }
         late_initcall(init_subsystem);
+
+3. Core kfuncs
+==============
+
+The BPF subsystem provides a number of "core" kfuncs that are potentially
+applicable to a wide variety of different possible use cases and programs.
+Those kfuncs are documented here.
+
+3.1 struct task_struct * kfuncs
+-------------------------------
+
+There are a number of kfuncs that allow ``struct task_struct *`` objects to be
+used as kptrs:
+
+.. kernel-doc:: kernel/bpf/helpers.c
+   :identifiers: bpf_task_acquire bpf_task_release
+
+These kfuncs are useful when you want to acquire or release a reference to a
+``struct task_struct *`` that was passed as e.g. a tracepoint arg, or a
+struct_ops callback arg. For example:
+
+.. code-block:: c
+
+       /**
+        * A trivial example tracepoint program that shows how to
+        * acquire and release a struct task_struct * pointer.
+        */
+       SEC("tp_btf/task_newtask")
+       int BPF_PROG(task_acquire_release_example, struct task_struct *task, u64 clone_flags)
+       {
+               struct task_struct *acquired;
+
+               acquired = bpf_task_acquire(task);
+
+               /*
+                * In a typical program you'd do something like store
+                * the task in a map, and the map will automatically
+                * release it later. Here, we release it manually.
+                */
+               bpf_task_release(acquired);
+               return 0;
+       }
+
+----
+
+A BPF program can also look up a task from a pid. This can be useful if the
+caller doesn't have a trusted pointer to a ``struct task_struct *`` object that
+it can acquire a reference on with bpf_task_acquire().
+
+.. kernel-doc:: kernel/bpf/helpers.c
+   :identifiers: bpf_task_from_pid
+
+Here is an example of it being used:
+
+.. code-block:: c
+
+       SEC("tp_btf/task_newtask")
+       int BPF_PROG(task_get_pid_example, struct task_struct *task, u64 clone_flags)
+       {
+               struct task_struct *lookup;
+
+               lookup = bpf_task_from_pid(task->pid);
+               if (!lookup)
+                       /* A task should always be found, as %task is a tracepoint arg. */
+                       return -ENOENT;
+
+               if (lookup->pid != task->pid) {
+                       /* bpf_task_from_pid() looks up the task via its
+                        * globally-unique pid from the init_pid_ns. Thus,
+                        * the pid of the lookup task should always be the
+                        * same as the input task.
+                        */
+                       bpf_task_release(lookup);
+                       return -EINVAL;
+               }
+
+               /* bpf_task_from_pid() returns an acquired reference,
+                * so it must be dropped before returning from the
+                * tracepoint handler.
+                */
+               bpf_task_release(lookup);
+               return 0;
+       }
+
+3.2 struct cgroup * kfuncs
+--------------------------
+
+``struct cgroup *`` objects also have acquire and release functions:
+
+.. kernel-doc:: kernel/bpf/helpers.c
+   :identifiers: bpf_cgroup_acquire bpf_cgroup_release
+
+These kfuncs are used in exactly the same manner as bpf_task_acquire() and
+bpf_task_release() respectively, so we won't provide examples for them.
+
+----
+
+You may also acquire a reference to a ``struct cgroup`` kptr that's already
+stored in a map using bpf_cgroup_kptr_get():
+
+.. kernel-doc:: kernel/bpf/helpers.c
+   :identifiers: bpf_cgroup_kptr_get
+
+Here's an example of how it can be used:
+
+.. code-block:: c
+
+       /* struct containing the struct task_struct kptr which is actually stored in the map. */
+       struct __cgroups_kfunc_map_value {
+               struct cgroup __kptr_ref * cgroup;
+       };
+
+       /* The map containing struct __cgroups_kfunc_map_value entries. */
+       struct {
+               __uint(type, BPF_MAP_TYPE_HASH);
+               __type(key, int);
+               __type(value, struct __cgroups_kfunc_map_value);
+               __uint(max_entries, 1);
+       } __cgroups_kfunc_map SEC(".maps");
+
+       /* ... */
+
+       /**
+        * A simple example tracepoint program showing how a
+        * struct cgroup kptr that is stored in a map can
+        * be acquired using the bpf_cgroup_kptr_get() kfunc.
+        */
+        SEC("tp_btf/cgroup_mkdir")
+        int BPF_PROG(cgroup_kptr_get_example, struct cgroup *cgrp, const char *path)
+        {
+               struct cgroup *kptr;
+               struct __cgroups_kfunc_map_value *v;
+               s32 id = cgrp->self.id;
+
+               /* Assume a cgroup kptr was previously stored in the map. */
+               v = bpf_map_lookup_elem(&__cgroups_kfunc_map, &id);
+               if (!v)
+                       return -ENOENT;
+
+               /* Acquire a reference to the cgroup kptr that's already stored in the map. */
+               kptr = bpf_cgroup_kptr_get(&v->cgroup);
+               if (!kptr)
+                       /* If no cgroup was present in the map, it's because
+                        * we're racing with another CPU that removed it with
+                        * bpf_kptr_xchg() between the bpf_map_lookup_elem()
+                        * above, and our call to bpf_cgroup_kptr_get().
+                        * bpf_cgroup_kptr_get() internally safely handles this
+                        * race, and will return NULL if the task is no longer
+                        * present in the map by the time we invoke the kfunc.
+                        */
+                       return -EBUSY;
+
+               /* Free the reference we just took above. Note that the
+                * original struct cgroup kptr is still in the map. It will
+                * be freed either at a later time if another context deletes
+                * it from the map, or automatically by the BPF subsystem if
+                * it's still present when the map is destroyed.
+                */
+               bpf_cgroup_release(kptr);
+
+               return 0;
+        }
+
+----
+
+Another kfunc available for interacting with ``struct cgroup *`` objects is
+bpf_cgroup_ancestor(). This allows callers to access the ancestor of a cgroup,
+and return it as a cgroup kptr.
+
+.. kernel-doc:: kernel/bpf/helpers.c
+   :identifiers: bpf_cgroup_ancestor
+
+Eventually, BPF should be updated to allow this to happen with a normal memory
+load in the program itself. This is currently not possible without more work in
+the verifier. bpf_cgroup_ancestor() can be used as follows:
+
+.. code-block:: c
+
+       /**
+        * Simple tracepoint example that illustrates how a cgroup's
+        * ancestor can be accessed using bpf_cgroup_ancestor().
+        */
+       SEC("tp_btf/cgroup_mkdir")
+       int BPF_PROG(cgrp_ancestor_example, struct cgroup *cgrp, const char *path)
+       {
+               struct cgroup *parent;
+
+               /* The parent cgroup resides at the level before the current cgroup's level. */
+               parent = bpf_cgroup_ancestor(cgrp, cgrp->level - 1);
+               if (!parent)
+                       return -ENOENT;
+
+               bpf_printk("Parent id is %d", parent->self.id);
+
+               /* Return the parent cgroup that was acquired above. */
+               bpf_cgroup_release(parent);
+               return 0;
+       }
diff --git a/Documentation/bpf/map_sk_storage.rst b/Documentation/bpf/map_sk_storage.rst
new file mode 100644 (file)
index 0000000..047e16c
--- /dev/null
@@ -0,0 +1,155 @@
+.. SPDX-License-Identifier: GPL-2.0-only
+.. Copyright (C) 2022 Red Hat, Inc.
+
+=======================
+BPF_MAP_TYPE_SK_STORAGE
+=======================
+
+.. note::
+   - ``BPF_MAP_TYPE_SK_STORAGE`` was introduced in kernel version 5.2
+
+``BPF_MAP_TYPE_SK_STORAGE`` is used to provide socket-local storage for BPF
+programs. A map of type ``BPF_MAP_TYPE_SK_STORAGE`` declares the type of storage
+to be provided and acts as the handle for accessing the socket-local
+storage. The values for maps of type ``BPF_MAP_TYPE_SK_STORAGE`` are stored
+locally with each socket instead of with the map. The kernel is responsible for
+allocating storage for a socket when requested and for freeing the storage when
+either the map or the socket is deleted.
+
+.. note::
+  - The key type must be ``int`` and ``max_entries`` must be set to ``0``.
+  - The ``BPF_F_NO_PREALLOC`` flag must be used when creating a map for
+    socket-local storage.
+
+Usage
+=====
+
+Kernel BPF
+----------
+
+bpf_sk_storage_get()
+~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: c
+
+   void *bpf_sk_storage_get(struct bpf_map *map, void *sk, void *value, u64 flags)
+
+Socket-local storage can be retrieved using the ``bpf_sk_storage_get()``
+helper. The helper gets the storage from ``sk`` that is associated with ``map``.
+If the ``BPF_LOCAL_STORAGE_GET_F_CREATE`` flag is used then
+``bpf_sk_storage_get()`` will create the storage for ``sk`` if it does not
+already exist. ``value`` can be used together with
+``BPF_LOCAL_STORAGE_GET_F_CREATE`` to initialize the storage value, otherwise it
+will be zero initialized. Returns a pointer to the storage on success, or
+``NULL`` in case of failure.
+
+.. note::
+   - ``sk`` is a kernel ``struct sock`` pointer for LSM or tracing programs.
+   - ``sk`` is a ``struct bpf_sock`` pointer for other program types.
+
+bpf_sk_storage_delete()
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: c
+
+   long bpf_sk_storage_delete(struct bpf_map *map, void *sk)
+
+Socket-local storage can be deleted using the ``bpf_sk_storage_delete()``
+helper. The helper deletes the storage from ``sk`` that is identified by
+``map``. Returns ``0`` on success, or negative error in case of failure.
+
+User space
+----------
+
+bpf_map_update_elem()
+~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: c
+
+   int bpf_map_update_elem(int map_fd, const void *key, const void *value, __u64 flags)
+
+Socket-local storage for the socket identified by ``key`` belonging to
+``map_fd`` can be added or updated using the ``bpf_map_update_elem()`` libbpf
+function. ``key`` must be a pointer to a valid ``fd`` in the user space
+program. The ``flags`` parameter can be used to control the update behaviour:
+
+- ``BPF_ANY`` will create storage for ``fd`` or update existing storage.
+- ``BPF_NOEXIST`` will create storage for ``fd`` only if it did not already
+  exist, otherwise the call will fail with ``-EEXIST``.
+- ``BPF_EXIST`` will update existing storage for ``fd`` if it already exists,
+  otherwise the call will fail with ``-ENOENT``.
+
+Returns ``0`` on success, or negative error in case of failure.
+
+bpf_map_lookup_elem()
+~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: c
+
+   int bpf_map_lookup_elem(int map_fd, const void *key, void *value)
+
+Socket-local storage for the socket identified by ``key`` belonging to
+``map_fd`` can be retrieved using the ``bpf_map_lookup_elem()`` libbpf
+function. ``key`` must be a pointer to a valid ``fd`` in the user space
+program. Returns ``0`` on success, or negative error in case of failure.
+
+bpf_map_delete_elem()
+~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: c
+
+   int bpf_map_delete_elem(int map_fd, const void *key)
+
+Socket-local storage for the socket identified by ``key`` belonging to
+``map_fd`` can be deleted using the ``bpf_map_delete_elem()`` libbpf
+function. Returns ``0`` on success, or negative error in case of failure.
+
+Examples
+========
+
+Kernel BPF
+----------
+
+This snippet shows how to declare socket-local storage in a BPF program:
+
+.. code-block:: c
+
+    struct {
+            __uint(type, BPF_MAP_TYPE_SK_STORAGE);
+            __uint(map_flags, BPF_F_NO_PREALLOC);
+            __type(key, int);
+            __type(value, struct my_storage);
+    } socket_storage SEC(".maps");
+
+This snippet shows how to retrieve socket-local storage in a BPF program:
+
+.. code-block:: c
+
+    SEC("sockops")
+    int _sockops(struct bpf_sock_ops *ctx)
+    {
+            struct my_storage *storage;
+            struct bpf_sock *sk;
+
+            sk = ctx->sk;
+            if (!sk)
+                    return 1;
+
+            storage = bpf_sk_storage_get(&socket_storage, sk, 0,
+                                         BPF_LOCAL_STORAGE_GET_F_CREATE);
+            if (!storage)
+                    return 1;
+
+            /* Use 'storage' here */
+
+            return 1;
+    }
+
+
+Please see the ``tools/testing/selftests/bpf`` directory for functional
+examples.
+
+References
+==========
+
+https://lwn.net/ml/netdev/20190426171103.61892-1-kafai@fb.com/
index 2ab4642..55c4f94 100644 (file)
@@ -148,7 +148,7 @@ allOf:
           items:
             - const: oscclk
             - const: dout_clkcmu_fsys1_bus
-            - const: dout_clkcmu_fsys1_mmc_card
+            - const: gout_clkcmu_fsys1_mmc_card
             - const: dout_clkcmu_fsys1_usbdrd
 
   - if:
diff --git a/Documentation/devicetree/bindings/net/bluetooth.txt b/Documentation/devicetree/bindings/net/bluetooth.txt
deleted file mode 100644 (file)
index 94797df..0000000
+++ /dev/null
@@ -1,5 +0,0 @@
-The following properties are common to the Bluetooth controllers:
-
-- local-bd-address: array of 6 bytes, specifies the BD address that was
-  uniquely assigned to the Bluetooth device, formatted with least significant
-  byte first (little-endian).
diff --git a/Documentation/devicetree/bindings/net/bluetooth/bluetooth-controller.yaml b/Documentation/devicetree/bindings/net/bluetooth/bluetooth-controller.yaml
new file mode 100644 (file)
index 0000000..9309dc4
--- /dev/null
@@ -0,0 +1,29 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/net/bluetooth/bluetooth-controller.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Bluetooth Controller Generic Binding
+
+maintainers:
+  - Marcel Holtmann <marcel@holtmann.org>
+  - Johan Hedberg <johan.hedberg@gmail.com>
+  - Luiz Augusto von Dentz <luiz.dentz@gmail.com>
+
+properties:
+  $nodename:
+    pattern: "^bluetooth(@.*)?$"
+
+  local-bd-address:
+    $ref: /schemas/types.yaml#/definitions/uint8-array
+    maxItems: 6
+    description:
+      Specifies the BD address that was uniquely assigned to the Bluetooth
+      device. Formatted with least significant byte first (little-endian), e.g.
+      in order to assign the address 00:11:22:33:44:55 this property must have
+      the value [55 44 33 22 11 00].
+
+additionalProperties: true
+
+...
diff --git a/Documentation/devicetree/bindings/net/bluetooth/brcm,bcm4377-bluetooth.yaml b/Documentation/devicetree/bindings/net/bluetooth/brcm,bcm4377-bluetooth.yaml
new file mode 100644 (file)
index 0000000..37cb39a
--- /dev/null
@@ -0,0 +1,81 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/net/bluetooth/brcm,bcm4377-bluetooth.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Broadcom BCM4377 family PCIe Bluetooth Chips
+
+maintainers:
+  - Sven Peter <sven@svenpeter.dev>
+
+description:
+  This binding describes Broadcom BCM4377 family PCIe-attached bluetooth chips
+  usually found in Apple machines. The Wi-Fi part of the chip is described in
+  bindings/net/wireless/brcm,bcm4329-fmac.yaml.
+
+allOf:
+  - $ref: bluetooth-controller.yaml#
+
+properties:
+  compatible:
+    enum:
+      - pci14e4,5fa0 # BCM4377
+      - pci14e4,5f69 # BCM4378
+      - pci14e4,5f71 # BCM4387
+
+  reg:
+    maxItems: 1
+
+  brcm,board-type:
+    $ref: /schemas/types.yaml#/definitions/string
+    description: Board type of the Bluetooth chip. This is used to decouple
+      the overall system board from the Bluetooth module and used to construct
+      firmware and calibration data filenames.
+      On Apple platforms, this should be the Apple module-instance codename
+      prefixed by "apple,", e.g. "apple,atlantisb".
+    pattern: '^apple,.*'
+
+  brcm,taurus-cal-blob:
+    $ref: /schemas/types.yaml#/definitions/uint8-array
+    description: A per-device calibration blob for the Bluetooth radio. This
+      should be filled in by the bootloader from platform configuration
+      data, if necessary, and will be uploaded to the device.
+      This blob is used if the chip stepping of the Bluetooth module does not
+      support beamforming.
+
+  brcm,taurus-bf-cal-blob:
+    $ref: /schemas/types.yaml#/definitions/uint8-array
+    description: A per-device calibration blob for the Bluetooth radio. This
+      should be filled in by the bootloader from platform configuration
+      data, if necessary, and will be uploaded to the device.
+      This blob is used if the chip stepping of the Bluetooth module supports
+      beamforming.
+
+  local-bd-address: true
+
+required:
+  - compatible
+  - reg
+  - local-bd-address
+  - brcm,board-type
+
+additionalProperties: false
+
+examples:
+  - |
+    pcie@a0000000 {
+      #address-cells = <3>;
+      #size-cells = <2>;
+      reg = <0xa0000000 0x1000000>;
+      device_type = "pci";
+      ranges = <0x43000000 0x6 0xa0000000 0xa0000000 0x0 0x20000000>;
+
+      bluetooth@0,1 {
+        compatible = "pci14e4,5f69";
+        reg = <0x100 0x0 0x0 0x0 0x0>;
+        brcm,board-type = "apple,honshu";
+        /* To be filled by the bootloader */
+        local-bd-address = [00 00 00 00 00 00];
+      };
+    };
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
 %YAML 1.2
 ---
-$id: http://devicetree.org/schemas/net/qualcomm-bluetooth.yaml#
+$id: http://devicetree.org/schemas/net/bluetooth/qualcomm-bluetooth.yaml#
 $schema: http://devicetree.org/meta-schemas/core.yaml#
 
 title: Qualcomm Bluetooth Chips
@@ -79,8 +79,7 @@ properties:
   firmware-name:
     description: specify the name of nvm firmware to load
 
-  local-bd-address:
-    description: see Documentation/devicetree/bindings/net/bluetooth.txt
+  local-bd-address: true
 
 
 required:
@@ -89,6 +88,7 @@ required:
 additionalProperties: false
 
 allOf:
+  - $ref: bluetooth-controller.yaml#
   - if:
       properties:
         compatible:
index 445b2a5..b964c7d 100644 (file)
@@ -19,11 +19,14 @@ properties:
       - brcm,bcm4329-bt
       - brcm,bcm4330-bt
       - brcm,bcm4334-bt
+      - brcm,bcm43430a0-bt
+      - brcm,bcm43430a1-bt
       - brcm,bcm43438-bt
       - brcm,bcm4345c5
       - brcm,bcm43540-bt
       - brcm,bcm4335a0
       - brcm,bcm4349-bt
+      - cypress,cyw4373a0-bt
       - infineon,cyw55572-bt
 
   shutdown-gpios:
index e52db84..6e59bd2 100644 (file)
@@ -17,6 +17,7 @@ properties:
   compatible:
     oneOf:
       - enum:
+          - fsl,imx93-flexcan
           - fsl,imx8qm-flexcan
           - fsl,imx8mp-flexcan
           - fsl,imx6q-flexcan
index 6f71fc9..1eb98c9 100644 (file)
@@ -9,9 +9,6 @@ title: Renesas R-Car CAN FD Controller
 maintainers:
   - Fabrizio Castro <fabrizio.castro.jz@renesas.com>
 
-allOf:
-  - $ref: can-controller.yaml#
-
 properties:
   compatible:
     oneOf:
@@ -33,7 +30,7 @@ properties:
 
       - items:
           - enum:
-              - renesas,r9a07g043-canfd    # RZ/G2UL
+              - renesas,r9a07g043-canfd    # RZ/G2UL and RZ/Five
               - renesas,r9a07g044-canfd    # RZ/G2{L,LC}
               - renesas,r9a07g054-canfd    # RZ/V2L
           - const: renesas,rzg2l-canfd     # RZ/G2L family
@@ -77,12 +74,13 @@ properties:
     description: Maximum frequency of the CANFD clock.
 
 patternProperties:
-  "^channel[01]$":
+  "^channel[0-7]$":
     type: object
     description:
-      The controller supports two channels and each is represented as a child
-      node.  Each child node supports the "status" property only, which
-      is used to enable/disable the respective channel.
+      The controller supports multiple channels and each is represented as a
+      child node.  Each channel can be enabled/disabled individually.
+
+    additionalProperties: false
 
 required:
   - compatible
@@ -98,60 +96,73 @@ required:
   - channel0
   - channel1
 
-if:
-  properties:
-    compatible:
-      contains:
-        enum:
-          - renesas,rzg2l-canfd
-then:
-  properties:
-    interrupts:
-      items:
-        - description: CAN global error interrupt
-        - description: CAN receive FIFO interrupt
-        - description: CAN0 error interrupt
-        - description: CAN0 transmit interrupt
-        - description: CAN0 transmit/receive FIFO receive completion interrupt
-        - description: CAN1 error interrupt
-        - description: CAN1 transmit interrupt
-        - description: CAN1 transmit/receive FIFO receive completion interrupt
-
-    interrupt-names:
-      items:
-        - const: g_err
-        - const: g_recc
-        - const: ch0_err
-        - const: ch0_rec
-        - const: ch0_trx
-        - const: ch1_err
-        - const: ch1_rec
-        - const: ch1_trx
-
-    resets:
-      maxItems: 2
-
-    reset-names:
-      items:
-        - const: rstp_n
-        - const: rstc_n
-
-  required:
-    - reset-names
-else:
-  properties:
-    interrupts:
-      items:
-        - description: Channel interrupt
-        - description: Global interrupt
-
-    interrupt-names:
-      items:
-        - const: ch_int
-        - const: g_int
-
-    resets:
-      maxItems: 1
+allOf:
+  - $ref: can-controller.yaml#
+
+  - if:
+      properties:
+        compatible:
+          contains:
+            enum:
+              - renesas,rzg2l-canfd
+    then:
+      properties:
+        interrupts:
+          items:
+            - description: CAN global error interrupt
+            - description: CAN receive FIFO interrupt
+            - description: CAN0 error interrupt
+            - description: CAN0 transmit interrupt
+            - description: CAN0 transmit/receive FIFO receive completion interrupt
+            - description: CAN1 error interrupt
+            - description: CAN1 transmit interrupt
+            - description: CAN1 transmit/receive FIFO receive completion interrupt
+
+        interrupt-names:
+          items:
+            - const: g_err
+            - const: g_recc
+            - const: ch0_err
+            - const: ch0_rec
+            - const: ch0_trx
+            - const: ch1_err
+            - const: ch1_rec
+            - const: ch1_trx
+
+        resets:
+          maxItems: 2
+
+        reset-names:
+          items:
+            - const: rstp_n
+            - const: rstc_n
+
+      required:
+        - reset-names
+    else:
+      properties:
+        interrupts:
+          items:
+            - description: Channel interrupt
+            - description: Global interrupt
+
+        interrupt-names:
+          items:
+            - const: ch_int
+            - const: g_int
+
+        resets:
+          maxItems: 1
+
+  - if:
+      not:
+        properties:
+          compatible:
+            contains:
+              const: renesas,r8a779a0-canfd
+    then:
+      patternProperties:
+        "^channel[2-7]$": false
 
 unevaluatedProperties: false
 
index 73b774e..1d7dab3 100644 (file)
@@ -12,7 +12,7 @@ allOf:
 maintainers:
   - Andrew Lunn <andrew@lunn.ch>
   - Florian Fainelli <f.fainelli@gmail.com>
-  - Vivien Didelot <vivien.didelot@gmail.com>
+  - Vladimir Oltean <olteanv@gmail.com>
   - Kurt Kanzenbach <kurt@linutronix.de>
 
 description:
index 9e81b9e..4aeda37 100644 (file)
@@ -49,6 +49,7 @@ properties:
       - qcom,sc7280-ipa
       - qcom,sdm845-ipa
       - qcom,sdx55-ipa
+      - qcom,sm6350-ipa
       - qcom,sm8350-ipa
 
   reg:
index e329ef0..143b566 100644 (file)
@@ -20,6 +20,7 @@ properties:
     enum:
       - realtek,rtl8723bs-bt
       - realtek,rtl8723cs-bt
+      - realtek,rtl8723ds-bt
       - realtek,rtl8822cs-bt
 
   device-wake-gpios:
diff --git a/Documentation/devicetree/bindings/net/socionext,synquacer-netsec.yaml b/Documentation/devicetree/bindings/net/socionext,synquacer-netsec.yaml
new file mode 100644 (file)
index 0000000..a65e6aa
--- /dev/null
@@ -0,0 +1,73 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/net/socionext,synquacer-netsec.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Socionext NetSec Ethernet Controller IP
+
+maintainers:
+  - Jassi Brar <jaswinder.singh@linaro.org>
+  - Ilias Apalodimas <ilias.apalodimas@linaro.org>
+
+allOf:
+  - $ref: ethernet-controller.yaml#
+
+properties:
+  compatible:
+    const: socionext,synquacer-netsec
+
+  reg:
+    items:
+      - description: control register area
+      - description: EEPROM holding the MAC address and microengine firmware
+
+  clocks:
+    maxItems: 1
+
+  clock-names:
+    const: phy_ref_clk
+
+  dma-coherent: true
+
+  interrupts:
+    maxItems: 1
+
+  mdio:
+    $ref: mdio.yaml#
+
+required:
+  - compatible
+  - reg
+  - clocks
+  - clock-names
+  - interrupts
+  - mdio
+
+unevaluatedProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/interrupt-controller/arm-gic.h>
+
+    ethernet@522d0000 {
+        compatible = "socionext,synquacer-netsec";
+        reg = <0x522d0000 0x10000>, <0x10000000 0x10000>;
+        interrupts = <GIC_SPI 176 IRQ_TYPE_LEVEL_HIGH>;
+        clocks = <&clk_netsec>;
+        clock-names = "phy_ref_clk";
+        phy-mode = "rgmii";
+        max-speed = <1000>;
+        max-frame-size = <9000>;
+        phy-handle = <&phy1>;
+
+        mdio {
+            #address-cells = <1>;
+            #size-cells = <0>;
+            phy1: ethernet-phy@1 {
+                compatible = "ethernet-phy-ieee802.3-c22";
+                reg = <1>;
+            };
+        };
+    };
+...
diff --git a/Documentation/devicetree/bindings/net/socionext-netsec.txt b/Documentation/devicetree/bindings/net/socionext-netsec.txt
deleted file mode 100644 (file)
index a3c1dff..0000000
+++ /dev/null
@@ -1,56 +0,0 @@
-* Socionext NetSec Ethernet Controller IP
-
-Required properties:
-- compatible: Should be "socionext,synquacer-netsec"
-- reg: Address and length of the control register area, followed by the
-       address and length of the EEPROM holding the MAC address and
-       microengine firmware
-- interrupts: Should contain ethernet controller interrupt
-- clocks: phandle to the PHY reference clock
-- clock-names: Should be "phy_ref_clk"
-- phy-mode: See ethernet.txt file in the same directory
-- phy-handle: See ethernet.txt in the same directory.
-
-- mdio device tree subnode: When the Netsec has a phy connected to its local
-               mdio, there must be device tree subnode with the following
-               required properties:
-
-       - #address-cells: Must be <1>.
-       - #size-cells: Must be <0>.
-
-       For each phy on the mdio bus, there must be a node with the following
-       fields:
-       - compatible: Refer to phy.txt
-       - reg: phy id used to communicate to phy.
-
-Optional properties: (See ethernet.txt file in the same directory)
-- dma-coherent: Boolean property, must only be present if memory
-       accesses performed by the device are cache coherent.
-- max-speed: See ethernet.txt in the same directory.
-- max-frame-size: See ethernet.txt in the same directory.
-
-The MAC address will be determined using the optional properties
-defined in ethernet.txt. The 'phy-mode' property is required, but may
-be set to the empty string if the PHY configuration is programmed by
-the firmware or set by hardware straps, and needs to be preserved.
-
-Example:
-       eth0: ethernet@522d0000 {
-               compatible = "socionext,synquacer-netsec";
-               reg = <0 0x522d0000 0x0 0x10000>, <0 0x10000000 0x0 0x10000>;
-               interrupts = <GIC_SPI 176 IRQ_TYPE_LEVEL_HIGH>;
-               clocks = <&clk_netsec>;
-               clock-names = "phy_ref_clk";
-               phy-mode = "rgmii";
-               max-speed = <1000>;
-               max-frame-size = <9000>;
-               phy-handle = <&phy1>;
-
-               mdio {
-                       #address-cells = <1>;
-                       #size-cells = <0>;
-                       phy1: ethernet-phy@1 {
-                               compatible = "ethernet-phy-ieee802.3-c22";
-                               reg = <1>;
-                       };
-               };
index 5320504..0e6fd57 100644 (file)
@@ -42,15 +42,13 @@ properties:
   bluetooth:
     type: object
     additionalProperties: false
+    allOf:
+      - $ref: /schemas/net/bluetooth/bluetooth-controller.yaml#
     properties:
       compatible:
         const: qcom,wcnss-bt
 
-      local-bd-address:
-        $ref: /schemas/types.yaml#/definitions/uint8-array
-        maxItems: 6
-        description:
-          See Documentation/devicetree/bindings/net/bluetooth.txt
+      local-bd-address: true
 
     required:
       - compatible
diff --git a/Documentation/loongarch/booting.rst b/Documentation/loongarch/booting.rst
new file mode 100644 (file)
index 0000000..91eccd4
--- /dev/null
@@ -0,0 +1,42 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=======================
+Booting Linux/LoongArch
+=======================
+
+:Author: Yanteng Si <siyanteng@loongson.cn>
+:Date:   18 Nov 2022
+
+Information passed from BootLoader to kernel
+============================================
+
+LoongArch supports ACPI and FDT. The information that needs to be passed
+to the kernel includes the memmap, the initrd, the command line, optionally
+the ACPI/FDT tables, and so on.
+
+The kernel is passed the following arguments on `kernel_entry` :
+
+      - a0 = efi_boot: `efi_boot` is a flag indicating whether
+        this boot environment is fully UEFI-compliant.
+
+      - a1 = cmdline: `cmdline` is a pointer to the kernel command line.
+
+      - a2 = systemtable: `systemtable` points to the EFI system table.
+        All pointers involved at this stage are in physical addresses.
+
+Header of Linux/LoongArch kernel images
+=======================================
+
+Linux/LoongArch kernel images are EFI images. Being PE files, they have
+a 64-byte header structured like::
+
+       u32     MZ_MAGIC                /* "MZ", MS-DOS header */
+       u32     res0 = 0                /* Reserved */
+       u64     kernel_entry            /* Kernel entry point */
+       u64     _end - _text            /* Kernel image effective size */
+       u64     load_offset             /* Kernel image load offset from start of RAM */
+       u64     res1 = 0                /* Reserved */
+       u64     res2 = 0                /* Reserved */
+       u64     res3 = 0                /* Reserved */
+       u32     LINUX_PE_MAGIC          /* Magic number */
+       u32     pe_header - _head       /* Offset to the PE header */
index aaba648..c779bfa 100644 (file)
@@ -9,6 +9,7 @@ LoongArch Architecture
    :numbered:
 
    introduction
+   booting
    irq-chip-model
 
    features
index e8fa7ac..6969652 100644 (file)
@@ -351,42 +351,26 @@ driver.
 
 MAC address setup
 -----------------
-mlx5 driver provides mechanism to setup the MAC address of the PCI VF/SF.
+mlx5 driver support devlink port function attr mechanism to setup MAC
+address. (refer to Documentation/networking/devlink/devlink-port.rst)
 
-The configured MAC address of the PCI VF/SF will be used by netdevice and rdma
-device created for the PCI VF/SF.
+RoCE capability setup
+---------------------
+Not all mlx5 PCI devices/SFs require RoCE capability.
 
-- Get the MAC address of the VF identified by its unique devlink port index::
+When RoCE capability is disabled, it saves 1 Mbytes worth of system memory per
+PCI devices/SF.
 
-    $ devlink port show pci/0000:06:00.0/2
-    pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1
-      function:
-        hw_addr 00:00:00:00:00:00
-
-- Set the MAC address of the VF identified by its unique devlink port index::
-
-    $ devlink port function set pci/0000:06:00.0/2 hw_addr 00:11:22:33:44:55
-
-    $ devlink port show pci/0000:06:00.0/2
-    pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1
-      function:
-        hw_addr 00:11:22:33:44:55
-
-- Get the MAC address of the SF identified by its unique devlink port index::
-
-    $ devlink port show pci/0000:06:00.0/32768
-    pci/0000:06:00.0/32768: type eth netdev enp6s0pf0sf88 flavour pcisf pfnum 0 sfnum 88
-      function:
-        hw_addr 00:00:00:00:00:00
-
-- Set the MAC address of the SF identified by its unique devlink port index::
+mlx5 driver support devlink port function attr mechanism to setup RoCE
+capability. (refer to Documentation/networking/devlink/devlink-port.rst)
 
-    $ devlink port function set pci/0000:06:00.0/32768 hw_addr 00:00:00:00:88:88
+migratable capability setup
+---------------------------
+User who wants mlx5 PCI VFs to be able to perform live migration need to
+explicitly enable the VF migratable capability.
 
-    $ devlink port show pci/0000:06:00.0/32768
-    pci/0000:06:00.0/32768: type eth netdev enp6s0pf0sf88 flavour pcisf pfnum 0 sfnum 88
-      function:
-        hw_addr 00:00:00:00:88:88
+mlx5 driver support devlink port function attr mechanism to setup migratable
+capability. (refer to Documentation/networking/devlink/devlink-port.rst)
 
 SF state setup
 --------------
index 7572bf6..1242b0e 100644 (file)
@@ -198,6 +198,11 @@ fw.bundle_id
 
 Unique identifier of the entire firmware bundle.
 
+fw.bootloader
+-------------
+
+Version of the bootloader.
+
 Future work
 ===========
 
index 98557c2..3da5909 100644 (file)
@@ -110,7 +110,7 @@ devlink ports for both the controllers.
 Function configuration
 ======================
 
-A user can configure the function attribute before enumerating the PCI
+Users can configure one or more function attributes before enumerating the PCI
 function. Usually it means, user should configure function attribute
 before a bus specific device for the function is created. However, when
 SRIOV is enabled, virtual function devices are created on the PCI bus.
@@ -119,9 +119,127 @@ function device to the driver. For subfunctions, this means user should
 configure port function attribute before activating the port function.
 
 A user may set the hardware address of the function using
-'devlink port function set hw_addr' command. For Ethernet port function
+`devlink port function set hw_addr` command. For Ethernet port function
 this means a MAC address.
 
+Users may also set the RoCE capability of the function using
+`devlink port function set roce` command.
+
+Users may also set the function as migratable using
+'devlink port function set migratable' command.
+
+Function attributes
+===================
+
+MAC address setup
+-----------------
+The configured MAC address of the PCI VF/SF will be used by netdevice and rdma
+device created for the PCI VF/SF.
+
+- Get the MAC address of the VF identified by its unique devlink port index::
+
+    $ devlink port show pci/0000:06:00.0/2
+    pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1
+      function:
+        hw_addr 00:00:00:00:00:00
+
+- Set the MAC address of the VF identified by its unique devlink port index::
+
+    $ devlink port function set pci/0000:06:00.0/2 hw_addr 00:11:22:33:44:55
+
+    $ devlink port show pci/0000:06:00.0/2
+    pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1
+      function:
+        hw_addr 00:11:22:33:44:55
+
+- Get the MAC address of the SF identified by its unique devlink port index::
+
+    $ devlink port show pci/0000:06:00.0/32768
+    pci/0000:06:00.0/32768: type eth netdev enp6s0pf0sf88 flavour pcisf pfnum 0 sfnum 88
+      function:
+        hw_addr 00:00:00:00:00:00
+
+- Set the MAC address of the SF identified by its unique devlink port index::
+
+    $ devlink port function set pci/0000:06:00.0/32768 hw_addr 00:00:00:00:88:88
+
+    $ devlink port show pci/0000:06:00.0/32768
+    pci/0000:06:00.0/32768: type eth netdev enp6s0pf0sf88 flavour pcisf pfnum 0 sfnum 88
+      function:
+        hw_addr 00:00:00:00:88:88
+
+RoCE capability setup
+---------------------
+Not all PCI VFs/SFs require RoCE capability.
+
+When RoCE capability is disabled, it saves system memory per PCI VF/SF.
+
+When user disables RoCE capability for a VF/SF, user application cannot send or
+receive any RoCE packets through this VF/SF and RoCE GID table for this PCI
+will be empty.
+
+When RoCE capability is disabled in the device using port function attribute,
+VF/SF driver cannot override it.
+
+- Get RoCE capability of the VF device::
+
+    $ devlink port show pci/0000:06:00.0/2
+    pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1
+        function:
+            hw_addr 00:00:00:00:00:00 roce enable
+
+- Set RoCE capability of the VF device::
+
+    $ devlink port function set pci/0000:06:00.0/2 roce disable
+
+    $ devlink port show pci/0000:06:00.0/2
+    pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1
+        function:
+            hw_addr 00:00:00:00:00:00 roce disable
+
+migratable capability setup
+---------------------------
+Live migration is the process of transferring a live virtual machine
+from one physical host to another without disrupting its normal
+operation.
+
+User who want PCI VFs to be able to perform live migration need to
+explicitly enable the VF migratable capability.
+
+When user enables migratable capability for a VF, and the HV binds the VF to VFIO driver
+with migration support, the user can migrate the VM with this VF from one HV to a
+different one.
+
+However, when migratable capability is enable, device will disable features which cannot
+be migrated. Thus migratable cap can impose limitations on a VF so let the user decide.
+
+Example of LM with migratable function configuration:
+- Get migratable capability of the VF device::
+
+    $ devlink port show pci/0000:06:00.0/2
+    pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1
+        function:
+            hw_addr 00:00:00:00:00:00 migratable disable
+
+- Set migratable capability of the VF device::
+
+    $ devlink port function set pci/0000:06:00.0/2 migratable enable
+
+    $ devlink port show pci/0000:06:00.0/2
+    pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1
+        function:
+            hw_addr 00:00:00:00:00:00 migratable enable
+
+- Bind VF to VFIO driver with migration support::
+
+    $ echo <pci_id> > /sys/bus/pci/devices/0000:08:00.0/driver/unbind
+    $ echo mlx5_vfio_pci > /sys/bus/pci/devices/0000:08:00.0/driver_override
+    $ echo <pci_id> > /sys/bus/pci/devices/0000:08:00.0/driver/bind
+
+Attach VF to the VM.
+Start the VM.
+Perform live migration.
+
 Subfunction
 ============
 
diff --git a/Documentation/networking/devlink/etas_es58x.rst b/Documentation/networking/devlink/etas_es58x.rst
new file mode 100644 (file)
index 0000000..3b857d8
--- /dev/null
@@ -0,0 +1,36 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==========================
+etas_es58x devlink support
+==========================
+
+This document describes the devlink features implemented by the
+``etas_es58x`` device driver.
+
+Info versions
+=============
+
+The ``etas_es58x`` driver reports the following versions
+
+.. list-table:: devlink info versions implemented
+   :widths: 5 5 90
+
+   * - Name
+     - Type
+     - Description
+   * - ``fw``
+     - running
+     - Version of the firmware running on the device. Also available
+       through ``ethtool -i`` as the first member of the
+       ``firmware-version``.
+   * - ``fw.bootloader``
+     - running
+     - Version of the bootloader running on the device. Also available
+       through ``ethtool -i`` as the second member of the
+       ``firmware-version``.
+   * - ``board.rev``
+     - fixed
+     - The hardware revision of the device.
+   * - ``serial_number``
+     - fixed
+     - The USB serial number. Also available through ``lsusb -v``.
index bede24e..f10f8eb 100644 (file)
@@ -222,6 +222,7 @@ Userspace to kernel:
   ``ETHTOOL_MSG_MODULE_GET``            get transceiver module parameters
   ``ETHTOOL_MSG_PSE_SET``               set PSE parameters
   ``ETHTOOL_MSG_PSE_GET``               get PSE parameters
+  ``ETHTOOL_MSG_RSS_GET``               get RSS settings
   ===================================== =================================
 
 Kernel to userspace:
@@ -263,6 +264,7 @@ Kernel to userspace:
   ``ETHTOOL_MSG_PHC_VCLOCKS_GET_REPLY``    PHC virtual clocks info
   ``ETHTOOL_MSG_MODULE_GET_REPLY``         transceiver module parameters
   ``ETHTOOL_MSG_PSE_GET_REPLY``            PSE parameters
+  ``ETHTOOL_MSG_RSS_GET_REPLY``            RSS settings
   ======================================== =================================
 
 ``GET`` requests are sent by userspace applications to retrieve device
@@ -1687,6 +1689,33 @@ to control PoDL PSE Admin functions. This option is implementing
 ``IEEE 802.3-2018`` 30.15.1.2.1 acPoDLPSEAdminControl. See
 ``ETHTOOL_A_PODL_PSE_ADMIN_STATE`` for supported values.
 
+RSS_GET
+=======
+
+Get indirection table, hash key and hash function info associated with a
+RSS context of an interface similar to ``ETHTOOL_GRSSH`` ioctl request.
+
+Request contents:
+
+=====================================  ======  ==========================
+  ``ETHTOOL_A_RSS_HEADER``             nested  request header
+  ``ETHTOOL_A_RSS_CONTEXT``            u32     context number
+=====================================  ======  ==========================
+
+Kernel response contents:
+
+=====================================  ======  ==========================
+  ``ETHTOOL_A_RSS_HEADER``             nested  reply header
+  ``ETHTOOL_A_RSS_HFUNC``              u32     RSS hash func
+  ``ETHTOOL_A_RSS_INDIR``              binary  Indir table bytes
+  ``ETHTOOL_A_RSS_HKEY``               binary  Hash key bytes
+=====================================  ======  ==========================
+
+ETHTOOL_A_RSS_HFUNC attribute is bitmap indicating the hash function
+being used. Current supported options are toeplitz, xor or crc32.
+ETHTOOL_A_RSS_INDIR attribute returns RSS indrection table where each byte
+indicates queue number.
+
 Request translation
 ===================
 
@@ -1768,7 +1797,7 @@ are netlink only.
   ``ETHTOOL_GMODULEEEPROM``           ``ETHTOOL_MSG_MODULE_EEPROM_GET``
   ``ETHTOOL_GEEE``                    ``ETHTOOL_MSG_EEE_GET``
   ``ETHTOOL_SEEE``                    ``ETHTOOL_MSG_EEE_SET``
-  ``ETHTOOL_GRSSH``                   n/a
+  ``ETHTOOL_GRSSH``                   ``ETHTOOL_MSG_RSS_GET``
   ``ETHTOOL_SRSSH``                   n/a
   ``ETHTOOL_GTUNABLE``                n/a
   ``ETHTOOL_STUNABLE``                n/a
index 387fda8..3fb5fa1 100644 (file)
@@ -129,6 +129,26 @@ drop_packet - INTEGER
        threshold. When the mode 3 is set, the always mode drop rate
        is controlled by the /proc/sys/net/ipv4/vs/am_droprate.
 
+est_cpulist - CPULIST
+       Allowed CPUs for estimation kthreads
+
+       Syntax: standard cpulist format
+       empty list - stop kthread tasks and estimation
+       default - the system's housekeeping CPUs for kthreads
+
+       Example:
+       "all": all possible CPUs
+       "0-N": all possible CPUs, N denotes last CPU number
+       "0,1-N:1/2": first and all CPUs with odd number
+       "": empty list
+
+est_nice - INTEGER
+       default 0
+       Valid range: -20 (more favorable) .. 19 (less favorable)
+
+       Niceness value to use for the estimation kthreads (scheduling
+       priority)
+
 expire_nodest_conn - BOOLEAN
        - 0 - disabled (default)
        - not 0 - enabled
@@ -304,8 +324,8 @@ run_estimation - BOOLEAN
        0 - disabled
        not 0 - enabled (default)
 
-       If disabled, the estimation will be stop, and you can't see
-       any update on speed estimation data.
+       If disabled, the estimation will be suspended and kthread tasks
+       stopped.
 
        You can always re-enable estimation by setting this value to 1.
        But be careful, the first estimation after re-enable is not
index be4eb12..f17c018 100644 (file)
@@ -179,7 +179,8 @@ SOF_TIMESTAMPING_OPT_ID:
   identifier and returns that along with the timestamp. The identifier
   is derived from a per-socket u32 counter (that wraps). For datagram
   sockets, the counter increments with each sent packet. For stream
-  sockets, it increments with every byte.
+  sockets, it increments with every byte. For stream sockets, also set
+  SOF_TIMESTAMPING_OPT_ID_TCP, see the section below.
 
   The counter starts at zero. It is initialized the first time that
   the socket option is enabled. It is reset each time the option is
@@ -192,6 +193,35 @@ SOF_TIMESTAMPING_OPT_ID:
   among all possibly concurrently outstanding timestamp requests for
   that socket.
 
+SOF_TIMESTAMPING_OPT_ID_TCP:
+  Pass this modifier along with SOF_TIMESTAMPING_OPT_ID for new TCP
+  timestamping applications. SOF_TIMESTAMPING_OPT_ID defines how the
+  counter increments for stream sockets, but its starting point is
+  not entirely trivial. This option fixes that.
+
+  For stream sockets, if SOF_TIMESTAMPING_OPT_ID is set, this should
+  always be set too. On datagram sockets the option has no effect.
+
+  A reasonable expectation is that the counter is reset to zero with
+  the system call, so that a subsequent write() of N bytes generates
+  a timestamp with counter N-1. SOF_TIMESTAMPING_OPT_ID_TCP
+  implements this behavior under all conditions.
+
+  SOF_TIMESTAMPING_OPT_ID without modifier often reports the same,
+  especially when the socket option is set when no data is in
+  transmission. If data is being transmitted, it may be off by the
+  length of the output queue (SIOCOUTQ).
+
+  The difference is due to being based on snd_una versus write_seq.
+  snd_una is the offset in the stream acknowledged by the peer. This
+  depends on factors outside of process control, such as network RTT.
+  write_seq is the last byte written by the process. This offset is
+  not affected by external inputs.
+
+  The difference is subtle and unlikely to be noticed when configured
+  at initial socket creation, when no data is queued or sent. But
+  SOF_TIMESTAMPING_OPT_ID_TCP behavior is more robust regardless of
+  when the socket option is set.
 
 SOF_TIMESTAMPING_OPT_CMSG:
   Support recv() cmsg for all timestamped packets. Control messages
index 01391df..c43ace7 100644 (file)
@@ -5,6 +5,7 @@ XFRM device - offloading the IPsec computations
 ===============================================
 
 Shannon Nelson <shannon.nelson@oracle.com>
+Leon Romanovsky <leonro@nvidia.com>
 
 
 Overview
@@ -18,10 +19,21 @@ can radically increase throughput and decrease CPU utilization.  The XFRM
 Device interface allows NIC drivers to offer to the stack access to the
 hardware offload.
 
+Right now, there are two types of hardware offload that kernel supports.
+ * IPsec crypto offload:
+   * NIC performs encrypt/decrypt
+   * Kernel does everything else
+ * IPsec packet offload:
+   * NIC performs encrypt/decrypt
+   * NIC does encapsulation
+   * Kernel and NIC have SA and policy in-sync
+   * NIC handles the SA and policies states
+   * The Kernel talks to the keymanager
+
 Userland access to the offload is typically through a system such as
 libreswan or KAME/raccoon, but the iproute2 'ip xfrm' command set can
 be handy when experimenting.  An example command might look something
-like this::
+like this for crypto offload:
 
   ip x s add proto esp dst 14.0.0.70 src 14.0.0.52 spi 0x07 mode transport \
      reqid 0x07 replay-window 32 \
@@ -29,6 +41,17 @@ like this::
      sel src 14.0.0.52/24 dst 14.0.0.70/24 proto tcp \
      offload dev eth4 dir in
 
+and for packet offload
+
+  ip x s add proto esp dst 14.0.0.70 src 14.0.0.52 spi 0x07 mode transport \
+     reqid 0x07 replay-window 32 \
+     aead 'rfc4106(gcm(aes))' 0x44434241343332312423222114131211f4f3f2f1 128 \
+     sel src 14.0.0.52/24 dst 14.0.0.70/24 proto tcp \
+     offload packet dev eth4 dir in
+
+  ip x p add src 14.0.0.70 dst 14.0.0.52 offload packet dev eth4 dir in
+  tmpl src 14.0.0.70 dst 14.0.0.52 proto esp reqid 10000 mode transport
+
 Yes, that's ugly, but that's what shell scripts and/or libreswan are for.
 
 
@@ -40,17 +63,24 @@ Callbacks to implement
 
   /* from include/linux/netdevice.h */
   struct xfrmdev_ops {
+        /* Crypto and Packet offload callbacks */
        int     (*xdo_dev_state_add) (struct xfrm_state *x);
        void    (*xdo_dev_state_delete) (struct xfrm_state *x);
        void    (*xdo_dev_state_free) (struct xfrm_state *x);
        bool    (*xdo_dev_offload_ok) (struct sk_buff *skb,
                                       struct xfrm_state *x);
        void    (*xdo_dev_state_advance_esn) (struct xfrm_state *x);
+
+        /* Solely packet offload callbacks */
+       void    (*xdo_dev_state_update_curlft) (struct xfrm_state *x);
+       int     (*xdo_dev_policy_add) (struct xfrm_policy *x);
+       void    (*xdo_dev_policy_delete) (struct xfrm_policy *x);
+       void    (*xdo_dev_policy_free) (struct xfrm_policy *x);
   };
 
-The NIC driver offering ipsec offload will need to implement these
-callbacks to make the offload available to the network stack's
-XFRM subsystem.  Additionally, the feature bits NETIF_F_HW_ESP and
+The NIC driver offering ipsec offload will need to implement callbacks
+relevant to supported offload to make the offload available to the network
+stack's XFRM subsystem. Additionally, the feature bits NETIF_F_HW_ESP and
 NETIF_F_HW_ESP_TX_CSUM will signal the availability of the offload.
 
 
@@ -79,7 +109,8 @@ and an indication of whether it is for Rx or Tx.  The driver should
 
                ===========   ===================================
                0             success
-               -EOPNETSUPP   offload not supported, try SW IPsec
+               -EOPNETSUPP   offload not supported, try SW IPsec,
+                              not applicable for packet offload mode
                other         fail the request
                ===========   ===================================
 
@@ -96,6 +127,7 @@ will serviceable.  This can check the packet information to be sure the
 offload can be supported (e.g. IPv4 or IPv6, no IPv4 options, etc) and
 return true of false to signify its support.
 
+Crypto offload mode:
 When ready to send, the driver needs to inspect the Tx packet for the
 offload information, including the opaque context, and set up the packet
 send accordingly::
@@ -139,13 +171,25 @@ the stack in xfrm_input().
 In ESN mode, xdo_dev_state_advance_esn() is called from xfrm_replay_advance_esn().
 Driver will check packet seq number and update HW ESN state machine if needed.
 
+Packet offload mode:
+HW adds and deletes XFRM headers. So in RX path, XFRM stack is bypassed if HW
+reported success. In TX path, the packet lefts kernel without extra header
+and not encrypted, the HW is responsible to perform it.
+
 When the SA is removed by the user, the driver's xdo_dev_state_delete()
-is asked to disable the offload.  Later, xdo_dev_state_free() is called
-from a garbage collection routine after all reference counts to the state
+and xdo_dev_policy_delete() are asked to disable the offload.  Later,
+xdo_dev_state_free() and xdo_dev_policy_free() are called from a garbage
+collection routine after all reference counts to the state and policy
 have been removed and any remaining resources can be cleared for the
 offload state.  How these are used by the driver will depend on specific
 hardware needs.
 
 As a netdev is set to DOWN the XFRM stack's netdev listener will call
-xdo_dev_state_delete() and xdo_dev_state_free() on any remaining offloaded
-states.
+xdo_dev_state_delete(), xdo_dev_policy_delete(), xdo_dev_state_free() and
+xdo_dev_policy_free() on any remaining offloaded states.
+
+Outcome of HW handling packets, the XFRM core can't count hard, soft limits.
+The HW/driver are responsible to perform it and provide accurate data when
+xdo_dev_state_update_curlft() is called. In case of one of these limits
+occuried, the driver needs to call to xfrm_state_check_expire() to make sure
+that XFRM performs rekeying sequence.
diff --git a/Documentation/translations/zh_CN/loongarch/booting.rst b/Documentation/translations/zh_CN/loongarch/booting.rst
new file mode 100644 (file)
index 0000000..fb6440c
--- /dev/null
@@ -0,0 +1,48 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+.. include:: ../disclaimer-zh_CN.rst
+
+:Original: Documentation/loongarch/booting.rst
+
+:翻译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+====================
+启动 Linux/LoongArch
+====================
+
+:作者: 司延腾 <siyanteng@loongson.cn>
+:日期: 2022年11月18日
+
+BootLoader传递给内核的信息
+==========================
+
+LoongArch支持ACPI和FDT启动,需要传递给内核的信息包括memmap、initrd、cmdline、可
+选的ACPI/FDT表等。
+
+内核在 `kernel_entry` 入口处被传递以下参数:
+
+      - a0 = efi_boot: `efi_boot` 是一个标志,表示这个启动环境是否完全符合UEFI
+        的要求。
+
+      - a1 = cmdline: `cmdline` 是一个指向内核命令行的指针。
+
+      - a2 = systemtable: `systemtable` 指向EFI的系统表,在这个阶段涉及的所有
+        指针都是物理地址。
+
+Linux/LoongArch内核镜像文件头
+=============================
+
+内核镜像是EFI镜像。作为PE文件,它们有一个64字节的头部结构体,如下所示::
+
+       u32     MZ_MAGIC                /* "MZ", MS-DOS 头 */
+       u32     res0 = 0                /* 保留 */
+       u64     kernel_entry            /* 内核入口点 */
+       u64     _end - _text            /* 内核镜像有效大小 */
+       u64     load_offset             /* 加载内核镜像相对内存起始地址的偏移量 */
+       u64     res1 = 0                /* 保留 */
+       u64     res2 = 0                /* 保留 */
+       u64     res3 = 0                /* 保留 */
+       u32     LINUX_PE_MAGIC          /* 魔术数 */
+       u32     pe_header - _head       /* 到PE头的偏移量 */
index 7d23eb7..0273a08 100644 (file)
@@ -14,6 +14,7 @@ LoongArch体系结构
    :numbered:
 
    introduction
+   booting
    irq-chip-model
 
    features
index eee9f85..896914e 100644 (file)
@@ -7213,14 +7213,13 @@ veto the transition.
 :Parameters: args[0] is the maximum poll time in nanoseconds
 :Returns: 0 on success; -1 on error
 
-This capability overrides the kvm module parameter halt_poll_ns for the
-target VM.
-
-VCPU polling allows a VCPU to poll for wakeup events instead of immediately
-scheduling during guest halts. The maximum time a VCPU can spend polling is
-controlled by the kvm module parameter halt_poll_ns. This capability allows
-the maximum halt time to specified on a per-VM basis, effectively overriding
-the module parameter for the target VM.
+KVM_CAP_HALT_POLL overrides the kvm.halt_poll_ns module parameter to set the
+maximum halt-polling time for all vCPUs in the target VM. This capability can
+be invoked at any time and any number of times to dynamically change the
+maximum halt-polling time.
+
+See Documentation/virt/kvm/halt-polling.rst for more information on halt
+polling.
 
 7.21 KVM_CAP_X86_USER_SPACE_MSR
 -------------------------------
similarity index 92%
rename from Documentation/virt/kvm/x86/halt-polling.rst
rename to Documentation/virt/kvm/halt-polling.rst
index 4922e4a..3fae39b 100644 (file)
@@ -119,6 +119,19 @@ These module parameters can be set from the debugfs files in:
 Note: that these module parameters are system wide values and are not able to
       be tuned on a per vm basis.
 
+Any changes to these parameters will be picked up by new and existing vCPUs the
+next time they halt, with the notable exception of VMs using KVM_CAP_HALT_POLL
+(see next section).
+
+KVM_CAP_HALT_POLL
+=================
+
+KVM_CAP_HALT_POLL is a VM capability that allows userspace to override halt_poll_ns
+on a per-VM basis. VMs using KVM_CAP_HALT_POLL ignore halt_poll_ns completely (but
+still obey halt_poll_ns_grow, halt_poll_ns_grow_start, and halt_poll_ns_shrink).
+
+See Documentation/virt/kvm/api.rst for more information on this capability.
+
 Further Notes
 =============
 
index e0a2c74..ad13ec5 100644 (file)
@@ -17,4 +17,5 @@ KVM
 
    locking
    vcpu-requests
+   halt-polling
    review-checklist
index 7ff5888..9ece6b8 100644 (file)
@@ -10,7 +10,6 @@ KVM for x86 systems
    amd-memory-encryption
    cpuid
    errata
-   halt-polling
    hypercalls
    mmu
    msr
index 955c1be..32dd415 100644 (file)
@@ -1903,6 +1903,7 @@ F:        Documentation/devicetree/bindings/interrupt-controller/apple,*
 F:     Documentation/devicetree/bindings/iommu/apple,dart.yaml
 F:     Documentation/devicetree/bindings/iommu/apple,sart.yaml
 F:     Documentation/devicetree/bindings/mailbox/apple,mailbox.yaml
+F:     Documentation/devicetree/bindings/net/bluetooth/brcm,bcm4377-bluetooth.yaml
 F:     Documentation/devicetree/bindings/nvme/apple,nvme-ans.yaml
 F:     Documentation/devicetree/bindings/nvmem/apple,efuses.yaml
 F:     Documentation/devicetree/bindings/pci/apple,pcie.yaml
@@ -1910,6 +1911,7 @@ F:        Documentation/devicetree/bindings/pinctrl/apple,pinctrl.yaml
 F:     Documentation/devicetree/bindings/power/apple*
 F:     Documentation/devicetree/bindings/watchdog/apple,wdt.yaml
 F:     arch/arm64/boot/dts/apple/
+F:     drivers/bluetooth/hci_bcm4377.c
 F:     drivers/clk/clk-apple-nco.c
 F:     drivers/dma/apple-admac.c
 F:     drivers/i2c/busses/i2c-pasemi-core.c
@@ -7682,6 +7684,7 @@ ETAS ES58X CAN/USB DRIVER
 M:     Vincent Mailhol <mailhol.vincent@wanadoo.fr>
 L:     linux-can@vger.kernel.org
 S:     Maintained
+F:     Documentation/networking/devlink/etas_es58x.rst
 F:     drivers/net/can/usb/etas_es58x/
 
 ETHERNET BRIDGE
@@ -12365,7 +12368,7 @@ F:      Documentation/networking/device_drivers/ethernet/marvell/octeontx2.rst
 F:     drivers/net/ethernet/marvell/octeontx2/af/
 
 MARVELL PRESTERA ETHERNET SWITCH DRIVER
-M:     Taras Chornyi <tchornyi@marvell.com>
+M:     Taras Chornyi <taras.chornyi@plvision.eu>
 S:     Supported
 W:     https://github.com/Marvell-switching/switchdev-prestera
 F:     drivers/net/ethernet/marvell/prestera/
@@ -19065,7 +19068,7 @@ M:      Jassi Brar <jaswinder.singh@linaro.org>
 M:     Ilias Apalodimas <ilias.apalodimas@linaro.org>
 L:     netdev@vger.kernel.org
 S:     Maintained
-F:     Documentation/devicetree/bindings/net/socionext-netsec.txt
+F:     Documentation/devicetree/bindings/net/socionext,synquacer-netsec.yaml
 F:     drivers/net/ethernet/socionext/netsec.c
 
 SOCIONEXT (SNI) Synquacer SPI DRIVER
index 78525eb..0992f82 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@
 VERSION = 6
 PATCHLEVEL = 1
 SUBLEVEL = 0
-EXTRAVERSION = -rc7
+EXTRAVERSION = -rc8
 NAME = Hurr durr I'ma ninja sloth
 
 # *DOCUMENTATION*
index 7a11332..6f9004e 100644 (file)
                                compatible = "atmel,at91rm9200-udc";
                                reg = <0xfffb0000 0x4000>;
                                interrupts = <11 IRQ_TYPE_LEVEL_HIGH 2>;
-                               clocks = <&pmc PMC_TYPE_PERIPHERAL 11>, <&pmc PMC_TYPE_SYSTEM 2>;
+                               clocks = <&pmc PMC_TYPE_PERIPHERAL 11>, <&pmc PMC_TYPE_SYSTEM 1>;
                                clock-names = "pclk", "hclk";
                                status = "disabled";
                        };
index 67ed68f..bf2b5c6 100644 (file)
@@ -26,7 +26,7 @@ static void sama5_l2c310_write_sec(unsigned long val, unsigned reg)
 static void __init sama5_secure_cache_init(void)
 {
        sam_secure_init();
-       if (sam_linux_is_optee_available())
+       if (IS_ENABLED(CONFIG_OUTER_CACHE) && sam_linux_is_optee_available())
                outer_cache.write_sec = sama5_l2c310_write_sec;
 }
 
index c1f3ba9..b52ddc4 100644 (file)
        };
 };
 
+&bluetooth0 {
+       brcm,board-type = "apple,atlantisb";
+};
+
 &wifi0 {
        brcm,board-type = "apple,atlantisb";
 };
index ecb10d2..1510741 100644 (file)
        model = "Apple MacBook Pro (13-inch, M1, 2020)";
 };
 
+&bluetooth0 {
+       brcm,board-type = "apple,honshu";
+};
+
 &wifi0 {
        brcm,board-type = "apple,honshu";
 };
index df74173..bc1f865 100644 (file)
        model = "Apple MacBook Air (M1, 2020)";
 };
 
+&bluetooth0 {
+       brcm,board-type = "apple,shikoku";
+};
+
 &wifi0 {
        brcm,board-type = "apple,shikoku";
 };
index 8c6bf95..7ea2745 100644 (file)
        };
 };
 
+&bluetooth0 {
+       brcm,board-type = "apple,capri";
+};
+
 &wifi0 {
        brcm,board-type = "apple,capri";
 };
index fe7c0aa..8ee0ac8 100644 (file)
        };
 };
 
+&bluetooth0 {
+       brcm,board-type = "apple,santorini";
+};
+
 &wifi0 {
        brcm,board-type = "apple,santorini";
 };
index 3d15b8e..cc2e040 100644 (file)
@@ -11,6 +11,7 @@
 
 / {
        aliases {
+               bluetooth0 = &bluetooth0;
                serial0 = &serial0;
                serial2 = &serial2;
                wifi0 = &wifi0;
                local-mac-address = [00 00 00 00 00 00];
                apple,antenna-sku = "XX";
        };
+
+       bluetooth0: bluetooth@0,1 {
+               compatible = "pci14e4,5f69";
+               reg = <0x10100 0x0 0x0 0x0 0x0>;
+               /* To be filled by the loader */
+               local-bd-address = [00 00 00 00 00 00];
+       };
 };
index d6cf535..439e2bc 100644 (file)
 
 #ifdef CONFIG_EFI
 extern void efi_init(void);
-
-bool efi_runtime_fixup_exception(struct pt_regs *regs, const char *msg);
 #else
 #define efi_init()
-
-static inline
-bool efi_runtime_fixup_exception(struct pt_regs *regs, const char *msg)
-{
-       return false;
-}
 #endif
 
 int efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md);
index 67babd5..75691a2 100644 (file)
@@ -6,7 +6,7 @@
 #include <linux/linkage.h>
 
 SYM_FUNC_START(__efi_rt_asm_wrapper)
-       stp     x29, x30, [sp, #-112]!
+       stp     x29, x30, [sp, #-32]!
        mov     x29, sp
 
        /*
@@ -17,20 +17,6 @@ SYM_FUNC_START(__efi_rt_asm_wrapper)
        stp     x1, x18, [sp, #16]
 
        /*
-        * Preserve all callee saved registers and record the stack pointer
-        * value in a per-CPU variable so we can recover from synchronous
-        * exceptions occurring while running the firmware routines.
-        */
-       stp     x19, x20, [sp, #32]
-       stp     x21, x22, [sp, #48]
-       stp     x23, x24, [sp, #64]
-       stp     x25, x26, [sp, #80]
-       stp     x27, x28, [sp, #96]
-
-       adr_this_cpu    x8, __efi_rt_asm_recover_sp, x9
-       str             x29, [x8]
-
-       /*
         * We are lucky enough that no EFI runtime services take more than
         * 5 arguments, so all are passed in registers rather than via the
         * stack.
@@ -45,7 +31,7 @@ SYM_FUNC_START(__efi_rt_asm_wrapper)
 
        ldp     x1, x2, [sp, #16]
        cmp     x2, x18
-       ldp     x29, x30, [sp], #112
+       ldp     x29, x30, [sp], #32
        b.ne    0f
        ret
 0:
@@ -59,18 +45,3 @@ SYM_FUNC_START(__efi_rt_asm_wrapper)
        mov     x18, x2
        b       efi_handle_corrupted_x18        // tail call
 SYM_FUNC_END(__efi_rt_asm_wrapper)
-
-SYM_FUNC_START(__efi_rt_asm_recover)
-       ldr_this_cpu    x8, __efi_rt_asm_recover_sp, x9
-       mov             sp, x8
-
-       ldp     x0,  x18, [sp, #16]
-       ldp     x19, x20, [sp, #32]
-       ldp     x21, x22, [sp, #48]
-       ldp     x23, x24, [sp, #64]
-       ldp     x25, x26, [sp, #80]
-       ldp     x27, x28, [sp, #96]
-       ldp     x29, x30, [sp], #112
-
-       b       efi_handle_runtime_exception
-SYM_FUNC_END(__efi_rt_asm_recover)
index ee53f2a..a908a37 100644 (file)
@@ -9,7 +9,6 @@
 
 #include <linux/efi.h>
 #include <linux/init.h>
-#include <linux/percpu.h>
 
 #include <asm/efi.h>
 
@@ -145,28 +144,3 @@ asmlinkage efi_status_t efi_handle_corrupted_x18(efi_status_t s, const char *f)
        pr_err_ratelimited(FW_BUG "register x18 corrupted by EFI %s\n", f);
        return s;
 }
-
-asmlinkage DEFINE_PER_CPU(u64, __efi_rt_asm_recover_sp);
-
-asmlinkage efi_status_t __efi_rt_asm_recover(void);
-
-asmlinkage efi_status_t efi_handle_runtime_exception(const char *f)
-{
-       pr_err(FW_BUG "Synchronous exception occurred in EFI runtime service %s()\n", f);
-       clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
-       return EFI_ABORTED;
-}
-
-bool efi_runtime_fixup_exception(struct pt_regs *regs, const char *msg)
-{
-        /* Check whether the exception occurred while running the firmware */
-       if (current_work() != &efi_rts_work.work || regs->pc >= TASK_SIZE_64)
-               return false;
-
-       pr_err(FW_BUG "Unable to handle %s in EFI runtime service\n", msg);
-       add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
-       dump_stack();
-
-       regs->pc = (u64)__efi_rt_asm_recover;
-       return true;
-}
index 3cb101e..5240f6a 100644 (file)
@@ -36,7 +36,22 @@ void arch_dma_prep_coherent(struct page *page, size_t size)
 {
        unsigned long start = (unsigned long)page_address(page);
 
-       dcache_clean_poc(start, start + size);
+       /*
+        * The architecture only requires a clean to the PoC here in order to
+        * meet the requirements of the DMA API. However, some vendors (i.e.
+        * Qualcomm) abuse the DMA API for transferring buffers from the
+        * non-secure to the secure world, resetting the system if a non-secure
+        * access shows up after the buffer has been transferred:
+        *
+        * https://lore.kernel.org/r/20221114110329.68413-1-manivannan.sadhasivam@linaro.org
+        *
+        * Using clean+invalidate appears to make this issue less likely, but
+        * the drivers themselves still need fixing as the CPU could issue a
+        * speculative read from the buffer via the linear mapping irrespective
+        * of the cache maintenance we use. Once the drivers are fixed, we can
+        * relax this to a clean operation.
+        */
+       dcache_clean_inval_poc(start, start + size);
 }
 
 #ifdef CONFIG_IOMMU_DMA
index 3e9cf98..5b39149 100644 (file)
@@ -30,7 +30,6 @@
 #include <asm/bug.h>
 #include <asm/cmpxchg.h>
 #include <asm/cpufeature.h>
-#include <asm/efi.h>
 #include <asm/exception.h>
 #include <asm/daifflags.h>
 #include <asm/debug-monitors.h>
@@ -392,9 +391,6 @@ static void __do_kernel_fault(unsigned long addr, unsigned long esr,
                msg = "paging request";
        }
 
-       if (efi_runtime_fixup_exception(regs, msg))
-               return;
-
        die_kernel_fault(msg, addr, esr, regs);
 }
 
index aa0e0e0..79d5bfd 100644 (file)
@@ -490,6 +490,7 @@ static inline pmd_t pmd_mkdirty(pmd_t pmd)
        return pmd;
 }
 
+#define pmd_young pmd_young
 static inline int pmd_young(pmd_t pmd)
 {
        return !!(pmd_val(pmd) & _PAGE_ACCESSED);
index 3dd172d..d826873 100644 (file)
@@ -78,16 +78,6 @@ extern void calculate_cpu_foreign_map(void);
  */
 extern void show_ipi_list(struct seq_file *p, int prec);
 
-/*
- * This function sends a 'reschedule' IPI to another CPU.
- * it goes straight through and wastes no time serializing
- * anything. Worst case is that we lose a reschedule ...
- */
-static inline void smp_send_reschedule(int cpu)
-{
-       loongson_send_ipi_single(cpu, SMP_RESCHEDULE);
-}
-
 static inline void arch_send_call_function_single_ipi(int cpu)
 {
        loongson_send_ipi_single(cpu, SMP_CALL_FUNCTION);
index 6ed72f7..14508d4 100644 (file)
@@ -149,6 +149,17 @@ void loongson_send_ipi_mask(const struct cpumask *mask, unsigned int action)
                ipi_write_action(cpu_logical_map(i), (u32)action);
 }
 
+/*
+ * This function sends a 'reschedule' IPI to another CPU.
+ * it goes straight through and wastes no time serializing
+ * anything. Worst case is that we lose a reschedule ...
+ */
+void smp_send_reschedule(int cpu)
+{
+       loongson_send_ipi_single(cpu, SMP_RESCHEDULE);
+}
+EXPORT_SYMBOL_GPL(smp_send_reschedule);
+
 irqreturn_t loongson_ipi_interrupt(int irq, void *dev)
 {
        unsigned int action;
index d8ee8fb..58781c6 100644 (file)
@@ -10,6 +10,8 @@
 #include <asm/regdef.h>
 #include <asm/stackframe.h>
 
+#define INVTLB_ADDR_GFALSE_AND_ASID    5
+
 #define PTRS_PER_PGD_BITS      (PAGE_SHIFT - 3)
 #define PTRS_PER_PUD_BITS      (PAGE_SHIFT - 3)
 #define PTRS_PER_PMD_BITS      (PAGE_SHIFT - 3)
@@ -136,13 +138,10 @@ tlb_huge_update_load:
        ori             t0, ra, _PAGE_VALID
        st.d            t0, t1, 0
 #endif
-       tlbsrch
-       addu16i.d       t1, zero, -(CSR_TLBIDX_EHINV >> 16)
-       addi.d          ra, t1, 0
-       csrxchg         ra, t1, LOONGARCH_CSR_TLBIDX
-       tlbwr
-
-       csrxchg         zero, t1, LOONGARCH_CSR_TLBIDX
+       csrrd           ra, LOONGARCH_CSR_ASID
+       csrrd           t1, LOONGARCH_CSR_BADV
+       andi            ra, ra, CSR_ASID_ASID
+       invtlb          INVTLB_ADDR_GFALSE_AND_ASID, ra, t1
 
        /*
         * A huge PTE describes an area the size of the
@@ -287,13 +286,11 @@ tlb_huge_update_store:
        ori             t0, ra, (_PAGE_VALID | _PAGE_DIRTY | _PAGE_MODIFIED)
        st.d            t0, t1, 0
 #endif
-       tlbsrch
-       addu16i.d       t1, zero, -(CSR_TLBIDX_EHINV >> 16)
-       addi.d          ra, t1, 0
-       csrxchg         ra, t1, LOONGARCH_CSR_TLBIDX
-       tlbwr
+       csrrd           ra, LOONGARCH_CSR_ASID
+       csrrd           t1, LOONGARCH_CSR_BADV
+       andi            ra, ra, CSR_ASID_ASID
+       invtlb          INVTLB_ADDR_GFALSE_AND_ASID, ra, t1
 
-       csrxchg         zero, t1, LOONGARCH_CSR_TLBIDX
        /*
         * A huge PTE describes an area the size of the
         * configured huge page size. This is twice the
@@ -436,6 +433,11 @@ tlb_huge_update_modify:
        ori             t0, ra, (_PAGE_VALID | _PAGE_DIRTY | _PAGE_MODIFIED)
        st.d            t0, t1, 0
 #endif
+       csrrd           ra, LOONGARCH_CSR_ASID
+       csrrd           t1, LOONGARCH_CSR_BADV
+       andi            ra, ra, CSR_ASID_ASID
+       invtlb          INVTLB_ADDR_GFALSE_AND_ASID, ra, t1
+
        /*
         * A huge PTE describes an area the size of the
         * configured huge page size. This is twice the
@@ -466,7 +468,7 @@ tlb_huge_update_modify:
        addu16i.d       t1, zero, (PS_HUGE_SIZE << (CSR_TLBIDX_PS_SHIFT - 16))
        csrxchg         t1, t0, LOONGARCH_CSR_TLBIDX
 
-       tlbwr
+       tlbfill
 
        /* Reset default page size */
        addu16i.d       t0, zero, (CSR_TLBIDX_PS >> 16)
index 6caec38..4678627 100644 (file)
@@ -622,6 +622,7 @@ static inline pmd_t pmd_mkdirty(pmd_t pmd)
        return pmd;
 }
 
+#define pmd_young pmd_young
 static inline int pmd_young(pmd_t pmd)
 {
        return !!(pmd_val(pmd) & _PAGE_ACCESSED);
index 4745bb9..6d8492b 100644 (file)
@@ -602,6 +602,7 @@ ____##func(struct pt_regs *regs)
 /* kernel/traps.c */
 DECLARE_INTERRUPT_HANDLER_NMI(system_reset_exception);
 #ifdef CONFIG_PPC_BOOK3S_64
+DECLARE_INTERRUPT_HANDLER_RAW(machine_check_early_boot);
 DECLARE_INTERRUPT_HANDLER_ASYNC(machine_check_exception_async);
 #endif
 DECLARE_INTERRUPT_HANDLER_NMI(machine_check_exception);
index 43f1c76..a379b0c 100644 (file)
@@ -113,23 +113,19 @@ void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx)
 {
        int i;
 
-       /* First arg comes in as a 32 bits pointer. */
-       EMIT(PPC_RAW_MR(bpf_to_ppc(BPF_REG_1), _R3));
-       EMIT(PPC_RAW_LI(bpf_to_ppc(BPF_REG_1) - 1, 0));
+       /* Initialize tail_call_cnt, to be skipped if we do tail calls. */
+       EMIT(PPC_RAW_LI(_R4, 0));
+
+#define BPF_TAILCALL_PROLOGUE_SIZE     4
+
        EMIT(PPC_RAW_STWU(_R1, _R1, -BPF_PPC_STACKFRAME(ctx)));
 
-       /*
-        * Initialize tail_call_cnt in stack frame if we do tail calls.
-        * Otherwise, put in NOPs so that it can be skipped when we are
-        * invoked through a tail call.
-        */
        if (ctx->seen & SEEN_TAILCALL)
-               EMIT(PPC_RAW_STW(bpf_to_ppc(BPF_REG_1) - 1, _R1,
-                                bpf_jit_stack_offsetof(ctx, BPF_PPC_TC)));
-       else
-               EMIT(PPC_RAW_NOP());
+               EMIT(PPC_RAW_STW(_R4, _R1, bpf_jit_stack_offsetof(ctx, BPF_PPC_TC)));
 
-#define BPF_TAILCALL_PROLOGUE_SIZE     16
+       /* First arg comes in as a 32 bits pointer. */
+       EMIT(PPC_RAW_MR(bpf_to_ppc(BPF_REG_1), _R3));
+       EMIT(PPC_RAW_LI(bpf_to_ppc(BPF_REG_1) - 1, 0));
 
        /*
         * We need a stack frame, but we don't necessarily need to
@@ -170,24 +166,24 @@ static void bpf_jit_emit_common_epilogue(u32 *image, struct codegen_context *ctx
        for (i = BPF_PPC_NVR_MIN; i <= 31; i++)
                if (bpf_is_seen_register(ctx, i))
                        EMIT(PPC_RAW_LWZ(i, _R1, bpf_jit_stack_offsetof(ctx, i)));
-}
-
-void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx)
-{
-       EMIT(PPC_RAW_MR(_R3, bpf_to_ppc(BPF_REG_0)));
-
-       bpf_jit_emit_common_epilogue(image, ctx);
-
-       /* Tear down our stack frame */
 
        if (ctx->seen & SEEN_FUNC)
                EMIT(PPC_RAW_LWZ(_R0, _R1, BPF_PPC_STACKFRAME(ctx) + PPC_LR_STKOFF));
 
+       /* Tear down our stack frame */
        EMIT(PPC_RAW_ADDI(_R1, _R1, BPF_PPC_STACKFRAME(ctx)));
 
        if (ctx->seen & SEEN_FUNC)
                EMIT(PPC_RAW_MTLR(_R0));
 
+}
+
+void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx)
+{
+       EMIT(PPC_RAW_MR(_R3, bpf_to_ppc(BPF_REG_0)));
+
+       bpf_jit_emit_common_epilogue(image, ctx);
+
        EMIT(PPC_RAW_BLR());
 }
 
@@ -244,7 +240,6 @@ static int bpf_jit_emit_tail_call(u32 *image, struct codegen_context *ctx, u32 o
        EMIT(PPC_RAW_RLWINM(_R3, b2p_index, 2, 0, 29));
        EMIT(PPC_RAW_ADD(_R3, _R3, b2p_bpf_array));
        EMIT(PPC_RAW_LWZ(_R3, _R3, offsetof(struct bpf_array, ptrs)));
-       EMIT(PPC_RAW_STW(_R0, _R1, bpf_jit_stack_offsetof(ctx, BPF_PPC_TC)));
 
        /*
         * if (prog == NULL)
@@ -255,19 +250,14 @@ static int bpf_jit_emit_tail_call(u32 *image, struct codegen_context *ctx, u32 o
 
        /* goto *(prog->bpf_func + prologue_size); */
        EMIT(PPC_RAW_LWZ(_R3, _R3, offsetof(struct bpf_prog, bpf_func)));
-
-       if (ctx->seen & SEEN_FUNC)
-               EMIT(PPC_RAW_LWZ(_R0, _R1, BPF_PPC_STACKFRAME(ctx) + PPC_LR_STKOFF));
-
        EMIT(PPC_RAW_ADDIC(_R3, _R3, BPF_TAILCALL_PROLOGUE_SIZE));
-
-       if (ctx->seen & SEEN_FUNC)
-               EMIT(PPC_RAW_MTLR(_R0));
-
        EMIT(PPC_RAW_MTCTR(_R3));
 
        EMIT(PPC_RAW_MR(_R3, bpf_to_ppc(BPF_REG_1)));
 
+       /* Put tail_call_cnt in r4 */
+       EMIT(PPC_RAW_MR(_R4, _R0));
+
        /* tear restore NVRs, ... */
        bpf_jit_emit_common_epilogue(image, ctx);
 
index fa78595..593cf09 100644 (file)
@@ -317,9 +317,9 @@ config SMP
 config NR_CPUS
        int "Maximum number of CPUs (2-512)"
        depends on SMP
-       range 2 512 if !SBI_V01
-       range 2 32 if SBI_V01 && 32BIT
-       range 2 64 if SBI_V01 && 64BIT
+       range 2 512 if !RISCV_SBI_V01
+       range 2 32 if RISCV_SBI_V01 && 32BIT
+       range 2 64 if RISCV_SBI_V01 && 64BIT
        default "32" if 32BIT
        default "64" if 64BIT
 
index 1b471ff..816e753 100644 (file)
@@ -23,6 +23,7 @@
 #define REG_L          __REG_SEL(ld, lw)
 #define REG_S          __REG_SEL(sd, sw)
 #define REG_SC         __REG_SEL(sc.d, sc.w)
+#define REG_AMOSWAP_AQ __REG_SEL(amoswap.d.aq, amoswap.w.aq)
 #define REG_ASM                __REG_SEL(.dword, .word)
 #define SZREG          __REG_SEL(8, 4)
 #define LGREG          __REG_SEL(3, 2)
index f74879a..e229d7b 100644 (file)
@@ -10,6 +10,7 @@
 #include <asm/mmu_context.h>
 #include <asm/ptrace.h>
 #include <asm/tlbflush.h>
+#include <asm/pgalloc.h>
 
 #ifdef CONFIG_EFI
 extern void efi_init(void);
@@ -20,7 +21,10 @@ extern void efi_init(void);
 int efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md);
 int efi_set_mapping_permissions(struct mm_struct *mm, efi_memory_desc_t *md);
 
-#define arch_efi_call_virt_setup()      efi_virtmap_load()
+#define arch_efi_call_virt_setup()      ({             \
+               sync_kernel_mappings(efi_mm.pgd);       \
+               efi_virtmap_load();                     \
+       })
 #define arch_efi_call_virt_teardown()   efi_virtmap_unload()
 
 #define ARCH_EFI_IRQ_FLAGS_MASK (SR_IE | SR_SPIE)
index 947f23d..59dc12b 100644 (file)
@@ -127,6 +127,13 @@ static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d)
 #define __p4d_free_tlb(tlb, p4d, addr)  p4d_free((tlb)->mm, p4d)
 #endif /* __PAGETABLE_PMD_FOLDED */
 
+static inline void sync_kernel_mappings(pgd_t *pgd)
+{
+       memcpy(pgd + USER_PTRS_PER_PGD,
+              init_mm.pgd + USER_PTRS_PER_PGD,
+              (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
+}
+
 static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 {
        pgd_t *pgd;
@@ -135,9 +142,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
        if (likely(pgd != NULL)) {
                memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t));
                /* Copy kernel mappings */
-               memcpy(pgd + USER_PTRS_PER_PGD,
-                       init_mm.pgd + USER_PTRS_PER_PGD,
-                       (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
+               sync_kernel_mappings(pgd);
        }
        return pgd;
 }
index 7ec9369..92ec2d9 100644 (file)
@@ -600,6 +600,7 @@ static inline int pmd_dirty(pmd_t pmd)
        return pte_dirty(pmd_pte(pmd));
 }
 
+#define pmd_young pmd_young
 static inline int pmd_young(pmd_t pmd)
 {
        return pte_young(pmd_pte(pmd));
index d3443be..3831b63 100644 (file)
@@ -50,6 +50,9 @@ void riscv_set_ipi_ops(const struct riscv_ipi_ops *ops);
 /* Clear IPI for current CPU */
 void riscv_clear_ipi(void);
 
+/* Check other CPUs stop or not */
+bool smp_crash_stop_failed(void);
+
 /* Secondary hart entry */
 asmlinkage void smp_callin(void);
 
index b9eda3f..186abd1 100644 (file)
@@ -404,6 +404,19 @@ handle_syscall_trace_exit:
 
 #ifdef CONFIG_VMAP_STACK
 handle_kernel_stack_overflow:
+       /*
+        * Takes the psuedo-spinlock for the shadow stack, in case multiple
+        * harts are concurrently overflowing their kernel stacks.  We could
+        * store any value here, but since we're overflowing the kernel stack
+        * already we only have SP to use as a scratch register.  So we just
+        * swap in the address of the spinlock, as that's definately non-zero.
+        *
+        * Pairs with a store_release in handle_bad_stack().
+        */
+1:     la sp, spin_shadow_stack
+       REG_AMOSWAP_AQ sp, sp, (sp)
+       bnez sp, 1b
+
        la sp, shadow_stack
        addi sp, sp, SHADOW_OVERFLOW_STACK_SIZE
 
index ee79e68..2d139b7 100644 (file)
@@ -15,6 +15,8 @@
 #include <linux/compiler.h>    /* For unreachable() */
 #include <linux/cpu.h>         /* For cpu_down() */
 #include <linux/reboot.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
 
 /*
  * kexec_image_info - Print received image details
@@ -138,20 +140,35 @@ void machine_shutdown(void)
 #endif
 }
 
-/* Override the weak function in kernel/panic.c */
-void crash_smp_send_stop(void)
+static void machine_kexec_mask_interrupts(void)
 {
-       static int cpus_stopped;
+       unsigned int i;
+       struct irq_desc *desc;
 
-       /*
-        * This function can be called twice in panic path, but obviously
-        * we execute this only once.
-        */
-       if (cpus_stopped)
-               return;
+       for_each_irq_desc(i, desc) {
+               struct irq_chip *chip;
+               int ret;
+
+               chip = irq_desc_get_chip(desc);
+               if (!chip)
+                       continue;
+
+               /*
+                * First try to remove the active state. If this
+                * fails, try to EOI the interrupt.
+                */
+               ret = irq_set_irqchip_state(i, IRQCHIP_STATE_ACTIVE, false);
+
+               if (ret && irqd_irq_inprogress(&desc->irq_data) &&
+                   chip->irq_eoi)
+                       chip->irq_eoi(&desc->irq_data);
 
-       smp_send_stop();
-       cpus_stopped = 1;
+               if (chip->irq_mask)
+                       chip->irq_mask(&desc->irq_data);
+
+               if (chip->irq_disable && !irqd_irq_disabled(&desc->irq_data))
+                       chip->irq_disable(&desc->irq_data);
+       }
 }
 
 /*
@@ -169,6 +186,8 @@ machine_crash_shutdown(struct pt_regs *regs)
        crash_smp_send_stop();
 
        crash_save_cpu(regs, smp_processor_id());
+       machine_kexec_mask_interrupts();
+
        pr_info("Starting crashdump kernel...\n");
 }
 
@@ -195,6 +214,11 @@ machine_kexec(struct kimage *image)
        void *control_code_buffer = page_address(image->control_code_page);
        riscv_kexec_method kexec_method = NULL;
 
+#ifdef CONFIG_SMP
+       WARN(smp_crash_stop_failed(),
+               "Some CPUs may be stale, kdump will be unreliable.\n");
+#endif
+
        if (image->type != KEXEC_TYPE_CRASH)
                kexec_method = control_code_buffer;
        else
index 67ec1fa..86acd69 100644 (file)
@@ -322,10 +322,11 @@ subsys_initcall(topology_init);
 
 void free_initmem(void)
 {
-       if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX))
-               set_kernel_memory(lm_alias(__init_begin), lm_alias(__init_end),
-                                 IS_ENABLED(CONFIG_64BIT) ?
-                                       set_memory_rw : set_memory_rw_nx);
+       if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX)) {
+               set_kernel_memory(lm_alias(__init_begin), lm_alias(__init_end), set_memory_rw_nx);
+               if (IS_ENABLED(CONFIG_64BIT))
+                       set_kernel_memory(__init_begin, __init_end, set_memory_nx);
+       }
 
        free_initmem_default(POISON_FREE_INITMEM);
 }
index 760a645..8c3b59f 100644 (file)
@@ -12,6 +12,7 @@
 #include <linux/clockchips.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
+#include <linux/kexec.h>
 #include <linux/profile.h>
 #include <linux/smp.h>
 #include <linux/sched.h>
 #include <asm/sbi.h>
 #include <asm/tlbflush.h>
 #include <asm/cacheflush.h>
+#include <asm/cpu_ops.h>
 
 enum ipi_message_type {
        IPI_RESCHEDULE,
        IPI_CALL_FUNC,
        IPI_CPU_STOP,
+       IPI_CPU_CRASH_STOP,
        IPI_IRQ_WORK,
        IPI_TIMER,
        IPI_MAX
@@ -71,6 +74,32 @@ static void ipi_stop(void)
                wait_for_interrupt();
 }
 
+#ifdef CONFIG_KEXEC_CORE
+static atomic_t waiting_for_crash_ipi = ATOMIC_INIT(0);
+
+static inline void ipi_cpu_crash_stop(unsigned int cpu, struct pt_regs *regs)
+{
+       crash_save_cpu(regs, cpu);
+
+       atomic_dec(&waiting_for_crash_ipi);
+
+       local_irq_disable();
+
+#ifdef CONFIG_HOTPLUG_CPU
+       if (cpu_has_hotplug(cpu))
+               cpu_ops[cpu]->cpu_stop();
+#endif
+
+       for(;;)
+               wait_for_interrupt();
+}
+#else
+static inline void ipi_cpu_crash_stop(unsigned int cpu, struct pt_regs *regs)
+{
+       unreachable();
+}
+#endif
+
 static const struct riscv_ipi_ops *ipi_ops __ro_after_init;
 
 void riscv_set_ipi_ops(const struct riscv_ipi_ops *ops)
@@ -124,8 +153,9 @@ void arch_irq_work_raise(void)
 
 void handle_IPI(struct pt_regs *regs)
 {
-       unsigned long *pending_ipis = &ipi_data[smp_processor_id()].bits;
-       unsigned long *stats = ipi_data[smp_processor_id()].stats;
+       unsigned int cpu = smp_processor_id();
+       unsigned long *pending_ipis = &ipi_data[cpu].bits;
+       unsigned long *stats = ipi_data[cpu].stats;
 
        riscv_clear_ipi();
 
@@ -154,6 +184,10 @@ void handle_IPI(struct pt_regs *regs)
                        ipi_stop();
                }
 
+               if (ops & (1 << IPI_CPU_CRASH_STOP)) {
+                       ipi_cpu_crash_stop(cpu, get_irq_regs());
+               }
+
                if (ops & (1 << IPI_IRQ_WORK)) {
                        stats[IPI_IRQ_WORK]++;
                        irq_work_run();
@@ -176,6 +210,7 @@ static const char * const ipi_names[] = {
        [IPI_RESCHEDULE]        = "Rescheduling interrupts",
        [IPI_CALL_FUNC]         = "Function call interrupts",
        [IPI_CPU_STOP]          = "CPU stop interrupts",
+       [IPI_CPU_CRASH_STOP]    = "CPU stop (for crash dump) interrupts",
        [IPI_IRQ_WORK]          = "IRQ work interrupts",
        [IPI_TIMER]             = "Timer broadcast interrupts",
 };
@@ -235,6 +270,64 @@ void smp_send_stop(void)
                           cpumask_pr_args(cpu_online_mask));
 }
 
+#ifdef CONFIG_KEXEC_CORE
+/*
+ * The number of CPUs online, not counting this CPU (which may not be
+ * fully online and so not counted in num_online_cpus()).
+ */
+static inline unsigned int num_other_online_cpus(void)
+{
+       unsigned int this_cpu_online = cpu_online(smp_processor_id());
+
+       return num_online_cpus() - this_cpu_online;
+}
+
+void crash_smp_send_stop(void)
+{
+       static int cpus_stopped;
+       cpumask_t mask;
+       unsigned long timeout;
+
+       /*
+        * This function can be called twice in panic path, but obviously
+        * we execute this only once.
+        */
+       if (cpus_stopped)
+               return;
+
+       cpus_stopped = 1;
+
+       /*
+        * If this cpu is the only one alive at this point in time, online or
+        * not, there are no stop messages to be sent around, so just back out.
+        */
+       if (num_other_online_cpus() == 0)
+               return;
+
+       cpumask_copy(&mask, cpu_online_mask);
+       cpumask_clear_cpu(smp_processor_id(), &mask);
+
+       atomic_set(&waiting_for_crash_ipi, num_other_online_cpus());
+
+       pr_crit("SMP: stopping secondary CPUs\n");
+       send_ipi_mask(&mask, IPI_CPU_CRASH_STOP);
+
+       /* Wait up to one second for other CPUs to stop */
+       timeout = USEC_PER_SEC;
+       while ((atomic_read(&waiting_for_crash_ipi) > 0) && timeout--)
+               udelay(1);
+
+       if (atomic_read(&waiting_for_crash_ipi) > 0)
+               pr_warn("SMP: failed to stop secondary CPUs %*pbl\n",
+                       cpumask_pr_args(&mask));
+}
+
+bool smp_crash_stop_failed(void)
+{
+       return (atomic_read(&waiting_for_crash_ipi) > 0);
+}
+#endif
+
 void smp_send_reschedule(int cpu)
 {
        send_ipi_single(cpu, IPI_RESCHEDULE);
index f3e96d6..7abd8e4 100644 (file)
@@ -221,11 +221,29 @@ asmlinkage unsigned long get_overflow_stack(void)
                OVERFLOW_STACK_SIZE;
 }
 
+/*
+ * A pseudo spinlock to protect the shadow stack from being used by multiple
+ * harts concurrently.  This isn't a real spinlock because the lock side must
+ * be taken without a valid stack and only a single register, it's only taken
+ * while in the process of panicing anyway so the performance and error
+ * checking a proper spinlock gives us doesn't matter.
+ */
+unsigned long spin_shadow_stack;
+
 asmlinkage void handle_bad_stack(struct pt_regs *regs)
 {
        unsigned long tsk_stk = (unsigned long)current->stack;
        unsigned long ovf_stk = (unsigned long)this_cpu_ptr(overflow_stack);
 
+       /*
+        * We're done with the shadow stack by this point, as we're on the
+        * overflow stack.  Tell any other concurrent overflowing harts that
+        * they can proceed with panicing by releasing the pseudo-spinlock.
+        *
+        * This pairs with an amoswap.aq in handle_kernel_stack_overflow.
+        */
+       smp_store_release(&spin_shadow_stack, 0);
+
        console_verbose();
 
        pr_emerg("Insufficient stack space to handle exception!\n");
index db65485..06e6b27 100644 (file)
@@ -17,6 +17,7 @@ vdso-syms += flush_icache
 obj-vdso = $(patsubst %, %.o, $(vdso-syms)) note.o
 
 ccflags-y := -fno-stack-protector
+ccflags-y += -DDISABLE_BRANCH_PROFILING
 
 ifneq ($(c-gettimeofday-y),)
   CFLAGS_vgettimeofday.o += -fPIC -include $(c-gettimeofday-y)
index 00df3a8..f2417ac 100644 (file)
@@ -136,6 +136,25 @@ static bool in_auipc_jalr_range(s64 val)
                val < ((1L << 31) - (1L << 11));
 }
 
+/* Emit fixed-length instructions for address */
+static int emit_addr(u8 rd, u64 addr, bool extra_pass, struct rv_jit_context *ctx)
+{
+       u64 ip = (u64)(ctx->insns + ctx->ninsns);
+       s64 off = addr - ip;
+       s64 upper = (off + (1 << 11)) >> 12;
+       s64 lower = off & 0xfff;
+
+       if (extra_pass && !in_auipc_jalr_range(off)) {
+               pr_err("bpf-jit: target offset 0x%llx is out of range\n", off);
+               return -ERANGE;
+       }
+
+       emit(rv_auipc(rd, upper), ctx);
+       emit(rv_addi(rd, rd, lower), ctx);
+       return 0;
+}
+
+/* Emit variable-length instructions for 32-bit and 64-bit imm */
 static void emit_imm(u8 rd, s64 val, struct rv_jit_context *ctx)
 {
        /* Note that the immediate from the add is sign-extended,
@@ -1050,7 +1069,15 @@ out_be:
                u64 imm64;
 
                imm64 = (u64)insn1.imm << 32 | (u32)imm;
-               emit_imm(rd, imm64, ctx);
+               if (bpf_pseudo_func(insn)) {
+                       /* fixed-length insns for extra jit pass */
+                       ret = emit_addr(rd, imm64, extra_pass, ctx);
+                       if (ret)
+                               return ret;
+               } else {
+                       emit_imm(rd, imm64, ctx);
+               }
+
                return 1;
        }
 
index f1cb939..11e9012 100644 (file)
@@ -763,6 +763,7 @@ static inline int pmd_dirty(pmd_t pmd)
        return (pmd_val(pmd) & _SEGMENT_ENTRY_DIRTY) != 0;
 }
 
+#define pmd_young pmd_young
 static inline int pmd_young(pmd_t pmd)
 {
        return (pmd_val(pmd) & _SEGMENT_ENTRY_YOUNG) != 0;
index 94138f8..ace2541 100644 (file)
@@ -546,8 +546,10 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
        if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_CEI))
                scb_s->eca |= scb_o->eca & ECA_CEI;
        /* Epoch Extension */
-       if (test_kvm_facility(vcpu->kvm, 139))
+       if (test_kvm_facility(vcpu->kvm, 139)) {
                scb_s->ecd |= scb_o->ecd & ECD_MEF;
+               scb_s->epdx = scb_o->epdx;
+       }
 
        /* etoken */
        if (test_kvm_facility(vcpu->kvm, 156))
index a779418..3bc9736 100644 (file)
@@ -693,6 +693,7 @@ static inline unsigned long pmd_dirty(pmd_t pmd)
        return pte_dirty(pte);
 }
 
+#define pmd_young pmd_young
 static inline unsigned long pmd_young(pmd_t pmd)
 {
        pte_t pte = __pte(pmd_val(pmd));
index c936ce9..dfdb103 100644 (file)
@@ -321,7 +321,7 @@ static inline void indirect_branch_prediction_barrier(void)
 /* The Intel SPEC CTRL MSR base value cache */
 extern u64 x86_spec_ctrl_base;
 DECLARE_PER_CPU(u64, x86_spec_ctrl_current);
-extern void write_spec_ctrl_current(u64 val, bool force);
+extern void update_spec_ctrl_cond(u64 val);
 extern u64 spec_ctrl_current(void);
 
 /*
index 5059799..286a718 100644 (file)
@@ -139,6 +139,7 @@ static inline int pmd_dirty(pmd_t pmd)
        return pmd_flags(pmd) & _PAGE_DIRTY;
 }
 
+#define pmd_young pmd_young
 static inline int pmd_young(pmd_t pmd)
 {
        return pmd_flags(pmd) & _PAGE_ACCESSED;
@@ -1438,6 +1439,14 @@ static inline bool arch_has_hw_pte_young(void)
        return true;
 }
 
+#ifdef CONFIG_XEN_PV
+#define arch_has_hw_nonleaf_pmd_young arch_has_hw_nonleaf_pmd_young
+static inline bool arch_has_hw_nonleaf_pmd_young(void)
+{
+       return !cpu_feature_enabled(X86_FEATURE_XENPV);
+}
+#endif
+
 #ifdef CONFIG_PAGE_TABLE_CHECK
 static inline bool pte_user_accessible_page(pte_t pte)
 {
index 3e3230c..6daf842 100644 (file)
@@ -60,11 +60,18 @@ EXPORT_SYMBOL_GPL(x86_spec_ctrl_current);
 
 static DEFINE_MUTEX(spec_ctrl_mutex);
 
+/* Update SPEC_CTRL MSR and its cached copy unconditionally */
+static void update_spec_ctrl(u64 val)
+{
+       this_cpu_write(x86_spec_ctrl_current, val);
+       wrmsrl(MSR_IA32_SPEC_CTRL, val);
+}
+
 /*
  * Keep track of the SPEC_CTRL MSR value for the current task, which may differ
  * from x86_spec_ctrl_base due to STIBP/SSB in __speculation_ctrl_update().
  */
-void write_spec_ctrl_current(u64 val, bool force)
+void update_spec_ctrl_cond(u64 val)
 {
        if (this_cpu_read(x86_spec_ctrl_current) == val)
                return;
@@ -75,7 +82,7 @@ void write_spec_ctrl_current(u64 val, bool force)
         * When KERNEL_IBRS this MSR is written on return-to-user, unless
         * forced the update can be delayed until that time.
         */
-       if (force || !cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS))
+       if (!cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS))
                wrmsrl(MSR_IA32_SPEC_CTRL, val);
 }
 
@@ -1328,7 +1335,7 @@ static void __init spec_ctrl_disable_kernel_rrsba(void)
 
        if (ia32_cap & ARCH_CAP_RRSBA) {
                x86_spec_ctrl_base |= SPEC_CTRL_RRSBA_DIS_S;
-               write_spec_ctrl_current(x86_spec_ctrl_base, true);
+               update_spec_ctrl(x86_spec_ctrl_base);
        }
 }
 
@@ -1450,7 +1457,7 @@ static void __init spectre_v2_select_mitigation(void)
 
        if (spectre_v2_in_ibrs_mode(mode)) {
                x86_spec_ctrl_base |= SPEC_CTRL_IBRS;
-               write_spec_ctrl_current(x86_spec_ctrl_base, true);
+               update_spec_ctrl(x86_spec_ctrl_base);
        }
 
        switch (mode) {
@@ -1564,7 +1571,7 @@ static void __init spectre_v2_select_mitigation(void)
 static void update_stibp_msr(void * __unused)
 {
        u64 val = spec_ctrl_current() | (x86_spec_ctrl_base & SPEC_CTRL_STIBP);
-       write_spec_ctrl_current(val, true);
+       update_spec_ctrl(val);
 }
 
 /* Update x86_spec_ctrl_base in case SMT state changed. */
@@ -1797,7 +1804,7 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void)
                        x86_amd_ssb_disable();
                } else {
                        x86_spec_ctrl_base |= SPEC_CTRL_SSBD;
-                       write_spec_ctrl_current(x86_spec_ctrl_base, true);
+                       update_spec_ctrl(x86_spec_ctrl_base);
                }
        }
 
@@ -2048,7 +2055,7 @@ int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which)
 void x86_spec_ctrl_setup_ap(void)
 {
        if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL))
-               write_spec_ctrl_current(x86_spec_ctrl_base, true);
+               update_spec_ctrl(x86_spec_ctrl_base);
 
        if (ssb_mode == SPEC_STORE_BYPASS_DISABLE)
                x86_amd_ssb_disable();
index c21b734..e436c9c 100644 (file)
@@ -600,7 +600,7 @@ static __always_inline void __speculation_ctrl_update(unsigned long tifp,
        }
 
        if (updmsr)
-               write_spec_ctrl_current(msr, false);
+               update_spec_ctrl_cond(msr);
 }
 
 static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk)
index 2835bd7..69227f7 100644 (file)
@@ -10574,8 +10574,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                                vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
                                vcpu->mmio_needed = 0;
                                r = 0;
+                               goto out;
                        }
-                       goto out;
                }
                if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
                        /* Page is swapped out. Do synthetic halt */
index 23f49a2..6cceca6 100644 (file)
@@ -562,17 +562,26 @@ static int initiator_cmp(void *priv, const struct list_head *a,
 {
        struct memory_initiator *ia;
        struct memory_initiator *ib;
-       unsigned long *p_nodes = priv;
 
        ia = list_entry(a, struct memory_initiator, node);
        ib = list_entry(b, struct memory_initiator, node);
 
-       set_bit(ia->processor_pxm, p_nodes);
-       set_bit(ib->processor_pxm, p_nodes);
-
        return ia->processor_pxm - ib->processor_pxm;
 }
 
+static int initiators_to_nodemask(unsigned long *p_nodes)
+{
+       struct memory_initiator *initiator;
+
+       if (list_empty(&initiators))
+               return -ENXIO;
+
+       list_for_each_entry(initiator, &initiators, node)
+               set_bit(initiator->processor_pxm, p_nodes);
+
+       return 0;
+}
+
 static void hmat_register_target_initiators(struct memory_target *target)
 {
        static DECLARE_BITMAP(p_nodes, MAX_NUMNODES);
@@ -609,7 +618,10 @@ static void hmat_register_target_initiators(struct memory_target *target)
         * initiators.
         */
        bitmap_zero(p_nodes, MAX_NUMNODES);
-       list_sort(p_nodes, &initiators, initiator_cmp);
+       list_sort(NULL, &initiators, initiator_cmp);
+       if (initiators_to_nodemask(p_nodes) < 0)
+               return;
+
        if (!access0done) {
                for (i = WRITE_LATENCY; i <= READ_BANDWIDTH; i++) {
                        loc = localities_types[i];
@@ -643,8 +655,9 @@ static void hmat_register_target_initiators(struct memory_target *target)
 
        /* Access 1 ignores Generic Initiators */
        bitmap_zero(p_nodes, MAX_NUMNODES);
-       list_sort(p_nodes, &initiators, initiator_cmp);
-       best = 0;
+       if (initiators_to_nodemask(p_nodes) < 0)
+               return;
+
        for (i = WRITE_LATENCY; i <= READ_BANDWIDTH; i++) {
                loc = localities_types[i];
                if (!loc)
index ddf17e2..b9e336b 100644 (file)
@@ -109,7 +109,7 @@ struct clk *ahci_platform_find_clk(struct ahci_host_priv *hpriv, const char *con
        int i;
 
        for (i = 0; i < hpriv->n_clks; i++) {
-               if (!strcmp(hpriv->clks[i].id, con_id))
+               if (hpriv->clks[i].id && !strcmp(hpriv->clks[i].id, con_id))
                        return hpriv->clks[i].clk;
        }
 
index e307074..5a1a7be 100644 (file)
@@ -45,6 +45,17 @@ config BT_HCIBTUSB_AUTOSUSPEND
          This can be overridden by passing btusb.enable_autosuspend=[y|n]
          on the kernel commandline.
 
+config BT_HCIBTUSB_POLL_SYNC
+       bool "Enable USB poll_sync for Bluetooth USB devices by default"
+       depends on BT_HCIBTUSB
+       default y
+       help
+         poll_sync synchronizes the USB data and event endpoints by
+         prioritizing the later.
+
+         Say Y here to enable USB poll_sync for Bluetooth USB devices by
+         default.
+
 config BT_HCIBTUSB_BCM
        bool "Broadcom protocol support"
        depends on BT_HCIBTUSB
@@ -274,6 +285,18 @@ config BT_HCIBCM203X
          Say Y here to compile support for HCI BCM203x devices into the
          kernel or say M to compile it as module (bcm203x).
 
+
+config BT_HCIBCM4377
+       tristate "HCI BCM4377/4378/4387 PCIe driver"
+       depends on PCI
+       select FW_LOADER
+       help
+         Support for Broadcom BCM4377/4378/4387 Bluetooth chipsets attached via
+         PCIe. These are usually found in Apple machines.
+
+         Say Y here to compile support for HCI BCM4377 family devices into the
+         kernel or say M to compile it as module (hci_bcm4377).
+
 config BT_HCIBPA10X
        tristate "HCI BPA10x USB driver"
        depends on USB
index 3321a8a..e0b261f 100644 (file)
@@ -6,6 +6,7 @@
 obj-$(CONFIG_BT_HCIVHCI)       += hci_vhci.o
 obj-$(CONFIG_BT_HCIUART)       += hci_uart.o
 obj-$(CONFIG_BT_HCIBCM203X)    += bcm203x.o
+obj-$(CONFIG_BT_HCIBCM4377)    += hci_bcm4377.o
 obj-$(CONFIG_BT_HCIBPA10X)     += bpa10x.o
 obj-$(CONFIG_BT_HCIBFUSB)      += bfusb.o
 obj-$(CONFIG_BT_HCIDTL1)       += dtl1_cs.o
index a657e9a..d4e2cb9 100644 (file)
@@ -26,7 +26,7 @@
 
 #define CMD_WRITE_BOOT_PARAMS  0xfc0e
 struct cmd_write_boot_params {
-       u32 boot_addr;
+       __le32 boot_addr;
        u8  fw_build_num;
        u8  fw_build_ww;
        u8  fw_build_yy;
@@ -1783,19 +1783,19 @@ static int btintel_get_fw_name(struct intel_version *ver,
        case 0x0b:      /* SfP */
        case 0x0c:      /* WsP */
                snprintf(fw_name, len, "intel/ibt-%u-%u.%s",
-                       le16_to_cpu(ver->hw_variant),
-                       le16_to_cpu(params->dev_revid),
-                       suffix);
+                        ver->hw_variant,
+                        le16_to_cpu(params->dev_revid),
+                        suffix);
                break;
        case 0x11:      /* JfP */
        case 0x12:      /* ThP */
        case 0x13:      /* HrP */
        case 0x14:      /* CcP */
                snprintf(fw_name, len, "intel/ibt-%u-%u-%u.%s",
-                       le16_to_cpu(ver->hw_variant),
-                       le16_to_cpu(ver->hw_revision),
-                       le16_to_cpu(ver->fw_revision),
-                       suffix);
+                        ver->hw_variant,
+                        ver->hw_revision,
+                        ver->fw_revision,
+                        suffix);
                break;
        default:
                return -EINVAL;
@@ -2524,7 +2524,7 @@ static int btintel_setup_combined(struct hci_dev *hdev)
                 */
                err = btintel_read_version(hdev, &ver);
                if (err)
-                       return err;
+                       break;
 
                /* Apply the device specific HCI quirks
                 *
@@ -2566,7 +2566,8 @@ static int btintel_setup_combined(struct hci_dev *hdev)
        default:
                bt_dev_err(hdev, "Unsupported Intel hw variant (%u)",
                           INTEL_HW_VARIANT(ver_tlv.cnvi_bt));
-               return -EINVAL;
+               err = -EINVAL;
+               break;
        }
 
 exit_error:
index fb52313..69c3fe6 100644 (file)
@@ -781,6 +781,13 @@ void btrtl_set_quirks(struct hci_dev *hdev, struct btrtl_device_info *btrtl_dev)
        case CHIP_ID_8852C:
                set_bit(HCI_QUIRK_VALID_LE_STATES, &hdev->quirks);
                set_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, &hdev->quirks);
+
+               /* RTL8852C needs to transmit mSBC data continuously without
+                * the zero length of USB packets for the ALT 6 supported chips
+                */
+               if (btrtl_dev->project_id == CHIP_ID_8852C)
+                       btrealtek_set_flag(hdev, REALTEK_ALT6_CONTINUOUS_TX_CHIP);
+
                hci_set_aosp_capable(hdev);
                break;
        default:
index 2c441bd..ebf0101 100644 (file)
@@ -47,6 +47,27 @@ struct rtl_vendor_config {
        struct rtl_vendor_config_entry entry[];
 } __packed;
 
+enum {
+       REALTEK_ALT6_CONTINUOUS_TX_CHIP,
+
+       __REALTEK_NUM_FLAGS,
+};
+
+struct btrealtek_data {
+       DECLARE_BITMAP(flags, __REALTEK_NUM_FLAGS);
+};
+
+#define btrealtek_set_flag(hdev, nr)                                   \
+       do {                                                            \
+               struct btrealtek_data *realtek = hci_get_priv((hdev));  \
+               set_bit((nr), realtek->flags);                          \
+       } while (0)
+
+#define btrealtek_get_flag(hdev)                                       \
+       (((struct btrealtek_data *)hci_get_priv(hdev))->flags)
+
+#define btrealtek_test_flag(hdev, nr)  test_bit((nr), btrealtek_get_flag(hdev))
+
 #if IS_ENABLED(CONFIG_BT_RTL)
 
 struct btrtl_device_info *btrtl_initialize(struct hci_dev *hdev,
index 2719638..2ad4efd 100644 (file)
@@ -32,6 +32,7 @@
 static bool disable_scofix;
 static bool force_scofix;
 static bool enable_autosuspend = IS_ENABLED(CONFIG_BT_HCIBTUSB_AUTOSUSPEND);
+static bool enable_poll_sync = IS_ENABLED(CONFIG_BT_HCIBTUSB_POLL_SYNC);
 static bool reset = true;
 
 static struct usb_driver btusb_driver;
@@ -316,6 +317,90 @@ static const struct usb_device_id blacklist_table[] = {
        { USB_DEVICE(0x0489, 0xe0d0), .driver_info = BTUSB_QCA_WCN6855 |
                                                     BTUSB_WIDEBAND_SPEECH |
                                                     BTUSB_VALID_LE_STATES },
+       { USB_DEVICE(0x10ab, 0x9108), .driver_info = BTUSB_QCA_WCN6855 |
+                                                    BTUSB_WIDEBAND_SPEECH |
+                                                    BTUSB_VALID_LE_STATES },
+       { USB_DEVICE(0x10ab, 0x9109), .driver_info = BTUSB_QCA_WCN6855 |
+                                                    BTUSB_WIDEBAND_SPEECH |
+                                                    BTUSB_VALID_LE_STATES },
+       { USB_DEVICE(0x10ab, 0x9208), .driver_info = BTUSB_QCA_WCN6855 |
+                                                    BTUSB_WIDEBAND_SPEECH |
+                                                    BTUSB_VALID_LE_STATES },
+       { USB_DEVICE(0x10ab, 0x9209), .driver_info = BTUSB_QCA_WCN6855 |
+                                                    BTUSB_WIDEBAND_SPEECH |
+                                                    BTUSB_VALID_LE_STATES },
+       { USB_DEVICE(0x10ab, 0x9308), .driver_info = BTUSB_QCA_WCN6855 |
+                                                    BTUSB_WIDEBAND_SPEECH |
+                                                    BTUSB_VALID_LE_STATES },
+       { USB_DEVICE(0x10ab, 0x9408), .driver_info = BTUSB_QCA_WCN6855 |
+                                                    BTUSB_WIDEBAND_SPEECH |
+                                                    BTUSB_VALID_LE_STATES },
+       { USB_DEVICE(0x10ab, 0x9508), .driver_info = BTUSB_QCA_WCN6855 |
+                                                    BTUSB_WIDEBAND_SPEECH |
+                                                    BTUSB_VALID_LE_STATES },
+       { USB_DEVICE(0x10ab, 0x9509), .driver_info = BTUSB_QCA_WCN6855 |
+                                                    BTUSB_WIDEBAND_SPEECH |
+                                                    BTUSB_VALID_LE_STATES },
+       { USB_DEVICE(0x10ab, 0x9608), .driver_info = BTUSB_QCA_WCN6855 |
+                                                    BTUSB_WIDEBAND_SPEECH |
+                                                    BTUSB_VALID_LE_STATES },
+       { USB_DEVICE(0x10ab, 0x9609), .driver_info = BTUSB_QCA_WCN6855 |
+                                                    BTUSB_WIDEBAND_SPEECH |
+                                                    BTUSB_VALID_LE_STATES },
+       { USB_DEVICE(0x10ab, 0x9f09), .driver_info = BTUSB_QCA_WCN6855 |
+                                                    BTUSB_WIDEBAND_SPEECH |
+                                                    BTUSB_VALID_LE_STATES },
+       { USB_DEVICE(0x04ca, 0x3022), .driver_info = BTUSB_QCA_WCN6855 |
+                                                    BTUSB_WIDEBAND_SPEECH |
+                                                    BTUSB_VALID_LE_STATES },
+       { USB_DEVICE(0x0489, 0xe0c7), .driver_info = BTUSB_QCA_WCN6855 |
+                                                    BTUSB_WIDEBAND_SPEECH |
+                                                    BTUSB_VALID_LE_STATES },
+       { USB_DEVICE(0x0489, 0xe0c9), .driver_info = BTUSB_QCA_WCN6855 |
+                                                    BTUSB_WIDEBAND_SPEECH |
+                                                    BTUSB_VALID_LE_STATES },
+       { USB_DEVICE(0x0489, 0xe0ca), .driver_info = BTUSB_QCA_WCN6855 |
+                                                    BTUSB_WIDEBAND_SPEECH |
+                                                    BTUSB_VALID_LE_STATES },
+       { USB_DEVICE(0x0489, 0xe0cb), .driver_info = BTUSB_QCA_WCN6855 |
+                                                    BTUSB_WIDEBAND_SPEECH |
+                                                    BTUSB_VALID_LE_STATES },
+       { USB_DEVICE(0x0489, 0xe0ce), .driver_info = BTUSB_QCA_WCN6855 |
+                                                    BTUSB_WIDEBAND_SPEECH |
+                                                    BTUSB_VALID_LE_STATES },
+       { USB_DEVICE(0x0489, 0xe0de), .driver_info = BTUSB_QCA_WCN6855 |
+                                                    BTUSB_WIDEBAND_SPEECH |
+                                                    BTUSB_VALID_LE_STATES },
+       { USB_DEVICE(0x0489, 0xe0df), .driver_info = BTUSB_QCA_WCN6855 |
+                                                    BTUSB_WIDEBAND_SPEECH |
+                                                    BTUSB_VALID_LE_STATES },
+       { USB_DEVICE(0x0489, 0xe0e1), .driver_info = BTUSB_QCA_WCN6855 |
+                                                    BTUSB_WIDEBAND_SPEECH |
+                                                    BTUSB_VALID_LE_STATES },
+       { USB_DEVICE(0x0489, 0xe0ea), .driver_info = BTUSB_QCA_WCN6855 |
+                                                    BTUSB_WIDEBAND_SPEECH |
+                                                    BTUSB_VALID_LE_STATES },
+       { USB_DEVICE(0x0489, 0xe0ec), .driver_info = BTUSB_QCA_WCN6855 |
+                                                    BTUSB_WIDEBAND_SPEECH |
+                                                    BTUSB_VALID_LE_STATES },
+       { USB_DEVICE(0x04ca, 0x3023), .driver_info = BTUSB_QCA_WCN6855 |
+                                                    BTUSB_WIDEBAND_SPEECH |
+                                                    BTUSB_VALID_LE_STATES },
+       { USB_DEVICE(0x04ca, 0x3024), .driver_info = BTUSB_QCA_WCN6855 |
+                                                    BTUSB_WIDEBAND_SPEECH |
+                                                    BTUSB_VALID_LE_STATES },
+       { USB_DEVICE(0x04ca, 0x3a22), .driver_info = BTUSB_QCA_WCN6855 |
+                                                    BTUSB_WIDEBAND_SPEECH |
+                                                    BTUSB_VALID_LE_STATES },
+       { USB_DEVICE(0x04ca, 0x3a24), .driver_info = BTUSB_QCA_WCN6855 |
+                                                    BTUSB_WIDEBAND_SPEECH |
+                                                    BTUSB_VALID_LE_STATES },
+       { USB_DEVICE(0x04ca, 0x3a26), .driver_info = BTUSB_QCA_WCN6855 |
+                                                    BTUSB_WIDEBAND_SPEECH |
+                                                    BTUSB_VALID_LE_STATES },
+       { USB_DEVICE(0x04ca, 0x3a27), .driver_info = BTUSB_QCA_WCN6855 |
+                                                    BTUSB_WIDEBAND_SPEECH |
+                                                    BTUSB_VALID_LE_STATES },
 
        /* QCA WCN785x chipset */
        { USB_DEVICE(0x0cf3, 0xe700), .driver_info = BTUSB_QCA_WCN6855 |
@@ -413,6 +498,10 @@ static const struct usb_device_id blacklist_table[] = {
        { USB_DEVICE(0x0bda, 0xc822), .driver_info = BTUSB_REALTEK |
                                                     BTUSB_WIDEBAND_SPEECH },
 
+       /* Realtek 8822CU Bluetooth devices */
+       { USB_DEVICE(0x13d3, 0x3549), .driver_info = BTUSB_REALTEK |
+                                                    BTUSB_WIDEBAND_SPEECH },
+
        /* Realtek 8852AE Bluetooth devices */
        { USB_DEVICE(0x0bda, 0x2852), .driver_info = BTUSB_REALTEK |
                                                     BTUSB_WIDEBAND_SPEECH },
@@ -443,6 +532,10 @@ static const struct usb_device_id blacklist_table[] = {
        { USB_DEVICE(0x13d3, 0x3592), .driver_info = BTUSB_REALTEK |
                                                     BTUSB_WIDEBAND_SPEECH },
 
+       /* Realtek 8852BE Bluetooth devices */
+       { USB_DEVICE(0x0cb8, 0xc559), .driver_info = BTUSB_REALTEK |
+                                                    BTUSB_WIDEBAND_SPEECH },
+
        /* Realtek Bluetooth devices */
        { USB_VENDOR_AND_INTERFACE_INFO(0x0bda, 0xe0, 0x01, 0x01),
          .driver_info = BTUSB_REALTEK },
@@ -511,6 +604,9 @@ static const struct usb_device_id blacklist_table[] = {
        { USB_DEVICE(0x0489, 0xe0e2), .driver_info = BTUSB_MEDIATEK |
                                                     BTUSB_WIDEBAND_SPEECH |
                                                     BTUSB_VALID_LE_STATES },
+       { USB_DEVICE(0x0489, 0xe0f2), .driver_info = BTUSB_MEDIATEK |
+                                                    BTUSB_WIDEBAND_SPEECH |
+                                                    BTUSB_VALID_LE_STATES },
 
        /* Additional Realtek 8723AE Bluetooth devices */
        { USB_DEVICE(0x0930, 0x021d), .driver_info = BTUSB_REALTEK },
@@ -543,6 +639,8 @@ static const struct usb_device_id blacklist_table[] = {
                                                     BTUSB_WIDEBAND_SPEECH },
        { USB_DEVICE(0x7392, 0xc611), .driver_info = BTUSB_REALTEK |
                                                     BTUSB_WIDEBAND_SPEECH },
+       { USB_DEVICE(0x2b89, 0x8761), .driver_info = BTUSB_REALTEK |
+                                                    BTUSB_WIDEBAND_SPEECH },
 
        /* Additional Realtek 8821AE Bluetooth devices */
        { USB_DEVICE(0x0b05, 0x17dc), .driver_info = BTUSB_REALTEK },
@@ -632,6 +730,7 @@ static const struct dmi_system_id btusb_needs_reset_resume_table[] = {
 #define BTUSB_TX_WAIT_VND_EVT  13
 #define BTUSB_WAKEUP_AUTOSUSPEND       14
 #define BTUSB_USE_ALT3_FOR_WBS 15
+#define BTUSB_ALT6_CONTINUOUS_TX       16
 
 struct btusb_data {
        struct hci_dev       *hdev;
@@ -696,6 +795,28 @@ struct btusb_data {
        unsigned cmd_timeout_cnt;
 };
 
+static void btusb_reset(struct hci_dev *hdev)
+{
+       struct btusb_data *data;
+       int err;
+
+       if (hdev->reset) {
+               hdev->reset(hdev);
+               return;
+       }
+
+       data = hci_get_drvdata(hdev);
+       /* This is not an unbalanced PM reference since the device will reset */
+       err = usb_autopm_get_interface(data->intf);
+       if (err) {
+               bt_dev_err(hdev, "Failed usb_autopm_get_interface: %d", err);
+               return;
+       }
+
+       bt_dev_err(hdev, "Resetting usb device.");
+       usb_queue_reset_device(data->intf);
+}
+
 static void btusb_intel_cmd_timeout(struct hci_dev *hdev)
 {
        struct btusb_data *data = hci_get_drvdata(hdev);
@@ -705,7 +826,7 @@ static void btusb_intel_cmd_timeout(struct hci_dev *hdev)
                return;
 
        if (!reset_gpio) {
-               bt_dev_err(hdev, "No way to reset. Ignoring and continuing");
+               btusb_reset(hdev);
                return;
        }
 
@@ -736,7 +857,7 @@ static void btusb_rtl_cmd_timeout(struct hci_dev *hdev)
                return;
 
        if (!reset_gpio) {
-               bt_dev_err(hdev, "No gpio to reset Realtek device, ignoring");
+               btusb_reset(hdev);
                return;
        }
 
@@ -761,7 +882,6 @@ static void btusb_qca_cmd_timeout(struct hci_dev *hdev)
 {
        struct btusb_data *data = hci_get_drvdata(hdev);
        struct gpio_desc *reset_gpio = data->reset_gpio;
-       int err;
 
        if (++data->cmd_timeout_cnt < 5)
                return;
@@ -787,13 +907,7 @@ static void btusb_qca_cmd_timeout(struct hci_dev *hdev)
                return;
        }
 
-       bt_dev_err(hdev, "Multiple cmd timeouts seen. Resetting usb device.");
-       /* This is not an unbalanced PM reference since the device will reset */
-       err = usb_autopm_get_interface(data->intf);
-       if (!err)
-               usb_queue_reset_device(data->intf);
-       else
-               bt_dev_err(hdev, "Failed usb_autopm_get_interface with %d", err);
+       btusb_reset(hdev);
 }
 
 static inline void btusb_free_frags(struct btusb_data *data)
@@ -802,13 +916,13 @@ static inline void btusb_free_frags(struct btusb_data *data)
 
        spin_lock_irqsave(&data->rxlock, flags);
 
-       kfree_skb(data->evt_skb);
+       dev_kfree_skb_irq(data->evt_skb);
        data->evt_skb = NULL;
 
-       kfree_skb(data->acl_skb);
+       dev_kfree_skb_irq(data->acl_skb);
        data->acl_skb = NULL;
 
-       kfree_skb(data->sco_skb);
+       dev_kfree_skb_irq(data->sco_skb);
        data->sco_skb = NULL;
 
        spin_unlock_irqrestore(&data->rxlock, flags);
@@ -962,6 +1076,34 @@ static int btusb_recv_bulk(struct btusb_data *data, void *buffer, int count)
        return err;
 }
 
+static bool btusb_validate_sco_handle(struct hci_dev *hdev,
+                                     struct hci_sco_hdr *hdr)
+{
+       __u16 handle;
+
+       if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL))
+               // Can't validate, userspace controls everything.
+               return true;
+
+       /*
+        * USB isochronous transfers are not designed to be reliable and may
+        * lose fragments.  When this happens, the next first fragment
+        * encountered might actually be a continuation fragment.
+        * Validate the handle to detect it and drop it, or else the upper
+        * layer will get garbage for a while.
+        */
+
+       handle = hci_handle(__le16_to_cpu(hdr->handle));
+
+       switch (hci_conn_lookup_type(hdev, handle)) {
+       case SCO_LINK:
+       case ESCO_LINK:
+               return true;
+       default:
+               return false;
+       }
+}
+
 static int btusb_recv_isoc(struct btusb_data *data, void *buffer, int count)
 {
        struct sk_buff *skb;
@@ -994,9 +1136,12 @@ static int btusb_recv_isoc(struct btusb_data *data, void *buffer, int count)
 
                if (skb->len == HCI_SCO_HDR_SIZE) {
                        /* Complete SCO header */
-                       hci_skb_expect(skb) = hci_sco_hdr(skb)->dlen;
+                       struct hci_sco_hdr *hdr = hci_sco_hdr(skb);
 
-                       if (skb_tailroom(skb) < hci_skb_expect(skb)) {
+                       hci_skb_expect(skb) = hdr->dlen;
+
+                       if (skb_tailroom(skb) < hci_skb_expect(skb) ||
+                           !btusb_validate_sco_handle(data->hdev, hdr)) {
                                kfree_skb(skb);
                                skb = NULL;
 
@@ -1276,11 +1421,17 @@ static void btusb_isoc_complete(struct urb *urb)
 static inline void __fill_isoc_descriptor_msbc(struct urb *urb, int len,
                                               int mtu, struct btusb_data *data)
 {
-       int i, offset = 0;
+       int i = 0, offset = 0;
        unsigned int interval;
 
        BT_DBG("len %d mtu %d", len, mtu);
 
+       /* For mSBC ALT 6 settings some chips need to transmit the data
+        * continuously without the zero length of USB packets.
+        */
+       if (test_bit(BTUSB_ALT6_CONTINUOUS_TX, &data->flags))
+               goto ignore_usb_alt6_packet_flow;
+
        /* For mSBC ALT 6 setting the host will send the packet at continuous
         * flow. As per core spec 5, vol 4, part B, table 2.1. For ALT setting
         * 6 the HCI PACKET INTERVAL should be 7.5ms for every usb packets.
@@ -1300,6 +1451,7 @@ static inline void __fill_isoc_descriptor_msbc(struct urb *urb, int len,
                urb->iso_frame_desc[i].length = offset;
        }
 
+ignore_usb_alt6_packet_flow:
        if (len && i < BTUSB_MAX_ISOC_FRAMES) {
                urb->iso_frame_desc[i].offset = offset;
                urb->iso_frame_desc[i].length = len;
@@ -1981,10 +2133,11 @@ static void btusb_work(struct work_struct *work)
                if (btusb_switch_alt_setting(hdev, new_alts) < 0)
                        bt_dev_err(hdev, "set USB alt:(%d) failed!", new_alts);
        } else {
-               clear_bit(BTUSB_ISOC_RUNNING, &data->flags);
                usb_kill_anchored_urbs(&data->isoc_anchor);
 
-               __set_isoc_interface(hdev, 0);
+               if (test_and_clear_bit(BTUSB_ISOC_RUNNING, &data->flags))
+                       __set_isoc_interface(hdev, 0);
+
                if (test_and_clear_bit(BTUSB_DID_ISO_RESUME, &data->flags))
                        usb_autopm_put_interface(data->isoc ? data->isoc : data->intf);
        }
@@ -2048,13 +2201,19 @@ static int btusb_setup_csr(struct hci_dev *hdev)
                return err;
        }
 
-       if (skb->len != sizeof(struct hci_rp_read_local_version)) {
+       rp = skb_pull_data(skb, sizeof(*rp));
+       if (!rp) {
                bt_dev_err(hdev, "CSR: Local version length mismatch");
                kfree_skb(skb);
                return -EIO;
        }
 
-       rp = (struct hci_rp_read_local_version *)skb->data;
+       bt_dev_info(hdev, "CSR: Setting up dongle with HCI ver=%u rev=%04x",
+                   rp->hci_ver, le16_to_cpu(rp->hci_rev));
+
+       bt_dev_info(hdev, "LMP ver=%u subver=%04x; manufacturer=%u",
+                   rp->lmp_ver, le16_to_cpu(rp->lmp_subver),
+                   le16_to_cpu(rp->manufacturer));
 
        /* Detect a wide host of Chinese controllers that aren't CSR.
         *
@@ -2084,29 +2243,29 @@ static int btusb_setup_csr(struct hci_dev *hdev)
         *      third-party BT 4.0 dongle reuses it.
         */
        else if (le16_to_cpu(rp->lmp_subver) <= 0x034e &&
-                le16_to_cpu(rp->hci_ver) > BLUETOOTH_VER_1_1)
+                rp->hci_ver > BLUETOOTH_VER_1_1)
                is_fake = true;
 
        else if (le16_to_cpu(rp->lmp_subver) <= 0x0529 &&
-                le16_to_cpu(rp->hci_ver) > BLUETOOTH_VER_1_2)
+                rp->hci_ver > BLUETOOTH_VER_1_2)
                is_fake = true;
 
        else if (le16_to_cpu(rp->lmp_subver) <= 0x0c5c &&
-                le16_to_cpu(rp->hci_ver) > BLUETOOTH_VER_2_0)
+                rp->hci_ver > BLUETOOTH_VER_2_0)
                is_fake = true;
 
        else if (le16_to_cpu(rp->lmp_subver) <= 0x1899 &&
-                le16_to_cpu(rp->hci_ver) > BLUETOOTH_VER_2_1)
+                rp->hci_ver > BLUETOOTH_VER_2_1)
                is_fake = true;
 
        else if (le16_to_cpu(rp->lmp_subver) <= 0x22bb &&
-                le16_to_cpu(rp->hci_ver) > BLUETOOTH_VER_4_0)
+                rp->hci_ver > BLUETOOTH_VER_4_0)
                is_fake = true;
 
        /* Other clones which beat all the above checks */
        else if (bcdDevice == 0x0134 &&
                 le16_to_cpu(rp->lmp_subver) == 0x0c5c &&
-                le16_to_cpu(rp->hci_ver) == BLUETOOTH_VER_2_0)
+                rp->hci_ver == BLUETOOTH_VER_2_0)
                is_fake = true;
 
        if (is_fake) {
@@ -2118,6 +2277,7 @@ static int btusb_setup_csr(struct hci_dev *hdev)
                 * without these the controller will lock up.
                 */
                set_bit(HCI_QUIRK_BROKEN_STORED_LINK_KEY, &hdev->quirks);
+               set_bit(HCI_QUIRK_BROKEN_ERR_DATA_REPORTING, &hdev->quirks);
                set_bit(HCI_QUIRK_BROKEN_FILTER_CLEAR_ALL, &hdev->quirks);
                set_bit(HCI_QUIRK_NO_SUSPEND_NOTIFIER, &hdev->quirks);
 
@@ -2315,6 +2475,19 @@ static int btusb_send_frame_intel(struct hci_dev *hdev, struct sk_buff *skb)
        return -EILSEQ;
 }
 
+static int btusb_setup_realtek(struct hci_dev *hdev)
+{
+       struct btusb_data *data = hci_get_drvdata(hdev);
+       int ret;
+
+       ret = btrtl_setup_realtek(hdev);
+
+       if (btrealtek_test_flag(data->hdev, REALTEK_ALT6_CONTINUOUS_TX_CHIP))
+               set_bit(BTUSB_ALT6_CONTINUOUS_TX, &data->flags);
+
+       return ret;
+}
+
 /* UHW CR mapping */
 #define MTK_BT_MISC            0x70002510
 #define MTK_BT_SUBSYS_RST      0x70002610
@@ -3250,7 +3423,7 @@ static int btusb_setup_qca_load_rampatch(struct hci_dev *hdev,
 
        if (ver_rom & ~0xffffU) {
                rver_rom_high = le16_to_cpu(rver->rom_version_high);
-               rver_rom = le32_to_cpu(rver_rom_high << 16 | rver_rom_low);
+               rver_rom = rver_rom_high << 16 | rver_rom_low;
        } else {
                rver_rom = rver_rom_low;
        }
@@ -3755,6 +3928,9 @@ static int btusb_probe(struct usb_interface *intf,
                /* Override the rx handlers */
                data->recv_event = btusb_recv_event_intel;
                data->recv_bulk = btusb_recv_bulk_intel;
+       } else if (id->driver_info & BTUSB_REALTEK) {
+               /* Allocate extra space for Realtek device */
+               priv_size += sizeof(struct btrealtek_data);
        }
 
        data->recv_acl = hci_recv_frame;
@@ -3913,7 +4089,7 @@ static int btusb_probe(struct usb_interface *intf,
 
        if (IS_ENABLED(CONFIG_BT_HCIBTUSB_RTL) &&
            (id->driver_info & BTUSB_REALTEK)) {
-               hdev->setup = btrtl_setup_realtek;
+               hdev->setup = btusb_setup_realtek;
                hdev->shutdown = btrtl_shutdown_realtek;
                hdev->cmd_timeout = btusb_rtl_cmd_timeout;
 
@@ -3998,6 +4174,8 @@ static int btusb_probe(struct usb_interface *intf,
        if (enable_autosuspend)
                usb_enable_autosuspend(data->udev);
 
+       data->poll_sync = enable_poll_sync;
+
        err = hci_register_dev(hdev);
        if (err < 0)
                goto out_free_dev;
index d7e0b75..2b6c0e1 100644 (file)
  * struct bcm_device_data - device specific data
  * @no_early_set_baudrate: Disallow set baudrate before driver setup()
  * @drive_rts_on_open: drive RTS signal on ->open() when platform requires it
+ * @no_uart_clock_set: UART clock set command for >3Mbps mode is unavailable
  * @max_autobaud_speed: max baudrate supported by device in autobaud mode
  */
 struct bcm_device_data {
        bool    no_early_set_baudrate;
        bool    drive_rts_on_open;
+       bool    no_uart_clock_set;
        u32     max_autobaud_speed;
 };
 
@@ -100,6 +102,7 @@ struct bcm_device_data {
  * @is_suspended: whether flow control is currently disabled
  * @no_early_set_baudrate: don't set_baudrate before setup()
  * @drive_rts_on_open: drive RTS signal on ->open() when platform requires it
+ * @no_uart_clock_set: UART clock set command for >3Mbps mode is unavailable
  * @pcm_int_params: keep the initial PCM configuration
  * @use_autobaud_mode: start Bluetooth device in autobaud mode
  * @max_autobaud_speed: max baudrate supported by device in autobaud mode
@@ -140,6 +143,7 @@ struct bcm_device {
 #endif
        bool                    no_early_set_baudrate;
        bool                    drive_rts_on_open;
+       bool                    no_uart_clock_set;
        bool                    use_autobaud_mode;
        u8                      pcm_int_params[5];
        u32                     max_autobaud_speed;
@@ -172,10 +176,11 @@ static inline void host_set_baudrate(struct hci_uart *hu, unsigned int speed)
 static int bcm_set_baudrate(struct hci_uart *hu, unsigned int speed)
 {
        struct hci_dev *hdev = hu->hdev;
+       struct bcm_data *bcm = hu->priv;
        struct sk_buff *skb;
        struct bcm_update_uart_baud_rate param;
 
-       if (speed > 3000000) {
+       if (speed > 3000000 && !bcm->dev->no_uart_clock_set) {
                struct bcm_write_uart_clock_setting clock;
 
                clock.type = BCM_UART_CLOCK_48MHZ;
@@ -1529,6 +1534,7 @@ static int bcm_serdev_probe(struct serdev_device *serdev)
                bcmdev->max_autobaud_speed = data->max_autobaud_speed;
                bcmdev->no_early_set_baudrate = data->no_early_set_baudrate;
                bcmdev->drive_rts_on_open = data->drive_rts_on_open;
+               bcmdev->no_uart_clock_set = data->no_uart_clock_set;
        }
 
        return hci_uart_register_device(&bcmdev->serdev_hu, &bcm_proto);
@@ -1550,6 +1556,10 @@ static struct bcm_device_data bcm43438_device_data = {
        .drive_rts_on_open = true,
 };
 
+static struct bcm_device_data cyw4373a0_device_data = {
+       .no_uart_clock_set = true,
+};
+
 static struct bcm_device_data cyw55572_device_data = {
        .max_autobaud_speed = 921600,
 };
@@ -1566,6 +1576,7 @@ static const struct of_device_id bcm_bluetooth_of_match[] = {
        { .compatible = "brcm,bcm4349-bt", .data = &bcm43438_device_data },
        { .compatible = "brcm,bcm43540-bt", .data = &bcm4354_device_data },
        { .compatible = "brcm,bcm4335a0" },
+       { .compatible = "cypress,cyw4373a0-bt", .data = &cyw4373a0_device_data },
        { .compatible = "infineon,cyw55572-bt", .data = &cyw55572_device_data },
        { },
 };
diff --git a/drivers/bluetooth/hci_bcm4377.c b/drivers/bluetooth/hci_bcm4377.c
new file mode 100644 (file)
index 0000000..19ad0e7
--- /dev/null
@@ -0,0 +1,2514 @@
+// SPDX-License-Identifier: GPL-2.0-only OR MIT
+/*
+ * Bluetooth HCI driver for Broadcom 4377/4378/4387 devices attached via PCIe
+ *
+ * Copyright (C) The Asahi Linux Contributors
+ */
+
+#include <linux/async.h>
+#include <linux/bitfield.h>
+#include <linux/completion.h>
+#include <linux/dma-mapping.h>
+#include <linux/dmi.h>
+#include <linux/firmware.h>
+#include <linux/module.h>
+#include <linux/msi.h>
+#include <linux/of.h>
+#include <linux/pci.h>
+#include <linux/printk.h>
+
+#include <asm/unaligned.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+
+enum bcm4377_chip {
+       BCM4377 = 0,
+       BCM4378,
+       BCM4387,
+};
+
+#define BCM4377_DEVICE_ID 0x5fa0
+#define BCM4378_DEVICE_ID 0x5f69
+#define BCM4387_DEVICE_ID 0x5f71
+
+#define BCM4377_TIMEOUT 1000
+
+/*
+ * These devices only support DMA transactions inside a 32bit window
+ * (possibly to avoid 64 bit arithmetic). The window size cannot exceed
+ * 0xffffffff but is always aligned down to the previous 0x200 byte boundary
+ * which effectively limits the window to [start, start+0xfffffe00].
+ * We just limit the DMA window to [0, 0xfffffe00] to make sure we don't
+ * run into this limitation.
+ */
+#define BCM4377_DMA_MASK 0xfffffe00
+
+#define BCM4377_PCIECFG_BAR0_WINDOW1      0x80
+#define BCM4377_PCIECFG_BAR0_WINDOW2      0x70
+#define BCM4377_PCIECFG_BAR0_CORE2_WINDOW1 0x74
+#define BCM4377_PCIECFG_BAR0_CORE2_WINDOW2 0x78
+#define BCM4377_PCIECFG_BAR2_WINDOW       0x84
+
+#define BCM4377_PCIECFG_BAR0_CORE2_WINDOW1_DEFAULT 0x18011000
+#define BCM4377_PCIECFG_BAR2_WINDOW_DEFAULT       0x19000000
+
+#define BCM4377_PCIECFG_SUBSYSTEM_CTRL 0x88
+
+#define BCM4377_BAR0_FW_DOORBELL 0x140
+#define BCM4377_BAR0_RTI_CONTROL 0x144
+
+#define BCM4377_BAR0_SLEEP_CONTROL           0x150
+#define BCM4377_BAR0_SLEEP_CONTROL_UNQUIESCE  0
+#define BCM4377_BAR0_SLEEP_CONTROL_AWAKE      2
+#define BCM4377_BAR0_SLEEP_CONTROL_QUIESCE    3
+
+#define BCM4377_BAR0_DOORBELL      0x174
+#define BCM4377_BAR0_DOORBELL_VALUE GENMASK(31, 16)
+#define BCM4377_BAR0_DOORBELL_IDX   GENMASK(15, 8)
+#define BCM4377_BAR0_DOORBELL_RING  BIT(5)
+
+#define BCM4377_BAR0_HOST_WINDOW_LO   0x590
+#define BCM4377_BAR0_HOST_WINDOW_HI   0x594
+#define BCM4377_BAR0_HOST_WINDOW_SIZE 0x598
+
+#define BCM4377_BAR2_BOOTSTAGE 0x200454
+
+#define BCM4377_BAR2_FW_LO   0x200478
+#define BCM4377_BAR2_FW_HI   0x20047c
+#define BCM4377_BAR2_FW_SIZE 0x200480
+
+#define BCM4377_BAR2_CONTEXT_ADDR_LO 0x20048c
+#define BCM4377_BAR2_CONTEXT_ADDR_HI 0x200450
+
+#define BCM4377_BAR2_RTI_STATUS             0x20045c
+#define BCM4377_BAR2_RTI_WINDOW_LO   0x200494
+#define BCM4377_BAR2_RTI_WINDOW_HI   0x200498
+#define BCM4377_BAR2_RTI_WINDOW_SIZE 0x20049c
+
+#define BCM4377_OTP_SIZE         0xe0
+#define BCM4377_OTP_SYS_VENDOR   0x15
+#define BCM4377_OTP_CIS                  0x80
+#define BCM4377_OTP_VENDOR_HDR   0x00000008
+#define BCM4377_OTP_MAX_PARAM_LEN 16
+
+#define BCM4377_N_TRANSFER_RINGS   9
+#define BCM4377_N_COMPLETION_RINGS 6
+
+#define BCM4377_MAX_RING_SIZE 256
+
+#define BCM4377_MSGID_GENERATION GENMASK(15, 8)
+#define BCM4377_MSGID_ID        GENMASK(7, 0)
+
+#define BCM4377_RING_N_ENTRIES 128
+
+#define BCM4377_CONTROL_MSG_SIZE                  0x34
+#define BCM4377_XFER_RING_MAX_INPLACE_PAYLOAD_SIZE (4 * 0xff)
+
+#define MAX_ACL_PAYLOAD_SIZE   (HCI_MAX_FRAME_SIZE + HCI_ACL_HDR_SIZE)
+#define MAX_SCO_PAYLOAD_SIZE   (HCI_MAX_SCO_SIZE + HCI_SCO_HDR_SIZE)
+#define MAX_EVENT_PAYLOAD_SIZE (HCI_MAX_EVENT_SIZE + HCI_EVENT_HDR_SIZE)
+
+enum bcm4377_otp_params_type {
+       BCM4377_OTP_BOARD_PARAMS,
+       BCM4377_OTP_CHIP_PARAMS
+};
+
+enum bcm4377_transfer_ring_id {
+       BCM4377_XFER_RING_CONTROL = 0,
+       BCM4377_XFER_RING_HCI_H2D = 1,
+       BCM4377_XFER_RING_HCI_D2H = 2,
+       BCM4377_XFER_RING_SCO_H2D = 3,
+       BCM4377_XFER_RING_SCO_D2H = 4,
+       BCM4377_XFER_RING_ACL_H2D = 5,
+       BCM4377_XFER_RING_ACL_D2H = 6,
+};
+
+enum bcm4377_completion_ring_id {
+       BCM4377_ACK_RING_CONTROL = 0,
+       BCM4377_ACK_RING_HCI_ACL = 1,
+       BCM4377_EVENT_RING_HCI_ACL = 2,
+       BCM4377_ACK_RING_SCO = 3,
+       BCM4377_EVENT_RING_SCO = 4,
+};
+
+enum bcm4377_doorbell {
+       BCM4377_DOORBELL_CONTROL = 0,
+       BCM4377_DOORBELL_HCI_H2D = 1,
+       BCM4377_DOORBELL_HCI_D2H = 2,
+       BCM4377_DOORBELL_ACL_H2D = 3,
+       BCM4377_DOORBELL_ACL_D2H = 4,
+       BCM4377_DOORBELL_SCO = 6,
+};
+
+/*
+ * Transfer ring entry
+ *
+ * flags: Flags to indicate if the payload is appended or mapped
+ * len: Payload length
+ * payload: Optional payload DMA address
+ * id: Message id to recognize the answer in the completion ring entry
+ */
+struct bcm4377_xfer_ring_entry {
+#define BCM4377_XFER_RING_FLAG_PAYLOAD_MAPPED   BIT(0)
+#define BCM4377_XFER_RING_FLAG_PAYLOAD_IN_FOOTER BIT(1)
+       u8 flags;
+       __le16 len;
+       u8 _unk0;
+       __le64 payload;
+       __le16 id;
+       u8 _unk1[2];
+} __packed;
+static_assert(sizeof(struct bcm4377_xfer_ring_entry) == 0x10);
+
+/*
+ * Completion ring entry
+ *
+ * flags: Flags to indicate if the payload is appended or mapped. If the payload
+ *        is mapped it can be found in the buffer of the corresponding transfer
+ *        ring message.
+ * ring_id: Transfer ring ID which required this message
+ * msg_id: Message ID specified in transfer ring entry
+ * len: Payload length
+ */
+struct bcm4377_completion_ring_entry {
+       u8 flags;
+       u8 _unk0;
+       __le16 ring_id;
+       __le16 msg_id;
+       __le32 len;
+       u8 _unk1[6];
+} __packed;
+static_assert(sizeof(struct bcm4377_completion_ring_entry) == 0x10);
+
+enum bcm4377_control_message_type {
+       BCM4377_CONTROL_MSG_CREATE_XFER_RING = 1,
+       BCM4377_CONTROL_MSG_CREATE_COMPLETION_RING = 2,
+       BCM4377_CONTROL_MSG_DESTROY_XFER_RING = 3,
+       BCM4377_CONTROL_MSG_DESTROY_COMPLETION_RING = 4,
+};
+
+/*
+ * Control message used to create a completion ring
+ *
+ * msg_type: Must be BCM4377_CONTROL_MSG_CREATE_COMPLETION_RING
+ * header_size: Unknown, but probably reserved space in front of the entry
+ * footer_size: Number of 32 bit words reserved for payloads after the entry
+ * id/id_again: Completion ring index
+ * ring_iova: DMA address of the ring buffer
+ * n_elements: Number of elements inside the ring buffer
+ * msi: MSI index, doesn't work for all rings though and should be zero
+ * intmod_delay: Unknown delay
+ * intmod_bytes: Unknown
+ */
+struct bcm4377_create_completion_ring_msg {
+       u8 msg_type;
+       u8 header_size;
+       u8 footer_size;
+       u8 _unk0;
+       __le16 id;
+       __le16 id_again;
+       __le64 ring_iova;
+       __le16 n_elements;
+       __le32 unk;
+       u8 _unk1[6];
+       __le16 msi;
+       __le16 intmod_delay;
+       __le32 intmod_bytes;
+       __le16 _unk2;
+       __le32 _unk3;
+       u8 _unk4[10];
+} __packed;
+static_assert(sizeof(struct bcm4377_create_completion_ring_msg) ==
+             BCM4377_CONTROL_MSG_SIZE);
+
+/*
+ * Control ring message used to destroy a completion ring
+ *
+ * msg_type: Must be BCM4377_CONTROL_MSG_DESTROY_COMPLETION_RING
+ * ring_id: Completion ring to be destroyed
+ */
+struct bcm4377_destroy_completion_ring_msg {
+       u8 msg_type;
+       u8 _pad0;
+       __le16 ring_id;
+       u8 _pad1[48];
+} __packed;
+static_assert(sizeof(struct bcm4377_destroy_completion_ring_msg) ==
+             BCM4377_CONTROL_MSG_SIZE);
+
+/*
+ * Control message used to create a transfer ring
+ *
+ * msg_type: Must be BCM4377_CONTROL_MSG_CREATE_XFER_RING
+ * header_size: Number of 32 bit words reserved for unknown content before the
+ *              entry
+ * footer_size: Number of 32 bit words reserved for payloads after the entry
+ * ring_id/ring_id_again: Transfer ring index
+ * ring_iova: DMA address of the ring buffer
+ * n_elements: Number of elements inside the ring buffer
+ * completion_ring_id: Completion ring index for acknowledgements and events
+ * doorbell: Doorbell index used to notify device of new entries
+ * flags: Transfer ring flags
+ *          - virtual: set if there is no associated shared memory and only the
+ *                     corresponding completion ring is used
+ *          - sync: only set for the SCO rings
+ */
+struct bcm4377_create_transfer_ring_msg {
+       u8 msg_type;
+       u8 header_size;
+       u8 footer_size;
+       u8 _unk0;
+       __le16 ring_id;
+       __le16 ring_id_again;
+       __le64 ring_iova;
+       u8 _unk1[8];
+       __le16 n_elements;
+       __le16 completion_ring_id;
+       __le16 doorbell;
+#define BCM4377_XFER_RING_FLAG_VIRTUAL BIT(7)
+#define BCM4377_XFER_RING_FLAG_SYNC    BIT(8)
+       __le16 flags;
+       u8 _unk2[20];
+} __packed;
+static_assert(sizeof(struct bcm4377_create_transfer_ring_msg) ==
+             BCM4377_CONTROL_MSG_SIZE);
+
+/*
+ * Control ring message used to destroy a transfer ring
+ *
+ * msg_type: Must be BCM4377_CONTROL_MSG_DESTROY_XFER_RING
+ * ring_id: Transfer ring to be destroyed
+ */
+struct bcm4377_destroy_transfer_ring_msg {
+       u8 msg_type;
+       u8 _pad0;
+       __le16 ring_id;
+       u8 _pad1[48];
+} __packed;
+static_assert(sizeof(struct bcm4377_destroy_transfer_ring_msg) ==
+             BCM4377_CONTROL_MSG_SIZE);
+
+/*
+ * "Converged IPC" context struct used to make the device aware of all other
+ * shared memory structures. A pointer to this structure is configured inside a
+ * MMIO register.
+ *
+ * version: Protocol version, must be 2.
+ * size: Size of this structure, must be 0x68.
+ * enabled_caps: Enabled capabilities. Unknown bitfield but should be 2.
+ * peripheral_info_addr: DMA address for a 0x20 buffer to which the device will
+ *                       write unknown contents
+ * {completion,xfer}_ring_{tails,heads}_addr: DMA pointers to ring heads/tails
+ * n_completion_rings: Number of completion rings, the firmware only works if
+ *                     this is set to BCM4377_N_COMPLETION_RINGS.
+ * n_xfer_rings: Number of transfer rings, the firmware only works if
+ *               this is set to BCM4377_N_TRANSFER_RINGS.
+ * control_completion_ring_addr: Control completion ring buffer DMA address
+ * control_xfer_ring_addr: Control transfer ring buffer DMA address
+ * control_xfer_ring_n_entries: Number of control transfer ring entries
+ * control_completion_ring_n_entries: Number of control completion ring entries
+ * control_xfer_ring_doorbell: Control transfer ring doorbell
+ * control_completion_ring_doorbell: Control completion ring doorbell,
+ *                                   must be set to 0xffff
+ * control_xfer_ring_msi: Control completion ring MSI index, must be 0
+ * control_completion_ring_msi: Control completion ring MSI index, must be 0.
+ * control_xfer_ring_header_size: Number of 32 bit words reserved in front of
+ *                                every control transfer ring entry
+ * control_xfer_ring_footer_size: Number of 32 bit words reserved after every
+ *                                control transfer ring entry
+ * control_completion_ring_header_size: Number of 32 bit words reserved in front
+ *                                      of every control completion ring entry
+ * control_completion_ring_footer_size: Number of 32 bit words reserved after
+ *                                      every control completion ring entry
+ * scratch_pad: Optional scratch pad DMA address
+ * scratch_pad_size: Scratch pad size
+ */
+struct bcm4377_context {
+       __le16 version;
+       __le16 size;
+       __le32 enabled_caps;
+
+       __le64 peripheral_info_addr;
+
+       /* ring heads and tails */
+       __le64 completion_ring_heads_addr;
+       __le64 xfer_ring_tails_addr;
+       __le64 completion_ring_tails_addr;
+       __le64 xfer_ring_heads_addr;
+       __le16 n_completion_rings;
+       __le16 n_xfer_rings;
+
+       /* control ring configuration */
+       __le64 control_completion_ring_addr;
+       __le64 control_xfer_ring_addr;
+       __le16 control_xfer_ring_n_entries;
+       __le16 control_completion_ring_n_entries;
+       __le16 control_xfer_ring_doorbell;
+       __le16 control_completion_ring_doorbell;
+       __le16 control_xfer_ring_msi;
+       __le16 control_completion_ring_msi;
+       u8 control_xfer_ring_header_size;
+       u8 control_xfer_ring_footer_size;
+       u8 control_completion_ring_header_size;
+       u8 control_completion_ring_footer_size;
+
+       __le16 _unk0;
+       __le16 _unk1;
+
+       __le64 scratch_pad;
+       __le32 scratch_pad_size;
+
+       __le32 _unk3;
+} __packed;
+static_assert(sizeof(struct bcm4377_context) == 0x68);
+
+#define BCM4378_CALIBRATION_CHUNK_SIZE 0xe6
+struct bcm4378_hci_send_calibration_cmd {
+       u8 unk;
+       __le16 blocks_left;
+       u8 data[BCM4378_CALIBRATION_CHUNK_SIZE];
+} __packed;
+
+#define BCM4378_PTB_CHUNK_SIZE 0xcf
+struct bcm4378_hci_send_ptb_cmd {
+       __le16 blocks_left;
+       u8 data[BCM4378_PTB_CHUNK_SIZE];
+} __packed;
+
+/*
+ * Shared memory structure used to store the ring head and tail pointers.
+ */
+struct bcm4377_ring_state {
+       __le16 completion_ring_head[BCM4377_N_COMPLETION_RINGS];
+       __le16 completion_ring_tail[BCM4377_N_COMPLETION_RINGS];
+       __le16 xfer_ring_head[BCM4377_N_TRANSFER_RINGS];
+       __le16 xfer_ring_tail[BCM4377_N_TRANSFER_RINGS];
+};
+
+/*
+ * A transfer ring can be used in two configurations:
+ *  1) Send control or HCI messages to the device which are then acknowledged
+ *     in the corresponding completion ring
+ *  2) Receiving HCI frames from the devices. In this case the transfer ring
+ *     itself contains empty messages that are acknowledged once data is
+ *     available from the device. If the payloads fit inside the footers
+ *     of the completion ring the transfer ring can be configured to be
+ *     virtual such that it has no ring buffer.
+ *
+ * ring_id: ring index hardcoded in the firmware
+ * doorbell: doorbell index to notify device of new entries
+ * payload_size: optional in-place payload size
+ * mapped_payload_size: optional out-of-place payload size
+ * completion_ring: index of corresponding completion ring
+ * n_entries: number of entries inside this ring
+ * generation: ring generation; incremented on hci_open to detect stale messages
+ * sync: set to true for SCO rings
+ * virtual: set to true if this ring has no entries and is just required to
+ *          setup a corresponding completion ring for device->host messages
+ * d2h_buffers_only: set to true if this ring is only used to provide large
+ *                   buffers used by device->host messages in the completion
+ *                   ring
+ * allow_wait: allow to wait for messages to be acknowledged
+ * enabled: true once the ring has been created and can be used
+ * ring: ring buffer for entries (struct bcm4377_xfer_ring_entry)
+ * ring_dma: DMA address for ring entry buffer
+ * payloads: payload buffer for mapped_payload_size payloads
+ * payloads_dma:DMA address for payload buffer
+ * events: pointer to array of completions if waiting is allowed
+ * msgids: bitmap to keep track of used message ids
+ * lock: Spinlock to protect access to ring structurs used in the irq handler
+ */
+struct bcm4377_transfer_ring {
+       enum bcm4377_transfer_ring_id ring_id;
+       enum bcm4377_doorbell doorbell;
+       size_t payload_size;
+       size_t mapped_payload_size;
+       u8 completion_ring;
+       u16 n_entries;
+       u8 generation;
+
+       bool sync;
+       bool virtual;
+       bool d2h_buffers_only;
+       bool allow_wait;
+       bool enabled;
+
+       void *ring;
+       dma_addr_t ring_dma;
+
+       void *payloads;
+       dma_addr_t payloads_dma;
+
+       struct completion **events;
+       DECLARE_BITMAP(msgids, BCM4377_MAX_RING_SIZE);
+       spinlock_t lock;
+};
+
+/*
+ * A completion ring can be either used to either acknowledge messages sent in
+ * the corresponding transfer ring or to receive messages associated with the
+ * transfer ring. When used to receive messages the transfer ring either
+ * has no ring buffer and is only advanced ("virtual transfer ring") or it
+ * only contains empty DMA buffers to be used for the payloads.
+ *
+ * ring_id: completion ring id, hardcoded in firmware
+ * payload_size: optional payload size after each entry
+ * delay: unknown delay
+ * n_entries: number of entries in this ring
+ * enabled: true once the ring has been created and can be used
+ * ring: ring buffer for entries (struct bcm4377_completion_ring_entry)
+ * ring_dma: DMA address of ring buffer
+ * transfer_rings: bitmap of corresponding transfer ring ids
+ */
+struct bcm4377_completion_ring {
+       enum bcm4377_completion_ring_id ring_id;
+       u16 payload_size;
+       u16 delay;
+       u16 n_entries;
+       bool enabled;
+
+       void *ring;
+       dma_addr_t ring_dma;
+
+       unsigned long transfer_rings;
+};
+
+struct bcm4377_data;
+
+/*
+ * Chip-specific configuration struct
+ *
+ * id: Chip id (e.g. 0x4377 for BCM4377)
+ * otp_offset: Offset to the start of the OTP inside BAR0
+ * bar0_window1: Backplane address mapped to the first window in BAR0
+ * bar0_window2: Backplane address mapped to the second window in BAR0
+ * bar0_core2_window2: Optional backplane address mapped to the second core's
+ *                     second window in BAR0
+ * has_bar0_core2_window2: Set to true if this chip requires the second core's
+ *                         second window to be configured
+ * clear_pciecfg_subsystem_ctrl_bit19: Set to true if bit 19 in the
+ *                                     vendor-specific subsystem control
+ *                                     register has to be cleared
+ * disable_aspm: Set to true if ASPM must be disabled due to hardware errata
+ * broken_ext_scan: Set to true if the chip erroneously claims to support
+ *                  extended scanning
+ * broken_mws_transport_config: Set to true if the chip erroneously claims to
+ *                              support MWS Transport Configuration
+ * send_calibration: Optional callback to send calibration data
+ * send_ptb: Callback to send "PTB" regulatory/calibration data
+ */
+struct bcm4377_hw {
+       unsigned int id;
+
+       u32 otp_offset;
+
+       u32 bar0_window1;
+       u32 bar0_window2;
+       u32 bar0_core2_window2;
+
+       unsigned long has_bar0_core2_window2 : 1;
+       unsigned long clear_pciecfg_subsystem_ctrl_bit19 : 1;
+       unsigned long disable_aspm : 1;
+       unsigned long broken_ext_scan : 1;
+       unsigned long broken_mws_transport_config : 1;
+
+       int (*send_calibration)(struct bcm4377_data *bcm4377);
+       int (*send_ptb)(struct bcm4377_data *bcm4377,
+                       const struct firmware *fw);
+};
+
+static const struct bcm4377_hw bcm4377_hw_variants[];
+static const struct dmi_system_id bcm4377_dmi_board_table[];
+
+/*
+ * Private struct associated with each device containing global state
+ *
+ * pdev: Pointer to associated struct pci_dev
+ * hdev: Pointer to associated strucy hci_dev
+ * bar0: iomem pointing to BAR0
+ * bar1: iomem pointing to BAR2
+ * bootstage: Current value of the bootstage
+ * rti_status: Current "RTI" status value
+ * hw: Pointer to chip-specific struct bcm4377_hw
+ * taurus_cal_blob: "Taurus" calibration blob used for some chips
+ * taurus_cal_size: "Taurus" calibration blob size
+ * taurus_beamforming_cal_blob: "Taurus" beamforming calibration blob used for
+ *                              some chips
+ * taurus_beamforming_cal_size: "Taurus" beamforming calibration blob size
+ * stepping: Chip stepping read from OTP; used for firmware selection
+ * vendor: Antenna vendor read from OTP; used for firmware selection
+ * board_type: Board type from FDT or DMI match; used for firmware selection
+ * event: Event for changed bootstage or rti_status; used for booting firmware
+ * ctx: "Converged IPC" context
+ * ctx_dma: "Converged IPC" context DMA address
+ * ring_state: Shared memory buffer containing ring head and tail indexes
+ * ring_state_dma: DMA address for ring_state
+ * {control,hci_acl,sco}_ack_ring: Completion rings used to acknowledge messages
+ * {hci_acl,sco}_event_ring: Completion rings used for device->host messages
+ * control_h2d_ring: Transfer ring used for control messages
+ * {hci,sco,acl}_h2d_ring: Transfer ring used to transfer HCI frames
+ * {hci,sco,acl}_d2h_ring: Transfer ring used to receive HCI frames in the
+ *                         corresponding completion ring
+ */
+struct bcm4377_data {
+       struct pci_dev *pdev;
+       struct hci_dev *hdev;
+
+       void __iomem *bar0;
+       void __iomem *bar2;
+
+       u32 bootstage;
+       u32 rti_status;
+
+       const struct bcm4377_hw *hw;
+
+       const void *taurus_cal_blob;
+       int taurus_cal_size;
+       const void *taurus_beamforming_cal_blob;
+       int taurus_beamforming_cal_size;
+
+       char stepping[BCM4377_OTP_MAX_PARAM_LEN];
+       char vendor[BCM4377_OTP_MAX_PARAM_LEN];
+       const char *board_type;
+
+       struct completion event;
+
+       struct bcm4377_context *ctx;
+       dma_addr_t ctx_dma;
+
+       struct bcm4377_ring_state *ring_state;
+       dma_addr_t ring_state_dma;
+
+       /*
+        * The HCI and ACL rings have to be merged because this structure is
+        * hardcoded in the firmware.
+        */
+       struct bcm4377_completion_ring control_ack_ring;
+       struct bcm4377_completion_ring hci_acl_ack_ring;
+       struct bcm4377_completion_ring hci_acl_event_ring;
+       struct bcm4377_completion_ring sco_ack_ring;
+       struct bcm4377_completion_ring sco_event_ring;
+
+       struct bcm4377_transfer_ring control_h2d_ring;
+       struct bcm4377_transfer_ring hci_h2d_ring;
+       struct bcm4377_transfer_ring hci_d2h_ring;
+       struct bcm4377_transfer_ring sco_h2d_ring;
+       struct bcm4377_transfer_ring sco_d2h_ring;
+       struct bcm4377_transfer_ring acl_h2d_ring;
+       struct bcm4377_transfer_ring acl_d2h_ring;
+};
+
+static void bcm4377_ring_doorbell(struct bcm4377_data *bcm4377, u8 doorbell,
+                                 u16 val)
+{
+       u32 db = 0;
+
+       db |= FIELD_PREP(BCM4377_BAR0_DOORBELL_VALUE, val);
+       db |= FIELD_PREP(BCM4377_BAR0_DOORBELL_IDX, doorbell);
+       db |= BCM4377_BAR0_DOORBELL_RING;
+
+       dev_dbg(&bcm4377->pdev->dev, "write %d to doorbell #%d (0x%x)\n", val,
+               doorbell, db);
+       iowrite32(db, bcm4377->bar0 + BCM4377_BAR0_DOORBELL);
+}
+
+static int bcm4377_extract_msgid(struct bcm4377_data *bcm4377,
+                                struct bcm4377_transfer_ring *ring,
+                                u16 raw_msgid, u8 *msgid)
+{
+       u8 generation = FIELD_GET(BCM4377_MSGID_GENERATION, raw_msgid);
+       *msgid = FIELD_GET(BCM4377_MSGID_ID, raw_msgid);
+
+       if (generation != ring->generation) {
+               dev_warn(
+                       &bcm4377->pdev->dev,
+                       "invalid message generation %d should be %d in entry for ring %d\n",
+                       generation, ring->generation, ring->ring_id);
+               return -EINVAL;
+       }
+
+       if (*msgid >= ring->n_entries) {
+               dev_warn(&bcm4377->pdev->dev,
+                        "invalid message id in entry for ring %d: %d > %d\n",
+                        ring->ring_id, *msgid, ring->n_entries);
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+static void bcm4377_handle_event(struct bcm4377_data *bcm4377,
+                                struct bcm4377_transfer_ring *ring,
+                                u16 raw_msgid, u8 entry_flags, u8 type,
+                                void *payload, size_t len)
+{
+       struct sk_buff *skb;
+       u16 head;
+       u8 msgid;
+       unsigned long flags;
+
+       spin_lock_irqsave(&ring->lock, flags);
+       if (!ring->enabled) {
+               dev_warn(&bcm4377->pdev->dev,
+                        "event for disabled transfer ring %d\n",
+                        ring->ring_id);
+               goto out;
+       }
+
+       if (ring->d2h_buffers_only &&
+           entry_flags & BCM4377_XFER_RING_FLAG_PAYLOAD_MAPPED) {
+               if (bcm4377_extract_msgid(bcm4377, ring, raw_msgid, &msgid))
+                       goto out;
+
+               if (len > ring->mapped_payload_size) {
+                       dev_warn(
+                               &bcm4377->pdev->dev,
+                               "invalid payload len in event for ring %d: %zu > %zu\n",
+                               ring->ring_id, len, ring->mapped_payload_size);
+                       goto out;
+               }
+
+               payload = ring->payloads + msgid * ring->mapped_payload_size;
+       }
+
+       skb = bt_skb_alloc(len, GFP_ATOMIC);
+       if (!skb)
+               goto out;
+
+       memcpy(skb_put(skb, len), payload, len);
+       hci_skb_pkt_type(skb) = type;
+       hci_recv_frame(bcm4377->hdev, skb);
+
+out:
+       head = le16_to_cpu(bcm4377->ring_state->xfer_ring_head[ring->ring_id]);
+       head = (head + 1) % ring->n_entries;
+       bcm4377->ring_state->xfer_ring_head[ring->ring_id] = cpu_to_le16(head);
+
+       bcm4377_ring_doorbell(bcm4377, ring->doorbell, head);
+
+       spin_unlock_irqrestore(&ring->lock, flags);
+}
+
+static void bcm4377_handle_ack(struct bcm4377_data *bcm4377,
+                              struct bcm4377_transfer_ring *ring,
+                              u16 raw_msgid)
+{
+       unsigned long flags;
+       u8 msgid;
+
+       spin_lock_irqsave(&ring->lock, flags);
+
+       if (bcm4377_extract_msgid(bcm4377, ring, raw_msgid, &msgid))
+               goto unlock;
+
+       if (!test_bit(msgid, ring->msgids)) {
+               dev_warn(
+                       &bcm4377->pdev->dev,
+                       "invalid message id in ack for ring %d: %d is not used\n",
+                       ring->ring_id, msgid);
+               goto unlock;
+       }
+
+       if (ring->allow_wait && ring->events[msgid]) {
+               complete(ring->events[msgid]);
+               ring->events[msgid] = NULL;
+       }
+
+       bitmap_release_region(ring->msgids, msgid, ring->n_entries);
+
+unlock:
+       spin_unlock_irqrestore(&ring->lock, flags);
+}
+
+static void bcm4377_handle_completion(struct bcm4377_data *bcm4377,
+                                     struct bcm4377_completion_ring *ring,
+                                     u16 pos)
+{
+       struct bcm4377_completion_ring_entry *entry;
+       u16 msg_id, transfer_ring;
+       size_t entry_size, data_len;
+       void *data;
+
+       if (pos >= ring->n_entries) {
+               dev_warn(&bcm4377->pdev->dev,
+                        "invalid offset %d for completion ring %d\n", pos,
+                        ring->ring_id);
+               return;
+       }
+
+       entry_size = sizeof(*entry) + ring->payload_size;
+       entry = ring->ring + pos * entry_size;
+       data = ring->ring + pos * entry_size + sizeof(*entry);
+       data_len = le32_to_cpu(entry->len);
+       msg_id = le16_to_cpu(entry->msg_id);
+       transfer_ring = le16_to_cpu(entry->ring_id);
+
+       if ((ring->transfer_rings & BIT(transfer_ring)) == 0) {
+               dev_warn(
+                       &bcm4377->pdev->dev,
+                       "invalid entry at offset %d for transfer ring %d in completion ring %d\n",
+                       pos, transfer_ring, ring->ring_id);
+               return;
+       }
+
+       dev_dbg(&bcm4377->pdev->dev,
+               "entry in completion ring %d for transfer ring %d with msg_id %d\n",
+               ring->ring_id, transfer_ring, msg_id);
+
+       switch (transfer_ring) {
+       case BCM4377_XFER_RING_CONTROL:
+               bcm4377_handle_ack(bcm4377, &bcm4377->control_h2d_ring, msg_id);
+               break;
+       case BCM4377_XFER_RING_HCI_H2D:
+               bcm4377_handle_ack(bcm4377, &bcm4377->hci_h2d_ring, msg_id);
+               break;
+       case BCM4377_XFER_RING_SCO_H2D:
+               bcm4377_handle_ack(bcm4377, &bcm4377->sco_h2d_ring, msg_id);
+               break;
+       case BCM4377_XFER_RING_ACL_H2D:
+               bcm4377_handle_ack(bcm4377, &bcm4377->acl_h2d_ring, msg_id);
+               break;
+
+       case BCM4377_XFER_RING_HCI_D2H:
+               bcm4377_handle_event(bcm4377, &bcm4377->hci_d2h_ring, msg_id,
+                                    entry->flags, HCI_EVENT_PKT, data,
+                                    data_len);
+               break;
+       case BCM4377_XFER_RING_SCO_D2H:
+               bcm4377_handle_event(bcm4377, &bcm4377->sco_d2h_ring, msg_id,
+                                    entry->flags, HCI_SCODATA_PKT, data,
+                                    data_len);
+               break;
+       case BCM4377_XFER_RING_ACL_D2H:
+               bcm4377_handle_event(bcm4377, &bcm4377->acl_d2h_ring, msg_id,
+                                    entry->flags, HCI_ACLDATA_PKT, data,
+                                    data_len);
+               break;
+
+       default:
+               dev_warn(
+                       &bcm4377->pdev->dev,
+                       "entry in completion ring %d for unknown transfer ring %d with msg_id %d\n",
+                       ring->ring_id, transfer_ring, msg_id);
+       }
+}
+
+static void bcm4377_poll_completion_ring(struct bcm4377_data *bcm4377,
+                                        struct bcm4377_completion_ring *ring)
+{
+       u16 tail;
+       __le16 *heads = bcm4377->ring_state->completion_ring_head;
+       __le16 *tails = bcm4377->ring_state->completion_ring_tail;
+
+       if (!ring->enabled)
+               return;
+
+       tail = le16_to_cpu(tails[ring->ring_id]);
+       dev_dbg(&bcm4377->pdev->dev,
+               "completion ring #%d: head: %d, tail: %d\n", ring->ring_id,
+               le16_to_cpu(heads[ring->ring_id]), tail);
+
+       while (tail != le16_to_cpu(READ_ONCE(heads[ring->ring_id]))) {
+               /*
+                * ensure the CPU doesn't speculate through the comparison.
+                * otherwise it might already read the (empty) queue entry
+                * before the updated head has been loaded and checked.
+                */
+               dma_rmb();
+
+               bcm4377_handle_completion(bcm4377, ring, tail);
+
+               tail = (tail + 1) % ring->n_entries;
+               tails[ring->ring_id] = cpu_to_le16(tail);
+       }
+}
+
+static irqreturn_t bcm4377_irq(int irq, void *data)
+{
+       struct bcm4377_data *bcm4377 = data;
+       u32 bootstage, rti_status;
+
+       bootstage = ioread32(bcm4377->bar2 + BCM4377_BAR2_BOOTSTAGE);
+       rti_status = ioread32(bcm4377->bar2 + BCM4377_BAR2_RTI_STATUS);
+
+       if (bootstage != bcm4377->bootstage ||
+           rti_status != bcm4377->rti_status) {
+               dev_dbg(&bcm4377->pdev->dev,
+                       "bootstage = %d -> %d, rti state = %d -> %d\n",
+                       bcm4377->bootstage, bootstage, bcm4377->rti_status,
+                       rti_status);
+               complete(&bcm4377->event);
+               bcm4377->bootstage = bootstage;
+               bcm4377->rti_status = rti_status;
+       }
+
+       if (rti_status > 2)
+               dev_err(&bcm4377->pdev->dev, "RTI status is %d\n", rti_status);
+
+       bcm4377_poll_completion_ring(bcm4377, &bcm4377->control_ack_ring);
+       bcm4377_poll_completion_ring(bcm4377, &bcm4377->hci_acl_event_ring);
+       bcm4377_poll_completion_ring(bcm4377, &bcm4377->hci_acl_ack_ring);
+       bcm4377_poll_completion_ring(bcm4377, &bcm4377->sco_ack_ring);
+       bcm4377_poll_completion_ring(bcm4377, &bcm4377->sco_event_ring);
+
+       return IRQ_HANDLED;
+}
+
+static int bcm4377_enqueue(struct bcm4377_data *bcm4377,
+                          struct bcm4377_transfer_ring *ring, void *data,
+                          size_t len, bool wait)
+{
+       unsigned long flags;
+       struct bcm4377_xfer_ring_entry *entry;
+       void *payload;
+       size_t offset;
+       u16 head, tail, new_head;
+       u16 raw_msgid;
+       int ret, msgid;
+       DECLARE_COMPLETION_ONSTACK(event);
+
+       if (len > ring->payload_size && len > ring->mapped_payload_size) {
+               dev_warn(
+                       &bcm4377->pdev->dev,
+                       "payload len %zu is too large for ring %d (max is %zu or %zu)\n",
+                       len, ring->ring_id, ring->payload_size,
+                       ring->mapped_payload_size);
+               return -EINVAL;
+       }
+       if (wait && !ring->allow_wait)
+               return -EINVAL;
+       if (ring->virtual)
+               return -EINVAL;
+
+       spin_lock_irqsave(&ring->lock, flags);
+
+       head = le16_to_cpu(bcm4377->ring_state->xfer_ring_head[ring->ring_id]);
+       tail = le16_to_cpu(bcm4377->ring_state->xfer_ring_tail[ring->ring_id]);
+
+       new_head = (head + 1) % ring->n_entries;
+
+       if (new_head == tail) {
+               dev_warn(&bcm4377->pdev->dev,
+                        "can't send message because ring %d is full\n",
+                        ring->ring_id);
+               ret = -EINVAL;
+               goto out;
+       }
+
+       msgid = bitmap_find_free_region(ring->msgids, ring->n_entries, 0);
+       if (msgid < 0) {
+               dev_warn(&bcm4377->pdev->dev,
+                        "can't find message id for ring %d\n", ring->ring_id);
+               ret = -EINVAL;
+               goto out;
+       }
+
+       raw_msgid = FIELD_PREP(BCM4377_MSGID_GENERATION, ring->generation);
+       raw_msgid |= FIELD_PREP(BCM4377_MSGID_ID, msgid);
+
+       offset = head * (sizeof(*entry) + ring->payload_size);
+       entry = ring->ring + offset;
+
+       memset(entry, 0, sizeof(*entry));
+       entry->id = cpu_to_le16(raw_msgid);
+       entry->len = cpu_to_le16(len);
+
+       if (len <= ring->payload_size) {
+               entry->flags = BCM4377_XFER_RING_FLAG_PAYLOAD_IN_FOOTER;
+               payload = ring->ring + offset + sizeof(*entry);
+       } else {
+               entry->flags = BCM4377_XFER_RING_FLAG_PAYLOAD_MAPPED;
+               entry->payload = cpu_to_le64(ring->payloads_dma +
+                                            msgid * ring->mapped_payload_size);
+               payload = ring->payloads + msgid * ring->mapped_payload_size;
+       }
+
+       memcpy(payload, data, len);
+
+       if (wait)
+               ring->events[msgid] = &event;
+
+       /*
+        * The 4377 chips stop responding to any commands as soon as they
+        * have been idle for a while. Poking the sleep control register here
+        * makes them come alive again.
+        */
+       iowrite32(BCM4377_BAR0_SLEEP_CONTROL_AWAKE,
+                 bcm4377->bar0 + BCM4377_BAR0_SLEEP_CONTROL);
+
+       dev_dbg(&bcm4377->pdev->dev,
+               "updating head for transfer queue #%d to %d\n", ring->ring_id,
+               new_head);
+       bcm4377->ring_state->xfer_ring_head[ring->ring_id] =
+               cpu_to_le16(new_head);
+
+       if (!ring->sync)
+               bcm4377_ring_doorbell(bcm4377, ring->doorbell, new_head);
+       ret = 0;
+
+out:
+       spin_unlock_irqrestore(&ring->lock, flags);
+
+       if (ret == 0 && wait) {
+               ret = wait_for_completion_interruptible_timeout(
+                       &event, BCM4377_TIMEOUT);
+               if (ret == 0)
+                       ret = -ETIMEDOUT;
+               else if (ret > 0)
+                       ret = 0;
+
+               spin_lock_irqsave(&ring->lock, flags);
+               ring->events[msgid] = NULL;
+               spin_unlock_irqrestore(&ring->lock, flags);
+       }
+
+       return ret;
+}
+
+static int bcm4377_create_completion_ring(struct bcm4377_data *bcm4377,
+                                         struct bcm4377_completion_ring *ring)
+{
+       struct bcm4377_create_completion_ring_msg msg;
+       int ret;
+
+       if (ring->enabled) {
+               dev_warn(&bcm4377->pdev->dev,
+                        "completion ring %d already enabled\n", ring->ring_id);
+               return 0;
+       }
+
+       memset(ring->ring, 0,
+              ring->n_entries * (sizeof(struct bcm4377_completion_ring_entry) +
+                                 ring->payload_size));
+       memset(&msg, 0, sizeof(msg));
+       msg.msg_type = BCM4377_CONTROL_MSG_CREATE_COMPLETION_RING;
+       msg.id = cpu_to_le16(ring->ring_id);
+       msg.id_again = cpu_to_le16(ring->ring_id);
+       msg.ring_iova = cpu_to_le64(ring->ring_dma);
+       msg.n_elements = cpu_to_le16(ring->n_entries);
+       msg.intmod_bytes = cpu_to_le32(0xffffffff);
+       msg.unk = cpu_to_le32(0xffffffff);
+       msg.intmod_delay = cpu_to_le16(ring->delay);
+       msg.footer_size = ring->payload_size / 4;
+
+       ret = bcm4377_enqueue(bcm4377, &bcm4377->control_h2d_ring, &msg,
+                             sizeof(msg), true);
+       if (!ret)
+               ring->enabled = true;
+
+       return ret;
+}
+
+static int bcm4377_destroy_completion_ring(struct bcm4377_data *bcm4377,
+                                          struct bcm4377_completion_ring *ring)
+{
+       struct bcm4377_destroy_completion_ring_msg msg;
+       int ret;
+
+       memset(&msg, 0, sizeof(msg));
+       msg.msg_type = BCM4377_CONTROL_MSG_DESTROY_COMPLETION_RING;
+       msg.ring_id = cpu_to_le16(ring->ring_id);
+
+       ret = bcm4377_enqueue(bcm4377, &bcm4377->control_h2d_ring, &msg,
+                             sizeof(msg), true);
+       if (ret)
+               dev_warn(&bcm4377->pdev->dev,
+                        "failed to destroy completion ring %d\n",
+                        ring->ring_id);
+
+       ring->enabled = false;
+       return ret;
+}
+
+static int bcm4377_create_transfer_ring(struct bcm4377_data *bcm4377,
+                                       struct bcm4377_transfer_ring *ring)
+{
+       struct bcm4377_create_transfer_ring_msg msg;
+       u16 flags = 0;
+       int ret, i;
+       unsigned long spinlock_flags;
+
+       if (ring->virtual)
+               flags |= BCM4377_XFER_RING_FLAG_VIRTUAL;
+       if (ring->sync)
+               flags |= BCM4377_XFER_RING_FLAG_SYNC;
+
+       spin_lock_irqsave(&ring->lock, spinlock_flags);
+       memset(&msg, 0, sizeof(msg));
+       msg.msg_type = BCM4377_CONTROL_MSG_CREATE_XFER_RING;
+       msg.ring_id = cpu_to_le16(ring->ring_id);
+       msg.ring_id_again = cpu_to_le16(ring->ring_id);
+       msg.ring_iova = cpu_to_le64(ring->ring_dma);
+       msg.n_elements = cpu_to_le16(ring->n_entries);
+       msg.completion_ring_id = cpu_to_le16(ring->completion_ring);
+       msg.doorbell = cpu_to_le16(ring->doorbell);
+       msg.flags = cpu_to_le16(flags);
+       msg.footer_size = ring->payload_size / 4;
+
+       bcm4377->ring_state->xfer_ring_head[ring->ring_id] = 0;
+       bcm4377->ring_state->xfer_ring_tail[ring->ring_id] = 0;
+       ring->generation++;
+       spin_unlock_irqrestore(&ring->lock, spinlock_flags);
+
+       ret = bcm4377_enqueue(bcm4377, &bcm4377->control_h2d_ring, &msg,
+                             sizeof(msg), true);
+
+       spin_lock_irqsave(&ring->lock, spinlock_flags);
+
+       if (ring->d2h_buffers_only) {
+               for (i = 0; i < ring->n_entries; ++i) {
+                       struct bcm4377_xfer_ring_entry *entry =
+                               ring->ring + i * sizeof(*entry);
+                       u16 raw_msgid = FIELD_PREP(BCM4377_MSGID_GENERATION,
+                                                  ring->generation);
+                       raw_msgid |= FIELD_PREP(BCM4377_MSGID_ID, i);
+
+                       memset(entry, 0, sizeof(*entry));
+                       entry->id = cpu_to_le16(raw_msgid);
+                       entry->len = cpu_to_le16(ring->mapped_payload_size);
+                       entry->flags = BCM4377_XFER_RING_FLAG_PAYLOAD_MAPPED;
+                       entry->payload =
+                               cpu_to_le64(ring->payloads_dma +
+                                           i * ring->mapped_payload_size);
+               }
+       }
+
+       /*
+        * send some messages if this is a device->host ring to allow the device
+        * to reply by acknowledging them in the completion ring
+        */
+       if (ring->virtual || ring->d2h_buffers_only) {
+               bcm4377->ring_state->xfer_ring_head[ring->ring_id] =
+                       cpu_to_le16(0xf);
+               bcm4377_ring_doorbell(bcm4377, ring->doorbell, 0xf);
+       }
+
+       ring->enabled = true;
+       spin_unlock_irqrestore(&ring->lock, spinlock_flags);
+
+       return ret;
+}
+
+static int bcm4377_destroy_transfer_ring(struct bcm4377_data *bcm4377,
+                                        struct bcm4377_transfer_ring *ring)
+{
+       struct bcm4377_destroy_transfer_ring_msg msg;
+       int ret;
+
+       memset(&msg, 0, sizeof(msg));
+       msg.msg_type = BCM4377_CONTROL_MSG_DESTROY_XFER_RING;
+       msg.ring_id = cpu_to_le16(ring->ring_id);
+
+       ret = bcm4377_enqueue(bcm4377, &bcm4377->control_h2d_ring, &msg,
+                             sizeof(msg), true);
+       if (ret)
+               dev_warn(&bcm4377->pdev->dev,
+                        "failed to destroy transfer ring %d\n", ring->ring_id);
+
+       ring->enabled = false;
+       return ret;
+}
+
+static int __bcm4378_send_calibration_chunk(struct bcm4377_data *bcm4377,
+                                           const void *data, size_t data_len,
+                                           u16 blocks_left)
+{
+       struct bcm4378_hci_send_calibration_cmd cmd;
+       struct sk_buff *skb;
+
+       if (data_len > sizeof(cmd.data))
+               return -EINVAL;
+
+       memset(&cmd, 0, sizeof(cmd));
+       cmd.unk = 0x03;
+       cmd.blocks_left = cpu_to_le16(blocks_left);
+       memcpy(cmd.data, data, data_len);
+
+       skb = __hci_cmd_sync(bcm4377->hdev, 0xfd97, sizeof(cmd), &cmd,
+                            HCI_INIT_TIMEOUT);
+       if (IS_ERR(skb))
+               return PTR_ERR(skb);
+
+       kfree_skb(skb);
+       return 0;
+}
+
+static int __bcm4378_send_calibration(struct bcm4377_data *bcm4377,
+                                     const void *data, size_t data_size)
+{
+       int ret;
+       size_t i, left, transfer_len;
+       size_t blocks =
+               DIV_ROUND_UP(data_size, (size_t)BCM4378_CALIBRATION_CHUNK_SIZE);
+
+       if (!data) {
+               dev_err(&bcm4377->pdev->dev,
+                       "no calibration data available.\n");
+               return -ENOENT;
+       }
+
+       for (i = 0, left = data_size; i < blocks; ++i, left -= transfer_len) {
+               transfer_len =
+                       min_t(size_t, left, BCM4378_CALIBRATION_CHUNK_SIZE);
+
+               ret = __bcm4378_send_calibration_chunk(
+                       bcm4377, data + i * BCM4378_CALIBRATION_CHUNK_SIZE,
+                       transfer_len, blocks - i - 1);
+               if (ret) {
+                       dev_err(&bcm4377->pdev->dev,
+                               "send calibration chunk failed with %d\n", ret);
+                       return ret;
+               }
+       }
+
+       return 0;
+}
+
+static int bcm4378_send_calibration(struct bcm4377_data *bcm4377)
+{
+       if ((strcmp(bcm4377->stepping, "b1") == 0) ||
+           strcmp(bcm4377->stepping, "b3") == 0)
+               return __bcm4378_send_calibration(
+                       bcm4377, bcm4377->taurus_beamforming_cal_blob,
+                       bcm4377->taurus_beamforming_cal_size);
+       else
+               return __bcm4378_send_calibration(bcm4377,
+                                                 bcm4377->taurus_cal_blob,
+                                                 bcm4377->taurus_cal_size);
+}
+
+static int bcm4387_send_calibration(struct bcm4377_data *bcm4377)
+{
+       if (strcmp(bcm4377->stepping, "c2") == 0)
+               return __bcm4378_send_calibration(
+                       bcm4377, bcm4377->taurus_beamforming_cal_blob,
+                       bcm4377->taurus_beamforming_cal_size);
+       else
+               return __bcm4378_send_calibration(bcm4377,
+                                                 bcm4377->taurus_cal_blob,
+                                                 bcm4377->taurus_cal_size);
+}
+
+static const struct firmware *bcm4377_request_blob(struct bcm4377_data *bcm4377,
+                                                  const char *suffix)
+{
+       const struct firmware *fw;
+       char name0[64], name1[64];
+       int ret;
+
+       snprintf(name0, sizeof(name0), "brcm/brcmbt%04x%s-%s-%s.%s",
+                bcm4377->hw->id, bcm4377->stepping, bcm4377->board_type,
+                bcm4377->vendor, suffix);
+       snprintf(name1, sizeof(name1), "brcm/brcmbt%04x%s-%s.%s",
+                bcm4377->hw->id, bcm4377->stepping, bcm4377->board_type,
+                suffix);
+       dev_dbg(&bcm4377->pdev->dev, "Trying to load firmware: '%s' or '%s'\n",
+               name0, name1);
+
+       ret = firmware_request_nowarn(&fw, name0, &bcm4377->pdev->dev);
+       if (!ret)
+               return fw;
+       ret = firmware_request_nowarn(&fw, name1, &bcm4377->pdev->dev);
+       if (!ret)
+               return fw;
+
+       dev_err(&bcm4377->pdev->dev,
+               "Unable to load firmware; tried '%s' and '%s'\n", name0, name1);
+       return NULL;
+}
+
+static int bcm4377_send_ptb(struct bcm4377_data *bcm4377,
+                           const struct firmware *fw)
+{
+       struct sk_buff *skb;
+
+       skb = __hci_cmd_sync(bcm4377->hdev, 0xfd98, fw->size, fw->data,
+                            HCI_INIT_TIMEOUT);
+       /*
+        * This command seems to always fail on more recent firmware versions
+        * (even in traces taken from the macOS driver). It's unclear why this
+        * happens but because the PTB file contains calibration and/or
+        * regulatory data and may be required on older firmware we still try to
+        * send it here just in case and just ignore if it fails.
+        */
+       if (!IS_ERR(skb))
+               kfree_skb(skb);
+       return 0;
+}
+
+static int bcm4378_send_ptb_chunk(struct bcm4377_data *bcm4377,
+                                 const void *data, size_t data_len,
+                                 u16 blocks_left)
+{
+       struct bcm4378_hci_send_ptb_cmd cmd;
+       struct sk_buff *skb;
+
+       if (data_len > BCM4378_PTB_CHUNK_SIZE)
+               return -EINVAL;
+
+       memset(&cmd, 0, sizeof(cmd));
+       cmd.blocks_left = cpu_to_le16(blocks_left);
+       memcpy(cmd.data, data, data_len);
+
+       skb = __hci_cmd_sync(bcm4377->hdev, 0xfe0d, sizeof(cmd), &cmd,
+                            HCI_INIT_TIMEOUT);
+       if (IS_ERR(skb))
+               return PTR_ERR(skb);
+
+       kfree_skb(skb);
+       return 0;
+}
+
+static int bcm4378_send_ptb(struct bcm4377_data *bcm4377,
+                           const struct firmware *fw)
+{
+       size_t chunks = DIV_ROUND_UP(fw->size, (size_t)BCM4378_PTB_CHUNK_SIZE);
+       size_t i, left, transfer_len;
+       int ret;
+
+       for (i = 0, left = fw->size; i < chunks; ++i, left -= transfer_len) {
+               transfer_len = min_t(size_t, left, BCM4378_PTB_CHUNK_SIZE);
+
+               dev_dbg(&bcm4377->pdev->dev, "sending ptb chunk %zu/%zu\n",
+                       i + 1, chunks);
+               ret = bcm4378_send_ptb_chunk(
+                       bcm4377, fw->data + i * BCM4378_PTB_CHUNK_SIZE,
+                       transfer_len, chunks - i - 1);
+               if (ret) {
+                       dev_err(&bcm4377->pdev->dev,
+                               "sending ptb chunk %zu failed (%d)", i, ret);
+                       return ret;
+               }
+       }
+
+       return 0;
+}
+
+static int bcm4377_hci_open(struct hci_dev *hdev)
+{
+       struct bcm4377_data *bcm4377 = hci_get_drvdata(hdev);
+       int ret;
+
+       dev_dbg(&bcm4377->pdev->dev, "creating rings\n");
+
+       ret = bcm4377_create_completion_ring(bcm4377,
+                                            &bcm4377->hci_acl_ack_ring);
+       if (ret)
+               return ret;
+       ret = bcm4377_create_completion_ring(bcm4377,
+                                            &bcm4377->hci_acl_event_ring);
+       if (ret)
+               goto destroy_hci_acl_ack;
+       ret = bcm4377_create_completion_ring(bcm4377, &bcm4377->sco_ack_ring);
+       if (ret)
+               goto destroy_hci_acl_event;
+       ret = bcm4377_create_completion_ring(bcm4377, &bcm4377->sco_event_ring);
+       if (ret)
+               goto destroy_sco_ack;
+       dev_dbg(&bcm4377->pdev->dev,
+               "all completion rings successfully created!\n");
+
+       ret = bcm4377_create_transfer_ring(bcm4377, &bcm4377->hci_h2d_ring);
+       if (ret)
+               goto destroy_sco_event;
+       ret = bcm4377_create_transfer_ring(bcm4377, &bcm4377->hci_d2h_ring);
+       if (ret)
+               goto destroy_hci_h2d;
+       ret = bcm4377_create_transfer_ring(bcm4377, &bcm4377->sco_h2d_ring);
+       if (ret)
+               goto destroy_hci_d2h;
+       ret = bcm4377_create_transfer_ring(bcm4377, &bcm4377->sco_d2h_ring);
+       if (ret)
+               goto destroy_sco_h2d;
+       ret = bcm4377_create_transfer_ring(bcm4377, &bcm4377->acl_h2d_ring);
+       if (ret)
+               goto destroy_sco_d2h;
+       ret = bcm4377_create_transfer_ring(bcm4377, &bcm4377->acl_d2h_ring);
+       if (ret)
+               goto destroy_acl_h2d;
+       dev_dbg(&bcm4377->pdev->dev,
+               "all transfer rings successfully created!\n");
+
+       return 0;
+
+destroy_acl_h2d:
+       bcm4377_destroy_transfer_ring(bcm4377, &bcm4377->acl_h2d_ring);
+destroy_sco_d2h:
+       bcm4377_destroy_transfer_ring(bcm4377, &bcm4377->sco_d2h_ring);
+destroy_sco_h2d:
+       bcm4377_destroy_transfer_ring(bcm4377, &bcm4377->sco_h2d_ring);
+destroy_hci_d2h:
+       bcm4377_destroy_transfer_ring(bcm4377, &bcm4377->hci_h2d_ring);
+destroy_hci_h2d:
+       bcm4377_destroy_transfer_ring(bcm4377, &bcm4377->hci_d2h_ring);
+destroy_sco_event:
+       bcm4377_destroy_completion_ring(bcm4377, &bcm4377->sco_event_ring);
+destroy_sco_ack:
+       bcm4377_destroy_completion_ring(bcm4377, &bcm4377->sco_ack_ring);
+destroy_hci_acl_event:
+       bcm4377_destroy_completion_ring(bcm4377, &bcm4377->hci_acl_event_ring);
+destroy_hci_acl_ack:
+       bcm4377_destroy_completion_ring(bcm4377, &bcm4377->hci_acl_ack_ring);
+
+       dev_err(&bcm4377->pdev->dev, "Creating rings failed with %d\n", ret);
+       return ret;
+}
+
+static int bcm4377_hci_close(struct hci_dev *hdev)
+{
+       struct bcm4377_data *bcm4377 = hci_get_drvdata(hdev);
+
+       dev_dbg(&bcm4377->pdev->dev, "destroying rings in hci_close\n");
+
+       bcm4377_destroy_transfer_ring(bcm4377, &bcm4377->acl_d2h_ring);
+       bcm4377_destroy_transfer_ring(bcm4377, &bcm4377->acl_h2d_ring);
+       bcm4377_destroy_transfer_ring(bcm4377, &bcm4377->sco_d2h_ring);
+       bcm4377_destroy_transfer_ring(bcm4377, &bcm4377->sco_h2d_ring);
+       bcm4377_destroy_transfer_ring(bcm4377, &bcm4377->hci_d2h_ring);
+       bcm4377_destroy_transfer_ring(bcm4377, &bcm4377->hci_h2d_ring);
+
+       bcm4377_destroy_completion_ring(bcm4377, &bcm4377->sco_event_ring);
+       bcm4377_destroy_completion_ring(bcm4377, &bcm4377->sco_ack_ring);
+       bcm4377_destroy_completion_ring(bcm4377, &bcm4377->hci_acl_event_ring);
+       bcm4377_destroy_completion_ring(bcm4377, &bcm4377->hci_acl_ack_ring);
+
+       return 0;
+}
+
+static bool bcm4377_is_valid_bdaddr(struct bcm4377_data *bcm4377,
+                                   bdaddr_t *addr)
+{
+       if (addr->b[0] != 0x93)
+               return true;
+       if (addr->b[1] != 0x76)
+               return true;
+       if (addr->b[2] != 0x00)
+               return true;
+       if (addr->b[4] != (bcm4377->hw->id & 0xff))
+               return true;
+       if (addr->b[5] != (bcm4377->hw->id >> 8))
+               return true;
+       return false;
+}
+
+static int bcm4377_check_bdaddr(struct bcm4377_data *bcm4377)
+{
+       struct hci_rp_read_bd_addr *bda;
+       struct sk_buff *skb;
+
+       skb = __hci_cmd_sync(bcm4377->hdev, HCI_OP_READ_BD_ADDR, 0, NULL,
+                            HCI_INIT_TIMEOUT);
+       if (IS_ERR(skb)) {
+               int err = PTR_ERR(skb);
+
+               dev_err(&bcm4377->pdev->dev, "HCI_OP_READ_BD_ADDR failed (%d)",
+                       err);
+               return err;
+       }
+
+       if (skb->len != sizeof(*bda)) {
+               dev_err(&bcm4377->pdev->dev,
+                       "HCI_OP_READ_BD_ADDR reply length invalid");
+               kfree_skb(skb);
+               return -EIO;
+       }
+
+       bda = (struct hci_rp_read_bd_addr *)skb->data;
+       if (!bcm4377_is_valid_bdaddr(bcm4377, &bda->bdaddr))
+               set_bit(HCI_QUIRK_INVALID_BDADDR, &bcm4377->hdev->quirks);
+
+       kfree_skb(skb);
+       return 0;
+}
+
+static int bcm4377_hci_setup(struct hci_dev *hdev)
+{
+       struct bcm4377_data *bcm4377 = hci_get_drvdata(hdev);
+       const struct firmware *fw;
+       int ret;
+
+       if (bcm4377->hw->send_calibration) {
+               ret = bcm4377->hw->send_calibration(bcm4377);
+               if (ret)
+                       return ret;
+       }
+
+       fw = bcm4377_request_blob(bcm4377, "ptb");
+       if (!fw) {
+               dev_err(&bcm4377->pdev->dev, "failed to load PTB data");
+               return -ENOENT;
+       }
+
+       ret = bcm4377->hw->send_ptb(bcm4377, fw);
+       release_firmware(fw);
+       if (ret)
+               return ret;
+
+       return bcm4377_check_bdaddr(bcm4377);
+}
+
+static int bcm4377_hci_send_frame(struct hci_dev *hdev, struct sk_buff *skb)
+{
+       struct bcm4377_data *bcm4377 = hci_get_drvdata(hdev);
+       struct bcm4377_transfer_ring *ring;
+       int ret;
+
+       switch (hci_skb_pkt_type(skb)) {
+       case HCI_COMMAND_PKT:
+               hdev->stat.cmd_tx++;
+               ring = &bcm4377->hci_h2d_ring;
+               break;
+
+       case HCI_ACLDATA_PKT:
+               hdev->stat.acl_tx++;
+               ring = &bcm4377->acl_h2d_ring;
+               break;
+
+       case HCI_SCODATA_PKT:
+               hdev->stat.sco_tx++;
+               ring = &bcm4377->sco_h2d_ring;
+               break;
+
+       default:
+               return -EILSEQ;
+       }
+
+       ret = bcm4377_enqueue(bcm4377, ring, skb->data, skb->len, false);
+       if (ret < 0) {
+               hdev->stat.err_tx++;
+               return ret;
+       }
+
+       hdev->stat.byte_tx += skb->len;
+       kfree_skb(skb);
+       return ret;
+}
+
+static int bcm4377_hci_set_bdaddr(struct hci_dev *hdev, const bdaddr_t *bdaddr)
+{
+       struct bcm4377_data *bcm4377 = hci_get_drvdata(hdev);
+       struct sk_buff *skb;
+       int err;
+
+       skb = __hci_cmd_sync(hdev, 0xfc01, 6, bdaddr, HCI_INIT_TIMEOUT);
+       if (IS_ERR(skb)) {
+               err = PTR_ERR(skb);
+               dev_err(&bcm4377->pdev->dev,
+                       "Change address command failed (%d)", err);
+               return err;
+       }
+       kfree_skb(skb);
+
+       return 0;
+}
+
+static int bcm4377_alloc_transfer_ring(struct bcm4377_data *bcm4377,
+                                      struct bcm4377_transfer_ring *ring)
+{
+       size_t entry_size;
+
+       spin_lock_init(&ring->lock);
+       ring->payload_size = ALIGN(ring->payload_size, 4);
+       ring->mapped_payload_size = ALIGN(ring->mapped_payload_size, 4);
+
+       if (ring->payload_size > BCM4377_XFER_RING_MAX_INPLACE_PAYLOAD_SIZE)
+               return -EINVAL;
+       if (ring->n_entries > BCM4377_MAX_RING_SIZE)
+               return -EINVAL;
+       if (ring->virtual && ring->allow_wait)
+               return -EINVAL;
+
+       if (ring->d2h_buffers_only) {
+               if (ring->virtual)
+                       return -EINVAL;
+               if (ring->payload_size)
+                       return -EINVAL;
+               if (!ring->mapped_payload_size)
+                       return -EINVAL;
+       }
+       if (ring->virtual)
+               return 0;
+
+       entry_size =
+               ring->payload_size + sizeof(struct bcm4377_xfer_ring_entry);
+       ring->ring = dmam_alloc_coherent(&bcm4377->pdev->dev,
+                                        ring->n_entries * entry_size,
+                                        &ring->ring_dma, GFP_KERNEL);
+       if (!ring->ring)
+               return -ENOMEM;
+
+       if (ring->allow_wait) {
+               ring->events = devm_kcalloc(&bcm4377->pdev->dev,
+                                           ring->n_entries,
+                                           sizeof(*ring->events), GFP_KERNEL);
+               if (!ring->events)
+                       return -ENOMEM;
+       }
+
+       if (ring->mapped_payload_size) {
+               ring->payloads = dmam_alloc_coherent(
+                       &bcm4377->pdev->dev,
+                       ring->n_entries * ring->mapped_payload_size,
+                       &ring->payloads_dma, GFP_KERNEL);
+               if (!ring->payloads)
+                       return -ENOMEM;
+       }
+
+       return 0;
+}
+
+static int bcm4377_alloc_completion_ring(struct bcm4377_data *bcm4377,
+                                        struct bcm4377_completion_ring *ring)
+{
+       size_t entry_size;
+
+       ring->payload_size = ALIGN(ring->payload_size, 4);
+       if (ring->payload_size > BCM4377_XFER_RING_MAX_INPLACE_PAYLOAD_SIZE)
+               return -EINVAL;
+       if (ring->n_entries > BCM4377_MAX_RING_SIZE)
+               return -EINVAL;
+
+       entry_size = ring->payload_size +
+                    sizeof(struct bcm4377_completion_ring_entry);
+
+       ring->ring = dmam_alloc_coherent(&bcm4377->pdev->dev,
+                                        ring->n_entries * entry_size,
+                                        &ring->ring_dma, GFP_KERNEL);
+       if (!ring->ring)
+               return -ENOMEM;
+       return 0;
+}
+
+static int bcm4377_init_context(struct bcm4377_data *bcm4377)
+{
+       struct device *dev = &bcm4377->pdev->dev;
+       dma_addr_t peripheral_info_dma;
+
+       bcm4377->ctx = dmam_alloc_coherent(dev, sizeof(*bcm4377->ctx),
+                                          &bcm4377->ctx_dma, GFP_KERNEL);
+       if (!bcm4377->ctx)
+               return -ENOMEM;
+       memset(bcm4377->ctx, 0, sizeof(*bcm4377->ctx));
+
+       bcm4377->ring_state =
+               dmam_alloc_coherent(dev, sizeof(*bcm4377->ring_state),
+                                   &bcm4377->ring_state_dma, GFP_KERNEL);
+       if (!bcm4377->ring_state)
+               return -ENOMEM;
+       memset(bcm4377->ring_state, 0, sizeof(*bcm4377->ring_state));
+
+       bcm4377->ctx->version = cpu_to_le16(1);
+       bcm4377->ctx->size = cpu_to_le16(sizeof(*bcm4377->ctx));
+       bcm4377->ctx->enabled_caps = cpu_to_le32(2);
+
+       /*
+        * The BT device will write 0x20 bytes of data to this buffer but
+        * the exact contents are unknown. It only needs to exist for BT
+        * to work such that we can just allocate and then ignore it.
+        */
+       if (!dmam_alloc_coherent(&bcm4377->pdev->dev, 0x20,
+                                &peripheral_info_dma, GFP_KERNEL))
+               return -ENOMEM;
+       bcm4377->ctx->peripheral_info_addr = cpu_to_le64(peripheral_info_dma);
+
+       bcm4377->ctx->xfer_ring_heads_addr = cpu_to_le64(
+               bcm4377->ring_state_dma +
+               offsetof(struct bcm4377_ring_state, xfer_ring_head));
+       bcm4377->ctx->xfer_ring_tails_addr = cpu_to_le64(
+               bcm4377->ring_state_dma +
+               offsetof(struct bcm4377_ring_state, xfer_ring_tail));
+       bcm4377->ctx->completion_ring_heads_addr = cpu_to_le64(
+               bcm4377->ring_state_dma +
+               offsetof(struct bcm4377_ring_state, completion_ring_head));
+       bcm4377->ctx->completion_ring_tails_addr = cpu_to_le64(
+               bcm4377->ring_state_dma +
+               offsetof(struct bcm4377_ring_state, completion_ring_tail));
+
+       bcm4377->ctx->n_completion_rings =
+               cpu_to_le16(BCM4377_N_COMPLETION_RINGS);
+       bcm4377->ctx->n_xfer_rings = cpu_to_le16(BCM4377_N_TRANSFER_RINGS);
+
+       bcm4377->ctx->control_completion_ring_addr =
+               cpu_to_le64(bcm4377->control_ack_ring.ring_dma);
+       bcm4377->ctx->control_completion_ring_n_entries =
+               cpu_to_le16(bcm4377->control_ack_ring.n_entries);
+       bcm4377->ctx->control_completion_ring_doorbell = cpu_to_le16(0xffff);
+       bcm4377->ctx->control_completion_ring_msi = 0;
+       bcm4377->ctx->control_completion_ring_header_size = 0;
+       bcm4377->ctx->control_completion_ring_footer_size = 0;
+
+       bcm4377->ctx->control_xfer_ring_addr =
+               cpu_to_le64(bcm4377->control_h2d_ring.ring_dma);
+       bcm4377->ctx->control_xfer_ring_n_entries =
+               cpu_to_le16(bcm4377->control_h2d_ring.n_entries);
+       bcm4377->ctx->control_xfer_ring_doorbell =
+               cpu_to_le16(bcm4377->control_h2d_ring.doorbell);
+       bcm4377->ctx->control_xfer_ring_msi = 0;
+       bcm4377->ctx->control_xfer_ring_header_size = 0;
+       bcm4377->ctx->control_xfer_ring_footer_size =
+               bcm4377->control_h2d_ring.payload_size / 4;
+
+       dev_dbg(&bcm4377->pdev->dev, "context initialized at IOVA %pad",
+               &bcm4377->ctx_dma);
+
+       return 0;
+}
+
+static int bcm4377_prepare_rings(struct bcm4377_data *bcm4377)
+{
+       int ret;
+
+       /*
+        * Even though many of these settings appear to be configurable
+        * when sending the "create ring" messages most of these are
+        * actually hardcoded in some (and quite possibly all) firmware versions
+        * and changing them on the host has no effect.
+        * Specifically, this applies to at least the doorbells, the transfer
+        * and completion ring ids and their mapping (e.g. both HCI and ACL
+        * entries will always be queued in completion rings 1 and 2 no matter
+        * what we configure here).
+        */
+       bcm4377->control_ack_ring.ring_id = BCM4377_ACK_RING_CONTROL;
+       bcm4377->control_ack_ring.n_entries = 32;
+       bcm4377->control_ack_ring.transfer_rings =
+               BIT(BCM4377_XFER_RING_CONTROL);
+
+       bcm4377->hci_acl_ack_ring.ring_id = BCM4377_ACK_RING_HCI_ACL;
+       bcm4377->hci_acl_ack_ring.n_entries = 2 * BCM4377_RING_N_ENTRIES;
+       bcm4377->hci_acl_ack_ring.transfer_rings =
+               BIT(BCM4377_XFER_RING_HCI_H2D) | BIT(BCM4377_XFER_RING_ACL_H2D);
+       bcm4377->hci_acl_ack_ring.delay = 1000;
+
+       /*
+        * A payload size of MAX_EVENT_PAYLOAD_SIZE is enough here since large
+        * ACL packets will be transmitted inside buffers mapped via
+        * acl_d2h_ring anyway.
+        */
+       bcm4377->hci_acl_event_ring.ring_id = BCM4377_EVENT_RING_HCI_ACL;
+       bcm4377->hci_acl_event_ring.payload_size = MAX_EVENT_PAYLOAD_SIZE;
+       bcm4377->hci_acl_event_ring.n_entries = 2 * BCM4377_RING_N_ENTRIES;
+       bcm4377->hci_acl_event_ring.transfer_rings =
+               BIT(BCM4377_XFER_RING_HCI_D2H) | BIT(BCM4377_XFER_RING_ACL_D2H);
+       bcm4377->hci_acl_event_ring.delay = 1000;
+
+       bcm4377->sco_ack_ring.ring_id = BCM4377_ACK_RING_SCO;
+       bcm4377->sco_ack_ring.n_entries = BCM4377_RING_N_ENTRIES;
+       bcm4377->sco_ack_ring.transfer_rings = BIT(BCM4377_XFER_RING_SCO_H2D);
+
+       bcm4377->sco_event_ring.ring_id = BCM4377_EVENT_RING_SCO;
+       bcm4377->sco_event_ring.payload_size = MAX_SCO_PAYLOAD_SIZE;
+       bcm4377->sco_event_ring.n_entries = BCM4377_RING_N_ENTRIES;
+       bcm4377->sco_event_ring.transfer_rings = BIT(BCM4377_XFER_RING_SCO_D2H);
+
+       bcm4377->control_h2d_ring.ring_id = BCM4377_XFER_RING_CONTROL;
+       bcm4377->control_h2d_ring.doorbell = BCM4377_DOORBELL_CONTROL;
+       bcm4377->control_h2d_ring.payload_size = BCM4377_CONTROL_MSG_SIZE;
+       bcm4377->control_h2d_ring.completion_ring = BCM4377_ACK_RING_CONTROL;
+       bcm4377->control_h2d_ring.allow_wait = true;
+       bcm4377->control_h2d_ring.n_entries = BCM4377_RING_N_ENTRIES;
+
+       bcm4377->hci_h2d_ring.ring_id = BCM4377_XFER_RING_HCI_H2D;
+       bcm4377->hci_h2d_ring.doorbell = BCM4377_DOORBELL_HCI_H2D;
+       bcm4377->hci_h2d_ring.payload_size = MAX_EVENT_PAYLOAD_SIZE;
+       bcm4377->hci_h2d_ring.completion_ring = BCM4377_ACK_RING_HCI_ACL;
+       bcm4377->hci_h2d_ring.n_entries = BCM4377_RING_N_ENTRIES;
+
+       bcm4377->hci_d2h_ring.ring_id = BCM4377_XFER_RING_HCI_D2H;
+       bcm4377->hci_d2h_ring.doorbell = BCM4377_DOORBELL_HCI_D2H;
+       bcm4377->hci_d2h_ring.completion_ring = BCM4377_EVENT_RING_HCI_ACL;
+       bcm4377->hci_d2h_ring.virtual = true;
+       bcm4377->hci_d2h_ring.n_entries = BCM4377_RING_N_ENTRIES;
+
+       bcm4377->sco_h2d_ring.ring_id = BCM4377_XFER_RING_SCO_H2D;
+       bcm4377->sco_h2d_ring.doorbell = BCM4377_DOORBELL_SCO;
+       bcm4377->sco_h2d_ring.payload_size = MAX_SCO_PAYLOAD_SIZE;
+       bcm4377->sco_h2d_ring.completion_ring = BCM4377_ACK_RING_SCO;
+       bcm4377->sco_h2d_ring.sync = true;
+       bcm4377->sco_h2d_ring.n_entries = BCM4377_RING_N_ENTRIES;
+
+       bcm4377->sco_d2h_ring.ring_id = BCM4377_XFER_RING_SCO_D2H;
+       bcm4377->sco_d2h_ring.doorbell = BCM4377_DOORBELL_SCO;
+       bcm4377->sco_d2h_ring.completion_ring = BCM4377_EVENT_RING_SCO;
+       bcm4377->sco_d2h_ring.virtual = true;
+       bcm4377->sco_d2h_ring.sync = true;
+       bcm4377->sco_d2h_ring.n_entries = BCM4377_RING_N_ENTRIES;
+
+       /*
+        * This ring has to use mapped_payload_size because the largest ACL
+        * packet doesn't fit inside the largest possible footer
+        */
+       bcm4377->acl_h2d_ring.ring_id = BCM4377_XFER_RING_ACL_H2D;
+       bcm4377->acl_h2d_ring.doorbell = BCM4377_DOORBELL_ACL_H2D;
+       bcm4377->acl_h2d_ring.mapped_payload_size = MAX_ACL_PAYLOAD_SIZE;
+       bcm4377->acl_h2d_ring.completion_ring = BCM4377_ACK_RING_HCI_ACL;
+       bcm4377->acl_h2d_ring.n_entries = BCM4377_RING_N_ENTRIES;
+
+       /*
+        * This ring only contains empty buffers to be used by incoming
+        * ACL packets that do not fit inside the footer of hci_acl_event_ring
+        */
+       bcm4377->acl_d2h_ring.ring_id = BCM4377_XFER_RING_ACL_D2H;
+       bcm4377->acl_d2h_ring.doorbell = BCM4377_DOORBELL_ACL_D2H;
+       bcm4377->acl_d2h_ring.completion_ring = BCM4377_EVENT_RING_HCI_ACL;
+       bcm4377->acl_d2h_ring.d2h_buffers_only = true;
+       bcm4377->acl_d2h_ring.mapped_payload_size = MAX_ACL_PAYLOAD_SIZE;
+       bcm4377->acl_d2h_ring.n_entries = BCM4377_RING_N_ENTRIES;
+
+       /*
+        * no need for any cleanup since this is only called from _probe
+        * and only devres-managed allocations are used
+        */
+       ret = bcm4377_alloc_transfer_ring(bcm4377, &bcm4377->control_h2d_ring);
+       if (ret)
+               return ret;
+       ret = bcm4377_alloc_transfer_ring(bcm4377, &bcm4377->hci_h2d_ring);
+       if (ret)
+               return ret;
+       ret = bcm4377_alloc_transfer_ring(bcm4377, &bcm4377->hci_d2h_ring);
+       if (ret)
+               return ret;
+       ret = bcm4377_alloc_transfer_ring(bcm4377, &bcm4377->sco_h2d_ring);
+       if (ret)
+               return ret;
+       ret = bcm4377_alloc_transfer_ring(bcm4377, &bcm4377->sco_d2h_ring);
+       if (ret)
+               return ret;
+       ret = bcm4377_alloc_transfer_ring(bcm4377, &bcm4377->acl_h2d_ring);
+       if (ret)
+               return ret;
+       ret = bcm4377_alloc_transfer_ring(bcm4377, &bcm4377->acl_d2h_ring);
+       if (ret)
+               return ret;
+
+       ret = bcm4377_alloc_completion_ring(bcm4377,
+                                           &bcm4377->control_ack_ring);
+       if (ret)
+               return ret;
+       ret = bcm4377_alloc_completion_ring(bcm4377,
+                                           &bcm4377->hci_acl_ack_ring);
+       if (ret)
+               return ret;
+       ret = bcm4377_alloc_completion_ring(bcm4377,
+                                           &bcm4377->hci_acl_event_ring);
+       if (ret)
+               return ret;
+       ret = bcm4377_alloc_completion_ring(bcm4377, &bcm4377->sco_ack_ring);
+       if (ret)
+               return ret;
+       ret = bcm4377_alloc_completion_ring(bcm4377, &bcm4377->sco_event_ring);
+       if (ret)
+               return ret;
+
+       dev_dbg(&bcm4377->pdev->dev, "all rings allocated and prepared\n");
+
+       return 0;
+}
+
+static int bcm4377_boot(struct bcm4377_data *bcm4377)
+{
+       const struct firmware *fw;
+       void *bfr;
+       dma_addr_t fw_dma;
+       int ret = 0;
+       u32 bootstage, rti_status;
+
+       bootstage = ioread32(bcm4377->bar2 + BCM4377_BAR2_BOOTSTAGE);
+       rti_status = ioread32(bcm4377->bar2 + BCM4377_BAR2_RTI_STATUS);
+
+       if (bootstage != 0) {
+               dev_err(&bcm4377->pdev->dev, "bootstage is %d and not 0\n",
+                       bootstage);
+               return -EINVAL;
+       }
+
+       if (rti_status != 0) {
+               dev_err(&bcm4377->pdev->dev, "RTI status is %d and not 0\n",
+                       rti_status);
+               return -EINVAL;
+       }
+
+       fw = bcm4377_request_blob(bcm4377, "bin");
+       if (!fw) {
+               dev_err(&bcm4377->pdev->dev, "Failed to load firmware\n");
+               return -ENOENT;
+       }
+
+       bfr = dma_alloc_coherent(&bcm4377->pdev->dev, fw->size, &fw_dma,
+                                GFP_KERNEL);
+       if (!bfr) {
+               ret = -ENOMEM;
+               goto out_release_fw;
+       }
+
+       memcpy(bfr, fw->data, fw->size);
+
+       iowrite32(0, bcm4377->bar0 + BCM4377_BAR0_HOST_WINDOW_LO);
+       iowrite32(0, bcm4377->bar0 + BCM4377_BAR0_HOST_WINDOW_HI);
+       iowrite32(BCM4377_DMA_MASK,
+                 bcm4377->bar0 + BCM4377_BAR0_HOST_WINDOW_SIZE);
+
+       iowrite32(lower_32_bits(fw_dma), bcm4377->bar2 + BCM4377_BAR2_FW_LO);
+       iowrite32(upper_32_bits(fw_dma), bcm4377->bar2 + BCM4377_BAR2_FW_HI);
+       iowrite32(fw->size, bcm4377->bar2 + BCM4377_BAR2_FW_SIZE);
+       iowrite32(0, bcm4377->bar0 + BCM4377_BAR0_FW_DOORBELL);
+
+       dev_dbg(&bcm4377->pdev->dev, "waiting for firmware to boot\n");
+
+       ret = wait_for_completion_interruptible_timeout(&bcm4377->event,
+                                                       BCM4377_TIMEOUT);
+       if (ret == 0) {
+               ret = -ETIMEDOUT;
+               goto out_dma_free;
+       } else if (ret < 0) {
+               goto out_dma_free;
+       }
+
+       if (bcm4377->bootstage != 2) {
+               dev_err(&bcm4377->pdev->dev, "boostage %d != 2\n",
+                       bcm4377->bootstage);
+               ret = -ENXIO;
+               goto out_dma_free;
+       }
+
+       dev_dbg(&bcm4377->pdev->dev, "firmware has booted (stage = %x)\n",
+               bcm4377->bootstage);
+       ret = 0;
+
+out_dma_free:
+       dma_free_coherent(&bcm4377->pdev->dev, fw->size, bfr, fw_dma);
+out_release_fw:
+       release_firmware(fw);
+       return ret;
+}
+
+static int bcm4377_setup_rti(struct bcm4377_data *bcm4377)
+{
+       int ret;
+
+       dev_dbg(&bcm4377->pdev->dev, "starting RTI\n");
+       iowrite32(1, bcm4377->bar0 + BCM4377_BAR0_RTI_CONTROL);
+
+       ret = wait_for_completion_interruptible_timeout(&bcm4377->event,
+                                                       BCM4377_TIMEOUT);
+       if (ret == 0) {
+               dev_err(&bcm4377->pdev->dev,
+                       "timed out while waiting for RTI to transition to state 1");
+               return -ETIMEDOUT;
+       } else if (ret < 0) {
+               return ret;
+       }
+
+       if (bcm4377->rti_status != 1) {
+               dev_err(&bcm4377->pdev->dev, "RTI did not ack state 1 (%d)\n",
+                       bcm4377->rti_status);
+               return -ENODEV;
+       }
+       dev_dbg(&bcm4377->pdev->dev, "RTI is in state 1\n");
+
+       /* allow access to the entire IOVA space again */
+       iowrite32(0, bcm4377->bar2 + BCM4377_BAR2_RTI_WINDOW_LO);
+       iowrite32(0, bcm4377->bar2 + BCM4377_BAR2_RTI_WINDOW_HI);
+       iowrite32(BCM4377_DMA_MASK,
+                 bcm4377->bar2 + BCM4377_BAR2_RTI_WINDOW_SIZE);
+
+       /* setup "Converged IPC" context */
+       iowrite32(lower_32_bits(bcm4377->ctx_dma),
+                 bcm4377->bar2 + BCM4377_BAR2_CONTEXT_ADDR_LO);
+       iowrite32(upper_32_bits(bcm4377->ctx_dma),
+                 bcm4377->bar2 + BCM4377_BAR2_CONTEXT_ADDR_HI);
+       iowrite32(2, bcm4377->bar0 + BCM4377_BAR0_RTI_CONTROL);
+
+       ret = wait_for_completion_interruptible_timeout(&bcm4377->event,
+                                                       BCM4377_TIMEOUT);
+       if (ret == 0) {
+               dev_err(&bcm4377->pdev->dev,
+                       "timed out while waiting for RTI to transition to state 2");
+               return -ETIMEDOUT;
+       } else if (ret < 0) {
+               return ret;
+       }
+
+       if (bcm4377->rti_status != 2) {
+               dev_err(&bcm4377->pdev->dev, "RTI did not ack state 2 (%d)\n",
+                       bcm4377->rti_status);
+               return -ENODEV;
+       }
+
+       dev_dbg(&bcm4377->pdev->dev,
+               "RTI is in state 2; control ring is ready\n");
+       bcm4377->control_ack_ring.enabled = true;
+
+       return 0;
+}
+
+static int bcm4377_parse_otp_board_params(struct bcm4377_data *bcm4377,
+                                         char tag, const char *val, size_t len)
+{
+       if (tag != 'V')
+               return 0;
+       if (len >= sizeof(bcm4377->vendor))
+               return -EINVAL;
+
+       strscpy(bcm4377->vendor, val, len + 1);
+       return 0;
+}
+
+static int bcm4377_parse_otp_chip_params(struct bcm4377_data *bcm4377, char tag,
+                                        const char *val, size_t len)
+{
+       size_t idx = 0;
+
+       if (tag != 's')
+               return 0;
+       if (len >= sizeof(bcm4377->stepping))
+               return -EINVAL;
+
+       while (len != 0) {
+               bcm4377->stepping[idx] = tolower(val[idx]);
+               if (val[idx] == '\0')
+                       return 0;
+
+               idx++;
+               len--;
+       }
+
+       bcm4377->stepping[idx] = '\0';
+       return 0;
+}
+
+static int bcm4377_parse_otp_str(struct bcm4377_data *bcm4377, const u8 *str,
+                                enum bcm4377_otp_params_type type)
+{
+       const char *p;
+       int ret;
+
+       p = skip_spaces(str);
+       while (*p) {
+               char tag = *p++;
+               const char *end;
+               size_t len;
+
+               if (*p++ != '=') /* implicit NUL check */
+                       return -EINVAL;
+
+               /* *p might be NUL here, if so end == p and len == 0 */
+               end = strchrnul(p, ' ');
+               len = end - p;
+
+               /* leave 1 byte for NUL in destination string */
+               if (len > (BCM4377_OTP_MAX_PARAM_LEN - 1))
+                       return -EINVAL;
+
+               switch (type) {
+               case BCM4377_OTP_BOARD_PARAMS:
+                       ret = bcm4377_parse_otp_board_params(bcm4377, tag, p,
+                                                            len);
+                       break;
+               case BCM4377_OTP_CHIP_PARAMS:
+                       ret = bcm4377_parse_otp_chip_params(bcm4377, tag, p,
+                                                           len);
+                       break;
+               default:
+                       ret = -EINVAL;
+                       break;
+               }
+
+               if (ret)
+                       return ret;
+
+               /* Skip to next arg, if any */
+               p = skip_spaces(end);
+       }
+
+       return 0;
+}
+
+static int bcm4377_parse_otp_sys_vendor(struct bcm4377_data *bcm4377, u8 *otp,
+                                       size_t size)
+{
+       int idx = 4;
+       const char *chip_params;
+       const char *board_params;
+       int ret;
+
+       /* 4-byte header and two empty strings */
+       if (size < 6)
+               return -EINVAL;
+
+       if (get_unaligned_le32(otp) != BCM4377_OTP_VENDOR_HDR)
+               return -EINVAL;
+
+       chip_params = &otp[idx];
+
+       /* Skip first string, including terminator */
+       idx += strnlen(chip_params, size - idx) + 1;
+       if (idx >= size)
+               return -EINVAL;
+
+       board_params = &otp[idx];
+
+       /* Skip to terminator of second string */
+       idx += strnlen(board_params, size - idx);
+       if (idx >= size)
+               return -EINVAL;
+
+       /* At this point both strings are guaranteed NUL-terminated */
+       dev_dbg(&bcm4377->pdev->dev,
+               "OTP: chip_params='%s' board_params='%s'\n", chip_params,
+               board_params);
+
+       ret = bcm4377_parse_otp_str(bcm4377, chip_params,
+                                   BCM4377_OTP_CHIP_PARAMS);
+       if (ret)
+               return ret;
+
+       ret = bcm4377_parse_otp_str(bcm4377, board_params,
+                                   BCM4377_OTP_BOARD_PARAMS);
+       if (ret)
+               return ret;
+
+       if (!bcm4377->stepping[0] || !bcm4377->vendor[0])
+               return -EINVAL;
+
+       dev_dbg(&bcm4377->pdev->dev, "OTP: stepping=%s, vendor=%s\n",
+               bcm4377->stepping, bcm4377->vendor);
+       return 0;
+}
+
+static int bcm4377_parse_otp(struct bcm4377_data *bcm4377)
+{
+       u8 *otp;
+       int i;
+       int ret = -ENOENT;
+
+       otp = kzalloc(BCM4377_OTP_SIZE, GFP_KERNEL);
+       if (!otp)
+               return -ENOMEM;
+
+       for (i = 0; i < BCM4377_OTP_SIZE; ++i)
+               otp[i] = ioread8(bcm4377->bar0 + bcm4377->hw->otp_offset + i);
+
+       i = 0;
+       while (i < (BCM4377_OTP_SIZE - 1)) {
+               u8 type = otp[i];
+               u8 length = otp[i + 1];
+
+               if (type == 0)
+                       break;
+
+               if ((i + 2 + length) > BCM4377_OTP_SIZE)
+                       break;
+
+               switch (type) {
+               case BCM4377_OTP_SYS_VENDOR:
+                       dev_dbg(&bcm4377->pdev->dev,
+                               "OTP @ 0x%x (%d): SYS_VENDOR", i, length);
+                       ret = bcm4377_parse_otp_sys_vendor(bcm4377, &otp[i + 2],
+                                                          length);
+                       break;
+               case BCM4377_OTP_CIS:
+                       dev_dbg(&bcm4377->pdev->dev, "OTP @ 0x%x (%d): CIS", i,
+                               length);
+                       break;
+               default:
+                       dev_dbg(&bcm4377->pdev->dev, "OTP @ 0x%x (%d): unknown",
+                               i, length);
+                       break;
+               }
+
+               i += 2 + length;
+       }
+
+       kfree(otp);
+       return ret;
+}
+
+static int bcm4377_init_cfg(struct bcm4377_data *bcm4377)
+{
+       int ret;
+       u32 ctrl;
+
+       ret = pci_write_config_dword(bcm4377->pdev,
+                                    BCM4377_PCIECFG_BAR0_WINDOW1,
+                                    bcm4377->hw->bar0_window1);
+       if (ret)
+               return ret;
+
+       ret = pci_write_config_dword(bcm4377->pdev,
+                                    BCM4377_PCIECFG_BAR0_WINDOW2,
+                                    bcm4377->hw->bar0_window2);
+       if (ret)
+               return ret;
+
+       ret = pci_write_config_dword(
+               bcm4377->pdev, BCM4377_PCIECFG_BAR0_CORE2_WINDOW1,
+               BCM4377_PCIECFG_BAR0_CORE2_WINDOW1_DEFAULT);
+       if (ret)
+               return ret;
+
+       if (bcm4377->hw->has_bar0_core2_window2) {
+               ret = pci_write_config_dword(bcm4377->pdev,
+                                            BCM4377_PCIECFG_BAR0_CORE2_WINDOW2,
+                                            bcm4377->hw->bar0_core2_window2);
+               if (ret)
+                       return ret;
+       }
+
+       ret = pci_write_config_dword(bcm4377->pdev, BCM4377_PCIECFG_BAR2_WINDOW,
+                                    BCM4377_PCIECFG_BAR2_WINDOW_DEFAULT);
+       if (ret)
+               return ret;
+
+       ret = pci_read_config_dword(bcm4377->pdev,
+                                   BCM4377_PCIECFG_SUBSYSTEM_CTRL, &ctrl);
+       if (ret)
+               return ret;
+
+       if (bcm4377->hw->clear_pciecfg_subsystem_ctrl_bit19)
+               ctrl &= ~BIT(19);
+       ctrl |= BIT(16);
+
+       return pci_write_config_dword(bcm4377->pdev,
+                                     BCM4377_PCIECFG_SUBSYSTEM_CTRL, ctrl);
+}
+
+static int bcm4377_probe_dmi(struct bcm4377_data *bcm4377)
+{
+       const struct dmi_system_id *board_type_dmi_id;
+
+       board_type_dmi_id = dmi_first_match(bcm4377_dmi_board_table);
+       if (board_type_dmi_id && board_type_dmi_id->driver_data) {
+               bcm4377->board_type = board_type_dmi_id->driver_data;
+               dev_dbg(&bcm4377->pdev->dev,
+                       "found board type via DMI match: %s\n",
+                       bcm4377->board_type);
+       }
+
+       return 0;
+}
+
+static int bcm4377_probe_of(struct bcm4377_data *bcm4377)
+{
+       struct device_node *np = bcm4377->pdev->dev.of_node;
+       int ret;
+
+       if (!np)
+               return 0;
+
+       ret = of_property_read_string(np, "brcm,board-type",
+                                     &bcm4377->board_type);
+       if (ret) {
+               dev_err(&bcm4377->pdev->dev, "no brcm,board-type property\n");
+               return ret;
+       }
+
+       bcm4377->taurus_beamforming_cal_blob =
+               of_get_property(np, "brcm,taurus-bf-cal-blob",
+                               &bcm4377->taurus_beamforming_cal_size);
+       if (!bcm4377->taurus_beamforming_cal_blob) {
+               dev_err(&bcm4377->pdev->dev,
+                       "no brcm,taurus-bf-cal-blob property\n");
+               return -ENOENT;
+       }
+       bcm4377->taurus_cal_blob = of_get_property(np, "brcm,taurus-cal-blob",
+                                                  &bcm4377->taurus_cal_size);
+       if (!bcm4377->taurus_cal_blob) {
+               dev_err(&bcm4377->pdev->dev,
+                       "no brcm,taurus-cal-blob property\n");
+               return -ENOENT;
+       }
+
+       return 0;
+}
+
+static void bcm4377_disable_aspm(struct bcm4377_data *bcm4377)
+{
+       pci_disable_link_state(bcm4377->pdev,
+                              PCIE_LINK_STATE_L0S | PCIE_LINK_STATE_L1);
+
+       /*
+        * pci_disable_link_state can fail if either CONFIG_PCIEASPM is disabled
+        * or if the BIOS hasn't handed over control to us. We must *always*
+        * disable ASPM for this device due to hardware errata though.
+        */
+       pcie_capability_clear_word(bcm4377->pdev, PCI_EXP_LNKCTL,
+                                  PCI_EXP_LNKCTL_ASPMC);
+}
+
+static void bcm4377_pci_free_irq_vectors(void *data)
+{
+       pci_free_irq_vectors(data);
+}
+
+static void bcm4377_hci_free_dev(void *data)
+{
+       hci_free_dev(data);
+}
+
+static void bcm4377_hci_unregister_dev(void *data)
+{
+       hci_unregister_dev(data);
+}
+
+static int bcm4377_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+       struct bcm4377_data *bcm4377;
+       struct hci_dev *hdev;
+       int ret, irq;
+
+       ret = dma_set_mask_and_coherent(&pdev->dev, BCM4377_DMA_MASK);
+       if (ret)
+               return ret;
+
+       bcm4377 = devm_kzalloc(&pdev->dev, sizeof(*bcm4377), GFP_KERNEL);
+       if (!bcm4377)
+               return -ENOMEM;
+
+       bcm4377->pdev = pdev;
+       bcm4377->hw = &bcm4377_hw_variants[id->driver_data];
+       init_completion(&bcm4377->event);
+
+       ret = bcm4377_prepare_rings(bcm4377);
+       if (ret)
+               return ret;
+
+       ret = bcm4377_init_context(bcm4377);
+       if (ret)
+               return ret;
+
+       ret = bcm4377_probe_dmi(bcm4377);
+       if (ret)
+               return ret;
+       ret = bcm4377_probe_of(bcm4377);
+       if (ret)
+               return ret;
+       if (!bcm4377->board_type) {
+               dev_err(&pdev->dev, "unable to determine board type\n");
+               return -ENODEV;
+       }
+
+       if (bcm4377->hw->disable_aspm)
+               bcm4377_disable_aspm(bcm4377);
+
+       ret = pci_reset_function_locked(pdev);
+       if (ret)
+               dev_warn(
+                       &pdev->dev,
+                       "function level reset failed with %d; trying to continue anyway\n",
+                       ret);
+
+       /*
+        * If this number is too low and we try to access any BAR too
+        * early the device will crash. Experiments have shown that
+        * approximately 50 msec is the minimum amount we have to wait.
+        * Let's double that to be safe.
+        */
+       msleep(100);
+
+       ret = pcim_enable_device(pdev);
+       if (ret)
+               return ret;
+       pci_set_master(pdev);
+
+       ret = bcm4377_init_cfg(bcm4377);
+       if (ret)
+               return ret;
+
+       bcm4377->bar0 = pcim_iomap(pdev, 0, 0);
+       if (!bcm4377->bar0)
+               return -EBUSY;
+       bcm4377->bar2 = pcim_iomap(pdev, 2, 0);
+       if (!bcm4377->bar2)
+               return -EBUSY;
+
+       ret = bcm4377_parse_otp(bcm4377);
+       if (ret) {
+               dev_err(&pdev->dev, "Reading OTP failed with %d\n", ret);
+               return ret;
+       }
+
+       /*
+        * Legacy interrupts result in an IRQ storm because we don't know where
+        * the interrupt mask and status registers for these chips are.
+        * MSIs are acked automatically instead.
+        */
+       ret = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_MSI);
+       if (ret < 0)
+               return -ENODEV;
+       ret = devm_add_action_or_reset(&pdev->dev, bcm4377_pci_free_irq_vectors,
+                                      pdev);
+       if (ret)
+               return ret;
+
+       irq = pci_irq_vector(pdev, 0);
+       if (irq <= 0)
+               return -ENODEV;
+
+       ret = devm_request_irq(&pdev->dev, irq, bcm4377_irq, 0, "bcm4377",
+                              bcm4377);
+       if (ret)
+               return ret;
+
+       hdev = hci_alloc_dev();
+       if (!hdev)
+               return -ENOMEM;
+       ret = devm_add_action_or_reset(&pdev->dev, bcm4377_hci_free_dev, hdev);
+       if (ret)
+               return ret;
+
+       bcm4377->hdev = hdev;
+
+       hdev->bus = HCI_PCI;
+       hdev->dev_type = HCI_PRIMARY;
+       hdev->open = bcm4377_hci_open;
+       hdev->close = bcm4377_hci_close;
+       hdev->send = bcm4377_hci_send_frame;
+       hdev->set_bdaddr = bcm4377_hci_set_bdaddr;
+       hdev->setup = bcm4377_hci_setup;
+
+       set_bit(HCI_QUIRK_USE_BDADDR_PROPERTY, &hdev->quirks);
+       if (bcm4377->hw->broken_mws_transport_config)
+               set_bit(HCI_QUIRK_BROKEN_MWS_TRANSPORT_CONFIG, &hdev->quirks);
+       if (bcm4377->hw->broken_ext_scan)
+               set_bit(HCI_QUIRK_BROKEN_EXT_SCAN, &hdev->quirks);
+
+       pci_set_drvdata(pdev, bcm4377);
+       hci_set_drvdata(hdev, bcm4377);
+       SET_HCIDEV_DEV(hdev, &pdev->dev);
+
+       ret = bcm4377_boot(bcm4377);
+       if (ret)
+               return ret;
+
+       ret = bcm4377_setup_rti(bcm4377);
+       if (ret)
+               return ret;
+
+       ret = hci_register_dev(hdev);
+       if (ret)
+               return ret;
+       return devm_add_action_or_reset(&pdev->dev, bcm4377_hci_unregister_dev,
+                                       hdev);
+}
+
+static int bcm4377_suspend(struct pci_dev *pdev, pm_message_t state)
+{
+       struct bcm4377_data *bcm4377 = pci_get_drvdata(pdev);
+       int ret;
+
+       ret = hci_suspend_dev(bcm4377->hdev);
+       if (ret)
+               return ret;
+
+       iowrite32(BCM4377_BAR0_SLEEP_CONTROL_QUIESCE,
+                 bcm4377->bar0 + BCM4377_BAR0_SLEEP_CONTROL);
+
+       return 0;
+}
+
+static int bcm4377_resume(struct pci_dev *pdev)
+{
+       struct bcm4377_data *bcm4377 = pci_get_drvdata(pdev);
+
+       iowrite32(BCM4377_BAR0_SLEEP_CONTROL_UNQUIESCE,
+                 bcm4377->bar0 + BCM4377_BAR0_SLEEP_CONTROL);
+
+       return hci_resume_dev(bcm4377->hdev);
+}
+
+static const struct dmi_system_id bcm4377_dmi_board_table[] = {
+       {
+               .matches = {
+                       DMI_MATCH(DMI_BOARD_VENDOR, "Apple Inc."),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "MacBookAir9,1"),
+               },
+               .driver_data = "apple,formosa",
+       },
+       {
+               .matches = {
+                       DMI_MATCH(DMI_BOARD_VENDOR, "Apple Inc."),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro15,4"),
+               },
+               .driver_data = "apple,formosa",
+       },
+       {
+               .matches = {
+                       DMI_MATCH(DMI_BOARD_VENDOR, "Apple Inc."),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro16,3"),
+               },
+               .driver_data = "apple,formosa",
+       },
+       {}
+};
+
+static const struct bcm4377_hw bcm4377_hw_variants[] = {
+       [BCM4377] = {
+               .id = 0x4377,
+               .otp_offset = 0x4120,
+               .bar0_window1 = 0x1800b000,
+               .bar0_window2 = 0x1810c000,
+               .disable_aspm = true,
+               .broken_ext_scan = true,
+               .send_ptb = bcm4377_send_ptb,
+       },
+
+       [BCM4378] = {
+               .id = 0x4378,
+               .otp_offset = 0x4120,
+               .bar0_window1 = 0x18002000,
+               .bar0_window2 = 0x1810a000,
+               .bar0_core2_window2 = 0x18107000,
+               .has_bar0_core2_window2 = true,
+               .broken_mws_transport_config = true,
+               .send_calibration = bcm4378_send_calibration,
+               .send_ptb = bcm4378_send_ptb,
+       },
+
+       [BCM4387] = {
+               .id = 0x4387,
+               .otp_offset = 0x413c,
+               .bar0_window1 = 0x18002000,
+               .bar0_window2 = 0x18109000,
+               .bar0_core2_window2 = 0x18106000,
+               .has_bar0_core2_window2 = true,
+               .clear_pciecfg_subsystem_ctrl_bit19 = true,
+               .broken_mws_transport_config = true,
+               .send_calibration = bcm4387_send_calibration,
+               .send_ptb = bcm4378_send_ptb,
+       },
+};
+
+#define BCM4377_DEVID_ENTRY(id)                                             \
+       {                                                                   \
+               PCI_VENDOR_ID_BROADCOM, BCM##id##_DEVICE_ID, PCI_ANY_ID,    \
+                       PCI_ANY_ID, PCI_CLASS_NETWORK_OTHER << 8, 0xffff00, \
+                       BCM##id                                             \
+       }
+
+static const struct pci_device_id bcm4377_devid_table[] = {
+       BCM4377_DEVID_ENTRY(4377),
+       BCM4377_DEVID_ENTRY(4378),
+       BCM4377_DEVID_ENTRY(4387),
+       {},
+};
+MODULE_DEVICE_TABLE(pci, bcm4377_devid_table);
+
+static struct pci_driver bcm4377_pci_driver = {
+       .name = "hci_bcm4377",
+       .id_table = bcm4377_devid_table,
+       .probe = bcm4377_probe,
+       .suspend = bcm4377_suspend,
+       .resume = bcm4377_resume,
+};
+module_pci_driver(bcm4377_pci_driver);
+
+MODULE_AUTHOR("Sven Peter <sven@svenpeter.dev>");
+MODULE_DESCRIPTION("Bluetooth support for Broadcom 4377/4378/4387 devices");
+MODULE_LICENSE("Dual MIT/GPL");
+MODULE_FIRMWARE("brcm/brcmbt4377*.bin");
+MODULE_FIRMWARE("brcm/brcmbt4377*.ptb");
+MODULE_FIRMWARE("brcm/brcmbt4378*.bin");
+MODULE_FIRMWARE("brcm/brcmbt4378*.ptb");
+MODULE_FIRMWARE("brcm/brcmbt4387*.bin");
+MODULE_FIRMWARE("brcm/brcmbt4387*.ptb");
index cf4a560..8055f63 100644 (file)
@@ -378,7 +378,7 @@ static void bcsp_pkt_cull(struct bcsp_struct *bcsp)
                i++;
 
                __skb_unlink(skb, &bcsp->unack);
-               kfree_skb(skb);
+               dev_kfree_skb_irq(skb);
        }
 
        if (skb_queue_empty(&bcsp->unack))
index c5a0409..6455bc4 100644 (file)
@@ -313,7 +313,7 @@ static void h5_pkt_cull(struct h5 *h5)
                        break;
 
                __skb_unlink(skb, &h5->unack);
-               kfree_skb(skb);
+               dev_kfree_skb_irq(skb);
        }
 
        if (skb_queue_empty(&h5->unack))
index 4eb420a..5abc01a 100644 (file)
@@ -345,7 +345,7 @@ static int ll_enqueue(struct hci_uart *hu, struct sk_buff *skb)
        default:
                BT_ERR("illegal hcill state: %ld (losing packet)",
                       ll->hcill_state);
-               kfree_skb(skb);
+               dev_kfree_skb_irq(skb);
                break;
        }
 
index 8df1101..3df9e69 100644 (file)
@@ -912,7 +912,7 @@ static int qca_enqueue(struct hci_uart *hu, struct sk_buff *skb)
        default:
                BT_ERR("Illegal tx state: %d (losing packet)",
                       qca->tx_ibs_state);
-               kfree_skb(skb);
+               dev_kfree_skb_irq(skb);
                break;
        }
 
@@ -1765,7 +1765,8 @@ retry:
                qca_debugfs_init(hdev);
                hu->hdev->hw_error = qca_hw_error;
                hu->hdev->cmd_timeout = qca_cmd_timeout;
-               hu->hdev->wakeup = qca_wakeup;
+               if (device_can_wakeup(hu->serdev->ctrl->dev.parent))
+                       hu->hdev->wakeup = qca_wakeup;
        } else if (ret == -ENOENT) {
                /* No patch/nvm-config found, run with original fw/config */
                set_bit(QCA_ROM_FW, &qca->flags);
index fd281d4..c570c45 100644 (file)
@@ -50,8 +50,11 @@ static int virtbt_add_inbuf(struct virtio_bluetooth *vbt)
 
 static int virtbt_open(struct hci_dev *hdev)
 {
-       struct virtio_bluetooth *vbt = hci_get_drvdata(hdev);
+       return 0;
+}
 
+static int virtbt_open_vdev(struct virtio_bluetooth *vbt)
+{
        if (virtbt_add_inbuf(vbt) < 0)
                return -EIO;
 
@@ -61,7 +64,11 @@ static int virtbt_open(struct hci_dev *hdev)
 
 static int virtbt_close(struct hci_dev *hdev)
 {
-       struct virtio_bluetooth *vbt = hci_get_drvdata(hdev);
+       return 0;
+}
+
+static int virtbt_close_vdev(struct virtio_bluetooth *vbt)
+{
        int i;
 
        cancel_work_sync(&vbt->rx);
@@ -306,7 +313,12 @@ static int virtbt_probe(struct virtio_device *vdev)
        if (virtio_has_feature(vdev, VIRTIO_BT_F_VND_HCI)) {
                __u16 vendor;
 
-               virtio_cread(vdev, struct virtio_bt_config, vendor, &vendor);
+               if (virtio_has_feature(vdev, VIRTIO_BT_F_CONFIG_V2))
+                       virtio_cread(vdev, struct virtio_bt_config_v2,
+                                    vendor, &vendor);
+               else
+                       virtio_cread(vdev, struct virtio_bt_config,
+                                    vendor, &vendor);
 
                switch (vendor) {
                case VIRTIO_BT_CONFIG_VENDOR_ZEPHYR:
@@ -339,8 +351,12 @@ static int virtbt_probe(struct virtio_device *vdev)
        if (virtio_has_feature(vdev, VIRTIO_BT_F_MSFT_EXT)) {
                __u16 msft_opcode;
 
-               virtio_cread(vdev, struct virtio_bt_config,
-                            msft_opcode, &msft_opcode);
+               if (virtio_has_feature(vdev, VIRTIO_BT_F_CONFIG_V2))
+                       virtio_cread(vdev, struct virtio_bt_config_v2,
+                                    msft_opcode, &msft_opcode);
+               else
+                       virtio_cread(vdev, struct virtio_bt_config,
+                                    msft_opcode, &msft_opcode);
 
                hci_set_msft_opcode(hdev, msft_opcode);
        }
@@ -354,8 +370,15 @@ static int virtbt_probe(struct virtio_device *vdev)
                goto failed;
        }
 
+       virtio_device_ready(vdev);
+       err = virtbt_open_vdev(vbt);
+       if (err)
+               goto open_failed;
+
        return 0;
 
+open_failed:
+       hci_free_dev(hdev);
 failed:
        vdev->config->del_vqs(vdev);
        return err;
@@ -368,6 +391,7 @@ static void virtbt_remove(struct virtio_device *vdev)
 
        hci_unregister_dev(hdev);
        virtio_reset_device(vdev);
+       virtbt_close_vdev(vbt);
 
        hci_free_dev(hdev);
        vbt->hdev = NULL;
@@ -387,6 +411,7 @@ static const unsigned int virtbt_features[] = {
        VIRTIO_BT_F_VND_HCI,
        VIRTIO_BT_F_MSFT_EXT,
        VIRTIO_BT_F_AOSP_EXT,
+       VIRTIO_BT_F_CONFIG_V2,
 };
 
 static struct virtio_driver virtbt_driver = {
index 1621ce8..d699052 100644 (file)
@@ -401,13 +401,14 @@ int tpm_pm_suspend(struct device *dev)
            !pm_suspend_via_firmware())
                goto suspended;
 
-       if (!tpm_chip_start(chip)) {
+       rc = tpm_try_get_ops(chip);
+       if (!rc) {
                if (chip->flags & TPM_CHIP_FLAG_TPM2)
                        tpm2_shutdown(chip, TPM2_SU_STATE);
                else
                        rc = tpm1_pm_suspend(chip, tpm_suspend_pcr);
 
-               tpm_chip_stop(chip);
+               tpm_put_ops(chip);
        }
 
 suspended:
index b174f72..1687094 100644 (file)
@@ -40,7 +40,7 @@ static const struct clk_pll_characteristics rm9200_pll_characteristics = {
 };
 
 static const struct sck at91rm9200_systemck[] = {
-       { .n = "udpck", .p = "usbck",    .id = 2 },
+       { .n = "udpck", .p = "usbck",    .id = 1 },
        { .n = "uhpck", .p = "usbck",    .id = 4 },
        { .n = "pck0",  .p = "prog0",    .id = 8 },
        { .n = "pck1",  .p = "prog1",    .id = 9 },
index a18ed88..b319878 100644 (file)
@@ -5364,6 +5364,8 @@ static struct clk_branch gcc_ufs_1_card_clkref_clk = {
                .enable_mask = BIT(0),
                .hw.init = &(const struct clk_init_data) {
                        .name = "gcc_ufs_1_card_clkref_clk",
+                       .parent_data = &gcc_parent_data_tcxo,
+                       .num_parents = 1,
                        .ops = &clk_branch2_ops,
                },
        },
@@ -5432,6 +5434,8 @@ static struct clk_branch gcc_ufs_card_clkref_clk = {
                .enable_mask = BIT(0),
                .hw.init = &(const struct clk_init_data) {
                        .name = "gcc_ufs_card_clkref_clk",
+                       .parent_data = &gcc_parent_data_tcxo,
+                       .num_parents = 1,
                        .ops = &clk_branch2_ops,
                },
        },
@@ -5848,6 +5852,8 @@ static struct clk_branch gcc_ufs_ref_clkref_clk = {
                .enable_mask = BIT(0),
                .hw.init = &(const struct clk_init_data) {
                        .name = "gcc_ufs_ref_clkref_clk",
+                       .parent_data = &gcc_parent_data_tcxo,
+                       .num_parents = 1,
                        .ops = &clk_branch2_ops,
                },
        },
index 7cf5e13..0f21a8a 100644 (file)
@@ -11,7 +11,6 @@
 #include <linux/kernel.h>
 #include <linux/ktime.h>
 #include <linux/pm_domain.h>
-#include <linux/pm_runtime.h>
 #include <linux/regmap.h>
 #include <linux/regulator/consumer.h>
 #include <linux/reset-controller.h>
@@ -56,22 +55,6 @@ enum gdsc_status {
        GDSC_ON
 };
 
-static int gdsc_pm_runtime_get(struct gdsc *sc)
-{
-       if (!sc->dev)
-               return 0;
-
-       return pm_runtime_resume_and_get(sc->dev);
-}
-
-static int gdsc_pm_runtime_put(struct gdsc *sc)
-{
-       if (!sc->dev)
-               return 0;
-
-       return pm_runtime_put_sync(sc->dev);
-}
-
 /* Returns 1 if GDSC status is status, 0 if not, and < 0 on error */
 static int gdsc_check_status(struct gdsc *sc, enum gdsc_status status)
 {
@@ -271,8 +254,9 @@ static void gdsc_retain_ff_on(struct gdsc *sc)
        regmap_update_bits(sc->regmap, sc->gdscr, mask, mask);
 }
 
-static int _gdsc_enable(struct gdsc *sc)
+static int gdsc_enable(struct generic_pm_domain *domain)
 {
+       struct gdsc *sc = domain_to_gdsc(domain);
        int ret;
 
        if (sc->pwrsts == PWRSTS_ON)
@@ -328,22 +312,11 @@ static int _gdsc_enable(struct gdsc *sc)
        return 0;
 }
 
-static int gdsc_enable(struct generic_pm_domain *domain)
+static int gdsc_disable(struct generic_pm_domain *domain)
 {
        struct gdsc *sc = domain_to_gdsc(domain);
        int ret;
 
-       ret = gdsc_pm_runtime_get(sc);
-       if (ret)
-               return ret;
-
-       return _gdsc_enable(sc);
-}
-
-static int _gdsc_disable(struct gdsc *sc)
-{
-       int ret;
-
        if (sc->pwrsts == PWRSTS_ON)
                return gdsc_assert_reset(sc);
 
@@ -388,18 +361,6 @@ static int _gdsc_disable(struct gdsc *sc)
        return 0;
 }
 
-static int gdsc_disable(struct generic_pm_domain *domain)
-{
-       struct gdsc *sc = domain_to_gdsc(domain);
-       int ret;
-
-       ret = _gdsc_disable(sc);
-
-       gdsc_pm_runtime_put(sc);
-
-       return ret;
-}
-
 static int gdsc_init(struct gdsc *sc)
 {
        u32 mask, val;
@@ -447,11 +408,6 @@ static int gdsc_init(struct gdsc *sc)
                                return ret;
                }
 
-               /* ...and the power-domain */
-               ret = gdsc_pm_runtime_get(sc);
-               if (ret)
-                       goto err_disable_supply;
-
                /*
                 * Votable GDSCs can be ON due to Vote from other masters.
                 * If a Votable GDSC is ON, make sure we have a Vote.
@@ -459,14 +415,14 @@ static int gdsc_init(struct gdsc *sc)
                if (sc->flags & VOTABLE) {
                        ret = gdsc_update_collapse_bit(sc, false);
                        if (ret)
-                               goto err_put_rpm;
+                               goto err_disable_supply;
                }
 
                /* Turn on HW trigger mode if supported */
                if (sc->flags & HW_CTRL) {
                        ret = gdsc_hwctrl(sc, true);
                        if (ret < 0)
-                               goto err_put_rpm;
+                               goto err_disable_supply;
                }
 
                /*
@@ -496,13 +452,10 @@ static int gdsc_init(struct gdsc *sc)
 
        ret = pm_genpd_init(&sc->pd, NULL, !on);
        if (ret)
-               goto err_put_rpm;
+               goto err_disable_supply;
 
        return 0;
 
-err_put_rpm:
-       if (on)
-               gdsc_pm_runtime_put(sc);
 err_disable_supply:
        if (on && sc->rsupply)
                regulator_disable(sc->rsupply);
@@ -541,8 +494,6 @@ int gdsc_register(struct gdsc_desc *desc,
        for (i = 0; i < num; i++) {
                if (!scs[i])
                        continue;
-               if (pm_runtime_enabled(dev))
-                       scs[i]->dev = dev;
                scs[i]->regmap = regmap;
                scs[i]->rcdev = rcdev;
                ret = gdsc_init(scs[i]);
index 981a12c..8035126 100644 (file)
@@ -30,7 +30,6 @@ struct reset_controller_dev;
  * @resets: ids of resets associated with this gdsc
  * @reset_count: number of @resets
  * @rcdev: reset controller
- * @dev: the device holding the GDSC, used for pm_runtime calls
  */
 struct gdsc {
        struct generic_pm_domain        pd;
@@ -74,7 +73,6 @@ struct gdsc {
 
        const char                      *supply;
        struct regulator                *rsupply;
-       struct device                   *dev;
 };
 
 struct gdsc_desc {
index 273f77d..e6d6cbf 100644 (file)
@@ -81,17 +81,19 @@ MODULE_DEVICE_TABLE(of, exynos_clkout_ids);
 static int exynos_clkout_match_parent_dev(struct device *dev, u32 *mux_mask)
 {
        const struct exynos_clkout_variant *variant;
+       const struct of_device_id *match;
 
        if (!dev->parent) {
                dev_err(dev, "not instantiated from MFD\n");
                return -EINVAL;
        }
 
-       variant = of_device_get_match_data(dev->parent);
-       if (!variant) {
+       match = of_match_device(exynos_clkout_ids, dev->parent);
+       if (!match) {
                dev_err(dev, "cannot match parent device\n");
                return -EINVAL;
        }
+       variant = match->data;
 
        *mux_mask = variant->mux_mask;
 
index 62ce681..0d2a950 100644 (file)
@@ -231,7 +231,7 @@ static const struct samsung_div_clock top_div_clks[] __initconst = {
            CLK_CON_DIV_PLL_SHARED0_DIV2, 0, 1),
        DIV(CLK_DOUT_SHARED0_DIV3, "dout_shared0_div3", "fout_shared0_pll",
            CLK_CON_DIV_PLL_SHARED0_DIV3, 0, 2),
-       DIV(CLK_DOUT_SHARED0_DIV4, "dout_shared0_div4", "fout_shared0_pll",
+       DIV(CLK_DOUT_SHARED0_DIV4, "dout_shared0_div4", "dout_shared0_div2",
            CLK_CON_DIV_PLL_SHARED0_DIV4, 0, 1),
        DIV(CLK_DOUT_SHARED0_DIV5, "dout_shared0_div5", "fout_shared0_pll",
            CLK_CON_DIV_PLL_SHARED0_DIV5, 0, 3),
@@ -239,7 +239,7 @@ static const struct samsung_div_clock top_div_clks[] __initconst = {
            CLK_CON_DIV_PLL_SHARED1_DIV2, 0, 1),
        DIV(CLK_DOUT_SHARED1_DIV3, "dout_shared1_div3", "fout_shared1_pll",
            CLK_CON_DIV_PLL_SHARED1_DIV3, 0, 2),
-       DIV(CLK_DOUT_SHARED1_DIV4, "dout_shared1_div4", "fout_shared1_pll",
+       DIV(CLK_DOUT_SHARED1_DIV4, "dout_shared1_div4", "dout_shared1_div2",
            CLK_CON_DIV_PLL_SHARED1_DIV4, 0, 1),
 
        /* CORE */
index 969a552..a0d66fa 100644 (file)
@@ -51,7 +51,7 @@ static int riscv_clock_next_event(unsigned long delta,
 static unsigned int riscv_clock_event_irq;
 static DEFINE_PER_CPU(struct clock_event_device, riscv_clock_event) = {
        .name                   = "riscv_timer_clockevent",
-       .features               = CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_C3STOP,
+       .features               = CLOCK_EVT_FEAT_ONESHOT,
        .rating                 = 100,
        .set_next_event         = riscv_clock_next_event,
 };
index 97086fa..903325a 100644 (file)
@@ -8,6 +8,13 @@
 static bool nohmem;
 module_param_named(disable, nohmem, bool, 0444);
 
+static struct resource hmem_active = {
+       .name = "HMEM devices",
+       .start = 0,
+       .end = -1,
+       .flags = IORESOURCE_MEM,
+};
+
 void hmem_register_device(int target_nid, struct resource *r)
 {
        /* define a clean / non-busy resource for the platform device */
@@ -41,6 +48,12 @@ void hmem_register_device(int target_nid, struct resource *r)
                goto out_pdev;
        }
 
+       if (!__request_region(&hmem_active, res.start, resource_size(&res),
+                             dev_name(&pdev->dev), 0)) {
+               dev_dbg(&pdev->dev, "hmem range %pr already active\n", &res);
+               goto out_active;
+       }
+
        pdev->dev.numa_node = numa_map_to_online_node(target_nid);
        info = (struct memregion_info) {
                .target_node = target_nid,
@@ -66,6 +79,8 @@ void hmem_register_device(int target_nid, struct resource *r)
        return;
 
 out_resource:
+       __release_region(&hmem_active, res.start, resource_size(&res));
+out_active:
        platform_device_put(pdev);
 out_pdev:
        memregion_free(id);
@@ -73,15 +88,6 @@ out_pdev:
 
 static __init int hmem_register_one(struct resource *res, void *data)
 {
-       /*
-        * If the resource is not a top-level resource it was already
-        * assigned to a device by the HMAT parsing.
-        */
-       if (res->parent != &iomem_resource) {
-               pr_info("HMEM: skip %pr, already claimed\n", res);
-               return 0;
-       }
-
        hmem_register_device(phys_to_target_node(res->start), res);
 
        return 0;
index 14e6b3e..6f3ded6 100644 (file)
@@ -226,7 +226,10 @@ found:
                ioport_unmap(gp.pm);
                goto out;
        }
+       return 0;
+
 out:
+       pci_dev_put(pdev);
        return err;
 }
 
@@ -234,6 +237,7 @@ static void __exit amd_gpio_exit(void)
 {
        gpiochip_remove(&gp.chip);
        ioport_unmap(gp.pm);
+       pci_dev_put(gp.pdev);
 }
 
 module_init(amd_gpio_init);
index 870910b..200e43a 100644 (file)
@@ -610,6 +610,7 @@ static int rockchip_gpiolib_register(struct rockchip_pin_bank *bank)
                        return -ENODATA;
 
                pctldev = of_pinctrl_get(pctlnp);
+               of_node_put(pctlnp);
                if (!pctldev)
                        return -ENODEV;
 
index 4756ea0..a70522a 100644 (file)
@@ -526,12 +526,13 @@ static int gpiochip_setup_dev(struct gpio_device *gdev)
        if (ret)
                return ret;
 
+       /* From this point, the .release() function cleans up gpio_device */
+       gdev->dev.release = gpiodevice_release;
+
        ret = gpiochip_sysfs_register(gdev);
        if (ret)
                goto err_remove_device;
 
-       /* From this point, the .release() function cleans up gpio_device */
-       gdev->dev.release = gpiodevice_release;
        dev_dbg(&gdev->dev, "registered GPIOs %d to %d on %s\n", gdev->base,
                gdev->base + gdev->ngpio - 1, gdev->chip->label ? : "generic");
 
@@ -597,10 +598,10 @@ int gpiochip_add_data_with_key(struct gpio_chip *gc, void *data,
        struct fwnode_handle *fwnode = NULL;
        struct gpio_device *gdev;
        unsigned long flags;
-       int base = gc->base;
        unsigned int i;
+       u32 ngpios = 0;
+       int base = 0;
        int ret = 0;
-       u32 ngpios;
 
        if (gc->fwnode)
                fwnode = gc->fwnode;
@@ -647,17 +648,12 @@ int gpiochip_add_data_with_key(struct gpio_chip *gc, void *data,
        else
                gdev->owner = THIS_MODULE;
 
-       gdev->descs = kcalloc(gc->ngpio, sizeof(gdev->descs[0]), GFP_KERNEL);
-       if (!gdev->descs) {
-               ret = -ENOMEM;
-               goto err_free_dev_name;
-       }
-
        /*
         * Try the device properties if the driver didn't supply the number
         * of GPIO lines.
         */
-       if (gc->ngpio == 0) {
+       ngpios = gc->ngpio;
+       if (ngpios == 0) {
                ret = device_property_read_u32(&gdev->dev, "ngpios", &ngpios);
                if (ret == -ENODATA)
                        /*
@@ -668,7 +664,7 @@ int gpiochip_add_data_with_key(struct gpio_chip *gc, void *data,
                         */
                        ngpios = 0;
                else if (ret)
-                       goto err_free_descs;
+                       goto err_free_dev_name;
 
                gc->ngpio = ngpios;
        }
@@ -676,13 +672,19 @@ int gpiochip_add_data_with_key(struct gpio_chip *gc, void *data,
        if (gc->ngpio == 0) {
                chip_err(gc, "tried to insert a GPIO chip with zero lines\n");
                ret = -EINVAL;
-               goto err_free_descs;
+               goto err_free_dev_name;
        }
 
        if (gc->ngpio > FASTPATH_NGPIO)
                chip_warn(gc, "line cnt %u is greater than fast path cnt %u\n",
                          gc->ngpio, FASTPATH_NGPIO);
 
+       gdev->descs = kcalloc(gc->ngpio, sizeof(*gdev->descs), GFP_KERNEL);
+       if (!gdev->descs) {
+               ret = -ENOMEM;
+               goto err_free_dev_name;
+       }
+
        gdev->label = kstrdup_const(gc->label ?: "unknown", GFP_KERNEL);
        if (!gdev->label) {
                ret = -ENOMEM;
@@ -701,11 +703,13 @@ int gpiochip_add_data_with_key(struct gpio_chip *gc, void *data,
         * it may be a pipe dream. It will not happen before we get rid
         * of the sysfs interface anyways.
         */
+       base = gc->base;
        if (base < 0) {
                base = gpiochip_find_base(gc->ngpio);
                if (base < 0) {
-                       ret = base;
                        spin_unlock_irqrestore(&gpio_lock, flags);
+                       ret = base;
+                       base = 0;
                        goto err_free_label;
                }
                /*
@@ -816,6 +820,11 @@ err_remove_of_chip:
 err_free_gpiochip_mask:
        gpiochip_remove_pin_ranges(gc);
        gpiochip_free_valid_mask(gc);
+       if (gdev->dev.release) {
+               /* release() has been registered by gpiochip_setup_dev() */
+               put_device(&gdev->dev);
+               goto err_print_message;
+       }
 err_remove_from_list:
        spin_lock_irqsave(&gpio_lock, flags);
        list_del(&gdev->list);
@@ -829,13 +838,14 @@ err_free_dev_name:
 err_free_ida:
        ida_free(&gpio_ida, gdev->id);
 err_free_gdev:
+       kfree(gdev);
+err_print_message:
        /* failures here can mean systems won't boot... */
        if (ret != -EPROBE_DEFER) {
                pr_err("%s: GPIOs %d..%d (%s) failed to register, %d\n", __func__,
-                      gdev->base, gdev->base + gdev->ngpio - 1,
+                      base, base + (int)ngpios - 1,
                       gc->label ? : "generic", ret);
        }
-       kfree(gdev);
        return ret;
 }
 EXPORT_SYMBOL_GPL(gpiochip_add_data_with_key);
index 0b52af4..ce64ca1 100644 (file)
@@ -156,6 +156,9 @@ int amdgpu_vcn_sw_init(struct amdgpu_device *adev)
                break;
        case IP_VERSION(3, 0, 2):
                fw_name = FIRMWARE_VANGOGH;
+               if ((adev->firmware.load_type == AMDGPU_FW_LOAD_PSP) &&
+                   (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG))
+                       adev->vcn.indirect_sram = true;
                break;
        case IP_VERSION(3, 0, 16):
                fw_name = FIRMWARE_DIMGREY_CAVEFISH;
index 6925e02..f4f3d26 100644 (file)
@@ -5,6 +5,7 @@ menu "Display Engine Configuration"
 config DRM_AMD_DC
        bool "AMD DC - Enable new display engine"
        default y
+       depends on BROKEN || !CC_IS_CLANG || X86_64 || SPARC64 || ARM64
        select SND_HDA_COMPONENT if SND_HDA_CORE
        select DRM_AMD_DC_DCN if (X86 || PPC_LONG_DOUBLE_128)
        help
@@ -12,6 +13,12 @@ config DRM_AMD_DC
          support for AMDGPU. This adds required support for Vega and
          Raven ASICs.
 
+         calculate_bandwidth() is presently broken on all !(X86_64 || SPARC64 || ARM64)
+         architectures built with Clang (all released versions), whereby the stack
+         frame gets blown up to well over 5k.  This would cause an immediate kernel
+         panic on most architectures.  We'll revert this when the following bug report
+         has been resolved: https://github.com/llvm/llvm-project/issues/41896.
+
 config DRM_AMD_DC_DCN
        def_bool n
        help
index 461c62c..de77054 100644 (file)
@@ -3723,12 +3723,16 @@ out:
 
 static u8 bigjoiner_pipes(struct drm_i915_private *i915)
 {
+       u8 pipes;
+
        if (DISPLAY_VER(i915) >= 12)
-               return BIT(PIPE_A) | BIT(PIPE_B) | BIT(PIPE_C) | BIT(PIPE_D);
+               pipes = BIT(PIPE_A) | BIT(PIPE_B) | BIT(PIPE_C) | BIT(PIPE_D);
        else if (DISPLAY_VER(i915) >= 11)
-               return BIT(PIPE_B) | BIT(PIPE_C);
+               pipes = BIT(PIPE_B) | BIT(PIPE_C);
        else
-               return 0;
+               pipes = 0;
+
+       return pipes & RUNTIME_INFO(i915)->pipe_mask;
 }
 
 static bool transcoder_ddi_func_is_enabled(struct drm_i915_private *dev_priv,
index d0b03a9..7caa341 100644 (file)
@@ -625,8 +625,13 @@ int intel_gt_wait_for_idle(struct intel_gt *gt, long timeout)
                        return -EINTR;
        }
 
-       return timeout ? timeout : intel_uc_wait_for_idle(&gt->uc,
-                                                         remaining_timeout);
+       if (timeout)
+               return timeout;
+
+       if (remaining_timeout < 0)
+               remaining_timeout = 0;
+
+       return intel_uc_wait_for_idle(&gt->uc, remaining_timeout);
 }
 
 int intel_gt_init(struct intel_gt *gt)
@@ -1017,6 +1022,11 @@ static void mmio_invalidate_full(struct intel_gt *gt)
                if (!i915_mmio_reg_offset(rb.reg))
                        continue;
 
+               if (GRAPHICS_VER(i915) == 12 && (engine->class == VIDEO_DECODE_CLASS ||
+                   engine->class == VIDEO_ENHANCEMENT_CLASS ||
+                   engine->class == COMPUTE_CLASS))
+                       rb.bit = _MASKED_BIT_ENABLE(rb.bit);
+
                intel_uncore_write_fw(uncore, rb.reg, rb.bit);
                awake |= engine->mask;
        }
index edb881d..1dfd016 100644 (file)
@@ -199,7 +199,7 @@ out_active: spin_lock(&timelines->lock);
        if (remaining_timeout)
                *remaining_timeout = timeout;
 
-       return active_count ? timeout : 0;
+       return active_count ? timeout ?: -ETIME : 0;
 }
 
 static void retire_work_handler(struct work_struct *work)
index 2403ccd..bba8cb6 100644 (file)
@@ -471,8 +471,7 @@ static int xelpdp_get_dram_info(struct drm_i915_private *i915)
        u32 val = intel_uncore_read(&i915->uncore, MTL_MEM_SS_INFO_GLOBAL);
        struct dram_info *dram_info = &i915->dram_info;
 
-       val = REG_FIELD_GET(MTL_DDR_TYPE_MASK, val);
-       switch (val) {
+       switch (REG_FIELD_GET(MTL_DDR_TYPE_MASK, val)) {
        case 0:
                dram_info->type = INTEL_DRAM_DDR4;
                break;
index 9c1d31f..bd47628 100644 (file)
@@ -1315,6 +1315,9 @@ static s32 snto32(__u32 value, unsigned n)
        if (!value || !n)
                return 0;
 
+       if (n > 32)
+               n = 32;
+
        switch (n) {
        case 8:  return ((__s8)value);
        case 16: return ((__s16)value);
index dad953f..8f58c3c 100644 (file)
 #define USB_DEVICE_ID_CH_AXIS_295      0x001c
 
 #define USB_VENDOR_ID_CHERRY           0x046a
+#define USB_DEVICE_ID_CHERRY_MOUSE_000C        0x000c
 #define USB_DEVICE_ID_CHERRY_CYMOTION  0x0023
 #define USB_DEVICE_ID_CHERRY_CYMOTION_SOLAR    0x0027
 
 #define USB_DEVICE_ID_MS_XBOX_ONE_S_CONTROLLER 0x02fd
 #define USB_DEVICE_ID_MS_PIXART_MOUSE    0x00cb
 #define USB_DEVICE_ID_8BITDO_SN30_PRO_PLUS      0x02e0
+#define USB_DEVICE_ID_MS_MOUSE_0783      0x0783
 
 #define USB_VENDOR_ID_MOJO             0x8282
 #define USB_DEVICE_ID_RETRO_ADAPTER    0x3201
 #define USB_DEVICE_ID_SYNAPTICS_DELL_K15A      0x6e21
 #define USB_DEVICE_ID_SYNAPTICS_ACER_ONE_S1002 0x73f4
 #define USB_DEVICE_ID_SYNAPTICS_ACER_ONE_S1003 0x73f5
+#define USB_DEVICE_ID_SYNAPTICS_ACER_SWITCH5_017       0x73f6
 #define USB_DEVICE_ID_SYNAPTICS_ACER_SWITCH5   0x81a7
 
 #define USB_VENDOR_ID_TEXAS_INSTRUMENTS        0x2047
 
 #define USB_VENDOR_ID_PRIMAX   0x0461
 #define USB_DEVICE_ID_PRIMAX_MOUSE_4D22        0x4d22
+#define USB_DEVICE_ID_PRIMAX_MOUSE_4E2A        0x4e2a
 #define USB_DEVICE_ID_PRIMAX_KEYBOARD  0x4e05
 #define USB_DEVICE_ID_PRIMAX_REZEL     0x4e72
 #define USB_DEVICE_ID_PRIMAX_PIXART_MOUSE_4D0F 0x4d0f
index 430fa4f..75ebfcf 100644 (file)
@@ -121,6 +121,11 @@ static const struct hid_device_id ite_devices[] = {
                     USB_VENDOR_ID_SYNAPTICS,
                     USB_DEVICE_ID_SYNAPTICS_ACER_ONE_S1003),
          .driver_data = QUIRK_TOUCHPAD_ON_OFF_REPORT },
+       /* ITE8910 USB kbd ctlr, with Synaptics touchpad connected to it. */
+       { HID_DEVICE(BUS_USB, HID_GROUP_GENERIC,
+                    USB_VENDOR_ID_SYNAPTICS,
+                    USB_DEVICE_ID_SYNAPTICS_ACER_SWITCH5_017),
+         .driver_data = QUIRK_TOUCHPAD_ON_OFF_REPORT },
        { }
 };
 MODULE_DEVICE_TABLE(hid, ite_devices);
index 5e6a0ce..e3fcf13 100644 (file)
@@ -872,6 +872,12 @@ static ssize_t lg4ff_alternate_modes_store(struct device *dev, struct device_att
                return -ENOMEM;
 
        i = strlen(lbuf);
+
+       if (i == 0) {
+               kfree(lbuf);
+               return -EINVAL;
+       }
+
        if (lbuf[i-1] == '\n') {
                if (i == 1) {
                        kfree(lbuf);
index 71a9c25..8a2aac1 100644 (file)
@@ -4269,21 +4269,6 @@ static void hidpp_remove(struct hid_device *hdev)
        mutex_destroy(&hidpp->send_mutex);
 }
 
-static const struct hid_device_id unhandled_hidpp_devices[] = {
-       /* Logitech Harmony Adapter for PS3, handled in hid-sony */
-       { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_HARMONY_PS3) },
-       /* Handled in hid-generic */
-       { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_DINOVO_EDGE_KBD) },
-       {}
-};
-
-static bool hidpp_match(struct hid_device *hdev,
-                       bool ignore_special_driver)
-{
-       /* Refuse to handle devices handled by other HID drivers */
-       return !hid_match_id(hdev, unhandled_hidpp_devices);
-}
-
 #define LDJ_DEVICE(product) \
        HID_DEVICE(BUS_USB, HID_GROUP_LOGITECH_DJ_DEVICE, \
                   USB_VENDOR_ID_LOGITECH, (product))
@@ -4367,9 +4352,15 @@ static const struct hid_device_id hidpp_devices[] = {
        { /* MX5500 keyboard over Bluetooth */
          HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, 0xb30b),
          .driver_data = HIDPP_QUIRK_HIDPP_CONSUMER_VENDOR_KEYS },
-
-       { /* And try to enable HID++ for all the Logitech Bluetooth devices */
-         HID_DEVICE(BUS_BLUETOOTH, HID_GROUP_ANY, USB_VENDOR_ID_LOGITECH, HID_ANY_ID) },
+       { /* M-RCQ142 V470 Cordless Laser Mouse over Bluetooth */
+         HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, 0xb008) },
+       { /* MX Master mouse over Bluetooth */
+         HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, 0xb012) },
+       { /* MX Ergo trackball over Bluetooth */
+         HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, 0xb01d) },
+       { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, 0xb01e) },
+       { /* MX Master 3 mouse over Bluetooth */
+         HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, 0xb023) },
        {}
 };
 
@@ -4383,7 +4374,6 @@ static const struct hid_usage_id hidpp_usages[] = {
 static struct hid_driver hidpp_driver = {
        .name = "logitech-hidpp-device",
        .id_table = hidpp_devices,
-       .match = hidpp_match,
        .report_fixup = hidpp_report_fixup,
        .probe = hidpp_probe,
        .remove = hidpp_remove,
index 50e1c71..0e9702c 100644 (file)
@@ -54,6 +54,7 @@ static const struct hid_device_id hid_quirks[] = {
        { HID_USB_DEVICE(USB_VENDOR_ID_CH, USB_DEVICE_ID_CH_FLIGHT_SIM_YOKE), HID_QUIRK_NOGET },
        { HID_USB_DEVICE(USB_VENDOR_ID_CH, USB_DEVICE_ID_CH_PRO_PEDALS), HID_QUIRK_NOGET },
        { HID_USB_DEVICE(USB_VENDOR_ID_CH, USB_DEVICE_ID_CH_PRO_THROTTLE), HID_QUIRK_NOGET },
+       { HID_USB_DEVICE(USB_VENDOR_ID_CHERRY, USB_DEVICE_ID_CHERRY_MOUSE_000C), HID_QUIRK_ALWAYS_POLL },
        { HID_USB_DEVICE(USB_VENDOR_ID_CORSAIR, USB_DEVICE_ID_CORSAIR_K65RGB), HID_QUIRK_NO_INIT_REPORTS },
        { HID_USB_DEVICE(USB_VENDOR_ID_CORSAIR, USB_DEVICE_ID_CORSAIR_K65RGB_RAPIDFIRE), HID_QUIRK_NO_INIT_REPORTS | HID_QUIRK_ALWAYS_POLL },
        { HID_USB_DEVICE(USB_VENDOR_ID_CORSAIR, USB_DEVICE_ID_CORSAIR_K70RGB), HID_QUIRK_NO_INIT_REPORTS },
@@ -122,6 +123,7 @@ static const struct hid_device_id hid_quirks[] = {
        { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_MOUSE_C05A), HID_QUIRK_ALWAYS_POLL },
        { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_MOUSE_C06A), HID_QUIRK_ALWAYS_POLL },
        { HID_USB_DEVICE(USB_VENDOR_ID_MCS, USB_DEVICE_ID_MCS_GAMEPADBLOCK), HID_QUIRK_MULTI_INPUT },
+       { HID_USB_DEVICE(USB_VENDOR_ID_MICROSOFT, USB_DEVICE_ID_MS_MOUSE_0783), HID_QUIRK_ALWAYS_POLL },
        { HID_USB_DEVICE(USB_VENDOR_ID_MICROSOFT, USB_DEVICE_ID_MS_PIXART_MOUSE), HID_QUIRK_ALWAYS_POLL },
        { HID_USB_DEVICE(USB_VENDOR_ID_MICROSOFT, USB_DEVICE_ID_MS_POWER_COVER), HID_QUIRK_NO_INIT_REPORTS },
        { HID_USB_DEVICE(USB_VENDOR_ID_MICROSOFT, USB_DEVICE_ID_MS_SURFACE3_COVER), HID_QUIRK_NO_INIT_REPORTS },
@@ -146,6 +148,7 @@ static const struct hid_device_id hid_quirks[] = {
        { HID_USB_DEVICE(USB_VENDOR_ID_PIXART, USB_DEVICE_ID_PIXART_OPTICAL_TOUCH_SCREEN), HID_QUIRK_NO_INIT_REPORTS },
        { HID_USB_DEVICE(USB_VENDOR_ID_PIXART, USB_DEVICE_ID_PIXART_USB_OPTICAL_MOUSE), HID_QUIRK_ALWAYS_POLL },
        { HID_USB_DEVICE(USB_VENDOR_ID_PRIMAX, USB_DEVICE_ID_PRIMAX_MOUSE_4D22), HID_QUIRK_ALWAYS_POLL },
+       { HID_USB_DEVICE(USB_VENDOR_ID_PRIMAX, USB_DEVICE_ID_PRIMAX_MOUSE_4E2A), HID_QUIRK_ALWAYS_POLL },
        { HID_USB_DEVICE(USB_VENDOR_ID_PRIMAX, USB_DEVICE_ID_PRIMAX_PIXART_MOUSE_4D0F), HID_QUIRK_ALWAYS_POLL },
        { HID_USB_DEVICE(USB_VENDOR_ID_PRIMAX, USB_DEVICE_ID_PRIMAX_PIXART_MOUSE_4D65), HID_QUIRK_ALWAYS_POLL },
        { HID_USB_DEVICE(USB_VENDOR_ID_PRIMAX, USB_DEVICE_ID_PRIMAX_PIXART_MOUSE_4E22), HID_QUIRK_ALWAYS_POLL },
index 0fbc408..7fa6fe0 100644 (file)
@@ -192,6 +192,7 @@ static int uclogic_probe(struct hid_device *hdev,
         * than the pen, so use QUIRK_MULTI_INPUT for all tablets.
         */
        hdev->quirks |= HID_QUIRK_MULTI_INPUT;
+       hdev->quirks |= HID_QUIRK_HIDINPUT_FORCE;
 
        /* Allocate and assign driver data */
        drvdata = devm_kzalloc(&hdev->dev, sizeof(*drvdata), GFP_KERNEL);
index 4bd54c4..6b73eb0 100644 (file)
@@ -1193,7 +1193,7 @@ __u8 *uclogic_rdesc_template_apply(const __u8 *template_ptr,
                           p[sizeof(btn_head)] < param_num) {
                        v = param_list[p[sizeof(btn_head)]];
                        put_unaligned((__u8)0x2A, p); /* Usage Maximum */
-                       put_unaligned_le16((__force u16)cpu_to_le16(v), p + 1);
+                       put_unaligned((__force u16)cpu_to_le16(v), (s16 *)(p + 1));
                        p += sizeof(btn_head) + 1;
                } else {
                        p++;
index 5273ee2..d65abe6 100644 (file)
@@ -66,6 +66,6 @@ endmenu
 
 config I2C_HID_CORE
        tristate
-       default y if I2C_HID_ACPI=y || I2C_HID_OF=y || I2C_HID_OF_GOODIX=y
-       default m if I2C_HID_ACPI=m || I2C_HID_OF=m || I2C_HID_OF_GOODIX=m
+       default y if I2C_HID_ACPI=y || I2C_HID_OF=y || I2C_HID_OF_ELAN=y || I2C_HID_OF_GOODIX=y
+       default m if I2C_HID_ACPI=m || I2C_HID_OF=m || I2C_HID_OF_ELAN=m || I2C_HID_OF_GOODIX=m
        select HID
index 81e6889..a901e4e 100644 (file)
@@ -938,6 +938,8 @@ static int asus_ec_probe(struct platform_device *pdev)
        ec_data->nr_sensors = hweight_long(ec_data->board_info->sensors);
        ec_data->sensors = devm_kcalloc(dev, ec_data->nr_sensors,
                                        sizeof(struct ec_sensor), GFP_KERNEL);
+       if (!ec_data->sensors)
+               return -ENOMEM;
 
        status = setup_lock_data(dev);
        if (status) {
index 8bf32c6..9bee4d3 100644 (file)
@@ -242,10 +242,13 @@ static int adjust_tjmax(struct cpuinfo_x86 *c, u32 id, struct device *dev)
         */
        if (host_bridge && host_bridge->vendor == PCI_VENDOR_ID_INTEL) {
                for (i = 0; i < ARRAY_SIZE(tjmax_pci_table); i++) {
-                       if (host_bridge->device == tjmax_pci_table[i].device)
+                       if (host_bridge->device == tjmax_pci_table[i].device) {
+                               pci_dev_put(host_bridge);
                                return tjmax_pci_table[i].tjmax;
+                       }
                }
        }
+       pci_dev_put(host_bridge);
 
        for (i = 0; i < ARRAY_SIZE(tjmax_table); i++) {
                if (strstr(c->x86_model_id, tjmax_table[i].id))
@@ -533,6 +536,10 @@ static void coretemp_remove_core(struct platform_data *pdata, int indx)
 {
        struct temp_data *tdata = pdata->core_data[indx];
 
+       /* if we errored on add then this is already gone */
+       if (!tdata)
+               return;
+
        /* Remove the sysfs attributes */
        sysfs_remove_group(&pdata->hwmon_dev->kobj, &tdata->attr_group);
 
index 05f68e9..23b9f94 100644 (file)
@@ -117,7 +117,7 @@ static int i5500_temp_probe(struct pci_dev *pdev,
        u32 tstimer;
        s8 tsfsc;
 
-       err = pci_enable_device(pdev);
+       err = pcim_enable_device(pdev);
        if (err) {
                dev_err(&pdev->dev, "Failed to enable device\n");
                return err;
index f6ec165..1837ccc 100644 (file)
@@ -502,6 +502,7 @@ static void ibmpex_register_bmc(int iface, struct device *dev)
        return;
 
 out_register:
+       list_del(&data->list);
        hwmon_device_unregister(data->hwmon_dev);
 out_user:
        ipmi_destroy_user(data->user);
index 2a57f4b..e061869 100644 (file)
@@ -228,7 +228,7 @@ static int ina3221_read_value(struct ina3221_data *ina, unsigned int reg,
         * Shunt Voltage Sum register has 14-bit value with 1-bit shift
         * Other Shunt Voltage registers have 12 bits with 3-bit shift
         */
-       if (reg == INA3221_SHUNT_SUM)
+       if (reg == INA3221_SHUNT_SUM || reg == INA3221_CRIT_SUM)
                *val = sign_extend32(regval >> 1, 14);
        else
                *val = sign_extend32(regval >> 3, 12);
@@ -465,7 +465,7 @@ static int ina3221_write_curr(struct device *dev, u32 attr,
         *     SHUNT_SUM: (1 / 40uV) << 1 = 1 / 20uV
         *     SHUNT[1-3]: (1 / 40uV) << 3 = 1 / 5uV
         */
-       if (reg == INA3221_SHUNT_SUM)
+       if (reg == INA3221_SHUNT_SUM || reg == INA3221_CRIT_SUM)
                regval = DIV_ROUND_CLOSEST(voltage_uv, 20) & 0xfffe;
        else
                regval = DIV_ROUND_CLOSEST(voltage_uv, 5) & 0xfff8;
index 7404e97..2dbbbac 100644 (file)
@@ -396,7 +396,7 @@ static int ltc2947_read_temp(struct device *dev, const u32 attr, long *val,
                return ret;
 
        /* in milidegrees celcius, temp is given by: */
-       *val = (__val * 204) + 550;
+       *val = (__val * 204) + 5500;
 
        return 0;
 }
index fe0cd20..f58943c 100644 (file)
@@ -852,7 +852,8 @@ static int cdns_i2c_master_xfer(struct i2c_adapter *adap, struct i2c_msg *msgs,
                                         CDNS_I2C_POLL_US, CDNS_I2C_TIMEOUT_US);
        if (ret) {
                ret = -EAGAIN;
-               i2c_recover_bus(adap);
+               if (id->adap.bus_recovery_info)
+                       i2c_recover_bus(adap);
                goto out;
        }
 
@@ -1263,8 +1264,13 @@ static int cdns_i2c_probe(struct platform_device *pdev)
 
        id->rinfo.pinctrl = devm_pinctrl_get(&pdev->dev);
        if (IS_ERR(id->rinfo.pinctrl)) {
+               int err = PTR_ERR(id->rinfo.pinctrl);
+
                dev_info(&pdev->dev, "can't get pinctrl, bus recovery not supported\n");
-               return PTR_ERR(id->rinfo.pinctrl);
+               if (err != -ENODEV)
+                       return err;
+       } else {
+               id->adap.bus_recovery_info = &id->rinfo;
        }
 
        id->membase = devm_platform_get_and_ioremap_resource(pdev, 0, &r_mem);
@@ -1283,7 +1289,6 @@ static int cdns_i2c_probe(struct platform_device *pdev)
        id->adap.retries = 3;           /* Default retry value. */
        id->adap.algo_data = id;
        id->adap.dev.parent = &pdev->dev;
-       id->adap.bus_recovery_info = &id->rinfo;
        init_completion(&id->xfer_done);
        snprintf(id->adap.name, sizeof(id->adap.name),
                 "Cadence I2C at %08lx", (unsigned long)r_mem->start);
index 3082183..fc70920 100644 (file)
@@ -1132,7 +1132,8 @@ static int i2c_imx_read(struct imx_i2c_struct *i2c_imx, struct i2c_msg *msgs,
        int i, result;
        unsigned int temp;
        int block_data = msgs->flags & I2C_M_RECV_LEN;
-       int use_dma = i2c_imx->dma && msgs->len >= DMA_THRESHOLD && !block_data;
+       int use_dma = i2c_imx->dma && msgs->flags & I2C_M_DMA_SAFE &&
+               msgs->len >= DMA_THRESHOLD && !block_data;
 
        dev_dbg(&i2c_imx->adapter.dev,
                "<%s> write slave address: addr=0x%x\n",
@@ -1298,7 +1299,8 @@ static int i2c_imx_xfer_common(struct i2c_adapter *adapter,
                        result = i2c_imx_read(i2c_imx, &msgs[i], is_lastmsg, atomic);
                } else {
                        if (!atomic &&
-                           i2c_imx->dma && msgs[i].len >= DMA_THRESHOLD)
+                           i2c_imx->dma && msgs[i].len >= DMA_THRESHOLD &&
+                               msgs[i].flags & I2C_M_DMA_SAFE)
                                result = i2c_imx_dma_write(i2c_imx, &msgs[i]);
                        else
                                result = i2c_imx_write(i2c_imx, &msgs[i], atomic);
index 0c365b5..8345735 100644 (file)
@@ -2393,8 +2393,17 @@ static struct platform_driver npcm_i2c_bus_driver = {
 
 static int __init npcm_i2c_init(void)
 {
+       int ret;
+
        npcm_i2c_debugfs_dir = debugfs_create_dir("npcm_i2c", NULL);
-       return platform_driver_register(&npcm_i2c_bus_driver);
+
+       ret = platform_driver_register(&npcm_i2c_bus_driver);
+       if (ret) {
+               debugfs_remove_recursive(npcm_i2c_debugfs_dir);
+               return ret;
+       }
+
+       return 0;
 }
 module_init(npcm_i2c_init);
 
index 84a7751..8fce98b 100644 (file)
@@ -626,7 +626,6 @@ static int geni_i2c_gpi_xfer(struct geni_i2c_dev *gi2c, struct i2c_msg msgs[], i
                        dev_err(gi2c->se.dev, "I2C timeout gpi flags:%d addr:0x%x\n",
                                gi2c->cur->flags, gi2c->cur->addr);
                        gi2c->err = -ETIMEDOUT;
-                       goto err;
                }
 
                if (gi2c->err) {
index 9aa7b9d..13fafb7 100644 (file)
@@ -467,6 +467,7 @@ static int i2c_device_probe(struct device *dev)
 {
        struct i2c_client       *client = i2c_verify_client(dev);
        struct i2c_driver       *driver;
+       bool do_power_on;
        int status;
 
        if (!client)
@@ -545,8 +546,8 @@ static int i2c_device_probe(struct device *dev)
        if (status < 0)
                goto err_clear_wakeup_irq;
 
-       status = dev_pm_domain_attach(&client->dev,
-                                     !i2c_acpi_waive_d0_probe(dev));
+       do_power_on = !i2c_acpi_waive_d0_probe(dev);
+       status = dev_pm_domain_attach(&client->dev, do_power_on);
        if (status)
                goto err_clear_wakeup_irq;
 
@@ -585,7 +586,7 @@ static int i2c_device_probe(struct device *dev)
 err_release_driver_resources:
        devres_release_group(&client->dev, client->devres_group_id);
 err_detach_pm_domain:
-       dev_pm_domain_detach(&client->dev, !i2c_acpi_waive_d0_probe(dev));
+       dev_pm_domain_detach(&client->dev, do_power_on);
 err_clear_wakeup_irq:
        dev_pm_clear_wake_irq(&client->dev);
        device_init_wakeup(&client->dev, false);
@@ -610,7 +611,7 @@ static void i2c_device_remove(struct device *dev)
 
        devres_release_group(&client->dev, client->devres_group_id);
 
-       dev_pm_domain_detach(&client->dev, !i2c_acpi_waive_d0_probe(dev));
+       dev_pm_domain_detach(&client->dev, true);
 
        dev_pm_clear_wake_irq(&client->dev);
        device_init_wakeup(&client->dev, false);
index 3a49529..3d9c575 100644 (file)
@@ -211,12 +211,14 @@ static int raydium_i2c_send(struct i2c_client *client,
 
                error = raydium_i2c_xfer(client, addr, xfer, ARRAY_SIZE(xfer));
                if (likely(!error))
-                       return 0;
+                       goto out;
 
                msleep(RM_RETRY_DELAY_MS);
        } while (++tries < RM_MAX_RETRIES);
 
        dev_err(&client->dev, "%s failed: %d\n", __func__, error);
+out:
+       kfree(tx_buf);
        return error;
 }
 
index 5a8f780..bc94059 100644 (file)
@@ -820,6 +820,7 @@ int __init dmar_dev_scope_init(void)
                        info = dmar_alloc_pci_notify_info(dev,
                                        BUS_NOTIFY_ADD_DEVICE);
                        if (!info) {
+                               pci_dev_put(dev);
                                return dmar_dev_scope_status;
                        } else {
                                dmar_pci_bus_add_dev(info);
index 996a8b5..5287efe 100644 (file)
@@ -1396,6 +1396,24 @@ static void domain_update_iotlb(struct dmar_domain *domain)
        spin_unlock_irqrestore(&domain->lock, flags);
 }
 
+/*
+ * The extra devTLB flush quirk impacts those QAT devices with PCI device
+ * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
+ * check because it applies only to the built-in QAT devices and it doesn't
+ * grant additional privileges.
+ */
+#define BUGGY_QAT_DEVID_MASK 0x494c
+static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
+{
+       if (pdev->vendor != PCI_VENDOR_ID_INTEL)
+               return false;
+
+       if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
+               return false;
+
+       return true;
+}
+
 static void iommu_enable_pci_caps(struct device_domain_info *info)
 {
        struct pci_dev *pdev;
@@ -1478,6 +1496,7 @@ static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
        qdep = info->ats_qdep;
        qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
                           qdep, addr, mask);
+       quirk_extra_dev_tlb_flush(info, addr, mask, PASID_RID2PASID, qdep);
 }
 
 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
@@ -3854,8 +3873,10 @@ static inline bool has_external_pci(void)
        struct pci_dev *pdev = NULL;
 
        for_each_pci_dev(pdev)
-               if (pdev->external_facing)
+               if (pdev->external_facing) {
+                       pci_dev_put(pdev);
                        return true;
+               }
 
        return false;
 }
@@ -4490,9 +4511,10 @@ static struct iommu_device *intel_iommu_probe_device(struct device *dev)
        if (dev_is_pci(dev)) {
                if (ecap_dev_iotlb_support(iommu->ecap) &&
                    pci_ats_supported(pdev) &&
-                   dmar_ats_supported(pdev, iommu))
+                   dmar_ats_supported(pdev, iommu)) {
                        info->ats_supported = 1;
-
+                       info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
+               }
                if (sm_supported(iommu)) {
                        if (pasid_supported(iommu)) {
                                int features = pci_pasid_features(pdev);
@@ -4931,3 +4953,48 @@ static void __init check_tylersburg_isoch(void)
        pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
               vtisochctrl);
 }
+
+/*
+ * Here we deal with a device TLB defect where device may inadvertently issue ATS
+ * invalidation completion before posted writes initiated with translated address
+ * that utilized translations matching the invalidation address range, violating
+ * the invalidation completion ordering.
+ * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
+ * vulnerable to this defect. In other words, any dTLB invalidation initiated not
+ * under the control of the trusted/privileged host device driver must use this
+ * quirk.
+ * Device TLBs are invalidated under the following six conditions:
+ * 1. Device driver does DMA API unmap IOVA
+ * 2. Device driver unbind a PASID from a process, sva_unbind_device()
+ * 3. PASID is torn down, after PASID cache is flushed. e.g. process
+ *    exit_mmap() due to crash
+ * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
+ *    VM has to free pages that were unmapped
+ * 5. Userspace driver unmaps a DMA buffer
+ * 6. Cache invalidation in vSVA usage (upcoming)
+ *
+ * For #1 and #2, device drivers are responsible for stopping DMA traffic
+ * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
+ * invalidate TLB the same way as normal user unmap which will use this quirk.
+ * The dTLB invalidation after PASID cache flush does not need this quirk.
+ *
+ * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
+ */
+void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
+                              unsigned long address, unsigned long mask,
+                              u32 pasid, u16 qdep)
+{
+       u16 sid;
+
+       if (likely(!info->dtlb_extra_inval))
+               return;
+
+       sid = PCI_DEVID(info->bus, info->devfn);
+       if (pasid == PASID_RID2PASID) {
+               qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
+                                  qdep, address, mask);
+       } else {
+               qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
+                                        pasid, qdep, address, mask);
+       }
+}
index 92023df..db9df7c 100644 (file)
@@ -623,6 +623,7 @@ struct device_domain_info {
        u8 pri_enabled:1;
        u8 ats_supported:1;
        u8 ats_enabled:1;
+       u8 dtlb_extra_inval:1;  /* Quirk for devices need extra flush */
        u8 ats_qdep;
        struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
        struct intel_iommu *iommu; /* IOMMU used by this device */
@@ -728,6 +729,9 @@ void qi_flush_piotlb(struct intel_iommu *iommu, u16 did, u32 pasid, u64 addr,
 void qi_flush_dev_iotlb_pasid(struct intel_iommu *iommu, u16 sid, u16 pfsid,
                              u32 pasid, u16 qdep, u64 addr,
                              unsigned int size_order);
+void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
+                              unsigned long address, unsigned long pages,
+                              u32 pasid, u16 qdep);
 void qi_flush_pasid_cache(struct intel_iommu *iommu, u16 did, u64 granu,
                          u32 pasid);
 
index 7d08eb0..03b2535 100644 (file)
@@ -184,10 +184,13 @@ static void __flush_svm_range_dev(struct intel_svm *svm,
                return;
 
        qi_flush_piotlb(sdev->iommu, sdev->did, svm->pasid, address, pages, ih);
-       if (info->ats_enabled)
+       if (info->ats_enabled) {
                qi_flush_dev_iotlb_pasid(sdev->iommu, sdev->sid, info->pfsid,
                                         svm->pasid, sdev->qdep, address,
                                         order_base_2(pages));
+               quirk_extra_dev_tlb_flush(info, address, order_base_2(pages),
+                                         svm->pasid, sdev->qdep);
+       }
 }
 
 static void intel_flush_svm_range_dev(struct intel_svm *svm,
@@ -745,12 +748,16 @@ bad_req:
                 * If prq is to be handled outside iommu driver via receiver of
                 * the fault notifiers, we skip the page response here.
                 */
-               if (!pdev || intel_svm_prq_report(iommu, &pdev->dev, req))
-                       handle_bad_prq_event(iommu, req, QI_RESP_INVALID);
+               if (!pdev)
+                       goto bad_req;
 
-               trace_prq_report(iommu, &pdev->dev, req->qw_0, req->qw_1,
-                                req->priv_data[0], req->priv_data[1],
-                                iommu->prq_seq_number++);
+               if (intel_svm_prq_report(iommu, &pdev->dev, req))
+                       handle_bad_prq_event(iommu, req, QI_RESP_INVALID);
+               else
+                       trace_prq_report(iommu, &pdev->dev, req->qw_0, req->qw_1,
+                                        req->priv_data[0], req->priv_data[1],
+                                        iommu->prq_seq_number++);
+               pci_dev_put(pdev);
 prq_advance:
                head = (head + sizeof(*req)) & PRQ_RING_MASK;
        }
index 542dde9..1440270 100644 (file)
 int get_vaddr_frames(unsigned long start, unsigned int nr_frames,
                     struct frame_vector *vec)
 {
-       struct mm_struct *mm = current->mm;
-       struct vm_area_struct *vma;
-       int ret_pin_user_pages_fast = 0;
-       int ret = 0;
-       int err;
+       int ret;
 
        if (nr_frames == 0)
                return 0;
@@ -52,57 +48,17 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames,
        ret = pin_user_pages_fast(start, nr_frames,
                                  FOLL_FORCE | FOLL_WRITE | FOLL_LONGTERM,
                                  (struct page **)(vec->ptrs));
-       if (ret > 0) {
-               vec->got_ref = true;
-               vec->is_pfns = false;
-               goto out_unlocked;
-       }
-       ret_pin_user_pages_fast = ret;
-
-       mmap_read_lock(mm);
-       vec->got_ref = false;
-       vec->is_pfns = true;
-       ret = 0;
-       do {
-               unsigned long *nums = frame_vector_pfns(vec);
-
-               vma = vma_lookup(mm, start);
-               if (!vma)
-                       break;
-
-               while (ret < nr_frames && start + PAGE_SIZE <= vma->vm_end) {
-                       err = follow_pfn(vma, start, &nums[ret]);
-                       if (err) {
-                               if (ret)
-                                       goto out;
-                               // If follow_pfn() returns -EINVAL, then this
-                               // is not an IO mapping or a raw PFN mapping.
-                               // In that case, return the original error from
-                               // pin_user_pages_fast(). Otherwise this
-                               // function would return -EINVAL when
-                               // pin_user_pages_fast() returned -ENOMEM,
-                               // which makes debugging hard.
-                               if (err == -EINVAL && ret_pin_user_pages_fast)
-                                       ret = ret_pin_user_pages_fast;
-                               else
-                                       ret = err;
-                               goto out;
-                       }
-                       start += PAGE_SIZE;
-                       ret++;
-               }
-               /* Bail out if VMA doesn't completely cover the tail page. */
-               if (start < vma->vm_end)
-                       break;
-       } while (ret < nr_frames);
-out:
-       mmap_read_unlock(mm);
-out_unlocked:
-       if (!ret)
-               ret = -EFAULT;
-       if (ret > 0)
-               vec->nr_frames = ret;
-       return ret;
+       vec->got_ref = true;
+       vec->is_pfns = false;
+       vec->nr_frames = ret;
+
+       if (likely(ret > 0))
+               return ret;
+
+       /* This used to (racily) return non-refcounted pfns. Let people know */
+       WARN_ONCE(1, "get_vaddr_frames() cannot follow VM_IO mapping");
+       vec->nr_frames = 0;
+       return ret ? ret : -EFAULT;
 }
 EXPORT_SYMBOL(get_vaddr_frames);
 
index ab9697f..92efc46 100644 (file)
@@ -813,7 +813,13 @@ int vb2_core_reqbufs(struct vb2_queue *q, enum vb2_memory memory,
        num_buffers = max_t(unsigned int, *count, q->min_buffers_needed);
        num_buffers = min_t(unsigned int, num_buffers, VB2_MAX_FRAME);
        memset(q->alloc_devs, 0, sizeof(q->alloc_devs));
+       /*
+        * Set this now to ensure that drivers see the correct q->memory value
+        * in the queue_setup op.
+        */
+       mutex_lock(&q->mmap_lock);
        q->memory = memory;
+       mutex_unlock(&q->mmap_lock);
        set_queue_coherency(q, non_coherent_mem);
 
        /*
@@ -823,22 +829,27 @@ int vb2_core_reqbufs(struct vb2_queue *q, enum vb2_memory memory,
        ret = call_qop(q, queue_setup, q, &num_buffers, &num_planes,
                       plane_sizes, q->alloc_devs);
        if (ret)
-               return ret;
+               goto error;
 
        /* Check that driver has set sane values */
-       if (WARN_ON(!num_planes))
-               return -EINVAL;
+       if (WARN_ON(!num_planes)) {
+               ret = -EINVAL;
+               goto error;
+       }
 
        for (i = 0; i < num_planes; i++)
-               if (WARN_ON(!plane_sizes[i]))
-                       return -EINVAL;
+               if (WARN_ON(!plane_sizes[i])) {
+                       ret = -EINVAL;
+                       goto error;
+               }
 
        /* Finally, allocate buffers and video memory */
        allocated_buffers =
                __vb2_queue_alloc(q, memory, num_buffers, num_planes, plane_sizes);
        if (allocated_buffers == 0) {
                dprintk(q, 1, "memory allocation failed\n");
-               return -ENOMEM;
+               ret = -ENOMEM;
+               goto error;
        }
 
        /*
@@ -879,7 +890,8 @@ int vb2_core_reqbufs(struct vb2_queue *q, enum vb2_memory memory,
        if (ret < 0) {
                /*
                 * Note: __vb2_queue_free() will subtract 'allocated_buffers'
-                * from q->num_buffers.
+                * from q->num_buffers and it will reset q->memory to
+                * VB2_MEMORY_UNKNOWN.
                 */
                __vb2_queue_free(q, allocated_buffers);
                mutex_unlock(&q->mmap_lock);
@@ -895,6 +907,12 @@ int vb2_core_reqbufs(struct vb2_queue *q, enum vb2_memory memory,
        q->waiting_for_buffers = !q->is_output;
 
        return 0;
+
+error:
+       mutex_lock(&q->mmap_lock);
+       q->memory = VB2_MEMORY_UNKNOWN;
+       mutex_unlock(&q->mmap_lock);
+       return ret;
 }
 EXPORT_SYMBOL_GPL(vb2_core_reqbufs);
 
@@ -906,6 +924,7 @@ int vb2_core_create_bufs(struct vb2_queue *q, enum vb2_memory memory,
        unsigned int num_planes = 0, num_buffers, allocated_buffers;
        unsigned plane_sizes[VB2_MAX_PLANES] = { };
        bool non_coherent_mem = flags & V4L2_MEMORY_FLAG_NON_COHERENT;
+       bool no_previous_buffers = !q->num_buffers;
        int ret;
 
        if (q->num_buffers == VB2_MAX_FRAME) {
@@ -913,13 +932,19 @@ int vb2_core_create_bufs(struct vb2_queue *q, enum vb2_memory memory,
                return -ENOBUFS;
        }
 
-       if (!q->num_buffers) {
+       if (no_previous_buffers) {
                if (q->waiting_in_dqbuf && *count) {
                        dprintk(q, 1, "another dup()ped fd is waiting for a buffer\n");
                        return -EBUSY;
                }
                memset(q->alloc_devs, 0, sizeof(q->alloc_devs));
+               /*
+                * Set this now to ensure that drivers see the correct q->memory
+                * value in the queue_setup op.
+                */
+               mutex_lock(&q->mmap_lock);
                q->memory = memory;
+               mutex_unlock(&q->mmap_lock);
                q->waiting_for_buffers = !q->is_output;
                set_queue_coherency(q, non_coherent_mem);
        } else {
@@ -945,14 +970,15 @@ int vb2_core_create_bufs(struct vb2_queue *q, enum vb2_memory memory,
        ret = call_qop(q, queue_setup, q, &num_buffers,
                       &num_planes, plane_sizes, q->alloc_devs);
        if (ret)
-               return ret;
+               goto error;
 
        /* Finally, allocate buffers and video memory */
        allocated_buffers = __vb2_queue_alloc(q, memory, num_buffers,
                                num_planes, plane_sizes);
        if (allocated_buffers == 0) {
                dprintk(q, 1, "memory allocation failed\n");
-               return -ENOMEM;
+               ret = -ENOMEM;
+               goto error;
        }
 
        /*
@@ -983,7 +1009,8 @@ int vb2_core_create_bufs(struct vb2_queue *q, enum vb2_memory memory,
        if (ret < 0) {
                /*
                 * Note: __vb2_queue_free() will subtract 'allocated_buffers'
-                * from q->num_buffers.
+                * from q->num_buffers and it will reset q->memory to
+                * VB2_MEMORY_UNKNOWN.
                 */
                __vb2_queue_free(q, allocated_buffers);
                mutex_unlock(&q->mmap_lock);
@@ -998,6 +1025,14 @@ int vb2_core_create_bufs(struct vb2_queue *q, enum vb2_memory memory,
        *count = allocated_buffers;
 
        return 0;
+
+error:
+       if (no_previous_buffers) {
+               mutex_lock(&q->mmap_lock);
+               q->memory = VB2_MEMORY_UNKNOWN;
+               mutex_unlock(&q->mmap_lock);
+       }
+       return ret;
 }
 EXPORT_SYMBOL_GPL(vb2_core_create_bufs);
 
@@ -2165,6 +2200,22 @@ static int __find_plane_by_offset(struct vb2_queue *q, unsigned long off,
        unsigned int buffer, plane;
 
        /*
+        * Sanity checks to ensure the lock is held, MEMORY_MMAP is
+        * used and fileio isn't active.
+        */
+       lockdep_assert_held(&q->mmap_lock);
+
+       if (q->memory != VB2_MEMORY_MMAP) {
+               dprintk(q, 1, "queue is not currently set up for mmap\n");
+               return -EINVAL;
+       }
+
+       if (vb2_fileio_is_active(q)) {
+               dprintk(q, 1, "file io in progress\n");
+               return -EBUSY;
+       }
+
+       /*
         * Go over all buffers and their planes, comparing the given offset
         * with an offset assigned to each plane. If a match is found,
         * return its buffer and plane numbers.
@@ -2265,11 +2316,6 @@ int vb2_mmap(struct vb2_queue *q, struct vm_area_struct *vma)
        int ret;
        unsigned long length;
 
-       if (q->memory != VB2_MEMORY_MMAP) {
-               dprintk(q, 1, "queue is not currently set up for mmap\n");
-               return -EINVAL;
-       }
-
        /*
         * Check memory area access mode.
         */
@@ -2291,14 +2337,9 @@ int vb2_mmap(struct vb2_queue *q, struct vm_area_struct *vma)
 
        mutex_lock(&q->mmap_lock);
 
-       if (vb2_fileio_is_active(q)) {
-               dprintk(q, 1, "mmap: file io in progress\n");
-               ret = -EBUSY;
-               goto unlock;
-       }
-
        /*
-        * Find the plane corresponding to the offset passed by userspace.
+        * Find the plane corresponding to the offset passed by userspace. This
+        * will return an error if not MEMORY_MMAP or file I/O is in progress.
         */
        ret = __find_plane_by_offset(q, off, &buffer, &plane);
        if (ret)
@@ -2351,22 +2392,25 @@ unsigned long vb2_get_unmapped_area(struct vb2_queue *q,
        void *vaddr;
        int ret;
 
-       if (q->memory != VB2_MEMORY_MMAP) {
-               dprintk(q, 1, "queue is not currently set up for mmap\n");
-               return -EINVAL;
-       }
+       mutex_lock(&q->mmap_lock);
 
        /*
-        * Find the plane corresponding to the offset passed by userspace.
+        * Find the plane corresponding to the offset passed by userspace. This
+        * will return an error if not MEMORY_MMAP or file I/O is in progress.
         */
        ret = __find_plane_by_offset(q, off, &buffer, &plane);
        if (ret)
-               return ret;
+               goto unlock;
 
        vb = q->bufs[buffer];
 
        vaddr = vb2_plane_vaddr(vb, plane);
+       mutex_unlock(&q->mmap_lock);
        return vaddr ? (unsigned long)vaddr : -EINVAL;
+
+unlock:
+       mutex_unlock(&q->mmap_lock);
+       return ret;
 }
 EXPORT_SYMBOL_GPL(vb2_get_unmapped_area);
 #endif
index c5de202..de1cc9e 100644 (file)
@@ -1484,6 +1484,11 @@ void mmc_init_erase(struct mmc_card *card)
                card->pref_erase = 0;
 }
 
+static bool is_trim_arg(unsigned int arg)
+{
+       return (arg & MMC_TRIM_OR_DISCARD_ARGS) && arg != MMC_DISCARD_ARG;
+}
+
 static unsigned int mmc_mmc_erase_timeout(struct mmc_card *card,
                                          unsigned int arg, unsigned int qty)
 {
@@ -1766,7 +1771,7 @@ int mmc_erase(struct mmc_card *card, unsigned int from, unsigned int nr,
            !(card->ext_csd.sec_feature_support & EXT_CSD_SEC_ER_EN))
                return -EOPNOTSUPP;
 
-       if (mmc_card_mmc(card) && (arg & MMC_TRIM_ARGS) &&
+       if (mmc_card_mmc(card) && is_trim_arg(arg) &&
            !(card->ext_csd.sec_feature_support & EXT_CSD_SEC_GB_CL_EN))
                return -EOPNOTSUPP;
 
@@ -1796,7 +1801,7 @@ int mmc_erase(struct mmc_card *card, unsigned int from, unsigned int nr,
         * identified by the card->eg_boundary flag.
         */
        rem = card->erase_size - (from % card->erase_size);
-       if ((arg & MMC_TRIM_ARGS) && (card->eg_boundary) && (nr > rem)) {
+       if ((arg & MMC_TRIM_OR_DISCARD_ARGS) && card->eg_boundary && nr > rem) {
                err = mmc_do_erase(card, from, from + rem - 1, arg);
                from += rem;
                if ((err) || (to <= from))
index 8d9bcee..155ce2b 100644 (file)
@@ -3179,7 +3179,8 @@ static int __mmc_test_register_dbgfs_file(struct mmc_card *card,
        struct mmc_test_dbgfs_file *df;
 
        if (card->debugfs_root)
-               debugfs_create_file(name, mode, card->debugfs_root, card, fops);
+               file = debugfs_create_file(name, mode, card->debugfs_root,
+                                          card, fops);
 
        df = kmalloc(sizeof(*df), GFP_KERNEL);
        if (!df) {
index df94143..26bc59b 100644 (file)
@@ -2588,13 +2588,11 @@ static int msdc_of_clock_parse(struct platform_device *pdev,
                        return PTR_ERR(host->src_clk_cg);
        }
 
-       host->sys_clk_cg = devm_clk_get_optional(&pdev->dev, "sys_cg");
+       /* If present, always enable for this clock gate */
+       host->sys_clk_cg = devm_clk_get_optional_enabled(&pdev->dev, "sys_cg");
        if (IS_ERR(host->sys_clk_cg))
                host->sys_clk_cg = NULL;
 
-       /* If present, always enable for this clock gate */
-       clk_prepare_enable(host->sys_clk_cg);
-
        host->bulk_clks[0].id = "pclk_cg";
        host->bulk_clks[1].id = "axi_cg";
        host->bulk_clks[2].id = "ahb_cg";
index 31ea0a2..ffeb575 100644 (file)
@@ -1512,7 +1512,7 @@ static void esdhc_cqe_enable(struct mmc_host *mmc)
         * system resume back.
         */
        cqhci_writel(cq_host, 0, CQHCI_CTL);
-       if (cqhci_readl(cq_host, CQHCI_CTL) && CQHCI_HALT)
+       if (cqhci_readl(cq_host, CQHCI_CTL) & CQHCI_HALT)
                dev_err(mmc_dev(host->mmc),
                        "failed to exit halt state when enable CQE\n");
 
index b92a408..bec3f9e 100644 (file)
@@ -470,7 +470,7 @@ static int sdhci_sprd_voltage_switch(struct mmc_host *mmc, struct mmc_ios *ios)
        }
 
        if (IS_ERR(sprd_host->pinctrl))
-               return 0;
+               goto reset;
 
        switch (ios->signal_voltage) {
        case MMC_SIGNAL_VOLTAGE_180:
@@ -498,6 +498,8 @@ static int sdhci_sprd_voltage_switch(struct mmc_host *mmc, struct mmc_ios *ios)
 
        /* Wait for 300 ~ 500 us for pin state stable */
        usleep_range(300, 500);
+
+reset:
        sdhci_reset(host, SDHCI_RESET_CMD | SDHCI_RESET_DATA);
 
        return 0;
index fef03de..c7ad32a 100644 (file)
@@ -373,6 +373,7 @@ static void sdhci_init(struct sdhci_host *host, int soft)
        if (soft) {
                /* force clock reconfiguration */
                host->clock = 0;
+               host->reinit_uhs = true;
                mmc->ops->set_ios(mmc, &mmc->ios);
        }
 }
@@ -2293,11 +2294,46 @@ void sdhci_set_uhs_signaling(struct sdhci_host *host, unsigned timing)
 }
 EXPORT_SYMBOL_GPL(sdhci_set_uhs_signaling);
 
+static bool sdhci_timing_has_preset(unsigned char timing)
+{
+       switch (timing) {
+       case MMC_TIMING_UHS_SDR12:
+       case MMC_TIMING_UHS_SDR25:
+       case MMC_TIMING_UHS_SDR50:
+       case MMC_TIMING_UHS_SDR104:
+       case MMC_TIMING_UHS_DDR50:
+       case MMC_TIMING_MMC_DDR52:
+               return true;
+       };
+       return false;
+}
+
+static bool sdhci_preset_needed(struct sdhci_host *host, unsigned char timing)
+{
+       return !(host->quirks2 & SDHCI_QUIRK2_PRESET_VALUE_BROKEN) &&
+              sdhci_timing_has_preset(timing);
+}
+
+static bool sdhci_presetable_values_change(struct sdhci_host *host, struct mmc_ios *ios)
+{
+       /*
+        * Preset Values are: Driver Strength, Clock Generator and SDCLK/RCLK
+        * Frequency. Check if preset values need to be enabled, or the Driver
+        * Strength needs updating. Note, clock changes are handled separately.
+        */
+       return !host->preset_enabled &&
+              (sdhci_preset_needed(host, ios->timing) || host->drv_type != ios->drv_type);
+}
+
 void sdhci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios)
 {
        struct sdhci_host *host = mmc_priv(mmc);
+       bool reinit_uhs = host->reinit_uhs;
+       bool turning_on_clk = false;
        u8 ctrl;
 
+       host->reinit_uhs = false;
+
        if (ios->power_mode == MMC_POWER_UNDEFINED)
                return;
 
@@ -2323,6 +2359,8 @@ void sdhci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios)
                sdhci_enable_preset_value(host, false);
 
        if (!ios->clock || ios->clock != host->clock) {
+               turning_on_clk = ios->clock && !host->clock;
+
                host->ops->set_clock(host, ios->clock);
                host->clock = ios->clock;
 
@@ -2349,6 +2387,17 @@ void sdhci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios)
 
        host->ops->set_bus_width(host, ios->bus_width);
 
+       /*
+        * Special case to avoid multiple clock changes during voltage
+        * switching.
+        */
+       if (!reinit_uhs &&
+           turning_on_clk &&
+           host->timing == ios->timing &&
+           host->version >= SDHCI_SPEC_300 &&
+           !sdhci_presetable_values_change(host, ios))
+               return;
+
        ctrl = sdhci_readb(host, SDHCI_HOST_CONTROL);
 
        if (!(host->quirks & SDHCI_QUIRK_NO_HISPD_BIT)) {
@@ -2392,6 +2441,7 @@ void sdhci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios)
                        }
 
                        sdhci_writew(host, ctrl_2, SDHCI_HOST_CONTROL2);
+                       host->drv_type = ios->drv_type;
                } else {
                        /*
                         * According to SDHC Spec v3.00, if the Preset Value
@@ -2419,19 +2469,14 @@ void sdhci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios)
                host->ops->set_uhs_signaling(host, ios->timing);
                host->timing = ios->timing;
 
-               if (!(host->quirks2 & SDHCI_QUIRK2_PRESET_VALUE_BROKEN) &&
-                               ((ios->timing == MMC_TIMING_UHS_SDR12) ||
-                                (ios->timing == MMC_TIMING_UHS_SDR25) ||
-                                (ios->timing == MMC_TIMING_UHS_SDR50) ||
-                                (ios->timing == MMC_TIMING_UHS_SDR104) ||
-                                (ios->timing == MMC_TIMING_UHS_DDR50) ||
-                                (ios->timing == MMC_TIMING_MMC_DDR52))) {
+               if (sdhci_preset_needed(host, ios->timing)) {
                        u16 preset;
 
                        sdhci_enable_preset_value(host, true);
                        preset = sdhci_get_preset_value(host);
                        ios->drv_type = FIELD_GET(SDHCI_PRESET_DRV_MASK,
                                                  preset);
+                       host->drv_type = ios->drv_type;
                }
 
                /* Re-enable SD Clock */
@@ -3768,6 +3813,7 @@ int sdhci_resume_host(struct sdhci_host *host)
                sdhci_init(host, 0);
                host->pwr = 0;
                host->clock = 0;
+               host->reinit_uhs = true;
                mmc->ops->set_ios(mmc, &mmc->ios);
        } else {
                sdhci_init(host, (mmc->pm_flags & MMC_PM_KEEP_POWER));
@@ -3830,6 +3876,7 @@ int sdhci_runtime_resume_host(struct sdhci_host *host, int soft_reset)
                /* Force clock and power re-program */
                host->pwr = 0;
                host->clock = 0;
+               host->reinit_uhs = true;
                mmc->ops->start_signal_voltage_switch(mmc, &mmc->ios);
                mmc->ops->set_ios(mmc, &mmc->ios);
 
index d750c46..87a3aaa 100644 (file)
@@ -524,6 +524,8 @@ struct sdhci_host {
 
        unsigned int clock;     /* Current clock (MHz) */
        u8 pwr;                 /* Current voltage */
+       u8 drv_type;            /* Current UHS-I driver type */
+       bool reinit_uhs;        /* Force UHS-related re-initialization */
 
        bool runtime_suspended; /* Host is runtime suspended */
        bool bus_on;            /* Bus power prevents runtime suspend */
index e01bb04..f7767af 100644 (file)
@@ -1632,13 +1632,19 @@ static int bond_master_upper_dev_link(struct bonding *bond, struct slave *slave,
 {
        struct netdev_lag_upper_info lag_upper_info;
        enum netdev_lag_tx_type type;
+       int err;
 
        type = bond_lag_tx_type(bond);
        lag_upper_info.tx_type = type;
        lag_upper_info.hash_type = bond_lag_hash_type(bond, type);
 
-       return netdev_master_upper_dev_link(slave->dev, bond->dev, slave,
-                                           &lag_upper_info, extack);
+       err = netdev_master_upper_dev_link(slave->dev, bond->dev, slave,
+                                          &lag_upper_info, extack);
+       if (err)
+               return err;
+
+       slave->dev->flags |= IFF_SLAVE;
+       return 0;
 }
 
 static void bond_upper_dev_unlink(struct bonding *bond, struct slave *slave)
@@ -1950,8 +1956,8 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev,
                }
        }
 
-       /* set slave flag before open to prevent IPv6 addrconf */
-       slave_dev->flags |= IFF_SLAVE;
+       /* set no_addrconf flag before open to prevent IPv6 addrconf */
+       slave_dev->priv_flags |= IFF_NO_ADDRCONF;
 
        /* open the slave since the application closed it */
        res = dev_open(slave_dev, extack);
@@ -2254,7 +2260,7 @@ err_close:
        dev_close(slave_dev);
 
 err_restore_mac:
-       slave_dev->flags &= ~IFF_SLAVE;
+       slave_dev->priv_flags &= ~IFF_NO_ADDRCONF;
        if (!bond->params.fail_over_mac ||
            BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) {
                /* XXX TODO - fom follow mode needs to change master's
@@ -2446,6 +2452,8 @@ static int __bond_release_one(struct net_device *bond_dev,
        /* close slave before restoring its mac address */
        dev_close(slave_dev);
 
+       slave_dev->priv_flags &= ~IFF_NO_ADDRCONF;
+
        if (bond->params.fail_over_mac != BOND_FOM_ACTIVE ||
            BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) {
                /* restore original ("permanent") mac address */
@@ -3249,7 +3257,7 @@ static int bond_na_rcv(const struct sk_buff *skb, struct bonding *bond,
                goto out;
 
        saddr = &combined->ip6.saddr;
-       daddr = &combined->ip6.saddr;
+       daddr = &combined->ip6.daddr;
 
        slave_dbg(bond->dev, slave->dev, "%s: %s/%d av %d sv %d sip %pI6c tip %pI6c\n",
                  __func__, slave->dev->name, bond_slave_state(slave),
index 86e95e9..03ccb7c 100644 (file)
@@ -290,8 +290,7 @@ static int c_can_plat_probe(struct platform_device *pdev)
                goto exit;
        }
 
-       mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-       addr = devm_ioremap_resource(&pdev->dev, mem);
+       addr = devm_platform_get_and_ioremap_resource(pdev, 0, &mem);
        if (IS_ERR(addr)) {
                ret =  PTR_ERR(addr);
                goto exit;
index ed3d0b8..dc7192e 100644 (file)
@@ -796,9 +796,9 @@ static int can327_netdev_close(struct net_device *dev)
 
        netif_stop_queue(dev);
 
-       /* Give UART one final chance to flush. */
-       clear_bit(TTY_DO_WRITE_WAKEUP, &elm->tty->flags);
-       flush_work(&elm->tx_work);
+       /* We don't flush the UART TX queue here, as we want final stop
+        * commands (like the above dummy char) to be flushed out.
+        */
 
        can_rx_offload_disable(&elm->offload);
        elm->can.state = CAN_STATE_STOPPED;
@@ -1069,12 +1069,15 @@ static void can327_ldisc_close(struct tty_struct *tty)
 {
        struct can327 *elm = (struct can327 *)tty->disc_data;
 
-       /* unregister_netdev() calls .ndo_stop() so we don't have to.
-        * Our .ndo_stop() also flushes the TTY write wakeup handler,
-        * so we can safely set elm->tty = NULL after this.
-        */
+       /* unregister_netdev() calls .ndo_stop() so we don't have to. */
        unregister_candev(elm->dev);
 
+       /* Give UART one final chance to flush.
+        * No need to clear TTY_DO_WRITE_WAKEUP since .write_wakeup() is
+        * serialised against .close() and will not be called once we return.
+        */
+       flush_work(&elm->tx_work);
+
        /* Mark channel as dead */
        spin_lock_bh(&elm->lock);
        tty->disc_data = NULL;
index 6e20733..f52407f 100644 (file)
@@ -23,7 +23,7 @@ config CAN_CTUCANFD_PCI
 
 config CAN_CTUCANFD_PLATFORM
        tristate "CTU CAN-FD IP core platform (FPGA, SoC) driver"
-       depends on HAS_IOMEM && (OF || COMPILE_TEST)
+       depends on HAS_IOMEM && OF
        select CAN_CTUCANFD
        help
          The core has been tested together with OpenCores SJA1000
index 9bdadd7..0aeff34 100644 (file)
@@ -345,6 +345,15 @@ static struct flexcan_devtype_data fsl_imx8mp_devtype_data = {
                FLEXCAN_QUIRK_SUPPORT_RX_MAILBOX_RTR,
 };
 
+static struct flexcan_devtype_data fsl_imx93_devtype_data = {
+       .quirks = FLEXCAN_QUIRK_DISABLE_RXFG | FLEXCAN_QUIRK_ENABLE_EACEN_RRS |
+               FLEXCAN_QUIRK_DISABLE_MECR | FLEXCAN_QUIRK_USE_RX_MAILBOX |
+               FLEXCAN_QUIRK_BROKEN_PERR_STATE | FLEXCAN_QUIRK_AUTO_STOP_MODE |
+               FLEXCAN_QUIRK_SUPPORT_FD | FLEXCAN_QUIRK_SUPPORT_ECC |
+               FLEXCAN_QUIRK_SUPPORT_RX_MAILBOX |
+               FLEXCAN_QUIRK_SUPPORT_RX_MAILBOX_RTR,
+};
+
 static const struct flexcan_devtype_data fsl_vf610_devtype_data = {
        .quirks = FLEXCAN_QUIRK_DISABLE_RXFG | FLEXCAN_QUIRK_ENABLE_EACEN_RRS |
                FLEXCAN_QUIRK_DISABLE_MECR | FLEXCAN_QUIRK_USE_RX_MAILBOX |
@@ -532,9 +541,14 @@ static inline int flexcan_enter_stop_mode(struct flexcan_priv *priv)
                ret = flexcan_stop_mode_enable_scfw(priv, true);
                if (ret < 0)
                        return ret;
-       } else {
+       } else if (priv->devtype_data.quirks & FLEXCAN_QUIRK_SETUP_STOP_MODE_GPR) {
                regmap_update_bits(priv->stm.gpr, priv->stm.req_gpr,
                                   1 << priv->stm.req_bit, 1 << priv->stm.req_bit);
+       } else if (priv->devtype_data.quirks & FLEXCAN_QUIRK_AUTO_STOP_MODE) {
+               /* For the auto stop mode, software do nothing, hardware will cover
+                * all the operation automatically after system go into low power mode.
+                */
+               return 0;
        }
 
        return flexcan_low_power_enter_ack(priv);
@@ -551,7 +565,7 @@ static inline int flexcan_exit_stop_mode(struct flexcan_priv *priv)
                ret = flexcan_stop_mode_enable_scfw(priv, false);
                if (ret < 0)
                        return ret;
-       } else {
+       } else if (priv->devtype_data.quirks & FLEXCAN_QUIRK_SETUP_STOP_MODE_GPR) {
                regmap_update_bits(priv->stm.gpr, priv->stm.req_gpr,
                                   1 << priv->stm.req_bit, 0);
        }
@@ -560,6 +574,12 @@ static inline int flexcan_exit_stop_mode(struct flexcan_priv *priv)
        reg_mcr &= ~FLEXCAN_MCR_SLF_WAK;
        priv->write(reg_mcr, &regs->mcr);
 
+       /* For the auto stop mode, hardware will exist stop mode
+        * automatically after system go out of low power mode.
+        */
+       if (priv->devtype_data.quirks & FLEXCAN_QUIRK_AUTO_STOP_MODE)
+               return 0;
+
        return flexcan_low_power_exit_ack(priv);
 }
 
@@ -1974,6 +1994,8 @@ static int flexcan_setup_stop_mode(struct platform_device *pdev)
                ret = flexcan_setup_stop_mode_scfw(pdev);
        else if (priv->devtype_data.quirks & FLEXCAN_QUIRK_SETUP_STOP_MODE_GPR)
                ret = flexcan_setup_stop_mode_gpr(pdev);
+       else if (priv->devtype_data.quirks & FLEXCAN_QUIRK_AUTO_STOP_MODE)
+               ret = 0;
        else
                /* return 0 directly if doesn't support stop mode feature */
                return 0;
@@ -1992,6 +2014,7 @@ static int flexcan_setup_stop_mode(struct platform_device *pdev)
 static const struct of_device_id flexcan_of_match[] = {
        { .compatible = "fsl,imx8qm-flexcan", .data = &fsl_imx8qm_devtype_data, },
        { .compatible = "fsl,imx8mp-flexcan", .data = &fsl_imx8mp_devtype_data, },
+       { .compatible = "fsl,imx93-flexcan", .data = &fsl_imx93_devtype_data, },
        { .compatible = "fsl,imx6q-flexcan", .data = &fsl_imx6q_devtype_data, },
        { .compatible = "fsl,imx28-flexcan", .data = &fsl_imx28_devtype_data, },
        { .compatible = "fsl,imx53-flexcan", .data = &fsl_imx25_devtype_data, },
@@ -2299,8 +2322,16 @@ static int __maybe_unused flexcan_noirq_suspend(struct device *device)
        if (netif_running(dev)) {
                int err;
 
-               if (device_may_wakeup(device))
+               if (device_may_wakeup(device)) {
                        flexcan_enable_wakeup_irq(priv, true);
+                       /* For auto stop mode, need to keep the clock on before
+                        * system go into low power mode. After system go into
+                        * low power mode, hardware will config the flexcan into
+                        * stop mode, and gate off the clock automatically.
+                        */
+                       if (priv->devtype_data.quirks & FLEXCAN_QUIRK_AUTO_STOP_MODE)
+                               return 0;
+               }
 
                err = pm_runtime_force_suspend(device);
                if (err)
index 025c341..9140297 100644 (file)
@@ -68,6 +68,8 @@
 #define FLEXCAN_QUIRK_SUPPORT_RX_MAILBOX_RTR BIT(15)
 /* Device supports RX via FIFO */
 #define FLEXCAN_QUIRK_SUPPORT_RX_FIFO BIT(16)
+/* auto enter stop mode to support wakeup */
+#define FLEXCAN_QUIRK_AUTO_STOP_MODE BIT(17)
 
 struct flexcan_devtype_data {
        u32 quirks;             /* quirks needed for different IP cores */
index 0bdec28..8e83d69 100644 (file)
@@ -9,20 +9,20 @@
  */
 
 #include <linux/bitfield.h>
+#include <linux/can/dev.h>
 #include <linux/ethtool.h>
 #include <linux/interrupt.h>
 #include <linux/io.h>
+#include <linux/iopoll.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/netdevice.h>
 #include <linux/of.h>
 #include <linux/of_device.h>
+#include <linux/phy/phy.h>
+#include <linux/pinctrl/consumer.h>
 #include <linux/platform_device.h>
 #include <linux/pm_runtime.h>
-#include <linux/iopoll.h>
-#include <linux/can/dev.h>
-#include <linux/pinctrl/consumer.h>
-#include <linux/phy/phy.h>
 
 #include "m_can.h"
 
@@ -369,9 +369,14 @@ m_can_txe_fifo_read(struct m_can_classdev *cdev, u32 fgi, u32 offset, u32 *val)
        return cdev->ops->read_fifo(cdev, addr_offset, val, 1);
 }
 
+static inline bool _m_can_tx_fifo_full(u32 txfqs)
+{
+       return !!(txfqs & TXFQS_TFQF);
+}
+
 static inline bool m_can_tx_fifo_full(struct m_can_classdev *cdev)
 {
-       return !!(m_can_read(cdev, M_CAN_TXFQS) & TXFQS_TFQF);
+       return _m_can_tx_fifo_full(m_can_read(cdev, M_CAN_TXFQS));
 }
 
 static void m_can_config_endisable(struct m_can_classdev *cdev, bool enable)
@@ -472,19 +477,16 @@ static void m_can_receive_skb(struct m_can_classdev *cdev,
        }
 }
 
-static int m_can_read_fifo(struct net_device *dev, u32 rxfs)
+static int m_can_read_fifo(struct net_device *dev, u32 fgi)
 {
        struct net_device_stats *stats = &dev->stats;
        struct m_can_classdev *cdev = netdev_priv(dev);
        struct canfd_frame *cf;
        struct sk_buff *skb;
        struct id_and_dlc fifo_header;
-       u32 fgi;
        u32 timestamp = 0;
        int err;
 
-       /* calculate the fifo get index for where to read data */
-       fgi = FIELD_GET(RXFS_FGI_MASK, rxfs);
        err = m_can_fifo_read(cdev, fgi, M_CAN_FIFO_ID, &fifo_header, 2);
        if (err)
                goto out_fail;
@@ -528,9 +530,6 @@ static int m_can_read_fifo(struct net_device *dev, u32 rxfs)
        }
        stats->rx_packets++;
 
-       /* acknowledge rx fifo 0 */
-       m_can_write(cdev, M_CAN_RXF0A, fgi);
-
        timestamp = FIELD_GET(RX_BUF_RXTS_MASK, fifo_header.dlc) << 16;
 
        m_can_receive_skb(cdev, skb, timestamp);
@@ -549,7 +548,11 @@ static int m_can_do_rx_poll(struct net_device *dev, int quota)
        struct m_can_classdev *cdev = netdev_priv(dev);
        u32 pkts = 0;
        u32 rxfs;
-       int err;
+       u32 rx_count;
+       u32 fgi;
+       int ack_fgi = -1;
+       int i;
+       int err = 0;
 
        rxfs = m_can_read(cdev, M_CAN_RXF0S);
        if (!(rxfs & RXFS_FFL_MASK)) {
@@ -557,16 +560,26 @@ static int m_can_do_rx_poll(struct net_device *dev, int quota)
                return 0;
        }
 
-       while ((rxfs & RXFS_FFL_MASK) && (quota > 0)) {
-               err = m_can_read_fifo(dev, rxfs);
+       rx_count = FIELD_GET(RXFS_FFL_MASK, rxfs);
+       fgi = FIELD_GET(RXFS_FGI_MASK, rxfs);
+
+       for (i = 0; i < rx_count && quota > 0; ++i) {
+               err = m_can_read_fifo(dev, fgi);
                if (err)
-                       return err;
+                       break;
 
                quota--;
                pkts++;
-               rxfs = m_can_read(cdev, M_CAN_RXF0S);
+               ack_fgi = fgi;
+               fgi = (++fgi >= cdev->mcfg[MRAM_RXF0].num ? 0 : fgi);
        }
 
+       if (ack_fgi != -1)
+               m_can_write(cdev, M_CAN_RXF0A, ack_fgi);
+
+       if (err)
+               return err;
+
        return pkts;
 }
 
@@ -900,14 +913,12 @@ static int m_can_handle_bus_errors(struct net_device *dev, u32 irqstatus,
        return work_done;
 }
 
-static int m_can_rx_handler(struct net_device *dev, int quota)
+static int m_can_rx_handler(struct net_device *dev, int quota, u32 irqstatus)
 {
        struct m_can_classdev *cdev = netdev_priv(dev);
        int rx_work_or_err;
        int work_done = 0;
-       u32 irqstatus, psr;
 
-       irqstatus = cdev->irqstatus | m_can_read(cdev, M_CAN_IR);
        if (!irqstatus)
                goto end;
 
@@ -932,13 +943,13 @@ static int m_can_rx_handler(struct net_device *dev, int quota)
                }
        }
 
-       psr = m_can_read(cdev, M_CAN_PSR);
-
        if (irqstatus & IR_ERR_STATE)
-               work_done += m_can_handle_state_errors(dev, psr);
+               work_done += m_can_handle_state_errors(dev,
+                                                      m_can_read(cdev, M_CAN_PSR));
 
        if (irqstatus & IR_ERR_BUS_30X)
-               work_done += m_can_handle_bus_errors(dev, irqstatus, psr);
+               work_done += m_can_handle_bus_errors(dev, irqstatus,
+                                                    m_can_read(cdev, M_CAN_PSR));
 
        if (irqstatus & IR_RF0N) {
                rx_work_or_err = m_can_do_rx_poll(dev, (quota - work_done));
@@ -951,12 +962,12 @@ end:
        return work_done;
 }
 
-static int m_can_rx_peripheral(struct net_device *dev)
+static int m_can_rx_peripheral(struct net_device *dev, u32 irqstatus)
 {
        struct m_can_classdev *cdev = netdev_priv(dev);
        int work_done;
 
-       work_done = m_can_rx_handler(dev, NAPI_POLL_WEIGHT);
+       work_done = m_can_rx_handler(dev, NAPI_POLL_WEIGHT, irqstatus);
 
        /* Don't re-enable interrupts if the driver had a fatal error
         * (e.g., FIFO read failure).
@@ -972,8 +983,11 @@ static int m_can_poll(struct napi_struct *napi, int quota)
        struct net_device *dev = napi->dev;
        struct m_can_classdev *cdev = netdev_priv(dev);
        int work_done;
+       u32 irqstatus;
 
-       work_done = m_can_rx_handler(dev, quota);
+       irqstatus = cdev->irqstatus | m_can_read(cdev, M_CAN_IR);
+
+       work_done = m_can_rx_handler(dev, quota, irqstatus);
 
        /* Don't re-enable interrupts if the driver had a fatal error
         * (e.g., FIFO read failure).
@@ -1014,7 +1028,9 @@ static int m_can_echo_tx_event(struct net_device *dev)
        u32 txe_count = 0;
        u32 m_can_txefs;
        u32 fgi = 0;
+       int ack_fgi = -1;
        int i = 0;
+       int err = 0;
        unsigned int msg_mark;
 
        struct m_can_classdev *cdev = netdev_priv(dev);
@@ -1024,34 +1040,34 @@ static int m_can_echo_tx_event(struct net_device *dev)
 
        /* Get Tx Event fifo element count */
        txe_count = FIELD_GET(TXEFS_EFFL_MASK, m_can_txefs);
+       fgi = FIELD_GET(TXEFS_EFGI_MASK, m_can_txefs);
 
        /* Get and process all sent elements */
        for (i = 0; i < txe_count; i++) {
                u32 txe, timestamp = 0;
-               int err;
-
-               /* retrieve get index */
-               fgi = FIELD_GET(TXEFS_EFGI_MASK, m_can_read(cdev, M_CAN_TXEFS));
 
                /* get message marker, timestamp */
                err = m_can_txe_fifo_read(cdev, fgi, 4, &txe);
                if (err) {
                        netdev_err(dev, "TXE FIFO read returned %d\n", err);
-                       return err;
+                       break;
                }
 
                msg_mark = FIELD_GET(TX_EVENT_MM_MASK, txe);
                timestamp = FIELD_GET(TX_EVENT_TXTS_MASK, txe) << 16;
 
-               /* ack txe element */
-               m_can_write(cdev, M_CAN_TXEFA, FIELD_PREP(TXEFA_EFAI_MASK,
-                                                         fgi));
+               ack_fgi = fgi;
+               fgi = (++fgi >= cdev->mcfg[MRAM_TXE].num ? 0 : fgi);
 
                /* update stats */
                m_can_tx_update_stats(cdev, msg_mark, timestamp);
        }
 
-       return 0;
+       if (ack_fgi != -1)
+               m_can_write(cdev, M_CAN_TXEFA, FIELD_PREP(TXEFA_EFAI_MASK,
+                                                         ack_fgi));
+
+       return err;
 }
 
 static irqreturn_t m_can_isr(int irq, void *dev_id)
@@ -1083,7 +1099,7 @@ static irqreturn_t m_can_isr(int irq, void *dev_id)
                m_can_disable_all_interrupts(cdev);
                if (!cdev->is_peripheral)
                        napi_schedule(&cdev->napi);
-               else if (m_can_rx_peripheral(dev) < 0)
+               else if (m_can_rx_peripheral(dev, ir) < 0)
                        goto out_fail;
        }
 
@@ -1243,10 +1259,17 @@ static int m_can_set_bittiming(struct net_device *dev)
  * - setup bittiming
  * - configure timestamp generation
  */
-static void m_can_chip_config(struct net_device *dev)
+static int m_can_chip_config(struct net_device *dev)
 {
        struct m_can_classdev *cdev = netdev_priv(dev);
        u32 cccr, test;
+       int err;
+
+       err = m_can_init_ram(cdev);
+       if (err) {
+               dev_err(cdev->dev, "Message RAM configuration failed\n");
+               return err;
+       }
 
        m_can_config_endisable(cdev, true);
 
@@ -1370,18 +1393,25 @@ static void m_can_chip_config(struct net_device *dev)
 
        if (cdev->ops->init)
                cdev->ops->init(cdev);
+
+       return 0;
 }
 
-static void m_can_start(struct net_device *dev)
+static int m_can_start(struct net_device *dev)
 {
        struct m_can_classdev *cdev = netdev_priv(dev);
+       int ret;
 
        /* basic m_can configuration */
-       m_can_chip_config(dev);
+       ret = m_can_chip_config(dev);
+       if (ret)
+               return ret;
 
        cdev->can.state = CAN_STATE_ERROR_ACTIVE;
 
        m_can_enable_all_interrupts(cdev);
+
+       return 0;
 }
 
 static int m_can_set_mode(struct net_device *dev, enum can_mode mode)
@@ -1595,6 +1625,7 @@ static netdev_tx_t m_can_tx_handler(struct m_can_classdev *cdev)
        struct sk_buff *skb = cdev->tx_skb;
        struct id_and_dlc fifo_header;
        u32 cccr, fdflags;
+       u32 txfqs;
        int err;
        int putidx;
 
@@ -1651,8 +1682,10 @@ static netdev_tx_t m_can_tx_handler(struct m_can_classdev *cdev)
        } else {
                /* Transmit routine for version >= v3.1.x */
 
+               txfqs = m_can_read(cdev, M_CAN_TXFQS);
+
                /* Check if FIFO full */
-               if (m_can_tx_fifo_full(cdev)) {
+               if (_m_can_tx_fifo_full(txfqs)) {
                        /* This shouldn't happen */
                        netif_stop_queue(dev);
                        netdev_warn(dev,
@@ -1668,8 +1701,7 @@ static netdev_tx_t m_can_tx_handler(struct m_can_classdev *cdev)
                }
 
                /* get put index for frame */
-               putidx = FIELD_GET(TXFQS_TFQPI_MASK,
-                                  m_can_read(cdev, M_CAN_TXFQS));
+               putidx = FIELD_GET(TXFQS_TFQPI_MASK, txfqs);
 
                /* Construct DLC Field, with CAN-FD configuration.
                 * Use the put index of the fifo as the message marker,
@@ -1809,7 +1841,9 @@ static int m_can_open(struct net_device *dev)
        }
 
        /* start the m_can controller */
-       m_can_start(dev);
+       err = m_can_start(dev);
+       if (err)
+               goto exit_irq_fail;
 
        if (!cdev->is_peripheral)
                napi_enable(&cdev->napi);
@@ -2068,9 +2102,13 @@ int m_can_class_resume(struct device *dev)
                ret = m_can_clk_start(cdev);
                if (ret)
                        return ret;
+               ret  = m_can_start(ndev);
+               if (ret) {
+                       m_can_clk_stop(cdev);
+
+                       return ret;
+               }
 
-               m_can_init_ram(cdev);
-               m_can_start(ndev);
                netif_device_attach(ndev);
                netif_start_queue(ndev);
        }
index 52563c0..a839dc7 100644 (file)
@@ -7,27 +7,27 @@
 #define _CAN_M_CAN_H_
 
 #include <linux/can/core.h>
+#include <linux/can/dev.h>
 #include <linux/can/rx-offload.h>
+#include <linux/clk.h>
 #include <linux/completion.h>
+#include <linux/delay.h>
 #include <linux/device.h>
 #include <linux/dma-mapping.h>
 #include <linux/freezer.h>
-#include <linux/slab.h>
-#include <linux/uaccess.h>
-#include <linux/clk.h>
-#include <linux/delay.h>
 #include <linux/interrupt.h>
 #include <linux/io.h>
+#include <linux/iopoll.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/netdevice.h>
 #include <linux/of.h>
 #include <linux/of_device.h>
-#include <linux/pm_runtime.h>
-#include <linux/iopoll.h>
-#include <linux/can/dev.h>
-#include <linux/pinctrl/consumer.h>
 #include <linux/phy/phy.h>
+#include <linux/pinctrl/consumer.h>
+#include <linux/pm_runtime.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
 
 /* m_can lec values */
 enum m_can_lec_type {
index eee47ba..9c1dcf8 100644 (file)
@@ -5,8 +5,8 @@
 //
 // Copyright (C) 2018-19 Texas Instruments Incorporated - http://www.ti.com/
 
-#include <linux/platform_device.h>
 #include <linux/phy/phy.h>
+#include <linux/platform_device.h>
 
 #include "m_can.h"
 
@@ -140,10 +140,6 @@ static int m_can_plat_probe(struct platform_device *pdev)
 
        platform_set_drvdata(pdev, mcan_class);
 
-       ret = m_can_init_ram(mcan_class);
-       if (ret)
-               goto probe_fail;
-
        pm_runtime_enable(mcan_class->dev);
        ret = m_can_class_register(mcan_class);
        if (ret)
index 41645a2..2342aa0 100644 (file)
@@ -10,7 +10,7 @@
 #define TCAN4X5X_DEV_ID1 0x04
 #define TCAN4X5X_REV 0x08
 #define TCAN4X5X_STATUS 0x0C
-#define TCAN4X5X_ERROR_STATUS 0x10
+#define TCAN4X5X_ERROR_STATUS_MASK 0x10
 #define TCAN4X5X_CONTROL 0x14
 
 #define TCAN4X5X_CONFIG 0x800
@@ -204,17 +204,7 @@ static int tcan4x5x_clear_interrupts(struct m_can_classdev *cdev)
        if (ret)
                return ret;
 
-       ret = tcan4x5x_write_tcan_reg(cdev, TCAN4X5X_MCAN_INT_REG,
-                                     TCAN4X5X_ENABLE_MCAN_INT);
-       if (ret)
-               return ret;
-
-       ret = tcan4x5x_write_tcan_reg(cdev, TCAN4X5X_INT_FLAGS,
-                                     TCAN4X5X_CLEAR_ALL_INT);
-       if (ret)
-               return ret;
-
-       return tcan4x5x_write_tcan_reg(cdev, TCAN4X5X_ERROR_STATUS,
+       return tcan4x5x_write_tcan_reg(cdev, TCAN4X5X_INT_FLAGS,
                                       TCAN4X5X_CLEAR_ALL_INT);
 }
 
@@ -234,8 +224,8 @@ static int tcan4x5x_init(struct m_can_classdev *cdev)
        if (ret)
                return ret;
 
-       /* Zero out the MCAN buffers */
-       ret = m_can_init_ram(cdev);
+       ret = tcan4x5x_write_tcan_reg(cdev, TCAN4X5X_ERROR_STATUS_MASK,
+                                     TCAN4X5X_CLEAR_ALL_INT);
        if (ret)
                return ret;
 
index 26e212b..2b218ce 100644 (file)
@@ -90,16 +90,47 @@ static int tcan4x5x_regmap_read(void *context,
        return 0;
 }
 
-static const struct regmap_range tcan4x5x_reg_table_yes_range[] = {
-       regmap_reg_range(0x0000, 0x002c),       /* Device ID and SPI Registers */
-       regmap_reg_range(0x0800, 0x083c),       /* Device configuration registers and Interrupt Flags*/
+static const struct regmap_range tcan4x5x_reg_table_wr_range[] = {
+       /* Device ID and SPI Registers */
+       regmap_reg_range(0x000c, 0x0010),
+       /* Device configuration registers and Interrupt Flags*/
+       regmap_reg_range(0x0800, 0x080c),
+       regmap_reg_range(0x0814, 0x0814),
+       regmap_reg_range(0x0820, 0x0820),
+       regmap_reg_range(0x0830, 0x0830),
+       /* M_CAN */
+       regmap_reg_range(0x100c, 0x102c),
+       regmap_reg_range(0x1048, 0x1048),
+       regmap_reg_range(0x1050, 0x105c),
+       regmap_reg_range(0x1080, 0x1088),
+       regmap_reg_range(0x1090, 0x1090),
+       regmap_reg_range(0x1098, 0x10a0),
+       regmap_reg_range(0x10a8, 0x10b0),
+       regmap_reg_range(0x10b8, 0x10c0),
+       regmap_reg_range(0x10c8, 0x10c8),
+       regmap_reg_range(0x10d0, 0x10d4),
+       regmap_reg_range(0x10e0, 0x10e4),
+       regmap_reg_range(0x10f0, 0x10f0),
+       regmap_reg_range(0x10f8, 0x10f8),
+       /* MRAM */
+       regmap_reg_range(0x8000, 0x87fc),
+};
+
+static const struct regmap_range tcan4x5x_reg_table_rd_range[] = {
+       regmap_reg_range(0x0000, 0x0010),       /* Device ID and SPI Registers */
+       regmap_reg_range(0x0800, 0x0830),       /* Device configuration registers and Interrupt Flags*/
        regmap_reg_range(0x1000, 0x10fc),       /* M_CAN */
        regmap_reg_range(0x8000, 0x87fc),       /* MRAM */
 };
 
-static const struct regmap_access_table tcan4x5x_reg_table = {
-       .yes_ranges = tcan4x5x_reg_table_yes_range,
-       .n_yes_ranges = ARRAY_SIZE(tcan4x5x_reg_table_yes_range),
+static const struct regmap_access_table tcan4x5x_reg_table_wr = {
+       .yes_ranges = tcan4x5x_reg_table_wr_range,
+       .n_yes_ranges = ARRAY_SIZE(tcan4x5x_reg_table_wr_range),
+};
+
+static const struct regmap_access_table tcan4x5x_reg_table_rd = {
+       .yes_ranges = tcan4x5x_reg_table_rd_range,
+       .n_yes_ranges = ARRAY_SIZE(tcan4x5x_reg_table_rd_range),
 };
 
 static const struct regmap_config tcan4x5x_regmap = {
@@ -107,8 +138,8 @@ static const struct regmap_config tcan4x5x_regmap = {
        .reg_stride = 4,
        .pad_bits = 8,
        .val_bits = 32,
-       .wr_table = &tcan4x5x_reg_table,
-       .rd_table = &tcan4x5x_reg_table,
+       .wr_table = &tcan4x5x_reg_table_wr,
+       .rd_table = &tcan4x5x_reg_table_rd,
        .max_register = TCAN4X5X_MAX_REGISTER,
        .cache_type = REGCACHE_NONE,
        .read_flag_mask = (__force unsigned long)
index 0a59eab..f6fa715 100644 (file)
 
 #define RCANFD_DRV_NAME                        "rcar_canfd"
 
-enum rcanfd_chip_id {
-       RENESAS_RCAR_GEN3 = 0,
-       RENESAS_RZG2L,
-       RENESAS_R8A779A0,
-};
-
 /* Global register bits */
 
 /* RSCFDnCFDGRMCFG */
@@ -522,6 +516,14 @@ enum rcar_canfd_fcanclk {
 
 struct rcar_canfd_global;
 
+struct rcar_canfd_hw_info {
+       u8 max_channels;
+       u8 postdiv;
+       /* hardware features */
+       unsigned shared_global_irqs:1;  /* Has shared global irqs */
+       unsigned multi_channel_irqs:1;  /* Has multiple channel irqs */
+};
+
 /* Channel priv data */
 struct rcar_canfd_channel {
        struct can_priv can;                    /* Must be the first member */
@@ -547,8 +549,7 @@ struct rcar_canfd_global {
        bool fdmode;                    /* CAN FD or Classical CAN only mode */
        struct reset_control *rstc1;
        struct reset_control *rstc2;
-       enum rcanfd_chip_id chip_id;
-       u32 max_channels;
+       const struct rcar_canfd_hw_info *info;
 };
 
 /* CAN FD mode nominal rate constants */
@@ -590,10 +591,28 @@ static const struct can_bittiming_const rcar_canfd_bittiming_const = {
        .brp_inc = 1,
 };
 
+static const struct rcar_canfd_hw_info rcar_gen3_hw_info = {
+       .max_channels = 2,
+       .postdiv = 2,
+       .shared_global_irqs = 1,
+};
+
+static const struct rcar_canfd_hw_info rzg2l_hw_info = {
+       .max_channels = 2,
+       .postdiv = 1,
+       .multi_channel_irqs = 1,
+};
+
+static const struct rcar_canfd_hw_info r8a779a0_hw_info = {
+       .max_channels = 8,
+       .postdiv = 2,
+       .shared_global_irqs = 1,
+};
+
 /* Helper functions */
 static inline bool is_v3u(struct rcar_canfd_global *gpriv)
 {
-       return gpriv->chip_id == RENESAS_R8A779A0;
+       return gpriv->info == &r8a779a0_hw_info;
 }
 
 static inline u32 reg_v3u(struct rcar_canfd_global *gpriv,
@@ -721,7 +740,7 @@ static int rcar_canfd_reset_controller(struct rcar_canfd_global *gpriv)
        rcar_canfd_set_mode(gpriv);
 
        /* Transition all Channels to reset mode */
-       for_each_set_bit(ch, &gpriv->channels_mask, gpriv->max_channels) {
+       for_each_set_bit(ch, &gpriv->channels_mask, gpriv->info->max_channels) {
                rcar_canfd_clear_bit(gpriv->base,
                                     RCANFD_CCTR(ch), RCANFD_CCTR_CSLPR);
 
@@ -762,7 +781,7 @@ static void rcar_canfd_configure_controller(struct rcar_canfd_global *gpriv)
        rcar_canfd_set_bit(gpriv->base, RCANFD_GCFG, cfg);
 
        /* Channel configuration settings */
-       for_each_set_bit(ch, &gpriv->channels_mask, gpriv->max_channels) {
+       for_each_set_bit(ch, &gpriv->channels_mask, gpriv->info->max_channels) {
                rcar_canfd_set_bit(gpriv->base, RCANFD_CCTR(ch),
                                   RCANFD_CCTR_ERRD);
                rcar_canfd_update_bit(gpriv->base, RCANFD_CCTR(ch),
@@ -1142,7 +1161,7 @@ static irqreturn_t rcar_canfd_global_err_interrupt(int irq, void *dev_id)
        struct rcar_canfd_global *gpriv = dev_id;
        u32 ch;
 
-       for_each_set_bit(ch, &gpriv->channels_mask, gpriv->max_channels)
+       for_each_set_bit(ch, &gpriv->channels_mask, gpriv->info->max_channels)
                rcar_canfd_handle_global_err(gpriv, ch);
 
        return IRQ_HANDLED;
@@ -1174,7 +1193,7 @@ static irqreturn_t rcar_canfd_global_receive_fifo_interrupt(int irq, void *dev_i
        struct rcar_canfd_global *gpriv = dev_id;
        u32 ch;
 
-       for_each_set_bit(ch, &gpriv->channels_mask, gpriv->max_channels)
+       for_each_set_bit(ch, &gpriv->channels_mask, gpriv->info->max_channels)
                rcar_canfd_handle_global_receive(gpriv, ch);
 
        return IRQ_HANDLED;
@@ -1188,7 +1207,7 @@ static irqreturn_t rcar_canfd_global_interrupt(int irq, void *dev_id)
        /* Global error interrupts still indicate a condition specific
         * to a channel. RxFIFO interrupt is a global interrupt.
         */
-       for_each_set_bit(ch, &gpriv->channels_mask, gpriv->max_channels) {
+       for_each_set_bit(ch, &gpriv->channels_mask, gpriv->info->max_channels) {
                rcar_canfd_handle_global_err(gpriv, ch);
                rcar_canfd_handle_global_receive(gpriv, ch);
        }
@@ -1284,7 +1303,7 @@ static irqreturn_t rcar_canfd_channel_interrupt(int irq, void *dev_id)
        u32 ch;
 
        /* Common FIFO is a per channel resource */
-       for_each_set_bit(ch, &gpriv->channels_mask, gpriv->max_channels) {
+       for_each_set_bit(ch, &gpriv->channels_mask, gpriv->info->max_channels) {
                rcar_canfd_handle_channel_err(gpriv, ch);
                rcar_canfd_handle_channel_tx(gpriv, ch);
        }
@@ -1696,6 +1715,7 @@ static const struct ethtool_ops rcar_canfd_ethtool_ops = {
 static int rcar_canfd_channel_probe(struct rcar_canfd_global *gpriv, u32 ch,
                                    u32 fcan_freq)
 {
+       const struct rcar_canfd_hw_info *info = gpriv->info;
        struct platform_device *pdev = gpriv->pdev;
        struct rcar_canfd_channel *priv;
        struct net_device *ndev;
@@ -1718,7 +1738,7 @@ static int rcar_canfd_channel_probe(struct rcar_canfd_global *gpriv, u32 ch,
        priv->can.clock.freq = fcan_freq;
        dev_info(&pdev->dev, "can_clk rate is %u\n", priv->can.clock.freq);
 
-       if (gpriv->chip_id == RENESAS_RZG2L) {
+       if (info->multi_channel_irqs) {
                char *irq_name;
                int err_irq;
                int tx_irq;
@@ -1818,6 +1838,7 @@ static void rcar_canfd_channel_remove(struct rcar_canfd_global *gpriv, u32 ch)
 
 static int rcar_canfd_probe(struct platform_device *pdev)
 {
+       const struct rcar_canfd_hw_info *info;
        void __iomem *addr;
        u32 sts, ch, fcan_freq;
        struct rcar_canfd_global *gpriv;
@@ -1826,18 +1847,15 @@ static int rcar_canfd_probe(struct platform_device *pdev)
        int err, ch_irq, g_irq;
        int g_err_irq, g_recc_irq;
        bool fdmode = true;                     /* CAN FD only mode - default */
-       enum rcanfd_chip_id chip_id;
-       int max_channels;
        char name[9] = "channelX";
        int i;
 
-       chip_id = (uintptr_t)of_device_get_match_data(&pdev->dev);
-       max_channels = chip_id == RENESAS_R8A779A0 ? 8 : 2;
+       info = of_device_get_match_data(&pdev->dev);
 
        if (of_property_read_bool(pdev->dev.of_node, "renesas,no-can-fd"))
                fdmode = false;                 /* Classical CAN only mode */
 
-       for (i = 0; i < max_channels; ++i) {
+       for (i = 0; i < info->max_channels; ++i) {
                name[7] = '0' + i;
                of_child = of_get_child_by_name(pdev->dev.of_node, name);
                if (of_child && of_device_is_available(of_child))
@@ -1845,7 +1863,7 @@ static int rcar_canfd_probe(struct platform_device *pdev)
                of_node_put(of_child);
        }
 
-       if (chip_id != RENESAS_RZG2L) {
+       if (info->shared_global_irqs) {
                ch_irq = platform_get_irq_byname_optional(pdev, "ch_int");
                if (ch_irq < 0) {
                        /* For backward compatibility get irq by index */
@@ -1879,8 +1897,7 @@ static int rcar_canfd_probe(struct platform_device *pdev)
        gpriv->pdev = pdev;
        gpriv->channels_mask = channels_mask;
        gpriv->fdmode = fdmode;
-       gpriv->chip_id = chip_id;
-       gpriv->max_channels = max_channels;
+       gpriv->info = info;
 
        gpriv->rstc1 = devm_reset_control_get_optional_exclusive(&pdev->dev,
                                                                 "rstp_n");
@@ -1917,9 +1934,9 @@ static int rcar_canfd_probe(struct platform_device *pdev)
        }
        fcan_freq = clk_get_rate(gpriv->can_clk);
 
-       if (gpriv->fcan == RCANFD_CANFDCLK && gpriv->chip_id != RENESAS_RZG2L)
+       if (gpriv->fcan == RCANFD_CANFDCLK)
                /* CANFD clock is further divided by (1/2) within the IP */
-               fcan_freq /= 2;
+               fcan_freq /= info->postdiv;
 
        addr = devm_platform_ioremap_resource(pdev, 0);
        if (IS_ERR(addr)) {
@@ -1929,7 +1946,7 @@ static int rcar_canfd_probe(struct platform_device *pdev)
        gpriv->base = addr;
 
        /* Request IRQ that's common for both channels */
-       if (gpriv->chip_id != RENESAS_RZG2L) {
+       if (info->shared_global_irqs) {
                err = devm_request_irq(&pdev->dev, ch_irq,
                                       rcar_canfd_channel_interrupt, 0,
                                       "canfd.ch_int", gpriv);
@@ -1995,7 +2012,7 @@ static int rcar_canfd_probe(struct platform_device *pdev)
        rcar_canfd_configure_controller(gpriv);
 
        /* Configure per channel attributes */
-       for_each_set_bit(ch, &gpriv->channels_mask, max_channels) {
+       for_each_set_bit(ch, &gpriv->channels_mask, info->max_channels) {
                /* Configure Channel's Rx fifo */
                rcar_canfd_configure_rx(gpriv, ch);
 
@@ -2021,7 +2038,7 @@ static int rcar_canfd_probe(struct platform_device *pdev)
                goto fail_mode;
        }
 
-       for_each_set_bit(ch, &gpriv->channels_mask, max_channels) {
+       for_each_set_bit(ch, &gpriv->channels_mask, info->max_channels) {
                err = rcar_canfd_channel_probe(gpriv, ch, fcan_freq);
                if (err)
                        goto fail_channel;
@@ -2033,7 +2050,7 @@ static int rcar_canfd_probe(struct platform_device *pdev)
        return 0;
 
 fail_channel:
-       for_each_set_bit(ch, &gpriv->channels_mask, max_channels)
+       for_each_set_bit(ch, &gpriv->channels_mask, info->max_channels)
                rcar_canfd_channel_remove(gpriv, ch);
 fail_mode:
        rcar_canfd_disable_global_interrupts(gpriv);
@@ -2054,7 +2071,7 @@ static int rcar_canfd_remove(struct platform_device *pdev)
        rcar_canfd_reset_controller(gpriv);
        rcar_canfd_disable_global_interrupts(gpriv);
 
-       for_each_set_bit(ch, &gpriv->channels_mask, gpriv->max_channels) {
+       for_each_set_bit(ch, &gpriv->channels_mask, gpriv->info->max_channels) {
                rcar_canfd_disable_channel_interrupts(gpriv->ch[ch]);
                rcar_canfd_channel_remove(gpriv, ch);
        }
@@ -2082,9 +2099,9 @@ static SIMPLE_DEV_PM_OPS(rcar_canfd_pm_ops, rcar_canfd_suspend,
                         rcar_canfd_resume);
 
 static const __maybe_unused struct of_device_id rcar_canfd_of_table[] = {
-       { .compatible = "renesas,rcar-gen3-canfd", .data = (void *)RENESAS_RCAR_GEN3 },
-       { .compatible = "renesas,rzg2l-canfd", .data = (void *)RENESAS_RZG2L },
-       { .compatible = "renesas,r8a779a0-canfd", .data = (void *)RENESAS_R8A779A0 },
+       { .compatible = "renesas,rcar-gen3-canfd", .data = &rcar_gen3_hw_info },
+       { .compatible = "renesas,rzg2l-canfd", .data = &rzg2l_hw_info },
+       { .compatible = "renesas,r8a779a0-canfd", .data = &r8a779a0_hw_info },
        { }
 };
 
index fbb3413..f4db770 100644 (file)
@@ -864,12 +864,14 @@ static void slcan_close(struct tty_struct *tty)
 {
        struct slcan *sl = (struct slcan *)tty->disc_data;
 
-       /* unregister_netdev() calls .ndo_stop() so we don't have to.
-        * Our .ndo_stop() also flushes the TTY write wakeup handler,
-        * so we can safely set sl->tty = NULL after this.
-        */
        unregister_candev(sl->dev);
 
+       /*
+        * The netdev needn't be UP (so .ndo_stop() is not called). Hence make
+        * sure this is not running before freeing it up.
+        */
+       flush_work(&sl->tx_work);
+
        /* Mark channel as dead */
        spin_lock_bh(&sl->lock);
        tty->disc_data = NULL;
index 8c6fea6..445504a 100644 (file)
@@ -30,6 +30,7 @@ config CAN_ESD_USB
 config CAN_ETAS_ES58X
        tristate "ETAS ES58X CAN/USB interfaces"
        select CRC16
+       select NET_DEVLINK
        help
          This driver supports the ES581.4, ES582.1 and ES584.1 interfaces
          from ETAS GmbH (https://www.etas.com/en/products/es58x.php).
index 81b88e9..42323f5 100644 (file)
@@ -234,6 +234,10 @@ static void esd_usb_rx_event(struct esd_usb_net_priv *priv,
                u8 rxerr = msg->msg.rx.data[2];
                u8 txerr = msg->msg.rx.data[3];
 
+               netdev_dbg(priv->netdev,
+                          "CAN_ERR_EV_EXT: dlc=%#02x state=%02x ecc=%02x rec=%02x tec=%02x\n",
+                          msg->msg.rx.dlc, state, ecc, rxerr, txerr);
+
                skb = alloc_can_err_skb(priv->netdev, &cf);
                if (skb == NULL) {
                        stats->rx_dropped++;
@@ -260,6 +264,8 @@ static void esd_usb_rx_event(struct esd_usb_net_priv *priv,
                                break;
                        default:
                                priv->can.state = CAN_STATE_ERROR_ACTIVE;
+                               txerr = 0;
+                               rxerr = 0;
                                break;
                        }
                } else {
index a129b4a..d6667eb 100644 (file)
@@ -1,3 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_CAN_ETAS_ES58X) += etas_es58x.o
-etas_es58x-y = es58x_core.o es581_4.o es58x_fd.o
+etas_es58x-y = es58x_core.o es58x_devlink.o es581_4.o es58x_fd.o
index 1bcdcec..4151b18 100644 (file)
@@ -6,12 +6,12 @@
  *
  * Copyright (c) 2019 Robert Bosch Engineering and Business Solutions. All rights reserved.
  * Copyright (c) 2020 ETAS K.K.. All rights reserved.
- * Copyright (c) 2020, 2021 Vincent Mailhol <mailhol.vincent@wanadoo.fr>
+ * Copyright (c) 2020-2022 Vincent Mailhol <mailhol.vincent@wanadoo.fr>
  */
 
+#include <asm/unaligned.h>
 #include <linux/kernel.h>
 #include <linux/units.h>
-#include <asm/unaligned.h>
 
 #include "es58x_core.h"
 #include "es581_4.h"
index ddb7c57..0c7f750 100644 (file)
@@ -7,15 +7,16 @@
  *
  * Copyright (c) 2019 Robert Bosch Engineering and Business Solutions. All rights reserved.
  * Copyright (c) 2020 ETAS K.K.. All rights reserved.
- * Copyright (c) 2020, 2021 Vincent Mailhol <mailhol.vincent@wanadoo.fr>
+ * Copyright (c) 2020-2022 Vincent Mailhol <mailhol.vincent@wanadoo.fr>
  */
 
+#include <asm/unaligned.h>
+#include <linux/crc16.h>
 #include <linux/ethtool.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/usb.h>
-#include <linux/crc16.h>
-#include <asm/unaligned.h>
+#include <net/devlink.h>
 
 #include "es58x_core.h"
 
@@ -2038,10 +2039,16 @@ static int es58x_set_mode(struct net_device *netdev, enum can_mode mode)
  * @es58x_dev: ES58X device.
  * @priv: ES58X private parameters related to the network device.
  * @channel_idx: Index of the network device.
+ *
+ * Return: zero on success, errno if devlink port could not be
+ *     properly registered.
  */
-static void es58x_init_priv(struct es58x_device *es58x_dev,
-                           struct es58x_priv *priv, int channel_idx)
+static int es58x_init_priv(struct es58x_device *es58x_dev,
+                          struct es58x_priv *priv, int channel_idx)
 {
+       struct devlink_port_attrs attrs = {
+               .flavour = DEVLINK_PORT_FLAVOUR_PHYSICAL,
+       };
        const struct es58x_parameters *param = es58x_dev->param;
        struct can_priv *can = &priv->can;
 
@@ -2060,6 +2067,10 @@ static void es58x_init_priv(struct es58x_device *es58x_dev,
        can->state = CAN_STATE_STOPPED;
        can->ctrlmode_supported = param->ctrlmode_supported;
        can->do_set_mode = es58x_set_mode;
+
+       devlink_port_attrs_set(&priv->devlink_port, &attrs);
+       return devlink_port_register(priv_to_devlink(es58x_dev),
+                                    &priv->devlink_port, channel_idx);
 }
 
 /**
@@ -2083,7 +2094,10 @@ static int es58x_init_netdev(struct es58x_device *es58x_dev, int channel_idx)
        }
        SET_NETDEV_DEV(netdev, dev);
        es58x_dev->netdev[channel_idx] = netdev;
-       es58x_init_priv(es58x_dev, es58x_priv(netdev), channel_idx);
+       ret = es58x_init_priv(es58x_dev, es58x_priv(netdev), channel_idx);
+       if (ret)
+               goto free_candev;
+       SET_NETDEV_DEVLINK_PORT(netdev, &es58x_priv(netdev)->devlink_port);
 
        netdev->netdev_ops = &es58x_netdev_ops;
        netdev->ethtool_ops = &es58x_ethtool_ops;
@@ -2091,16 +2105,20 @@ static int es58x_init_netdev(struct es58x_device *es58x_dev, int channel_idx)
        netdev->dev_port = channel_idx;
 
        ret = register_candev(netdev);
-       if (ret) {
-               es58x_dev->netdev[channel_idx] = NULL;
-               free_candev(netdev);
-               return ret;
-       }
+       if (ret)
+               goto devlink_port_unregister;
 
        netdev_queue_set_dql_min_limit(netdev_get_tx_queue(netdev, 0),
                                       es58x_dev->param->dql_min_limit);
 
        return ret;
+
+ devlink_port_unregister:
+       devlink_port_unregister(&es58x_priv(netdev)->devlink_port);
+ free_candev:
+       es58x_dev->netdev[channel_idx] = NULL;
+       free_candev(netdev);
+       return ret;
 }
 
 /**
@@ -2117,54 +2135,13 @@ static void es58x_free_netdevs(struct es58x_device *es58x_dev)
                if (!netdev)
                        continue;
                unregister_candev(netdev);
+               devlink_port_unregister(&es58x_priv(netdev)->devlink_port);
                es58x_dev->netdev[i] = NULL;
                free_candev(netdev);
        }
 }
 
 /**
- * es58x_get_product_info() - Get the product information and print them.
- * @es58x_dev: ES58X device.
- *
- * Do a synchronous call to get the product information.
- *
- * Return: zero on success, errno when any error occurs.
- */
-static int es58x_get_product_info(struct es58x_device *es58x_dev)
-{
-       struct usb_device *udev = es58x_dev->udev;
-       const int es58x_prod_info_idx = 6;
-       /* Empirical tests show a prod_info length of maximum 83,
-        * below should be more than enough.
-        */
-       const size_t prod_info_len = 127;
-       char *prod_info;
-       int ret;
-
-       prod_info = kmalloc(prod_info_len, GFP_KERNEL);
-       if (!prod_info)
-               return -ENOMEM;
-
-       ret = usb_string(udev, es58x_prod_info_idx, prod_info, prod_info_len);
-       if (ret < 0) {
-               dev_err(es58x_dev->dev,
-                       "%s: Could not read the product info: %pe\n",
-                       __func__, ERR_PTR(ret));
-               goto out_free;
-       }
-       if (ret >= prod_info_len - 1) {
-               dev_warn(es58x_dev->dev,
-                        "%s: Buffer is too small, result might be truncated\n",
-                        __func__);
-       }
-       dev_info(es58x_dev->dev, "Product info: %s\n", prod_info);
-
- out_free:
-       kfree(prod_info);
-       return ret < 0 ? ret : 0;
-}
-
-/**
  * es58x_init_es58x_dev() - Initialize the ES58X device.
  * @intf: USB interface.
  * @driver_info: Quirks of the device.
@@ -2177,6 +2154,7 @@ static struct es58x_device *es58x_init_es58x_dev(struct usb_interface *intf,
 {
        struct device *dev = &intf->dev;
        struct es58x_device *es58x_dev;
+       struct devlink *devlink;
        const struct es58x_parameters *param;
        const struct es58x_operators *ops;
        struct usb_device *udev = interface_to_usbdev(intf);
@@ -2199,11 +2177,12 @@ static struct es58x_device *es58x_init_es58x_dev(struct usb_interface *intf,
                ops = &es581_4_ops;
        }
 
-       es58x_dev = devm_kzalloc(dev, es58x_sizeof_es58x_device(param),
-                                GFP_KERNEL);
-       if (!es58x_dev)
+       devlink = devlink_alloc(&es58x_dl_ops, es58x_sizeof_es58x_device(param),
+                               dev);
+       if (!devlink)
                return ERR_PTR(-ENOMEM);
 
+       es58x_dev = devlink_priv(devlink);
        es58x_dev->param = param;
        es58x_dev->ops = ops;
        es58x_dev->dev = dev;
@@ -2240,25 +2219,24 @@ static int es58x_probe(struct usb_interface *intf,
                       const struct usb_device_id *id)
 {
        struct es58x_device *es58x_dev;
-       int ch_idx, ret;
+       int ch_idx;
 
        es58x_dev = es58x_init_es58x_dev(intf, id->driver_info);
        if (IS_ERR(es58x_dev))
                return PTR_ERR(es58x_dev);
 
-       ret = es58x_get_product_info(es58x_dev);
-       if (ret)
-               return ret;
+       es58x_parse_product_info(es58x_dev);
+       devlink_register(priv_to_devlink(es58x_dev));
 
        for (ch_idx = 0; ch_idx < es58x_dev->num_can_ch; ch_idx++) {
-               ret = es58x_init_netdev(es58x_dev, ch_idx);
+               int ret = es58x_init_netdev(es58x_dev, ch_idx);
                if (ret) {
                        es58x_free_netdevs(es58x_dev);
                        return ret;
                }
        }
 
-       return ret;
+       return 0;
 }
 
 /**
@@ -2275,8 +2253,10 @@ static void es58x_disconnect(struct usb_interface *intf)
        dev_info(&intf->dev, "Disconnecting %s %s\n",
                 es58x_dev->udev->manufacturer, es58x_dev->udev->product);
 
+       devlink_unregister(priv_to_devlink(es58x_dev));
        es58x_free_netdevs(es58x_dev);
        es58x_free_urbs(es58x_dev);
+       devlink_free(priv_to_devlink(es58x_dev));
        usb_set_intfdata(intf, NULL);
 }
 
index 640fe0a..c1ba1a4 100644 (file)
@@ -6,17 +6,18 @@
  *
  * Copyright (c) 2019 Robert Bosch Engineering and Business Solutions. All rights reserved.
  * Copyright (c) 2020 ETAS K.K.. All rights reserved.
- * Copyright (c) 2020, 2021 Vincent Mailhol <mailhol.vincent@wanadoo.fr>
+ * Copyright (c) 2020-2022 Vincent Mailhol <mailhol.vincent@wanadoo.fr>
  */
 
 #ifndef __ES58X_COMMON_H__
 #define __ES58X_COMMON_H__
 
-#include <linux/types.h>
-#include <linux/usb.h>
-#include <linux/netdevice.h>
 #include <linux/can.h>
 #include <linux/can/dev.h>
+#include <linux/netdevice.h>
+#include <linux/types.h>
+#include <linux/usb.h>
+#include <net/devlink.h>
 
 #include "es581_4.h"
 #include "es58x_fd.h"
@@ -230,6 +231,7 @@ union es58x_urb_cmd {
  * @can: struct can_priv must be the first member (Socket CAN relies
  *     on the fact that function netdev_priv() returns a pointer to
  *     a struct can_priv).
+ * @devlink_port: devlink instance for the network interface.
  * @es58x_dev: pointer to the corresponding ES58X device.
  * @tx_urb: Used as a buffer to concatenate the TX messages and to do
  *     a bulk send. Please refer to es58x_start_xmit() for more
@@ -255,6 +257,7 @@ union es58x_urb_cmd {
  */
 struct es58x_priv {
        struct can_priv can;
+       struct devlink_port devlink_port;
        struct es58x_device *es58x_dev;
        struct urb *tx_urb;
 
@@ -357,6 +360,39 @@ struct es58x_operators {
 };
 
 /**
+ * struct es58x_sw_version - Version number of the firmware or the
+ *     bootloader.
+ * @major: Version major number, represented on two digits.
+ * @minor: Version minor number, represented on two digits.
+ * @revision: Version revision number, represented on two digits.
+ *
+ * The firmware and the bootloader share the same format: "xx.xx.xx"
+ * where 'x' is a digit. Both can be retrieved from the product
+ * information string.
+ */
+struct es58x_sw_version {
+       u8 major;
+       u8 minor;
+       u8 revision;
+};
+
+/**
+ * struct es58x_hw_revision - Hardware revision number.
+ * @letter: Revision letter.
+ * @major: Version major number, represented on three digits.
+ * @minor: Version minor number, represented on three digits.
+ *
+ * The hardware revision uses its own format: "axxx/xxx" where 'a' is
+ * a letter and 'x' a digit. It can be retrieved from the product
+ * information string.
+ */
+struct es58x_hw_revision {
+       char letter;
+       u16 major;
+       u16 minor;
+};
+
+/**
  * struct es58x_device - All information specific to an ES58X device.
  * @dev: Device information.
  * @udev: USB device information.
@@ -373,6 +409,9 @@ struct es58x_operators {
  *     queue wake/stop logic should prevent this URB from getting
  *     empty. Please refer to es58x_get_tx_urb() for more details.
  * @tx_urbs_idle_cnt: number of urbs in @tx_urbs_idle.
+ * @firmware_version: The firmware version number.
+ * @bootloader_version: The bootloader version number.
+ * @hardware_revision: The hardware revision number.
  * @ktime_req_ns: kernel timestamp when es58x_set_realtime_diff_ns()
  *     was called.
  * @realtime_diff_ns: difference in nanoseconds between the clocks of
@@ -408,6 +447,10 @@ struct es58x_device {
        struct usb_anchor tx_urbs_idle;
        atomic_t tx_urbs_idle_cnt;
 
+       struct es58x_sw_version firmware_version;
+       struct es58x_sw_version bootloader_version;
+       struct es58x_hw_revision hardware_revision;
+
        u64 ktime_req_ns;
        s64 realtime_diff_ns;
 
@@ -674,6 +717,7 @@ static inline enum es58x_flag es58x_get_flags(const struct sk_buff *skb)
        return es58x_flags;
 }
 
+/* es58x_core.c. */
 int es58x_can_get_echo_skb(struct net_device *netdev, u32 packet_idx,
                           u64 *tstamps, unsigned int pkts);
 int es58x_tx_ack_msg(struct net_device *netdev, u16 tx_free_entries,
@@ -691,9 +735,15 @@ int es58x_rx_cmd_ret_u32(struct net_device *netdev,
 int es58x_send_msg(struct es58x_device *es58x_dev, u8 cmd_type, u8 cmd_id,
                   const void *msg, u16 cmd_len, int channel_idx);
 
+/* es58x_devlink.c. */
+void es58x_parse_product_info(struct es58x_device *es58x_dev);
+extern const struct devlink_ops es58x_dl_ops;
+
+/* es581_4.c. */
 extern const struct es58x_parameters es581_4_param;
 extern const struct es58x_operators es581_4_ops;
 
+/* es58x_fd.c. */
 extern const struct es58x_parameters es58x_fd_param;
 extern const struct es58x_operators es58x_fd_ops;
 
diff --git a/drivers/net/can/usb/etas_es58x/es58x_devlink.c b/drivers/net/can/usb/etas_es58x/es58x_devlink.c
new file mode 100644 (file)
index 0000000..9fba29e
--- /dev/null
@@ -0,0 +1,235 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/* Driver for ETAS GmbH ES58X USB CAN(-FD) Bus Interfaces.
+ *
+ * File es58x_devlink.c: report the product information using devlink.
+ *
+ * Copyright (c) 2022 Vincent Mailhol <mailhol.vincent@wanadoo.fr>
+ */
+
+#include <linux/ctype.h>
+#include <linux/device.h>
+#include <linux/usb.h>
+#include <net/devlink.h>
+
+#include "es58x_core.h"
+
+/* USB descriptor index containing the product information string. */
+#define ES58X_PROD_INFO_IDX 6
+
+/**
+ * es58x_parse_sw_version() - Extract boot loader or firmware version.
+ * @es58x_dev: ES58X device.
+ * @prod_info: USB custom string returned by the device.
+ * @prefix: Select which information should be parsed. Set it to "FW"
+ *     to parse the firmware version or to "BL" to parse the
+ *     bootloader version.
+ *
+ * The @prod_info string contains the firmware and the bootloader
+ * version number all prefixed by a magic string and concatenated with
+ * other numbers. Depending on the device, the firmware (bootloader)
+ * format is either "FW_Vxx.xx.xx" ("BL_Vxx.xx.xx") or "FW:xx.xx.xx"
+ * ("BL:xx.xx.xx") where 'x' represents a digit. @prod_info must
+ * contains the common part of those prefixes: "FW" or "BL".
+ *
+ * Parse @prod_info and store the version number in
+ * &es58x_dev.firmware_version or &es58x_dev.bootloader_version
+ * according to @prefix value.
+ *
+ * Return: zero on success, -EINVAL if @prefix contains an invalid
+ *     value and -EBADMSG if @prod_info could not be parsed.
+ */
+static int es58x_parse_sw_version(struct es58x_device *es58x_dev,
+                                 const char *prod_info, const char *prefix)
+{
+       struct es58x_sw_version *version;
+       int major, minor, revision;
+
+       if (!strcmp(prefix, "FW"))
+               version = &es58x_dev->firmware_version;
+       else if (!strcmp(prefix, "BL"))
+               version = &es58x_dev->bootloader_version;
+       else
+               return -EINVAL;
+
+       /* Go to prefix */
+       prod_info = strstr(prod_info, prefix);
+       if (!prod_info)
+               return -EBADMSG;
+       /* Go to beginning of the version number */
+       while (!isdigit(*prod_info)) {
+               prod_info++;
+               if (!*prod_info)
+                       return -EBADMSG;
+       }
+
+       if (sscanf(prod_info, "%2u.%2u.%2u", &major, &minor, &revision) != 3)
+               return -EBADMSG;
+
+       version->major = major;
+       version->minor = minor;
+       version->revision = revision;
+
+       return 0;
+}
+
+/**
+ * es58x_parse_hw_rev() - Extract hardware revision number.
+ * @es58x_dev: ES58X device.
+ * @prod_info: USB custom string returned by the device.
+ *
+ * @prod_info contains the hardware revision prefixed by a magic
+ * string and conquenated together with other numbers. Depending on
+ * the device, the hardware revision format is either
+ * "HW_VER:axxx/xxx" or "HR:axxx/xxx" where 'a' represents a letter
+ * and 'x' a digit.
+ *
+ * Parse @prod_info and store the hardware revision number in
+ * &es58x_dev.hardware_revision.
+ *
+ * Return: zero on success, -EBADMSG if @prod_info could not be
+ *     parsed.
+ */
+static int es58x_parse_hw_rev(struct es58x_device *es58x_dev,
+                             const char *prod_info)
+{
+       char letter;
+       int major, minor;
+
+       /* The only occurrence of 'H' is in the hardware revision prefix. */
+       prod_info = strchr(prod_info, 'H');
+       if (!prod_info)
+               return -EBADMSG;
+       /* Go to beginning of the hardware revision */
+       prod_info = strchr(prod_info, ':');
+       if (!prod_info)
+               return -EBADMSG;
+       prod_info++;
+
+       if (sscanf(prod_info, "%c%3u/%3u", &letter, &major, &minor) != 3)
+               return -EBADMSG;
+
+       es58x_dev->hardware_revision.letter = letter;
+       es58x_dev->hardware_revision.major = major;
+       es58x_dev->hardware_revision.minor = minor;
+
+       return 0;
+}
+
+/**
+ * es58x_parse_product_info() - Parse the ES58x product information
+ *     string.
+ * @es58x_dev: ES58X device.
+ *
+ * Retrieve the product information string and parse it to extract the
+ * firmware version, the bootloader version and the hardware
+ * revision.
+ *
+ * If the function fails, simply emit a log message and continue
+ * because product information is not critical for the driver to
+ * operate.
+ */
+void es58x_parse_product_info(struct es58x_device *es58x_dev)
+{
+       char *prod_info;
+
+       prod_info = usb_cache_string(es58x_dev->udev, ES58X_PROD_INFO_IDX);
+       if (!prod_info) {
+               dev_warn(es58x_dev->dev,
+                        "could not retrieve the product info string\n");
+               return;
+       }
+
+       if (es58x_parse_sw_version(es58x_dev, prod_info, "FW") ||
+           es58x_parse_sw_version(es58x_dev, prod_info, "BL") ||
+           es58x_parse_hw_rev(es58x_dev, prod_info))
+               dev_info(es58x_dev->dev,
+                        "could not parse product info: '%s'\n", prod_info);
+
+       kfree(prod_info);
+}
+
+/**
+ * es58x_sw_version_is_set() - Check if the version is a valid number.
+ * @sw_ver: Version number of either the firmware or the bootloader.
+ *
+ * If &es58x_sw_version.major, &es58x_sw_version.minor and
+ * &es58x_sw_version.revision are all zero, the product string could
+ * not be parsed and the version number is invalid.
+ */
+static inline bool es58x_sw_version_is_set(struct es58x_sw_version *sw_ver)
+{
+       return sw_ver->major || sw_ver->minor || sw_ver->revision;
+}
+
+/**
+ * es58x_hw_revision_is_set() - Check if the revision is a valid number.
+ * @hw_rev: Revision number of the hardware.
+ *
+ * If &es58x_hw_revision.letter is the null character, the product
+ * string could not be parsed and the hardware revision number is
+ * invalid.
+ */
+static inline bool es58x_hw_revision_is_set(struct es58x_hw_revision *hw_rev)
+{
+       return hw_rev->letter != '\0';
+}
+
+/**
+ * es58x_devlink_info_get() - Report the product information.
+ * @devlink: Devlink.
+ * @req: skb wrapper where to put requested information.
+ * @extack: Unused.
+ *
+ * Report the firmware version, the bootloader version, the hardware
+ * revision and the serial number through netlink.
+ *
+ * Return: zero on success, errno when any error occurs.
+ */
+static int es58x_devlink_info_get(struct devlink *devlink,
+                                 struct devlink_info_req *req,
+                                 struct netlink_ext_ack *extack)
+{
+       struct es58x_device *es58x_dev = devlink_priv(devlink);
+       struct es58x_sw_version *fw_ver = &es58x_dev->firmware_version;
+       struct es58x_sw_version *bl_ver = &es58x_dev->bootloader_version;
+       struct es58x_hw_revision *hw_rev = &es58x_dev->hardware_revision;
+       char buf[max(sizeof("xx.xx.xx"), sizeof("axxx/xxx"))];
+       int ret = 0;
+
+       if (es58x_sw_version_is_set(fw_ver)) {
+               snprintf(buf, sizeof(buf), "%02u.%02u.%02u",
+                        fw_ver->major, fw_ver->minor, fw_ver->revision);
+               ret = devlink_info_version_running_put(req,
+                                                      DEVLINK_INFO_VERSION_GENERIC_FW,
+                                                      buf);
+               if (ret)
+                       return ret;
+       }
+
+       if (es58x_sw_version_is_set(bl_ver)) {
+               snprintf(buf, sizeof(buf), "%02u.%02u.%02u",
+                        bl_ver->major, bl_ver->minor, bl_ver->revision);
+               ret = devlink_info_version_running_put(req,
+                                                      DEVLINK_INFO_VERSION_GENERIC_FW_BOOTLOADER,
+                                                      buf);
+               if (ret)
+                       return ret;
+       }
+
+       if (es58x_hw_revision_is_set(hw_rev)) {
+               snprintf(buf, sizeof(buf), "%c%03u/%03u",
+                        hw_rev->letter, hw_rev->major, hw_rev->minor);
+               ret = devlink_info_version_fixed_put(req,
+                                                    DEVLINK_INFO_VERSION_GENERIC_BOARD_REV,
+                                                    buf);
+               if (ret)
+                       return ret;
+       }
+
+       return devlink_info_serial_number_put(req, es58x_dev->udev->serial);
+}
+
+const struct devlink_ops es58x_dl_ops = {
+       .info_get = es58x_devlink_info_get,
+};
index c97ffa7..fa87b0b 100644 (file)
@@ -8,12 +8,12 @@
  *
  * Copyright (c) 2019 Robert Bosch Engineering and Business Solutions. All rights reserved.
  * Copyright (c) 2020 ETAS K.K.. All rights reserved.
- * Copyright (c) 2020, 2021 Vincent Mailhol <mailhol.vincent@wanadoo.fr>
+ * Copyright (c) 2020-2022 Vincent Mailhol <mailhol.vincent@wanadoo.fr>
  */
 
+#include <asm/unaligned.h>
 #include <linux/kernel.h>
 #include <linux/units.h>
-#include <asm/unaligned.h>
 
 #include "es58x_core.h"
 #include "es58x_fd.h"
index 838744d..d476c28 100644 (file)
@@ -299,7 +299,6 @@ struct gs_can {
 
        struct net_device *netdev;
        struct usb_device *udev;
-       struct usb_interface *iface;
 
        struct can_bittiming_const bt_const, data_bt_const;
        unsigned int channel;   /* channel number */
@@ -383,8 +382,7 @@ static int gs_cmd_reset(struct gs_can *dev)
                .mode = GS_CAN_MODE_RESET,
        };
 
-       return usb_control_msg_send(interface_to_usbdev(dev->iface), 0,
-                                   GS_USB_BREQ_MODE,
+       return usb_control_msg_send(dev->udev, 0, GS_USB_BREQ_MODE,
                                    USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_INTERFACE,
                                    dev->channel, 0, &dm, sizeof(dm), 1000,
                                    GFP_KERNEL);
@@ -396,8 +394,7 @@ static inline int gs_usb_get_timestamp(const struct gs_can *dev,
        __le32 timestamp;
        int rc;
 
-       rc = usb_control_msg_recv(interface_to_usbdev(dev->iface), 0,
-                                 GS_USB_BREQ_TIMESTAMP,
+       rc = usb_control_msg_recv(dev->udev, 0, GS_USB_BREQ_TIMESTAMP,
                                  USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_INTERFACE,
                                  dev->channel, 0,
                                  &timestamp, sizeof(timestamp),
@@ -674,8 +671,7 @@ static int gs_usb_set_bittiming(struct net_device *netdev)
        };
 
        /* request bit timings */
-       return usb_control_msg_send(interface_to_usbdev(dev->iface), 0,
-                                   GS_USB_BREQ_BITTIMING,
+       return usb_control_msg_send(dev->udev, 0, GS_USB_BREQ_BITTIMING,
                                    USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_INTERFACE,
                                    dev->channel, 0, &dbt, sizeof(dbt), 1000,
                                    GFP_KERNEL);
@@ -698,8 +694,7 @@ static int gs_usb_set_data_bittiming(struct net_device *netdev)
                request = GS_USB_BREQ_QUIRK_CANTACT_PRO_DATA_BITTIMING;
 
        /* request data bit timings */
-       return usb_control_msg_send(interface_to_usbdev(dev->iface), 0,
-                                   request,
+       return usb_control_msg_send(dev->udev, 0, request,
                                    USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_INTERFACE,
                                    dev->channel, 0, &dbt, sizeof(dbt), 1000,
                                    GFP_KERNEL);
@@ -941,8 +936,7 @@ static int gs_can_open(struct net_device *netdev)
        /* finally start device */
        dev->can.state = CAN_STATE_ERROR_ACTIVE;
        dm.flags = cpu_to_le32(flags);
-       rc = usb_control_msg_send(interface_to_usbdev(dev->iface), 0,
-                                 GS_USB_BREQ_MODE,
+       rc = usb_control_msg_send(dev->udev, 0, GS_USB_BREQ_MODE,
                                  USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_INTERFACE,
                                  dev->channel, 0, &dm, sizeof(dm), 1000,
                                  GFP_KERNEL);
@@ -969,8 +963,7 @@ static int gs_usb_get_state(const struct net_device *netdev,
        struct gs_device_state ds;
        int rc;
 
-       rc = usb_control_msg_recv(interface_to_usbdev(dev->iface), 0,
-                                 GS_USB_BREQ_GET_STATE,
+       rc = usb_control_msg_recv(dev->udev, 0, GS_USB_BREQ_GET_STATE,
                                  USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_INTERFACE,
                                  dev->channel, 0,
                                  &ds, sizeof(ds),
@@ -1064,8 +1057,7 @@ static int gs_usb_set_identify(struct net_device *netdev, bool do_identify)
        else
                imode.mode = cpu_to_le32(GS_CAN_IDENTIFY_OFF);
 
-       return usb_control_msg_send(interface_to_usbdev(dev->iface), 0,
-                                   GS_USB_BREQ_IDENTIFY,
+       return usb_control_msg_send(dev->udev, 0, GS_USB_BREQ_IDENTIFY,
                                    USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_INTERFACE,
                                    dev->channel, 0, &imode, sizeof(imode), 100,
                                    GFP_KERNEL);
@@ -1118,8 +1110,7 @@ static int gs_usb_get_termination(struct net_device *netdev, u16 *term)
        struct gs_device_termination_state term_state;
        int rc;
 
-       rc = usb_control_msg_recv(interface_to_usbdev(dev->iface), 0,
-                                 GS_USB_BREQ_GET_TERMINATION,
+       rc = usb_control_msg_recv(dev->udev, 0, GS_USB_BREQ_GET_TERMINATION,
                                  USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_INTERFACE,
                                  dev->channel, 0,
                                  &term_state, sizeof(term_state), 1000,
@@ -1145,8 +1136,7 @@ static int gs_usb_set_termination(struct net_device *netdev, u16 term)
        else
                term_state.state = cpu_to_le32(GS_CAN_TERMINATION_STATE_OFF);
 
-       return usb_control_msg_send(interface_to_usbdev(dev->iface), 0,
-                                   GS_USB_BREQ_SET_TERMINATION,
+       return usb_control_msg_send(dev->udev, 0, GS_USB_BREQ_SET_TERMINATION,
                                    USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_INTERFACE,
                                    dev->channel, 0,
                                    &term_state, sizeof(term_state), 1000,
@@ -1210,7 +1200,6 @@ static struct gs_can *gs_make_candev(unsigned int channel,
        dev->bt_const.brp_inc = le32_to_cpu(bt_const.brp_inc);
 
        dev->udev = interface_to_usbdev(intf);
-       dev->iface = intf;
        dev->netdev = netdev;
        dev->channel = channel;
 
index 3a2bfaa..d4c5356 100644 (file)
@@ -536,12 +536,11 @@ static int kvaser_usb_set_bittiming(struct net_device *netdev)
        struct kvaser_usb *dev = priv->dev;
        const struct kvaser_usb_dev_ops *ops = dev->driver_info->ops;
        struct can_bittiming *bt = &priv->can.bittiming;
-
        struct kvaser_usb_busparams busparams;
        int tseg1 = bt->prop_seg + bt->phase_seg1;
        int tseg2 = bt->phase_seg2;
        int sjw = bt->sjw;
-       int err = -EOPNOTSUPP;
+       int err;
 
        busparams.bitrate = cpu_to_le32(bt->bitrate);
        busparams.sjw = (u8)sjw;
@@ -581,7 +580,6 @@ static int kvaser_usb_set_data_bittiming(struct net_device *netdev)
        struct kvaser_usb *dev = priv->dev;
        const struct kvaser_usb_dev_ops *ops = dev->driver_info->ops;
        struct can_bittiming *dbt = &priv->can.data_bittiming;
-
        struct kvaser_usb_busparams busparams;
        int tseg1 = dbt->prop_seg + dbt->phase_seg1;
        int tseg2 = dbt->phase_seg2;
index ffa38f5..a0f7bce 100644 (file)
@@ -277,7 +277,6 @@ struct ucan_priv {
 
        /* linux USB device structures */
        struct usb_device *udev;
-       struct usb_interface *intf;
        struct net_device *netdev;
 
        /* lock for can->echo_skb (used around
@@ -1501,7 +1500,6 @@ static int ucan_probe(struct usb_interface *intf,
 
        /* initialize data */
        up->udev = udev;
-       up->intf = intf;
        up->netdev = netdev;
        up->intf_index = iface_desc->desc.bInterfaceNumber;
        up->in_ep_addr = in_ep_addr;
@@ -1534,9 +1532,8 @@ static int ucan_probe(struct usb_interface *intf,
                                     sizeof(union ucan_ctl_payload));
        if (ret > 0) {
                /* copy string while ensuring zero termination */
-               strncpy(firmware_str, up->ctl_msg_buffer->raw,
-                       sizeof(union ucan_ctl_payload));
-               firmware_str[sizeof(union ucan_ctl_payload)] = '\0';
+               strscpy(firmware_str, up->ctl_msg_buffer->raw,
+                       sizeof(union ucan_ctl_payload) + 1);
        } else {
                strcpy(firmware_str, "unknown");
        }
index 8582b4b..ea05abf 100644 (file)
@@ -57,5 +57,6 @@ int ksz8_reset_switch(struct ksz_device *dev);
 int ksz8_switch_detect(struct ksz_device *dev);
 int ksz8_switch_init(struct ksz_device *dev);
 void ksz8_switch_exit(struct ksz_device *dev);
+int ksz8_change_mtu(struct ksz_device *dev, int port, int mtu);
 
 #endif
index bd3b133..003b0ac 100644 (file)
@@ -76,6 +76,57 @@ int ksz8_reset_switch(struct ksz_device *dev)
        return 0;
 }
 
+static int ksz8863_change_mtu(struct ksz_device *dev, int frame_size)
+{
+       u8 ctrl2 = 0;
+
+       if (frame_size <= KSZ8_LEGAL_PACKET_SIZE)
+               ctrl2 |= KSZ8863_LEGAL_PACKET_ENABLE;
+       else if (frame_size > KSZ8863_NORMAL_PACKET_SIZE)
+               ctrl2 |= KSZ8863_HUGE_PACKET_ENABLE;
+
+       return ksz_rmw8(dev, REG_SW_CTRL_2, KSZ8863_LEGAL_PACKET_ENABLE |
+                       KSZ8863_HUGE_PACKET_ENABLE, ctrl2);
+}
+
+static int ksz8795_change_mtu(struct ksz_device *dev, int frame_size)
+{
+       u8 ctrl1 = 0, ctrl2 = 0;
+       int ret;
+
+       if (frame_size > KSZ8_LEGAL_PACKET_SIZE)
+               ctrl2 |= SW_LEGAL_PACKET_DISABLE;
+       else if (frame_size > KSZ8863_NORMAL_PACKET_SIZE)
+               ctrl1 |= SW_HUGE_PACKET;
+
+       ret = ksz_rmw8(dev, REG_SW_CTRL_1, SW_HUGE_PACKET, ctrl1);
+       if (ret)
+               return ret;
+
+       return ksz_rmw8(dev, REG_SW_CTRL_2, SW_LEGAL_PACKET_DISABLE, ctrl2);
+}
+
+int ksz8_change_mtu(struct ksz_device *dev, int port, int mtu)
+{
+       u16 frame_size;
+
+       if (!dsa_is_cpu_port(dev->ds, port))
+               return 0;
+
+       frame_size = mtu + VLAN_ETH_HLEN + ETH_FCS_LEN;
+
+       switch (dev->chip_id) {
+       case KSZ8795_CHIP_ID:
+       case KSZ8794_CHIP_ID:
+       case KSZ8765_CHIP_ID:
+               return ksz8795_change_mtu(dev, frame_size);
+       case KSZ8830_CHIP_ID:
+               return ksz8863_change_mtu(dev, frame_size);
+       }
+
+       return -EOPNOTSUPP;
+}
+
 static void ksz8795_set_prio_queue(struct ksz_device *dev, int port, int queue)
 {
        u8 hi, lo;
@@ -1233,8 +1284,6 @@ void ksz8_config_cpu_port(struct dsa_switch *ds)
        masks = dev->info->masks;
        regs = dev->info->regs;
 
-       /* Switch marks the maximum frame with extra byte as oversize. */
-       ksz_cfg(dev, REG_SW_CTRL_2, SW_LEGAL_PACKET_DISABLE, true);
        ksz_cfg(dev, regs[S_TAIL_TAG_CTRL], masks[SW_TAIL_TAG_ENABLE], true);
 
        p = &dev->ports[dev->cpu_port];
@@ -1308,6 +1357,18 @@ int ksz8_setup(struct dsa_switch *ds)
        struct ksz_device *dev = ds->priv;
        int i;
 
+       ds->mtu_enforcement_ingress = true;
+
+       /* We rely on software untagging on the CPU port, so that we
+        * can support both tagged and untagged VLANs
+        */
+       ds->untag_bridge_pvid = true;
+
+       /* VLAN filtering is partly controlled by the global VLAN
+        * Enable flag
+        */
+       ds->vlan_filtering_is_global = true;
+
        ksz_cfg(dev, S_REPLACE_VID_CTRL, SW_FLOW_CTRL, true);
 
        /* Enable automatic fast aging when link changed detected. */
@@ -1367,16 +1428,6 @@ int ksz8_switch_init(struct ksz_device *dev)
        dev->phy_port_cnt = dev->info->port_cnt - 1;
        dev->port_mask = (BIT(dev->phy_port_cnt) - 1) | dev->info->cpu_ports;
 
-       /* We rely on software untagging on the CPU port, so that we
-        * can support both tagged and untagged VLANs
-        */
-       dev->ds->untag_bridge_pvid = true;
-
-       /* VLAN filtering is partly controlled by the global VLAN
-        * Enable flag
-        */
-       dev->ds->vlan_filtering_is_global = true;
-
        return 0;
 }
 
index 77487d6..7a57c60 100644 (file)
@@ -48,6 +48,9 @@
 #define NO_EXC_COLLISION_DROP          BIT(3)
 #define SW_LEGAL_PACKET_DISABLE                BIT(1)
 
+#define KSZ8863_HUGE_PACKET_ENABLE     BIT(2)
+#define KSZ8863_LEGAL_PACKET_ENABLE    BIT(1)
+
 #define REG_SW_CTRL_3                  0x05
  #define WEIGHTED_FAIR_QUEUE_ENABLE    BIT(3)
 
index 0d6b409..47b54ec 100644 (file)
@@ -45,24 +45,15 @@ static void ksz9477_port_cfg32(struct ksz_device *dev, int port, int offset,
 
 int ksz9477_change_mtu(struct ksz_device *dev, int port, int mtu)
 {
-       u16 frame_size, max_frame = 0;
-       int i;
-
-       frame_size = mtu + VLAN_ETH_HLEN + ETH_FCS_LEN;
+       u16 frame_size;
 
-       /* Cache the per-port MTU setting */
-       dev->ports[port].max_frame = frame_size;
+       if (!dsa_is_cpu_port(dev->ds, port))
+               return 0;
 
-       for (i = 0; i < dev->info->port_cnt; i++)
-               max_frame = max(max_frame, dev->ports[i].max_frame);
+       frame_size = mtu + VLAN_ETH_HLEN + ETH_FCS_LEN;
 
        return regmap_update_bits(dev->regmap[1], REG_SW_MTU__2,
-                                 REG_SW_MTU_MASK, max_frame);
-}
-
-int ksz9477_max_mtu(struct ksz_device *dev, int port)
-{
-       return KSZ9477_MAX_FRAME_SIZE - VLAN_ETH_HLEN - ETH_FCS_LEN;
+                                 REG_SW_MTU_MASK, frame_size);
 }
 
 static int ksz9477_wait_vlan_ctrl_ready(struct ksz_device *dev)
@@ -1143,6 +1134,8 @@ int ksz9477_setup(struct dsa_switch *ds)
        struct ksz_device *dev = ds->priv;
        int ret = 0;
 
+       ds->mtu_enforcement_ingress = true;
+
        /* Required for port partitioning. */
        ksz9477_cfg32(dev, REG_SW_QM_CTRL__4, UNICAST_VLAN_BOUNDARY,
                      true);
index 00862c4..7c5bb30 100644 (file)
@@ -50,7 +50,6 @@ int ksz9477_mdb_add(struct ksz_device *dev, int port,
 int ksz9477_mdb_del(struct ksz_device *dev, int port,
                    const struct switchdev_obj_port_mdb *mdb, struct dsa_db db);
 int ksz9477_change_mtu(struct ksz_device *dev, int port, int mtu);
-int ksz9477_max_mtu(struct ksz_device *dev, int port);
 void ksz9477_config_cpu_port(struct dsa_switch *ds);
 int ksz9477_enable_stp_addr(struct ksz_device *dev);
 int ksz9477_reset_switch(struct ksz_device *dev);
index 53c68d2..cc457fa 100644 (file)
 #define PTP_TRIG_UNIT_M                        (BIT(MAX_TRIG_UNIT) - 1)
 #define PTP_TS_UNIT_M                  (BIT(MAX_TIMESTAMP_UNIT) - 1)
 
-#define KSZ9477_MAX_FRAME_SIZE         9000
-
 #endif /* KSZ9477_REGS_H */
index 8c8db31..423f944 100644 (file)
@@ -14,6 +14,7 @@
 #include <linux/phy.h>
 #include <linux/etherdevice.h>
 #include <linux/if_bridge.h>
+#include <linux/if_vlan.h>
 #include <linux/irq.h>
 #include <linux/irqdomain.h>
 #include <linux/of_mdio.h>
@@ -69,6 +70,43 @@ struct ksz_stats_raw {
        u64 tx_discards;
 };
 
+struct ksz88xx_stats_raw {
+       u64 rx;
+       u64 rx_hi;
+       u64 rx_undersize;
+       u64 rx_fragments;
+       u64 rx_oversize;
+       u64 rx_jabbers;
+       u64 rx_symbol_err;
+       u64 rx_crc_err;
+       u64 rx_align_err;
+       u64 rx_mac_ctrl;
+       u64 rx_pause;
+       u64 rx_bcast;
+       u64 rx_mcast;
+       u64 rx_ucast;
+       u64 rx_64_or_less;
+       u64 rx_65_127;
+       u64 rx_128_255;
+       u64 rx_256_511;
+       u64 rx_512_1023;
+       u64 rx_1024_1522;
+       u64 tx;
+       u64 tx_hi;
+       u64 tx_late_col;
+       u64 tx_pause;
+       u64 tx_bcast;
+       u64 tx_mcast;
+       u64 tx_ucast;
+       u64 tx_deferred;
+       u64 tx_total_col;
+       u64 tx_exc_col;
+       u64 tx_single_col;
+       u64 tx_mult_col;
+       u64 rx_discards;
+       u64 tx_discards;
+};
+
 static const struct ksz_mib_names ksz88xx_mib_names[] = {
        { 0x00, "rx" },
        { 0x01, "rx_hi" },
@@ -155,6 +193,7 @@ static const struct ksz_dev_ops ksz8_dev_ops = {
        .w_phy = ksz8_w_phy,
        .r_mib_cnt = ksz8_r_mib_cnt,
        .r_mib_pkt = ksz8_r_mib_pkt,
+       .r_mib_stat64 = ksz88xx_r_mib_stats64,
        .freeze_mib = ksz8_freeze_mib,
        .port_init_cnt = ksz8_port_init_cnt,
        .fdb_dump = ksz8_fdb_dump,
@@ -171,6 +210,7 @@ static const struct ksz_dev_ops ksz8_dev_ops = {
        .reset = ksz8_reset_switch,
        .init = ksz8_switch_init,
        .exit = ksz8_switch_exit,
+       .change_mtu = ksz8_change_mtu,
 };
 
 static void ksz9477_phylink_mac_link_up(struct ksz_device *dev, int port,
@@ -206,7 +246,6 @@ static const struct ksz_dev_ops ksz9477_dev_ops = {
        .mdb_add = ksz9477_mdb_add,
        .mdb_del = ksz9477_mdb_del,
        .change_mtu = ksz9477_change_mtu,
-       .max_mtu = ksz9477_max_mtu,
        .phylink_mac_link_up = ksz9477_phylink_mac_link_up,
        .config_cpu_port = ksz9477_config_cpu_port,
        .enable_stp_addr = ksz9477_enable_stp_addr,
@@ -243,7 +282,6 @@ static const struct ksz_dev_ops lan937x_dev_ops = {
        .mdb_add = ksz9477_mdb_add,
        .mdb_del = ksz9477_mdb_del,
        .change_mtu = lan937x_change_mtu,
-       .max_mtu = ksz9477_max_mtu,
        .phylink_mac_link_up = ksz9477_phylink_mac_link_up,
        .config_cpu_port = lan937x_config_cpu_port,
        .enable_stp_addr = ksz9477_enable_stp_addr,
@@ -1583,6 +1621,55 @@ void ksz_r_mib_stats64(struct ksz_device *dev, int port)
        spin_unlock(&mib->stats64_lock);
 }
 
+void ksz88xx_r_mib_stats64(struct ksz_device *dev, int port)
+{
+       struct ethtool_pause_stats *pstats;
+       struct rtnl_link_stats64 *stats;
+       struct ksz88xx_stats_raw *raw;
+       struct ksz_port_mib *mib;
+
+       mib = &dev->ports[port].mib;
+       stats = &mib->stats64;
+       pstats = &mib->pause_stats;
+       raw = (struct ksz88xx_stats_raw *)mib->counters;
+
+       spin_lock(&mib->stats64_lock);
+
+       stats->rx_packets = raw->rx_bcast + raw->rx_mcast + raw->rx_ucast +
+               raw->rx_pause;
+       stats->tx_packets = raw->tx_bcast + raw->tx_mcast + raw->tx_ucast +
+               raw->tx_pause;
+
+       /* HW counters are counting bytes + FCS which is not acceptable
+        * for rtnl_link_stats64 interface
+        */
+       stats->rx_bytes = raw->rx + raw->rx_hi - stats->rx_packets * ETH_FCS_LEN;
+       stats->tx_bytes = raw->tx + raw->tx_hi - stats->tx_packets * ETH_FCS_LEN;
+
+       stats->rx_length_errors = raw->rx_undersize + raw->rx_fragments +
+               raw->rx_oversize;
+
+       stats->rx_crc_errors = raw->rx_crc_err;
+       stats->rx_frame_errors = raw->rx_align_err;
+       stats->rx_dropped = raw->rx_discards;
+       stats->rx_errors = stats->rx_length_errors + stats->rx_crc_errors +
+               stats->rx_frame_errors  + stats->rx_dropped;
+
+       stats->tx_window_errors = raw->tx_late_col;
+       stats->tx_fifo_errors = raw->tx_discards;
+       stats->tx_aborted_errors = raw->tx_exc_col;
+       stats->tx_errors = stats->tx_window_errors + stats->tx_fifo_errors +
+               stats->tx_aborted_errors;
+
+       stats->multicast = raw->rx_mcast;
+       stats->collisions = raw->tx_total_col;
+
+       pstats->tx_pause_frames = raw->tx_pause;
+       pstats->rx_pause_frames = raw->rx_pause;
+
+       spin_unlock(&mib->stats64_lock);
+}
+
 static void ksz_get_stats64(struct dsa_switch *ds, int port,
                            struct rtnl_link_stats64 *s)
 {
@@ -2500,10 +2587,29 @@ static int ksz_max_mtu(struct dsa_switch *ds, int port)
 {
        struct ksz_device *dev = ds->priv;
 
-       if (!dev->dev_ops->max_mtu)
-               return -EOPNOTSUPP;
+       switch (dev->chip_id) {
+       case KSZ8795_CHIP_ID:
+       case KSZ8794_CHIP_ID:
+       case KSZ8765_CHIP_ID:
+               return KSZ8795_HUGE_PACKET_SIZE - VLAN_ETH_HLEN - ETH_FCS_LEN;
+       case KSZ8830_CHIP_ID:
+               return KSZ8863_HUGE_PACKET_SIZE - VLAN_ETH_HLEN - ETH_FCS_LEN;
+       case KSZ8563_CHIP_ID:
+       case KSZ9477_CHIP_ID:
+       case KSZ9563_CHIP_ID:
+       case KSZ9567_CHIP_ID:
+       case KSZ9893_CHIP_ID:
+       case KSZ9896_CHIP_ID:
+       case KSZ9897_CHIP_ID:
+       case LAN9370_CHIP_ID:
+       case LAN9371_CHIP_ID:
+       case LAN9372_CHIP_ID:
+       case LAN9373_CHIP_ID:
+       case LAN9374_CHIP_ID:
+               return KSZ9477_MAX_FRAME_SIZE - VLAN_ETH_HLEN - ETH_FCS_LEN;
+       }
 
-       return dev->dev_ops->max_mtu(dev, port);
+       return -EOPNOTSUPP;
 }
 
 static void ksz_set_xmii(struct ksz_device *dev, int port,
index c6726cb..055d61f 100644 (file)
@@ -95,7 +95,6 @@ struct ksz_port {
 
        struct ksz_port_mib mib;
        phy_interface_t interface;
-       u16 max_frame;
        u32 rgmii_tx_val;
        u32 rgmii_rx_val;
        struct ksz_device *ksz_dev;
@@ -322,7 +321,6 @@ struct ksz_dev_ops {
        void (*get_caps)(struct ksz_device *dev, int port,
                         struct phylink_config *config);
        int (*change_mtu)(struct ksz_device *dev, int port, int mtu);
-       int (*max_mtu)(struct ksz_device *dev, int port);
        void (*freeze_mib)(struct ksz_device *dev, int port, bool freeze);
        void (*port_init_cnt)(struct ksz_device *dev, int port);
        void (*phylink_mac_config)(struct ksz_device *dev, int port,
@@ -347,6 +345,7 @@ void ksz_switch_remove(struct ksz_device *dev);
 
 void ksz_init_mib_timer(struct ksz_device *dev);
 void ksz_r_mib_stats64(struct ksz_device *dev, int port);
+void ksz88xx_r_mib_stats64(struct ksz_device *dev, int port);
 void ksz_port_stp_state_set(struct dsa_switch *ds, int port, u8 state);
 bool ksz_get_gbit(struct ksz_device *dev, int port);
 phy_interface_t ksz_get_xmii(struct ksz_device *dev, int port, bool gbit);
@@ -456,6 +455,11 @@ static inline int ksz_write64(struct ksz_device *dev, u32 reg, u64 value)
        return regmap_bulk_write(dev->regmap[2], reg, val, 2);
 }
 
+static inline int ksz_rmw8(struct ksz_device *dev, int offset, u8 mask, u8 val)
+{
+       return regmap_update_bits(dev->regmap[0], offset, mask, val);
+}
+
 static inline int ksz_pread8(struct ksz_device *dev, int port, int offset,
                             u8 *data)
 {
@@ -588,6 +592,12 @@ static inline int is_lan937x(struct ksz_device *dev)
 
 #define PORT_SRC_PHY_INT               1
 
+#define KSZ8795_HUGE_PACKET_SIZE       2000
+#define KSZ8863_HUGE_PACKET_SIZE       1916
+#define KSZ8863_NORMAL_PACKET_SIZE     1536
+#define KSZ8_LEGAL_PACKET_SIZE         1518
+#define KSZ9477_MAX_FRAME_SIZE         9000
+
 /* Regmap tables generation */
 #define KSZ_SPI_OP_RD          3
 #define KSZ_SPI_OP_WR          2
index c8eca2b..49bf358 100644 (file)
@@ -15,3 +15,7 @@ mv88e6xxx-objs += port_hidden.o
 mv88e6xxx-$(CONFIG_NET_DSA_MV88E6XXX_PTP) += ptp.o
 mv88e6xxx-objs += serdes.o
 mv88e6xxx-objs += smi.o
+mv88e6xxx-objs += trace.o
+
+# for tracing framework to find trace.h
+CFLAGS_trace.o := -I$(src)
index ccfa475..ba4fff8 100644 (file)
@@ -833,10 +833,13 @@ static void mv88e6xxx_get_caps(struct dsa_switch *ds, int port,
 
        chip->info->ops->phylink_get_caps(chip, port, config);
 
-       /* Internal ports need GMII for PHYLIB */
-       if (mv88e6xxx_phy_is_internal(ds, port))
+       if (mv88e6xxx_phy_is_internal(ds, port)) {
+               __set_bit(PHY_INTERFACE_MODE_INTERNAL,
+                         config->supported_interfaces);
+               /* Internal ports with no phy-mode need GMII for PHYLIB */
                __set_bit(PHY_INTERFACE_MODE_GMII,
                          config->supported_interfaces);
+       }
 }
 
 static void mv88e6xxx_mac_config(struct dsa_switch *ds, int port,
index 40bd67a..61ae2d6 100644 (file)
@@ -12,6 +12,7 @@
 
 #include "chip.h"
 #include "global1.h"
+#include "trace.h"
 
 /* Offset 0x01: ATU FID Register */
 
@@ -114,6 +115,19 @@ static int mv88e6xxx_g1_atu_op_wait(struct mv88e6xxx_chip *chip)
        return mv88e6xxx_g1_wait_bit(chip, MV88E6XXX_G1_ATU_OP, bit, 0);
 }
 
+static int mv88e6xxx_g1_read_atu_violation(struct mv88e6xxx_chip *chip)
+{
+       int err;
+
+       err = mv88e6xxx_g1_write(chip, MV88E6XXX_G1_ATU_OP,
+                                MV88E6XXX_G1_ATU_OP_BUSY |
+                                MV88E6XXX_G1_ATU_OP_GET_CLR_VIOLATION);
+       if (err)
+               return err;
+
+       return mv88e6xxx_g1_atu_op_wait(chip);
+}
+
 static int mv88e6xxx_g1_atu_op(struct mv88e6xxx_chip *chip, u16 fid, u16 op)
 {
        u16 val;
@@ -159,6 +173,41 @@ int mv88e6xxx_g1_atu_get_next(struct mv88e6xxx_chip *chip, u16 fid)
        return mv88e6xxx_g1_atu_op(chip, fid, MV88E6XXX_G1_ATU_OP_GET_NEXT_DB);
 }
 
+static int mv88e6xxx_g1_atu_fid_read(struct mv88e6xxx_chip *chip, u16 *fid)
+{
+       u16 val = 0, upper = 0, op = 0;
+       int err = -EOPNOTSUPP;
+
+       if (mv88e6xxx_num_databases(chip) > 256) {
+               err = mv88e6xxx_g1_read(chip, MV88E6352_G1_ATU_FID, &val);
+               val &= 0xfff;
+               if (err)
+                       return err;
+       } else {
+               err = mv88e6xxx_g1_read(chip, MV88E6XXX_G1_ATU_OP, &op);
+               if (err)
+                       return err;
+               if (mv88e6xxx_num_databases(chip) > 64) {
+                       /* ATU DBNum[7:4] are located in ATU Control 15:12 */
+                       err = mv88e6xxx_g1_read(chip, MV88E6XXX_G1_ATU_CTL,
+                                               &upper);
+                       if (err)
+                               return err;
+
+                       upper = (upper >> 8) & 0x00f0;
+               } else if (mv88e6xxx_num_databases(chip) > 16) {
+                       /* ATU DBNum[5:4] are located in ATU Operation 9:8 */
+                       upper = (op >> 4) & 0x30;
+               }
+
+               /* ATU DBNum[3:0] are located in ATU Operation 3:0 */
+               val = (op & 0xf) | upper;
+       }
+       *fid = val;
+
+       return err;
+}
+
 /* Offset 0x0C: ATU Data Register */
 
 static int mv88e6xxx_g1_atu_data_read(struct mv88e6xxx_chip *chip,
@@ -353,14 +402,12 @@ static irqreturn_t mv88e6xxx_g1_atu_prob_irq_thread_fn(int irq, void *dev_id)
 {
        struct mv88e6xxx_chip *chip = dev_id;
        struct mv88e6xxx_atu_entry entry;
-       int spid;
-       int err;
-       u16 val;
+       int err, spid;
+       u16 val, fid;
 
        mv88e6xxx_reg_lock(chip);
 
-       err = mv88e6xxx_g1_atu_op(chip, 0,
-                                 MV88E6XXX_G1_ATU_OP_GET_CLR_VIOLATION);
+       err = mv88e6xxx_g1_read_atu_violation(chip);
        if (err)
                goto out;
 
@@ -368,6 +415,10 @@ static irqreturn_t mv88e6xxx_g1_atu_prob_irq_thread_fn(int irq, void *dev_id)
        if (err)
                goto out;
 
+       err = mv88e6xxx_g1_atu_fid_read(chip, &fid);
+       if (err)
+               goto out;
+
        err = mv88e6xxx_g1_atu_data_read(chip, &entry);
        if (err)
                goto out;
@@ -378,30 +429,24 @@ static irqreturn_t mv88e6xxx_g1_atu_prob_irq_thread_fn(int irq, void *dev_id)
 
        spid = entry.state;
 
-       if (val & MV88E6XXX_G1_ATU_OP_AGE_OUT_VIOLATION) {
-               dev_err_ratelimited(chip->dev,
-                                   "ATU age out violation for %pM\n",
-                                   entry.mac);
-       }
-
        if (val & MV88E6XXX_G1_ATU_OP_MEMBER_VIOLATION) {
-               dev_err_ratelimited(chip->dev,
-                                   "ATU member violation for %pM portvec %x spid %d\n",
-                                   entry.mac, entry.portvec, spid);
+               trace_mv88e6xxx_atu_member_violation(chip->dev, spid,
+                                                    entry.portvec, entry.mac,
+                                                    fid);
                chip->ports[spid].atu_member_violation++;
        }
 
        if (val & MV88E6XXX_G1_ATU_OP_MISS_VIOLATION) {
-               dev_err_ratelimited(chip->dev,
-                                   "ATU miss violation for %pM portvec %x spid %d\n",
-                                   entry.mac, entry.portvec, spid);
+               trace_mv88e6xxx_atu_miss_violation(chip->dev, spid,
+                                                  entry.portvec, entry.mac,
+                                                  fid);
                chip->ports[spid].atu_miss_violation++;
        }
 
        if (val & MV88E6XXX_G1_ATU_OP_FULL_VIOLATION) {
-               dev_err_ratelimited(chip->dev,
-                                   "ATU full violation for %pM portvec %x spid %d\n",
-                                   entry.mac, entry.portvec, spid);
+               trace_mv88e6xxx_atu_full_violation(chip->dev, spid,
+                                                  entry.portvec, entry.mac,
+                                                  fid);
                chip->ports[spid].atu_full_violation++;
        }
        mv88e6xxx_reg_unlock(chip);
index 38e18f5..bcfb4a8 100644 (file)
@@ -13,6 +13,7 @@
 
 #include "chip.h"
 #include "global1.h"
+#include "trace.h"
 
 /* Offset 0x02: VTU FID Register */
 
@@ -628,14 +629,12 @@ static irqreturn_t mv88e6xxx_g1_vtu_prob_irq_thread_fn(int irq, void *dev_id)
        spid = val & MV88E6XXX_G1_VTU_OP_SPID_MASK;
 
        if (val & MV88E6XXX_G1_VTU_OP_MEMBER_VIOLATION) {
-               dev_err_ratelimited(chip->dev, "VTU member violation for vid %d, source port %d\n",
-                                   vid, spid);
+               trace_mv88e6xxx_vtu_member_violation(chip->dev, spid, vid);
                chip->ports[spid].vtu_member_violation++;
        }
 
        if (val & MV88E6XXX_G1_VTU_OP_MISS_VIOLATION) {
-               dev_dbg_ratelimited(chip->dev, "VTU miss violation for vid %d, source port %d\n",
-                                   vid, spid);
+               trace_mv88e6xxx_vtu_miss_violation(chip->dev, spid, vid);
                chip->ports[spid].vtu_miss_violation++;
        }
 
diff --git a/drivers/net/dsa/mv88e6xxx/trace.c b/drivers/net/dsa/mv88e6xxx/trace.c
new file mode 100644 (file)
index 0000000..7833cb5
--- /dev/null
@@ -0,0 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Copyright 2022 NXP
+ */
+
+#define CREATE_TRACE_POINTS
+#include "trace.h"
diff --git a/drivers/net/dsa/mv88e6xxx/trace.h b/drivers/net/dsa/mv88e6xxx/trace.h
new file mode 100644 (file)
index 0000000..f59ca04
--- /dev/null
@@ -0,0 +1,96 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* Copyright 2022 NXP
+ */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM   mv88e6xxx
+
+#if !defined(_MV88E6XXX_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _MV88E6XXX_TRACE_H
+
+#include <linux/device.h>
+#include <linux/if_ether.h>
+#include <linux/tracepoint.h>
+
+DECLARE_EVENT_CLASS(mv88e6xxx_atu_violation,
+
+       TP_PROTO(const struct device *dev, int spid, u16 portvec,
+                const unsigned char *addr, u16 fid),
+
+       TP_ARGS(dev, spid, portvec, addr, fid),
+
+       TP_STRUCT__entry(
+               __string(name, dev_name(dev))
+               __field(int, spid)
+               __field(u16, portvec)
+               __array(unsigned char, addr, ETH_ALEN)
+               __field(u16, fid)
+       ),
+
+       TP_fast_assign(
+               __assign_str(name, dev_name(dev));
+               __entry->spid = spid;
+               __entry->portvec = portvec;
+               memcpy(__entry->addr, addr, ETH_ALEN);
+               __entry->fid = fid;
+       ),
+
+       TP_printk("dev %s spid %d portvec 0x%x addr %pM fid %u",
+                 __get_str(name), __entry->spid, __entry->portvec,
+                 __entry->addr, __entry->fid)
+);
+
+DEFINE_EVENT(mv88e6xxx_atu_violation, mv88e6xxx_atu_member_violation,
+            TP_PROTO(const struct device *dev, int spid, u16 portvec,
+                     const unsigned char *addr, u16 fid),
+            TP_ARGS(dev, spid, portvec, addr, fid));
+
+DEFINE_EVENT(mv88e6xxx_atu_violation, mv88e6xxx_atu_miss_violation,
+            TP_PROTO(const struct device *dev, int spid, u16 portvec,
+                     const unsigned char *addr, u16 fid),
+            TP_ARGS(dev, spid, portvec, addr, fid));
+
+DEFINE_EVENT(mv88e6xxx_atu_violation, mv88e6xxx_atu_full_violation,
+            TP_PROTO(const struct device *dev, int spid, u16 portvec,
+                     const unsigned char *addr, u16 fid),
+            TP_ARGS(dev, spid, portvec, addr, fid));
+
+DECLARE_EVENT_CLASS(mv88e6xxx_vtu_violation,
+
+       TP_PROTO(const struct device *dev, int spid, u16 vid),
+
+       TP_ARGS(dev, spid, vid),
+
+       TP_STRUCT__entry(
+               __string(name, dev_name(dev))
+               __field(int, spid)
+               __field(u16, vid)
+       ),
+
+       TP_fast_assign(
+               __assign_str(name, dev_name(dev));
+               __entry->spid = spid;
+               __entry->vid = vid;
+       ),
+
+       TP_printk("dev %s spid %d vid %u",
+                 __get_str(name), __entry->spid, __entry->vid)
+);
+
+DEFINE_EVENT(mv88e6xxx_vtu_violation, mv88e6xxx_vtu_member_violation,
+            TP_PROTO(const struct device *dev, int spid, u16 vid),
+            TP_ARGS(dev, spid, vid));
+
+DEFINE_EVENT(mv88e6xxx_vtu_violation, mv88e6xxx_vtu_miss_violation,
+            TP_PROTO(const struct device *dev, int spid, u16 vid),
+            TP_ARGS(dev, spid, vid));
+
+#endif /* _MV88E6XXX_TRACE_H */
+
+/* We don't want to use include/trace/events */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE     trace
+/* This part must be outside protection */
+#include <trace/define_trace.h>
index da53261..30b1f1b 100644 (file)
@@ -95,6 +95,8 @@ static int sja1105_setup_devlink_regions(struct dsa_switch *ds)
                if (IS_ERR(region)) {
                        while (--i >= 0)
                                dsa_devlink_region_destroy(priv->regions[i]);
+
+                       kfree(priv->regions);
                        return PTR_ERR(region);
                }
 
index 4126661..b70dcf3 100644 (file)
@@ -1038,7 +1038,7 @@ static int sja1105_init_l2_policing(struct sja1105_private *priv)
 
                policing[bcast].sharindx = port;
                /* Only SJA1110 has multicast policers */
-               if (mcast <= table->ops->max_entry_count)
+               if (mcast < table->ops->max_entry_count)
                        policing[mcast].sharindx = port;
        }
 
index e104fb0..aa0d2f3 100644 (file)
@@ -258,6 +258,7 @@ static int greth_init_rings(struct greth_private *greth)
                        if (dma_mapping_error(greth->dev, dma_addr)) {
                                if (netif_msg_ifup(greth))
                                        dev_err(greth->dev, "Could not create initial DMA mapping\n");
+                               dev_kfree_skb(skb);
                                goto cleanup;
                        }
                        greth->rx_skbuff[i] = skb;
index 55dfdb3..f4ca0c6 100644 (file)
@@ -71,13 +71,14 @@ config BCM63XX_ENET
 config BCMGENET
        tristate "Broadcom GENET internal MAC support"
        depends on HAS_IOMEM
+       depends on PTP_1588_CLOCK_OPTIONAL || !ARCH_BCM2835
        select MII
        select PHYLIB
        select FIXED_PHY
        select BCM7XXX_PHY
        select MDIO_BCM_UNIMAC
        select DIMLIB
-       select BROADCOM_PHY if (ARCH_BCM2835 && PTP_1588_CLOCK_OPTIONAL)
+       select BROADCOM_PHY if ARCH_BCM2835
        help
          This driver supports the built-in Ethernet MACs found in the
          Broadcom BCM7xxx Set Top Box family chipset.
index dbe3101..9f47385 100644 (file)
@@ -3045,7 +3045,7 @@ error:
 
        dma_unmap_single(&bp->pdev->dev, dma_addr, bp->rx_buf_use_size,
                         DMA_FROM_DEVICE);
-       skb = build_skb(data, 0);
+       skb = slab_build_skb(data);
        if (!skb) {
                kfree(data);
                goto error;
index 0fe164b..4c7d07c 100644 (file)
@@ -389,6 +389,9 @@ static netdev_tx_t bnxt_start_xmit(struct sk_buff *skb, struct net_device *dev)
                        return NETDEV_TX_BUSY;
        }
 
+       if (unlikely(ipv6_hopopt_jumbo_remove(skb)))
+               goto tx_free;
+
        length = skb->len;
        len = skb_headlen(skb);
        last_frag = skb_shinfo(skb)->nr_frags;
@@ -11315,6 +11318,7 @@ static bool bnxt_exthdr_check(struct bnxt *bp, struct sk_buff *skb, int nw_off,
                              u8 **nextp)
 {
        struct ipv6hdr *ip6h = (struct ipv6hdr *)(skb->data + nw_off);
+       struct hop_jumbo_hdr *jhdr;
        int hdr_count = 0;
        u8 *nexthdr;
        int start;
@@ -11342,9 +11346,27 @@ static bool bnxt_exthdr_check(struct bnxt *bp, struct sk_buff *skb, int nw_off,
 
                if (hdrlen > 64)
                        return false;
+
+               /* The ext header may be a hop-by-hop header inserted for
+                * big TCP purposes. This will be removed before sending
+                * from NIC, so do not count it.
+                */
+               if (*nexthdr == NEXTHDR_HOP) {
+                       if (likely(skb->len <= GRO_LEGACY_MAX_SIZE))
+                               goto increment_hdr;
+
+                       jhdr = (struct hop_jumbo_hdr *)hp;
+                       if (jhdr->tlv_type != IPV6_TLV_JUMBO || jhdr->hdrlen != 0 ||
+                           jhdr->nexthdr != IPPROTO_TCP)
+                               goto increment_hdr;
+
+                       goto next_hdr;
+               }
+increment_hdr:
+               hdr_count++;
+next_hdr:
                nexthdr = &hp->nexthdr;
                start += hdrlen;
-               hdr_count++;
        }
        if (nextp) {
                /* Caller will check inner protocol */
@@ -13657,6 +13679,8 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
                dev->features &= ~NETIF_F_LRO;
        dev->priv_flags |= IFF_UNICAST_FLT;
 
+       netif_set_tso_max_size(dev, GSO_MAX_SIZE);
+
 #ifdef CONFIG_BNXT_SRIOV
        init_waitqueue_head(&bp->sriov_cfg_wait);
 #endif
index a8ce8d0..2197304 100644 (file)
@@ -117,24 +117,6 @@ static inline void dmadesc_set(struct bcmgenet_priv *priv,
        dmadesc_set_length_status(priv, d, val);
 }
 
-static inline dma_addr_t dmadesc_get_addr(struct bcmgenet_priv *priv,
-                                         void __iomem *d)
-{
-       dma_addr_t addr;
-
-       addr = bcmgenet_readl(d + DMA_DESC_ADDRESS_LO);
-
-       /* Register writes to GISB bus can take couple hundred nanoseconds
-        * and are done for each packet, save these expensive writes unless
-        * the platform is explicitly configured for 64-bits/LPAE.
-        */
-#ifdef CONFIG_PHYS_ADDR_T_64BIT
-       if (priv->hw_params->flags & GENET_HAS_40BITS)
-               addr |= (u64)bcmgenet_readl(d + DMA_DESC_ADDRESS_HI) << 32;
-#endif
-       return addr;
-}
-
 #define GENET_VER_FMT  "%1d.%1d EPHY: 0x%04x"
 
 #define GENET_MSG_DEFAULT      (NETIF_MSG_DRV | NETIF_MSG_PROBE | \
index 28feabe..67c3570 100644 (file)
@@ -247,8 +247,7 @@ static const struct cvmx_bootmem_named_block_desc
                                        struct cvmx_bootmem_named_block_desc,
                                        size));
 
-               strncpy(desc->name, name, sizeof(desc->name));
-               desc->name[sizeof(desc->name) - 1] = 0;
+               strscpy(desc->name, name, sizeof(desc->name));
                return &oct->bootmem_named_block_desc;
        } else {
                return NULL;
@@ -471,8 +470,8 @@ static void output_console_line(struct octeon_device *oct,
        if (line != &console_buffer[bytes_read]) {
                console_buffer[bytes_read] = '\0';
                len = strlen(console->leftover);
-               strncpy(&console->leftover[len], line,
-                       sizeof(console->leftover) - len);
+               strscpy(&console->leftover[len], line,
+                       sizeof(console->leftover) - len + 1);
        }
 }
 
index 98f3dc4..f2f9549 100644 (file)
@@ -2239,7 +2239,7 @@ static int nicvf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
        err = register_netdev(netdev);
        if (err) {
                dev_err(dev, "Failed to register netdevice\n");
-               goto err_unregister_interrupts;
+               goto err_destroy_workqueue;
        }
 
        nic->msg_enable = debug;
@@ -2248,6 +2248,8 @@ static int nicvf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
        return 0;
 
+err_destroy_workqueue:
+       destroy_workqueue(nic->nicvf_rx_mode_wq);
 err_unregister_interrupts:
        nicvf_unregister_interrupts(nic);
 err_free_netdev:
index 5855905..ca21794 100644 (file)
@@ -283,6 +283,10 @@ static int ch_ipsec_xfrm_add_state(struct xfrm_state *x)
                pr_debug("Cannot offload xfrm states with geniv other than seqiv\n");
                return -EINVAL;
        }
+       if (x->xso.type != XFRM_DEV_OFFLOAD_CRYPTO) {
+               pr_debug("Unsupported xfrm offload\n");
+               return -EINVAL;
+       }
 
        sa_entry = kzalloc(sizeof(*sa_entry), GFP_KERNEL);
        if (!sa_entry) {
index cacd454..c39b866 100644 (file)
@@ -132,6 +132,7 @@ int dpaa2_switch_acl_entry_add(struct dpaa2_switch_filter_block *filter_block,
                                                 DMA_TO_DEVICE);
        if (unlikely(dma_mapping_error(dev, acl_entry_cfg->key_iova))) {
                dev_err(dev, "DMA mapping failed\n");
+               kfree(cmd_buff);
                return -EFAULT;
        }
 
@@ -142,6 +143,7 @@ int dpaa2_switch_acl_entry_add(struct dpaa2_switch_filter_block *filter_block,
                         DMA_TO_DEVICE);
        if (err) {
                dev_err(dev, "dpsw_acl_add_entry() failed %d\n", err);
+               kfree(cmd_buff);
                return err;
        }
 
@@ -172,6 +174,7 @@ dpaa2_switch_acl_entry_remove(struct dpaa2_switch_filter_block *block,
                                                 DMA_TO_DEVICE);
        if (unlikely(dma_mapping_error(dev, acl_entry_cfg->key_iova))) {
                dev_err(dev, "DMA mapping failed\n");
+               kfree(cmd_buff);
                return -EFAULT;
        }
 
@@ -182,6 +185,7 @@ dpaa2_switch_acl_entry_remove(struct dpaa2_switch_filter_block *block,
                         DMA_TO_DEVICE);
        if (err) {
                dev_err(dev, "dpsw_acl_remove_entry() failed %d\n", err);
+               kfree(cmd_buff);
                return err;
        }
 
index 9471baa..5528b0a 100644 (file)
@@ -1216,7 +1216,8 @@ fec_restart(struct net_device *ndev)
                writel(0, fep->hwp + FEC_IMASK);
 
        /* Init the interrupt coalescing */
-       fec_enet_itr_coal_set(ndev);
+       if (fep->quirks & FEC_QUIRK_HAS_COALESCE)
+               fec_enet_itr_coal_set(ndev);
 }
 
 static int fec_enet_ipc_handle_init(struct fec_enet_private *fep)
index 93846ba..ce2571c 100644 (file)
@@ -283,7 +283,7 @@ static int hisi_femac_rx(struct net_device *dev, int limit)
                skb->protocol = eth_type_trans(skb, dev);
                napi_gro_receive(&priv->napi, skb);
                dev->stats.rx_packets++;
-               dev->stats.rx_bytes += skb->len;
+               dev->stats.rx_bytes += len;
 next:
                pos = (pos + 1) % rxq->num;
                if (rx_pkts_num >= limit)
index ffcf797..f867e95 100644 (file)
@@ -550,7 +550,7 @@ static int hix5hd2_rx(struct net_device *dev, int limit)
                skb->protocol = eth_type_trans(skb, dev);
                napi_gro_receive(&priv->napi, skb);
                dev->stats.rx_packets++;
-               dev->stats.rx_bytes += skb->len;
+               dev->stats.rx_bytes += len;
 next:
                pos = dma_ring_incr(pos, RX_DESC_NUM);
        }
index 54faf0f..b54f370 100644 (file)
@@ -644,18 +644,15 @@ static void hns_nic_get_drvinfo(struct net_device *net_dev,
 {
        struct hns_nic_priv *priv = netdev_priv(net_dev);
 
-       strncpy(drvinfo->version, HNAE_DRIVER_VERSION,
+       strscpy(drvinfo->version, HNAE_DRIVER_VERSION,
                sizeof(drvinfo->version));
-       drvinfo->version[sizeof(drvinfo->version) - 1] = '\0';
 
-       strncpy(drvinfo->driver, HNAE_DRIVER_NAME, sizeof(drvinfo->driver));
-       drvinfo->driver[sizeof(drvinfo->driver) - 1] = '\0';
+       strscpy(drvinfo->driver, HNAE_DRIVER_NAME, sizeof(drvinfo->driver));
 
-       strncpy(drvinfo->bus_info, priv->dev->bus->name,
+       strscpy(drvinfo->bus_info, priv->dev->bus->name,
                sizeof(drvinfo->bus_info));
-       drvinfo->bus_info[ETHTOOL_BUSINFO_LEN - 1] = '\0';
 
-       strncpy(drvinfo->fw_version, "N/A", ETHTOOL_FWVERS_LEN);
+       strscpy(drvinfo->fw_version, "N/A", ETHTOOL_FWVERS_LEN);
        drvinfo->eedump_len = 0;
 }
 
index cdf76fb..55306fe 100644 (file)
@@ -639,13 +639,11 @@ static void hns3_get_drvinfo(struct net_device *netdev,
                return;
        }
 
-       strncpy(drvinfo->driver, dev_driver_string(&h->pdev->dev),
+       strscpy(drvinfo->driver, dev_driver_string(&h->pdev->dev),
                sizeof(drvinfo->driver));
-       drvinfo->driver[sizeof(drvinfo->driver) - 1] = '\0';
 
-       strncpy(drvinfo->bus_info, pci_name(h->pdev),
+       strscpy(drvinfo->bus_info, pci_name(h->pdev),
                sizeof(drvinfo->bus_info));
-       drvinfo->bus_info[ETHTOOL_BUSINFO_LEN - 1] = '\0';
 
        fw_version = priv->ae_handle->ae_algo->ops->get_fw_version(h);
 
index 36bc4fd..04acd1a 100644 (file)
@@ -5931,9 +5931,9 @@ static netdev_tx_t e1000_xmit_frame(struct sk_buff *skb,
                e1000_tx_queue(tx_ring, tx_flags, count);
                /* Make sure there is space in the ring for the next send. */
                e1000_maybe_stop_tx(tx_ring,
-                                   (MAX_SKB_FRAGS *
+                                   ((MAX_SKB_FRAGS + 1) *
                                     DIV_ROUND_UP(PAGE_SIZE,
-                                                 adapter->tx_fifo_limit) + 2));
+                                                 adapter->tx_fifo_limit) + 4));
 
                if (!netdev_xmit_more() ||
                    netif_xmit_stopped(netdev_get_tx_queue(netdev, 0))) {
index 60f9e0a..3357d65 100644 (file)
@@ -1795,9 +1795,11 @@ I40E_CHECK_CMD_LENGTH(i40e_aqc_an_advt_reg);
 /* Set Loopback mode (0x0618) */
 struct i40e_aqc_set_lb_mode {
        __le16  lb_mode;
-#define I40E_AQ_LB_PHY_LOCAL   0x01
-#define I40E_AQ_LB_PHY_REMOTE  0x02
-#define I40E_AQ_LB_MAC_LOCAL   0x04
+#define I40E_LEGACY_LOOPBACK_NVM_VER   0x6000
+#define I40E_AQ_LB_MAC_LOCAL           0x01
+#define I40E_AQ_LB_PHY_LOCAL           0x05
+#define I40E_AQ_LB_PHY_REMOTE          0x06
+#define I40E_AQ_LB_MAC_LOCAL_LEGACY    0x04
        u8      reserved[14];
 };
 
index 4f01e2a..8f764ff 100644 (file)
@@ -1831,6 +1831,32 @@ i40e_status i40e_aq_set_phy_int_mask(struct i40e_hw *hw,
 }
 
 /**
+ * i40e_aq_set_mac_loopback
+ * @hw: pointer to the HW struct
+ * @ena_lpbk: Enable or Disable loopback
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Enable/disable loopback on a given port
+ */
+i40e_status i40e_aq_set_mac_loopback(struct i40e_hw *hw, bool ena_lpbk,
+                                    struct i40e_asq_cmd_details *cmd_details)
+{
+       struct i40e_aq_desc desc;
+       struct i40e_aqc_set_lb_mode *cmd =
+               (struct i40e_aqc_set_lb_mode *)&desc.params.raw;
+
+       i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_set_lb_modes);
+       if (ena_lpbk) {
+               if (hw->nvm.version <= I40E_LEGACY_LOOPBACK_NVM_VER)
+                       cmd->lb_mode = cpu_to_le16(I40E_AQ_LB_MAC_LOCAL_LEGACY);
+               else
+                       cmd->lb_mode = cpu_to_le16(I40E_AQ_LB_MAC_LOCAL);
+       }
+
+       return i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+}
+
+/**
  * i40e_aq_set_phy_debug
  * @hw: pointer to the hw struct
  * @cmd_flags: debug command flags
index 616d27e..887a735 100644 (file)
@@ -4466,11 +4466,7 @@ static int i40e_check_fdir_input_set(struct i40e_vsi *vsi,
                        return -EOPNOTSUPP;
 
                /* First 4 bytes of L4 header */
-               if (usr_ip4_spec->l4_4_bytes == htonl(0xFFFFFFFF))
-                       new_mask |= I40E_L4_SRC_MASK | I40E_L4_DST_MASK;
-               else if (!usr_ip4_spec->l4_4_bytes)
-                       new_mask &= ~(I40E_L4_SRC_MASK | I40E_L4_DST_MASK);
-               else
+               if (usr_ip4_spec->l4_4_bytes)
                        return -EOPNOTSUPP;
 
                /* Filtering on Type of Service is not supported. */
@@ -4509,11 +4505,7 @@ static int i40e_check_fdir_input_set(struct i40e_vsi *vsi,
                else
                        return -EOPNOTSUPP;
 
-               if (usr_ip6_spec->l4_4_bytes == htonl(0xFFFFFFFF))
-                       new_mask |= I40E_L4_SRC_MASK | I40E_L4_DST_MASK;
-               else if (!usr_ip6_spec->l4_4_bytes)
-                       new_mask &= ~(I40E_L4_SRC_MASK | I40E_L4_DST_MASK);
-               else
+               if (usr_ip6_spec->l4_4_bytes)
                        return -EOPNOTSUPP;
 
                /* Filtering on Traffic class is not supported. */
index 6861b3e..6d2e302 100644 (file)
@@ -10656,6 +10656,21 @@ static int i40e_rebuild_channels(struct i40e_vsi *vsi)
 }
 
 /**
+ * i40e_clean_xps_state - clean xps state for every tx_ring
+ * @vsi: ptr to the VSI
+ **/
+static void i40e_clean_xps_state(struct i40e_vsi *vsi)
+{
+       int i;
+
+       if (vsi->tx_rings)
+               for (i = 0; i < vsi->num_queue_pairs; i++)
+                       if (vsi->tx_rings[i])
+                               clear_bit(__I40E_TX_XPS_INIT_DONE,
+                                         vsi->tx_rings[i]->state);
+}
+
+/**
  * i40e_prep_for_reset - prep for the core to reset
  * @pf: board private structure
  *
@@ -10679,8 +10694,10 @@ static void i40e_prep_for_reset(struct i40e_pf *pf)
        i40e_pf_quiesce_all_vsi(pf);
 
        for (v = 0; v < pf->num_alloc_vsi; v++) {
-               if (pf->vsi[v])
+               if (pf->vsi[v]) {
+                       i40e_clean_xps_state(pf->vsi[v]);
                        pf->vsi[v]->seid = 0;
+               }
        }
 
        i40e_shutdown_adminq(&pf->hw);
@@ -12921,6 +12938,29 @@ static void i40e_clear_rss_lut(struct i40e_vsi *vsi)
 }
 
 /**
+ * i40e_set_loopback - turn on/off loopback mode on underlying PF
+ * @vsi: ptr to VSI
+ * @ena: flag to indicate the on/off setting
+ */
+static int i40e_set_loopback(struct i40e_vsi *vsi, bool ena)
+{
+       bool if_running = netif_running(vsi->netdev) &&
+                         !test_and_set_bit(__I40E_VSI_DOWN, vsi->state);
+       int ret;
+
+       if (if_running)
+               i40e_down(vsi);
+
+       ret = i40e_aq_set_mac_loopback(&vsi->back->hw, ena, NULL);
+       if (ret)
+               netdev_err(vsi->netdev, "Failed to toggle loopback state\n");
+       if (if_running)
+               i40e_up(vsi);
+
+       return ret;
+}
+
+/**
  * i40e_set_features - set the netdev feature flags
  * @netdev: ptr to the netdev being adjusted
  * @features: the feature set that the stack is suggesting
@@ -12960,6 +13000,9 @@ static int i40e_set_features(struct net_device *netdev,
        if (need_reset)
                i40e_do_reset(pf, I40E_PF_RESET_FLAG, true);
 
+       if ((features ^ netdev->features) & NETIF_F_LOOPBACK)
+               return i40e_set_loopback(vsi, !!(features & NETIF_F_LOOPBACK));
+
        return 0;
 }
 
@@ -13722,7 +13765,7 @@ static int i40e_config_netdev(struct i40e_vsi *vsi)
        if (!(pf->flags & I40E_FLAG_MFP_ENABLED))
                hw_features |= NETIF_F_NTUPLE | NETIF_F_HW_TC;
 
-       netdev->hw_features |= hw_features;
+       netdev->hw_features |= hw_features | NETIF_F_LOOPBACK;
 
        netdev->features |= hw_features | NETIF_F_HW_VLAN_CTAG_FILTER;
        netdev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
index ebdcde6..9a71121 100644 (file)
@@ -105,6 +105,9 @@ enum i40e_status_code i40e_aq_set_phy_config(struct i40e_hw *hw,
                                struct i40e_asq_cmd_details *cmd_details);
 enum i40e_status_code i40e_set_fc(struct i40e_hw *hw, u8 *aq_failures,
                                  bool atomic_reset);
+i40e_status i40e_aq_set_mac_loopback(struct i40e_hw *hw,
+                                    bool ena_lpbk,
+                                    struct i40e_asq_cmd_details *cmd_details);
 i40e_status i40e_aq_set_phy_int_mask(struct i40e_hw *hw, u16 mask,
                                     struct i40e_asq_cmd_details *cmd_details);
 i40e_status i40e_aq_clear_pxe_mode(struct i40e_hw *hw,
index 72ddcef..635f93d 100644 (file)
@@ -1578,6 +1578,7 @@ bool i40e_reset_vf(struct i40e_vf *vf, bool flr)
        i40e_cleanup_reset_vf(vf);
 
        i40e_flush(hw);
+       usleep_range(20000, 40000);
        clear_bit(I40E_VF_STATE_RESETTING, &vf->vf_states);
 
        return true;
@@ -1701,6 +1702,7 @@ bool i40e_reset_all_vfs(struct i40e_pf *pf, bool flr)
        }
 
        i40e_flush(hw);
+       usleep_range(20000, 40000);
        clear_bit(__I40E_VF_DISABLE, pf->state);
 
        return true;
index 2b23b47..a9a7f8b 100644 (file)
@@ -1111,8 +1111,7 @@ ice_link_event(struct ice_pf *pf, struct ice_port_info *pi, bool link_up,
        if (link_up == old_link && link_speed == old_link_speed)
                return 0;
 
-       if (!ice_is_e810(&pf->hw))
-               ice_ptp_link_change(pf, pf->hw.pf_id, link_up);
+       ice_ptp_link_change(pf, pf->hw.pf_id, link_up);
 
        if (ice_is_dcb_active(pf)) {
                if (test_bit(ICE_FLAG_DCB_ENA, pf->flags))
@@ -6340,8 +6339,7 @@ static int ice_up_complete(struct ice_vsi *vsi)
                ice_print_link_msg(vsi, true);
                netif_tx_start_all_queues(vsi->netdev);
                netif_carrier_on(vsi->netdev);
-               if (!ice_is_e810(&pf->hw))
-                       ice_ptp_link_change(pf, pf->hw.pf_id, true);
+               ice_ptp_link_change(pf, pf->hw.pf_id, true);
        }
 
        /* Perform an initial read of the statistics registers now to
@@ -6773,8 +6771,7 @@ int ice_down(struct ice_vsi *vsi)
 
        if (vsi->netdev && vsi->type == ICE_VSI_PF) {
                vlan_err = ice_vsi_del_vlan_zero(vsi);
-               if (!ice_is_e810(&vsi->back->hw))
-                       ice_ptp_link_change(vsi->back, vsi->back->hw.pf_id, false);
+               ice_ptp_link_change(vsi->back, vsi->back->hw.pf_id, false);
                netif_carrier_off(vsi->netdev);
                netif_tx_disable(vsi->netdev);
        } else if (vsi->type == ICE_VSI_SWITCHDEV_CTRL) {
index 13e7527..d63161d 100644 (file)
@@ -600,6 +600,23 @@ static u64 ice_ptp_extend_40b_ts(struct ice_pf *pf, u64 in_tstamp)
 }
 
 /**
+ * ice_ptp_is_tx_tracker_up - Check if Tx tracker is ready for new timestamps
+ * @tx: the PTP Tx timestamp tracker to check
+ *
+ * Check that a given PTP Tx timestamp tracker is up, i.e. that it is ready
+ * to accept new timestamp requests.
+ *
+ * Assumes the tx->lock spinlock is already held.
+ */
+static bool
+ice_ptp_is_tx_tracker_up(struct ice_ptp_tx *tx)
+{
+       lockdep_assert_held(&tx->lock);
+
+       return tx->init && !tx->calibrating;
+}
+
+/**
  * ice_ptp_tx_tstamp - Process Tx timestamps for a port
  * @tx: the PTP Tx timestamp tracker
  *
@@ -608,11 +625,13 @@ static u64 ice_ptp_extend_40b_ts(struct ice_pf *pf, u64 in_tstamp)
  *
  * If a given index has a valid timestamp, perform the following steps:
  *
- * 1) copy the timestamp out of the PHY register
- * 4) clear the timestamp valid bit in the PHY register
- * 5) unlock the index by clearing the associated in_use bit.
- * 2) extend the 40b timestamp value to get a 64bit timestamp
- * 3) send that timestamp to the stack
+ * 1) check that the timestamp request is not stale
+ * 2) check that a timestamp is ready and available in the PHY memory bank
+ * 3) read and copy the timestamp out of the PHY register
+ * 4) unlock the index by clearing the associated in_use bit
+ * 5) check if the timestamp is stale, and discard if so
+ * 6) extend the 40 bit timestamp value to get a 64 bit timestamp value
+ * 7) send this 64 bit timestamp to the stack
  *
  * Returns true if all timestamps were handled, and false if any slots remain
  * without a timestamp.
@@ -623,24 +642,45 @@ static u64 ice_ptp_extend_40b_ts(struct ice_pf *pf, u64 in_tstamp)
  * interrupt. In some cases hardware might not interrupt us again when the
  * timestamp is captured.
  *
- * Note that we only take the tracking lock when clearing the bit and when
- * checking if we need to re-queue this task. The only place where bits can be
- * set is the hard xmit routine where an SKB has a request flag set. The only
- * places where we clear bits are this work function, or the periodic cleanup
- * thread. If the cleanup thread clears a bit we're processing we catch it
- * when we lock to clear the bit and then grab the SKB pointer. If a Tx thread
- * starts a new timestamp, we might not begin processing it right away but we
- * will notice it at the end when we re-queue the task. If a Tx thread starts
- * a new timestamp just after this function exits without re-queuing,
- * the interrupt when the timestamp finishes should trigger. Avoiding holding
- * the lock for the entire function is important in order to ensure that Tx
- * threads do not get blocked while waiting for the lock.
+ * Note that we do not hold the tracking lock while reading the Tx timestamp.
+ * This is because reading the timestamp requires taking a mutex that might
+ * sleep.
+ *
+ * The only place where we set in_use is when a new timestamp is initiated
+ * with a slot index. This is only called in the hard xmit routine where an
+ * SKB has a request flag set. The only places where we clear this bit is this
+ * function, or during teardown when the Tx timestamp tracker is being
+ * removed. A timestamp index will never be re-used until the in_use bit for
+ * that index is cleared.
+ *
+ * If a Tx thread starts a new timestamp, we might not begin processing it
+ * right away but we will notice it at the end when we re-queue the task.
+ *
+ * If a Tx thread starts a new timestamp just after this function exits, the
+ * interrupt for that timestamp should re-trigger this function once
+ * a timestamp is ready.
+ *
+ * In cases where the PTP hardware clock was directly adjusted, some
+ * timestamps may not be able to safely use the timestamp extension math. In
+ * this case, software will set the stale bit for any outstanding Tx
+ * timestamps when the clock is adjusted. Then this function will discard
+ * those captured timestamps instead of sending them to the stack.
+ *
+ * If a Tx packet has been waiting for more than 2 seconds, it is not possible
+ * to correctly extend the timestamp using the cached PHC time. It is
+ * extremely unlikely that a packet will ever take this long to timestamp. If
+ * we detect a Tx timestamp request that has waited for this long we assume
+ * the packet will never be sent by hardware and discard it without reading
+ * the timestamp register.
  */
 static bool ice_ptp_tx_tstamp(struct ice_ptp_tx *tx)
 {
        struct ice_ptp_port *ptp_port;
-       bool ts_handled = true;
+       bool more_timestamps;
        struct ice_pf *pf;
+       struct ice_hw *hw;
+       u64 tstamp_ready;
+       int err;
        u8 idx;
 
        if (!tx->init)
@@ -648,44 +688,86 @@ static bool ice_ptp_tx_tstamp(struct ice_ptp_tx *tx)
 
        ptp_port = container_of(tx, struct ice_ptp_port, tx);
        pf = ptp_port_to_pf(ptp_port);
+       hw = &pf->hw;
+
+       /* Read the Tx ready status first */
+       err = ice_get_phy_tx_tstamp_ready(hw, tx->block, &tstamp_ready);
+       if (err)
+               return false;
 
        for_each_set_bit(idx, tx->in_use, tx->len) {
                struct skb_shared_hwtstamps shhwtstamps = {};
-               u8 phy_idx = idx + tx->quad_offset;
-               u64 raw_tstamp, tstamp;
+               u8 phy_idx = idx + tx->offset;
+               u64 raw_tstamp = 0, tstamp;
+               bool drop_ts = false;
                struct sk_buff *skb;
-               int err;
+
+               /* Drop packets which have waited for more than 2 seconds */
+               if (time_is_before_jiffies(tx->tstamps[idx].start + 2 * HZ)) {
+                       drop_ts = true;
+
+                       /* Count the number of Tx timestamps that timed out */
+                       pf->ptp.tx_hwtstamp_timeouts++;
+               }
+
+               /* Only read a timestamp from the PHY if its marked as ready
+                * by the tstamp_ready register. This avoids unnecessary
+                * reading of timestamps which are not yet valid. This is
+                * important as we must read all timestamps which are valid
+                * and only timestamps which are valid during each interrupt.
+                * If we do not, the hardware logic for generating a new
+                * interrupt can get stuck on some devices.
+                */
+               if (!(tstamp_ready & BIT_ULL(phy_idx))) {
+                       if (drop_ts)
+                               goto skip_ts_read;
+
+                       continue;
+               }
 
                ice_trace(tx_tstamp_fw_req, tx->tstamps[idx].skb, idx);
 
-               err = ice_read_phy_tstamp(&pf->hw, tx->quad, phy_idx,
-                                         &raw_tstamp);
+               err = ice_read_phy_tstamp(hw, tx->block, phy_idx, &raw_tstamp);
                if (err)
                        continue;
 
                ice_trace(tx_tstamp_fw_done, tx->tstamps[idx].skb, idx);
 
-               /* Check if the timestamp is invalid or stale */
-               if (!(raw_tstamp & ICE_PTP_TS_VALID) ||
+               /* For PHYs which don't implement a proper timestamp ready
+                * bitmap, verify that the timestamp value is different
+                * from the last cached timestamp. If it is not, skip this for
+                * now assuming it hasn't yet been captured by hardware.
+                */
+               if (!drop_ts && tx->verify_cached &&
                    raw_tstamp == tx->tstamps[idx].cached_tstamp)
                        continue;
 
-               /* The timestamp is valid, so we'll go ahead and clear this
-                * index and then send the timestamp up to the stack.
-                */
+               /* Discard any timestamp value without the valid bit set */
+               if (!(raw_tstamp & ICE_PTP_TS_VALID))
+                       drop_ts = true;
+
+skip_ts_read:
                spin_lock(&tx->lock);
-               tx->tstamps[idx].cached_tstamp = raw_tstamp;
+               if (tx->verify_cached && raw_tstamp)
+                       tx->tstamps[idx].cached_tstamp = raw_tstamp;
                clear_bit(idx, tx->in_use);
                skb = tx->tstamps[idx].skb;
                tx->tstamps[idx].skb = NULL;
+               if (test_and_clear_bit(idx, tx->stale))
+                       drop_ts = true;
                spin_unlock(&tx->lock);
 
-               /* it's (unlikely but) possible we raced with the cleanup
-                * thread for discarding old timestamp requests.
+               /* It is unlikely but possible that the SKB will have been
+                * flushed at this point due to link change or teardown.
                 */
                if (!skb)
                        continue;
 
+               if (drop_ts) {
+                       dev_kfree_skb_any(skb);
+                       continue;
+               }
+
                /* Extend the timestamp using cached PHC time */
                tstamp = ice_ptp_extend_40b_ts(pf, raw_tstamp);
                if (tstamp) {
@@ -701,11 +783,10 @@ static bool ice_ptp_tx_tstamp(struct ice_ptp_tx *tx)
         * poll for remaining timestamps.
         */
        spin_lock(&tx->lock);
-       if (!bitmap_empty(tx->in_use, tx->len))
-               ts_handled = false;
+       more_timestamps = tx->init && !bitmap_empty(tx->in_use, tx->len);
        spin_unlock(&tx->lock);
 
-       return ts_handled;
+       return !more_timestamps;
 }
 
 /**
@@ -713,26 +794,33 @@ static bool ice_ptp_tx_tstamp(struct ice_ptp_tx *tx)
  * @tx: Tx tracking structure to initialize
  *
  * Assumes that the length has already been initialized. Do not call directly,
- * use the ice_ptp_init_tx_e822 or ice_ptp_init_tx_e810 instead.
+ * use the ice_ptp_init_tx_* instead.
  */
 static int
 ice_ptp_alloc_tx_tracker(struct ice_ptp_tx *tx)
 {
-       tx->tstamps = kcalloc(tx->len, sizeof(*tx->tstamps), GFP_KERNEL);
-       if (!tx->tstamps)
-               return -ENOMEM;
+       unsigned long *in_use, *stale;
+       struct ice_tx_tstamp *tstamps;
+
+       tstamps = kcalloc(tx->len, sizeof(*tstamps), GFP_KERNEL);
+       in_use = bitmap_zalloc(tx->len, GFP_KERNEL);
+       stale = bitmap_zalloc(tx->len, GFP_KERNEL);
+
+       if (!tstamps || !in_use || !stale) {
+               kfree(tstamps);
+               bitmap_free(in_use);
+               bitmap_free(stale);
 
-       tx->in_use = bitmap_zalloc(tx->len, GFP_KERNEL);
-       if (!tx->in_use) {
-               kfree(tx->tstamps);
-               tx->tstamps = NULL;
                return -ENOMEM;
        }
 
-       spin_lock_init(&tx->lock);
-
+       tx->tstamps = tstamps;
+       tx->in_use = in_use;
+       tx->stale = stale;
        tx->init = 1;
 
+       spin_lock_init(&tx->lock);
+
        return 0;
 }
 
@@ -740,31 +828,71 @@ ice_ptp_alloc_tx_tracker(struct ice_ptp_tx *tx)
  * ice_ptp_flush_tx_tracker - Flush any remaining timestamps from the tracker
  * @pf: Board private structure
  * @tx: the tracker to flush
+ *
+ * Called during teardown when a Tx tracker is being removed.
  */
 static void
 ice_ptp_flush_tx_tracker(struct ice_pf *pf, struct ice_ptp_tx *tx)
 {
+       struct ice_hw *hw = &pf->hw;
+       u64 tstamp_ready;
+       int err;
        u8 idx;
 
-       for (idx = 0; idx < tx->len; idx++) {
-               u8 phy_idx = idx + tx->quad_offset;
+       err = ice_get_phy_tx_tstamp_ready(hw, tx->block, &tstamp_ready);
+       if (err) {
+               dev_dbg(ice_pf_to_dev(pf), "Failed to get the Tx tstamp ready bitmap for block %u, err %d\n",
+                       tx->block, err);
+
+               /* If we fail to read the Tx timestamp ready bitmap just
+                * skip clearing the PHY timestamps.
+                */
+               tstamp_ready = 0;
+       }
+
+       for_each_set_bit(idx, tx->in_use, tx->len) {
+               u8 phy_idx = idx + tx->offset;
+               struct sk_buff *skb;
+
+               /* In case this timestamp is ready, we need to clear it. */
+               if (!hw->reset_ongoing && (tstamp_ready & BIT_ULL(phy_idx)))
+                       ice_clear_phy_tstamp(hw, tx->block, phy_idx);
 
                spin_lock(&tx->lock);
-               if (tx->tstamps[idx].skb) {
-                       dev_kfree_skb_any(tx->tstamps[idx].skb);
-                       tx->tstamps[idx].skb = NULL;
-                       pf->ptp.tx_hwtstamp_flushed++;
-               }
+               skb = tx->tstamps[idx].skb;
+               tx->tstamps[idx].skb = NULL;
                clear_bit(idx, tx->in_use);
+               clear_bit(idx, tx->stale);
                spin_unlock(&tx->lock);
 
-               /* Clear any potential residual timestamp in the PHY block */
-               if (!pf->hw.reset_ongoing)
-                       ice_clear_phy_tstamp(&pf->hw, tx->quad, phy_idx);
+               /* Count the number of Tx timestamps flushed */
+               pf->ptp.tx_hwtstamp_flushed++;
+
+               /* Free the SKB after we've cleared the bit */
+               dev_kfree_skb_any(skb);
        }
 }
 
 /**
+ * ice_ptp_mark_tx_tracker_stale - Mark unfinished timestamps as stale
+ * @tx: the tracker to mark
+ *
+ * Mark currently outstanding Tx timestamps as stale. This prevents sending
+ * their timestamp value to the stack. This is required to prevent extending
+ * the 40bit hardware timestamp incorrectly.
+ *
+ * This should be called when the PTP clock is modified such as after a set
+ * time request.
+ */
+static void
+ice_ptp_mark_tx_tracker_stale(struct ice_ptp_tx *tx)
+{
+       spin_lock(&tx->lock);
+       bitmap_or(tx->stale, tx->stale, tx->in_use, tx->len);
+       spin_unlock(&tx->lock);
+}
+
+/**
  * ice_ptp_release_tx_tracker - Release allocated memory for Tx tracker
  * @pf: Board private structure
  * @tx: Tx tracking structure to release
@@ -774,7 +902,12 @@ ice_ptp_flush_tx_tracker(struct ice_pf *pf, struct ice_ptp_tx *tx)
 static void
 ice_ptp_release_tx_tracker(struct ice_pf *pf, struct ice_ptp_tx *tx)
 {
+       spin_lock(&tx->lock);
        tx->init = 0;
+       spin_unlock(&tx->lock);
+
+       /* wait for potentially outstanding interrupt to complete */
+       synchronize_irq(pf->msix_entries[pf->oicr_idx].vector);
 
        ice_ptp_flush_tx_tracker(pf, tx);
 
@@ -784,6 +917,9 @@ ice_ptp_release_tx_tracker(struct ice_pf *pf, struct ice_ptp_tx *tx)
        bitmap_free(tx->in_use);
        tx->in_use = NULL;
 
+       bitmap_free(tx->stale);
+       tx->stale = NULL;
+
        tx->len = 0;
 }
 
@@ -801,9 +937,10 @@ ice_ptp_release_tx_tracker(struct ice_pf *pf, struct ice_ptp_tx *tx)
 static int
 ice_ptp_init_tx_e822(struct ice_pf *pf, struct ice_ptp_tx *tx, u8 port)
 {
-       tx->quad = port / ICE_PORTS_PER_QUAD;
-       tx->quad_offset = (port % ICE_PORTS_PER_QUAD) * INDEX_PER_PORT;
-       tx->len = INDEX_PER_PORT;
+       tx->block = port / ICE_PORTS_PER_QUAD;
+       tx->offset = (port % ICE_PORTS_PER_QUAD) * INDEX_PER_PORT_E822;
+       tx->len = INDEX_PER_PORT_E822;
+       tx->verify_cached = 0;
 
        return ice_ptp_alloc_tx_tracker(tx);
 }
@@ -819,59 +956,19 @@ ice_ptp_init_tx_e822(struct ice_pf *pf, struct ice_ptp_tx *tx, u8 port)
 static int
 ice_ptp_init_tx_e810(struct ice_pf *pf, struct ice_ptp_tx *tx)
 {
-       tx->quad = pf->hw.port_info->lport;
-       tx->quad_offset = 0;
-       tx->len = INDEX_PER_QUAD;
+       tx->block = pf->hw.port_info->lport;
+       tx->offset = 0;
+       tx->len = INDEX_PER_PORT_E810;
+       /* The E810 PHY does not provide a timestamp ready bitmap. Instead,
+        * verify new timestamps against cached copy of the last read
+        * timestamp.
+        */
+       tx->verify_cached = 1;
 
        return ice_ptp_alloc_tx_tracker(tx);
 }
 
 /**
- * ice_ptp_tx_tstamp_cleanup - Cleanup old timestamp requests that got dropped
- * @pf: pointer to the PF struct
- * @tx: PTP Tx tracker to clean up
- *
- * Loop through the Tx timestamp requests and see if any of them have been
- * waiting for a long time. Discard any SKBs that have been waiting for more
- * than 2 seconds. This is long enough to be reasonably sure that the
- * timestamp will never be captured. This might happen if the packet gets
- * discarded before it reaches the PHY timestamping block.
- */
-static void ice_ptp_tx_tstamp_cleanup(struct ice_pf *pf, struct ice_ptp_tx *tx)
-{
-       struct ice_hw *hw = &pf->hw;
-       u8 idx;
-
-       if (!tx->init)
-               return;
-
-       for_each_set_bit(idx, tx->in_use, tx->len) {
-               struct sk_buff *skb;
-               u64 raw_tstamp;
-
-               /* Check if this SKB has been waiting for too long */
-               if (time_is_after_jiffies(tx->tstamps[idx].start + 2 * HZ))
-                       continue;
-
-               /* Read tstamp to be able to use this register again */
-               ice_read_phy_tstamp(hw, tx->quad, idx + tx->quad_offset,
-                                   &raw_tstamp);
-
-               spin_lock(&tx->lock);
-               skb = tx->tstamps[idx].skb;
-               tx->tstamps[idx].skb = NULL;
-               clear_bit(idx, tx->in_use);
-               spin_unlock(&tx->lock);
-
-               /* Count the number of Tx timestamps which have timed out */
-               pf->ptp.tx_hwtstamp_timeouts++;
-
-               /* Free the SKB after we've cleared the bit */
-               dev_kfree_skb_any(skb);
-       }
-}
-
-/**
  * ice_ptp_update_cached_phctime - Update the cached PHC time values
  * @pf: Board specific private structure
  *
@@ -941,20 +1038,13 @@ static int ice_ptp_update_cached_phctime(struct ice_pf *pf)
  * @pf: Board specific private structure
  *
  * This function must be called when the cached PHC time is no longer valid,
- * such as after a time adjustment. It discards any outstanding Tx timestamps,
- * and updates the cached PHC time for both the PF and Rx rings. If updating
- * the PHC time cannot be done immediately, a warning message is logged and
- * the work item is scheduled.
- *
- * These steps are required in order to ensure that we do not accidentally
- * report a timestamp extended by the wrong PHC cached copy. Note that we
- * do not directly update the cached timestamp here because it is possible
- * this might produce an error when ICE_CFG_BUSY is set. If this occurred, we
- * would have to try again. During that time window, timestamps might be
- * requested and returned with an invalid extension. Thus, on failure to
- * immediately update the cached PHC time we would need to zero the value
- * anyways. For this reason, we just zero the value immediately and queue the
- * update work item.
+ * such as after a time adjustment. It marks any currently outstanding Tx
+ * timestamps as stale and updates the cached PHC time for both the PF and Rx
+ * rings.
+ *
+ * If updating the PHC time cannot be done immediately, a warning message is
+ * logged and the work item is scheduled immediately to minimize the window
+ * with a wrong cached timestamp.
  */
 static void ice_ptp_reset_cached_phctime(struct ice_pf *pf)
 {
@@ -978,8 +1068,12 @@ static void ice_ptp_reset_cached_phctime(struct ice_pf *pf)
                                           msecs_to_jiffies(10));
        }
 
-       /* Flush any outstanding Tx timestamps */
-       ice_ptp_flush_tx_tracker(pf, &pf->ptp.port.tx);
+       /* Mark any outstanding timestamps as stale, since they might have
+        * been captured in hardware before the time update. This could lead
+        * to us extending them with the wrong cached value resulting in
+        * incorrect timestamp values.
+        */
+       ice_ptp_mark_tx_tracker_stale(&pf->ptp.port.tx);
 }
 
 /**
@@ -1060,19 +1154,6 @@ static u64 ice_base_incval(struct ice_pf *pf)
 }
 
 /**
- * ice_ptp_reset_ts_memory_quad - Reset timestamp memory for one quad
- * @pf: The PF private data structure
- * @quad: The quad (0-4)
- */
-static void ice_ptp_reset_ts_memory_quad(struct ice_pf *pf, int quad)
-{
-       struct ice_hw *hw = &pf->hw;
-
-       ice_write_quad_reg_e822(hw, quad, Q_REG_TS_CTRL, Q_REG_TS_CTRL_M);
-       ice_write_quad_reg_e822(hw, quad, Q_REG_TS_CTRL, ~(u32)Q_REG_TS_CTRL_M);
-}
-
-/**
  * ice_ptp_check_tx_fifo - Check whether Tx FIFO is in an OK state
  * @port: PTP port for which Tx FIFO is checked
  */
@@ -1124,7 +1205,7 @@ static int ice_ptp_check_tx_fifo(struct ice_ptp_port *port)
                dev_dbg(ice_pf_to_dev(pf),
                        "Port %d Tx FIFO still not empty; resetting quad %d\n",
                        port->port_num, quad);
-               ice_ptp_reset_ts_memory_quad(pf, quad);
+               ice_ptp_reset_ts_memory_quad_e822(hw, quad);
                port->tx_fifo_busy_cnt = FIFO_OK;
                return 0;
        }
@@ -1133,130 +1214,49 @@ static int ice_ptp_check_tx_fifo(struct ice_ptp_port *port)
 }
 
 /**
- * ice_ptp_check_tx_offset_valid - Check if the Tx PHY offset is valid
- * @port: the PTP port to check
- *
- * Checks whether the Tx offset for the PHY associated with this port is
- * valid. Returns 0 if the offset is valid, and a non-zero error code if it is
- * not.
- */
-static int ice_ptp_check_tx_offset_valid(struct ice_ptp_port *port)
-{
-       struct ice_pf *pf = ptp_port_to_pf(port);
-       struct device *dev = ice_pf_to_dev(pf);
-       struct ice_hw *hw = &pf->hw;
-       u32 val;
-       int err;
-
-       err = ice_ptp_check_tx_fifo(port);
-       if (err)
-               return err;
-
-       err = ice_read_phy_reg_e822(hw, port->port_num, P_REG_TX_OV_STATUS,
-                                   &val);
-       if (err) {
-               dev_err(dev, "Failed to read TX_OV_STATUS for port %d, err %d\n",
-                       port->port_num, err);
-               return -EAGAIN;
-       }
-
-       if (!(val & P_REG_TX_OV_STATUS_OV_M))
-               return -EAGAIN;
-
-       return 0;
-}
-
-/**
- * ice_ptp_check_rx_offset_valid - Check if the Rx PHY offset is valid
- * @port: the PTP port to check
- *
- * Checks whether the Rx offset for the PHY associated with this port is
- * valid. Returns 0 if the offset is valid, and a non-zero error code if it is
- * not.
- */
-static int ice_ptp_check_rx_offset_valid(struct ice_ptp_port *port)
-{
-       struct ice_pf *pf = ptp_port_to_pf(port);
-       struct device *dev = ice_pf_to_dev(pf);
-       struct ice_hw *hw = &pf->hw;
-       int err;
-       u32 val;
-
-       err = ice_read_phy_reg_e822(hw, port->port_num, P_REG_RX_OV_STATUS,
-                                   &val);
-       if (err) {
-               dev_err(dev, "Failed to read RX_OV_STATUS for port %d, err %d\n",
-                       port->port_num, err);
-               return err;
-       }
-
-       if (!(val & P_REG_RX_OV_STATUS_OV_M))
-               return -EAGAIN;
-
-       return 0;
-}
-
-/**
- * ice_ptp_check_offset_valid - Check port offset valid bit
- * @port: Port for which offset valid bit is checked
- *
- * Returns 0 if both Tx and Rx offset are valid, and -EAGAIN if one of the
- * offset is not ready.
- */
-static int ice_ptp_check_offset_valid(struct ice_ptp_port *port)
-{
-       int tx_err, rx_err;
-
-       /* always check both Tx and Rx offset validity */
-       tx_err = ice_ptp_check_tx_offset_valid(port);
-       rx_err = ice_ptp_check_rx_offset_valid(port);
-
-       if (tx_err || rx_err)
-               return -EAGAIN;
-
-       return 0;
-}
-
-/**
- * ice_ptp_wait_for_offset_valid - Check for valid Tx and Rx offsets
+ * ice_ptp_wait_for_offsets - Check for valid Tx and Rx offsets
  * @work: Pointer to the kthread_work structure for this task
  *
- * Check whether both the Tx and Rx offsets are valid for enabling the vernier
- * calibration.
+ * Check whether hardware has completed measuring the Tx and Rx offset values
+ * used to configure and enable vernier timestamp calibration.
+ *
+ * Once the offset in either direction is measured, configure the associated
+ * registers with the calibrated offset values and enable timestamping. The Tx
+ * and Rx directions are configured independently as soon as their associated
+ * offsets are known.
  *
- * Once we have valid offsets from hardware, update the total Tx and Rx
- * offsets, and exit bypass mode. This enables more precise timestamps using
- * the extra data measured during the vernier calibration process.
+ * This function reschedules itself until both Tx and Rx calibration have
+ * completed.
  */
-static void ice_ptp_wait_for_offset_valid(struct kthread_work *work)
+static void ice_ptp_wait_for_offsets(struct kthread_work *work)
 {
        struct ice_ptp_port *port;
-       int err;
-       struct device *dev;
        struct ice_pf *pf;
        struct ice_hw *hw;
+       int tx_err;
+       int rx_err;
 
        port = container_of(work, struct ice_ptp_port, ov_work.work);
        pf = ptp_port_to_pf(port);
        hw = &pf->hw;
-       dev = ice_pf_to_dev(pf);
-
-       if (ice_is_reset_in_progress(pf->state))
-               return;
 
-       if (ice_ptp_check_offset_valid(port)) {
-               /* Offsets not ready yet, try again later */
+       if (ice_is_reset_in_progress(pf->state)) {
+               /* wait for device driver to complete reset */
                kthread_queue_delayed_work(pf->ptp.kworker,
                                           &port->ov_work,
                                           msecs_to_jiffies(100));
                return;
        }
 
-       /* Offsets are valid, so it is safe to exit bypass mode */
-       err = ice_phy_exit_bypass_e822(hw, port->port_num);
-       if (err) {
-               dev_warn(dev, "Failed to exit bypass mode for PHY port %u, err %d\n",
-                        port->port_num, err);
+       tx_err = ice_ptp_check_tx_fifo(port);
+       if (!tx_err)
+               tx_err = ice_phy_cfg_tx_offset_e822(hw, port->port_num);
+       rx_err = ice_phy_cfg_rx_offset_e822(hw, port->port_num);
+       if (tx_err || rx_err) {
+               /* Tx and/or Rx offset not yet configured, try again later */
+               kthread_queue_delayed_work(pf->ptp.kworker,
+                                          &port->ov_work,
+                                          msecs_to_jiffies(100));
                return;
        }
 }
@@ -1317,16 +1317,20 @@ ice_ptp_port_phy_restart(struct ice_ptp_port *ptp_port)
        kthread_cancel_delayed_work_sync(&ptp_port->ov_work);
 
        /* temporarily disable Tx timestamps while calibrating PHY offset */
+       spin_lock(&ptp_port->tx.lock);
        ptp_port->tx.calibrating = true;
+       spin_unlock(&ptp_port->tx.lock);
        ptp_port->tx_fifo_busy_cnt = 0;
 
-       /* Start the PHY timer in bypass mode */
-       err = ice_start_phy_timer_e822(hw, port, true);
+       /* Start the PHY timer in Vernier mode */
+       err = ice_start_phy_timer_e822(hw, port);
        if (err)
                goto out_unlock;
 
        /* Enable Tx timestamps right away */
+       spin_lock(&ptp_port->tx.lock);
        ptp_port->tx.calibrating = false;
+       spin_unlock(&ptp_port->tx.lock);
 
        kthread_queue_delayed_work(pf->ptp.kworker, &ptp_port->ov_work, 0);
 
@@ -1341,45 +1345,33 @@ out_unlock:
 }
 
 /**
- * ice_ptp_link_change - Set or clear port registers for timestamping
+ * ice_ptp_link_change - Reconfigure PTP after link status change
  * @pf: Board private structure
  * @port: Port for which the PHY start is set
  * @linkup: Link is up or down
  */
-int ice_ptp_link_change(struct ice_pf *pf, u8 port, bool linkup)
+void ice_ptp_link_change(struct ice_pf *pf, u8 port, bool linkup)
 {
        struct ice_ptp_port *ptp_port;
 
-       if (!test_bit(ICE_FLAG_PTP_SUPPORTED, pf->flags))
-               return 0;
+       if (!test_bit(ICE_FLAG_PTP, pf->flags))
+               return;
 
-       if (port >= ICE_NUM_EXTERNAL_PORTS)
-               return -EINVAL;
+       if (WARN_ON_ONCE(port >= ICE_NUM_EXTERNAL_PORTS))
+               return;
 
        ptp_port = &pf->ptp.port;
-       if (ptp_port->port_num != port)
-               return -EINVAL;
+       if (WARN_ON_ONCE(ptp_port->port_num != port))
+               return;
 
-       /* Update cached link err for this port immediately */
+       /* Update cached link status for this port immediately */
        ptp_port->link_up = linkup;
 
-       if (!test_bit(ICE_FLAG_PTP, pf->flags))
-               /* PTP is not setup */
-               return -EAGAIN;
-
-       return ice_ptp_port_phy_restart(ptp_port);
-}
-
-/**
- * ice_ptp_reset_ts_memory - Reset timestamp memory for all quads
- * @pf: The PF private data structure
- */
-static void ice_ptp_reset_ts_memory(struct ice_pf *pf)
-{
-       int quad;
+       /* E810 devices do not need to reconfigure the PHY */
+       if (ice_is_e810(&pf->hw))
+               return;
 
-       quad = pf->hw.port_info->lport / ICE_PORTS_PER_QUAD;
-       ice_ptp_reset_ts_memory_quad(pf, quad);
+       ice_ptp_port_phy_restart(ptp_port);
 }
 
 /**
@@ -1397,7 +1389,7 @@ static int ice_ptp_tx_ena_intr(struct ice_pf *pf, bool ena, u32 threshold)
        int quad;
        u32 val;
 
-       ice_ptp_reset_ts_memory(pf);
+       ice_ptp_reset_ts_memory(hw);
 
        for (quad = 0; quad < ICE_MAX_QUAD; quad++) {
                err = ice_read_quad_reg_e822(hw, quad, Q_REG_TX_MEM_GBL_CFG,
@@ -2332,11 +2324,14 @@ s8 ice_ptp_request_ts(struct ice_ptp_tx *tx, struct sk_buff *skb)
 {
        u8 idx;
 
-       /* Check if this tracker is initialized */
-       if (!tx->init || tx->calibrating)
+       spin_lock(&tx->lock);
+
+       /* Check that this tracker is accepting new timestamp requests */
+       if (!ice_ptp_is_tx_tracker_up(tx)) {
+               spin_unlock(&tx->lock);
                return -1;
+       }
 
-       spin_lock(&tx->lock);
        /* Find and set the first available index */
        idx = find_first_zero_bit(tx->in_use, tx->len);
        if (idx < tx->len) {
@@ -2345,6 +2340,7 @@ s8 ice_ptp_request_ts(struct ice_ptp_tx *tx, struct sk_buff *skb)
                 * requests.
                 */
                set_bit(idx, tx->in_use);
+               clear_bit(idx, tx->stale);
                tx->tstamps[idx].start = jiffies;
                tx->tstamps[idx].skb = skb_get(skb);
                skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS;
@@ -2359,7 +2355,7 @@ s8 ice_ptp_request_ts(struct ice_ptp_tx *tx, struct sk_buff *skb)
        if (idx >= tx->len)
                return -1;
        else
-               return idx + tx->quad_offset;
+               return idx + tx->offset;
 }
 
 /**
@@ -2384,8 +2380,6 @@ static void ice_ptp_periodic_work(struct kthread_work *work)
 
        err = ice_ptp_update_cached_phctime(pf);
 
-       ice_ptp_tx_tstamp_cleanup(pf, &pf->ptp.port.tx);
-
        /* Run twice a second or reschedule if phc update failed */
        kthread_queue_delayed_work(ptp->kworker, &ptp->work,
                                   msecs_to_jiffies(err ? 10 : 500));
@@ -2462,7 +2456,7 @@ pfr:
                err = ice_ptp_init_tx_e810(pf, &ptp->port.tx);
        } else {
                kthread_init_delayed_work(&ptp->port.ov_work,
-                                         ice_ptp_wait_for_offset_valid);
+                                         ice_ptp_wait_for_offsets);
                err = ice_ptp_init_tx_e822(pf, &ptp->port.tx,
                                           ptp->port.port_num);
        }
@@ -2625,7 +2619,7 @@ static int ice_ptp_init_port(struct ice_pf *pf, struct ice_ptp_port *ptp_port)
                return ice_ptp_init_tx_e810(pf, &ptp_port->tx);
 
        kthread_init_delayed_work(&ptp_port->ov_work,
-                                 ice_ptp_wait_for_offset_valid);
+                                 ice_ptp_wait_for_offsets);
        return ice_ptp_init_tx_e822(pf, &ptp_port->tx, ptp_port->port_num);
 }
 
index 0283492..9cda2f4 100644 (file)
@@ -93,9 +93,14 @@ struct ice_perout_channel {
  * we discard old requests that were not fulfilled within a 2 second time
  * window.
  * Timestamp values in the PHY are read only and do not get cleared except at
- * hardware reset or when a new timestamp value is captured. The cached_tstamp
- * field is used to detect the case where a new timestamp has not yet been
- * captured, ensuring that we avoid sending stale timestamp data to the stack.
+ * hardware reset or when a new timestamp value is captured.
+ *
+ * Some PHY types do not provide a "ready" bitmap indicating which timestamp
+ * indexes are valid. In these cases, we use a cached_tstamp to keep track of
+ * the last timestamp we read for a given index. If the current timestamp
+ * value is the same as the cached value, we assume a new timestamp hasn't
+ * been captured. This avoids reporting stale timestamps to the stack. This is
+ * only done if the verify_cached flag is set in ice_ptp_tx structure.
  */
 struct ice_tx_tstamp {
        struct sk_buff *skb;
@@ -105,30 +110,35 @@ struct ice_tx_tstamp {
 
 /**
  * struct ice_ptp_tx - Tracking structure for all Tx timestamp requests on a port
- * @lock: lock to prevent concurrent write to in_use bitmap
+ * @lock: lock to prevent concurrent access to fields of this struct
  * @tstamps: array of len to store outstanding requests
  * @in_use: bitmap of len to indicate which slots are in use
- * @quad: which quad the timestamps are captured in
- * @quad_offset: offset into timestamp block of the quad to get the real index
+ * @stale: bitmap of len to indicate slots which have stale timestamps
+ * @block: which memory block (quad or port) the timestamps are captured in
+ * @offset: offset into timestamp block to get the real index
  * @len: length of the tstamps and in_use fields.
  * @init: if true, the tracker is initialized;
  * @calibrating: if true, the PHY is calibrating the Tx offset. During this
  *               window, timestamps are temporarily disabled.
+ * @verify_cached: if true, verify new timestamp differs from last read value
  */
 struct ice_ptp_tx {
        spinlock_t lock; /* lock protecting in_use bitmap */
        struct ice_tx_tstamp *tstamps;
        unsigned long *in_use;
-       u8 quad;
-       u8 quad_offset;
+       unsigned long *stale;
+       u8 block;
+       u8 offset;
        u8 len;
-       u8 init;
-       u8 calibrating;
+       u8 init : 1;
+       u8 calibrating : 1;
+       u8 verify_cached : 1;
 };
 
 /* Quad and port information for initializing timestamp blocks */
 #define INDEX_PER_QUAD                 64
-#define INDEX_PER_PORT                 (INDEX_PER_QUAD / ICE_PORTS_PER_QUAD)
+#define INDEX_PER_PORT_E822            16
+#define INDEX_PER_PORT_E810            64
 
 /**
  * struct ice_ptp_port - data used to initialize an external port for PTP
@@ -256,7 +266,7 @@ void ice_ptp_reset(struct ice_pf *pf);
 void ice_ptp_prepare_for_reset(struct ice_pf *pf);
 void ice_ptp_init(struct ice_pf *pf);
 void ice_ptp_release(struct ice_pf *pf);
-int ice_ptp_link_change(struct ice_pf *pf, u8 port, bool linkup);
+void ice_ptp_link_change(struct ice_pf *pf, u8 port, bool linkup);
 #else /* IS_ENABLED(CONFIG_PTP_1588_CLOCK) */
 static inline int ice_ptp_set_ts_config(struct ice_pf *pf, struct ifreq *ifr)
 {
@@ -291,7 +301,8 @@ static inline void ice_ptp_reset(struct ice_pf *pf) { }
 static inline void ice_ptp_prepare_for_reset(struct ice_pf *pf) { }
 static inline void ice_ptp_init(struct ice_pf *pf) { }
 static inline void ice_ptp_release(struct ice_pf *pf) { }
-static inline int ice_ptp_link_change(struct ice_pf *pf, u8 port, bool linkup)
-{ return 0; }
+static inline void ice_ptp_link_change(struct ice_pf *pf, u8 port, bool linkup)
+{
+}
 #endif /* IS_ENABLED(CONFIG_PTP_1588_CLOCK) */
 #endif /* _ICE_PTP_H_ */
index 1f8dd50..a38614d 100644 (file)
@@ -656,6 +656,32 @@ ice_clear_phy_tstamp_e822(struct ice_hw *hw, u8 quad, u8 idx)
 }
 
 /**
+ * ice_ptp_reset_ts_memory_quad_e822 - Clear all timestamps from the quad block
+ * @hw: pointer to the HW struct
+ * @quad: the quad to read from
+ *
+ * Clear all timestamps from the PHY quad block that is shared between the
+ * internal PHYs on the E822 devices.
+ */
+void ice_ptp_reset_ts_memory_quad_e822(struct ice_hw *hw, u8 quad)
+{
+       ice_write_quad_reg_e822(hw, quad, Q_REG_TS_CTRL, Q_REG_TS_CTRL_M);
+       ice_write_quad_reg_e822(hw, quad, Q_REG_TS_CTRL, ~(u32)Q_REG_TS_CTRL_M);
+}
+
+/**
+ * ice_ptp_reset_ts_memory_e822 - Clear all timestamps from all quad blocks
+ * @hw: pointer to the HW struct
+ */
+static void ice_ptp_reset_ts_memory_e822(struct ice_hw *hw)
+{
+       unsigned int quad;
+
+       for (quad = 0; quad < ICE_MAX_QUAD; quad++)
+               ice_ptp_reset_ts_memory_quad_e822(hw, quad);
+}
+
+/**
  * ice_read_cgu_reg_e822 - Read a CGU register
  * @hw: pointer to the HW struct
  * @addr: Register address to read
@@ -1715,21 +1741,48 @@ ice_calc_fixed_tx_offset_e822(struct ice_hw *hw, enum ice_ptp_link_spd link_spd)
  * adjust Tx timestamps by. This is calculated by combining some known static
  * latency along with the Vernier offset computations done by hardware.
  *
- * This function must be called only after the offset registers are valid,
- * i.e. after the Vernier calibration wait has passed, to ensure that the PHY
- * has measured the offset.
+ * This function will not return successfully until the Tx offset calculations
+ * have been completed, which requires waiting until at least one packet has
+ * been transmitted by the device. It is safe to call this function
+ * periodically until calibration succeeds, as it will only program the offset
+ * once.
  *
  * To avoid overflow, when calculating the offset based on the known static
  * latency values, we use measurements in 1/100th of a nanosecond, and divide
  * the TUs per second up front. This avoids overflow while allowing
  * calculation of the adjustment using integer arithmetic.
+ *
+ * Returns zero on success, -EBUSY if the hardware vernier offset
+ * calibration has not completed, or another error code on failure.
  */
-static int ice_phy_cfg_tx_offset_e822(struct ice_hw *hw, u8 port)
+int ice_phy_cfg_tx_offset_e822(struct ice_hw *hw, u8 port)
 {
        enum ice_ptp_link_spd link_spd;
        enum ice_ptp_fec_mode fec_mode;
        u64 total_offset, val;
        int err;
+       u32 reg;
+
+       /* Nothing to do if we've already programmed the offset */
+       err = ice_read_phy_reg_e822(hw, port, P_REG_TX_OR, &reg);
+       if (err) {
+               ice_debug(hw, ICE_DBG_PTP, "Failed to read TX_OR for port %u, err %d\n",
+                         port, err);
+               return err;
+       }
+
+       if (reg)
+               return 0;
+
+       err = ice_read_phy_reg_e822(hw, port, P_REG_TX_OV_STATUS, &reg);
+       if (err) {
+               ice_debug(hw, ICE_DBG_PTP, "Failed to read TX_OV_STATUS for port %u, err %d\n",
+                         port, err);
+               return err;
+       }
+
+       if (!(reg & P_REG_TX_OV_STATUS_OV_M))
+               return -EBUSY;
 
        err = ice_phy_get_speed_and_fec_e822(hw, port, &link_spd, &fec_mode);
        if (err)
@@ -1783,46 +1836,8 @@ static int ice_phy_cfg_tx_offset_e822(struct ice_hw *hw, u8 port)
        if (err)
                return err;
 
-       return 0;
-}
-
-/**
- * ice_phy_cfg_fixed_tx_offset_e822 - Configure Tx offset for bypass mode
- * @hw: pointer to the HW struct
- * @port: the PHY port to configure
- *
- * Calculate and program the fixed Tx offset, and indicate that the offset is
- * ready. This can be used when operating in bypass mode.
- */
-static int
-ice_phy_cfg_fixed_tx_offset_e822(struct ice_hw *hw, u8 port)
-{
-       enum ice_ptp_link_spd link_spd;
-       enum ice_ptp_fec_mode fec_mode;
-       u64 total_offset;
-       int err;
-
-       err = ice_phy_get_speed_and_fec_e822(hw, port, &link_spd, &fec_mode);
-       if (err)
-               return err;
-
-       total_offset = ice_calc_fixed_tx_offset_e822(hw, link_spd);
-
-       /* Program the fixed Tx offset into the P_REG_TOTAL_TX_OFFSET_L
-        * register, then indicate that the Tx offset is ready. After this,
-        * timestamps will be enabled.
-        *
-        * Note that this skips including the more precise offsets generated
-        * by the Vernier calibration.
-        */
-       err = ice_write_64b_phy_reg_e822(hw, port, P_REG_TOTAL_TX_OFFSET_L,
-                                        total_offset);
-       if (err)
-               return err;
-
-       err = ice_write_phy_reg_e822(hw, port, P_REG_TX_OR, 1);
-       if (err)
-               return err;
+       dev_info(ice_hw_to_dev(hw), "Port=%d Tx vernier offset calibration complete\n",
+                port);
 
        return 0;
 }
@@ -2026,6 +2041,11 @@ ice_calc_fixed_rx_offset_e822(struct ice_hw *hw, enum ice_ptp_link_spd link_spd)
  * measurements taken in hardware with some data about known fixed delay as
  * well as adjusting for multi-lane alignment delay.
  *
+ * This function will not return successfully until the Rx offset calculations
+ * have been completed, which requires waiting until at least one packet has
+ * been received by the device. It is safe to call this function periodically
+ * until calibration succeeds, as it will only program the offset once.
+ *
  * This function must be called only after the offset registers are valid,
  * i.e. after the Vernier calibration wait has passed, to ensure that the PHY
  * has measured the offset.
@@ -2034,13 +2054,38 @@ ice_calc_fixed_rx_offset_e822(struct ice_hw *hw, enum ice_ptp_link_spd link_spd)
  * latency values, we use measurements in 1/100th of a nanosecond, and divide
  * the TUs per second up front. This avoids overflow while allowing
  * calculation of the adjustment using integer arithmetic.
+ *
+ * Returns zero on success, -EBUSY if the hardware vernier offset
+ * calibration has not completed, or another error code on failure.
  */
-static int ice_phy_cfg_rx_offset_e822(struct ice_hw *hw, u8 port)
+int ice_phy_cfg_rx_offset_e822(struct ice_hw *hw, u8 port)
 {
        enum ice_ptp_link_spd link_spd;
        enum ice_ptp_fec_mode fec_mode;
        u64 total_offset, pmd, val;
        int err;
+       u32 reg;
+
+       /* Nothing to do if we've already programmed the offset */
+       err = ice_read_phy_reg_e822(hw, port, P_REG_RX_OR, &reg);
+       if (err) {
+               ice_debug(hw, ICE_DBG_PTP, "Failed to read RX_OR for port %u, err %d\n",
+                         port, err);
+               return err;
+       }
+
+       if (reg)
+               return 0;
+
+       err = ice_read_phy_reg_e822(hw, port, P_REG_RX_OV_STATUS, &reg);
+       if (err) {
+               ice_debug(hw, ICE_DBG_PTP, "Failed to read RX_OV_STATUS for port %u, err %d\n",
+                         port, err);
+               return err;
+       }
+
+       if (!(reg & P_REG_RX_OV_STATUS_OV_M))
+               return -EBUSY;
 
        err = ice_phy_get_speed_and_fec_e822(hw, port, &link_spd, &fec_mode);
        if (err)
@@ -2101,46 +2146,8 @@ static int ice_phy_cfg_rx_offset_e822(struct ice_hw *hw, u8 port)
        if (err)
                return err;
 
-       return 0;
-}
-
-/**
- * ice_phy_cfg_fixed_rx_offset_e822 - Configure fixed Rx offset for bypass mode
- * @hw: pointer to the HW struct
- * @port: the PHY port to configure
- *
- * Calculate and program the fixed Rx offset, and indicate that the offset is
- * ready. This can be used when operating in bypass mode.
- */
-static int
-ice_phy_cfg_fixed_rx_offset_e822(struct ice_hw *hw, u8 port)
-{
-       enum ice_ptp_link_spd link_spd;
-       enum ice_ptp_fec_mode fec_mode;
-       u64 total_offset;
-       int err;
-
-       err = ice_phy_get_speed_and_fec_e822(hw, port, &link_spd, &fec_mode);
-       if (err)
-               return err;
-
-       total_offset = ice_calc_fixed_rx_offset_e822(hw, link_spd);
-
-       /* Program the fixed Rx offset into the P_REG_TOTAL_RX_OFFSET_L
-        * register, then indicate that the Rx offset is ready. After this,
-        * timestamps will be enabled.
-        *
-        * Note that this skips including the more precise offsets generated
-        * by Vernier calibration.
-        */
-       err = ice_write_64b_phy_reg_e822(hw, port, P_REG_TOTAL_RX_OFFSET_L,
-                                        total_offset);
-       if (err)
-               return err;
-
-       err = ice_write_phy_reg_e822(hw, port, P_REG_RX_OR, 1);
-       if (err)
-               return err;
+       dev_info(ice_hw_to_dev(hw), "Port=%d Rx vernier offset calibration complete\n",
+                port);
 
        return 0;
 }
@@ -2323,20 +2330,14 @@ ice_stop_phy_timer_e822(struct ice_hw *hw, u8 port, bool soft_reset)
  * ice_start_phy_timer_e822 - Start the PHY clock timer
  * @hw: pointer to the HW struct
  * @port: the PHY port to start
- * @bypass: if true, start the PHY in bypass mode
  *
  * Start the clock of a PHY port. This must be done as part of the flow to
  * re-calibrate Tx and Rx timestamping offsets whenever the clock time is
  * initialized or when link speed changes.
  *
- * Bypass mode enables timestamps immediately without waiting for Vernier
- * calibration to complete. Hardware will still continue taking Vernier
- * measurements on Tx or Rx of packets, but they will not be applied to
- * timestamps. Use ice_phy_exit_bypass_e822 to exit bypass mode once hardware
- * has completed offset calculation.
+ * Hardware will take Vernier measurements on Tx or Rx of packets.
  */
-int
-ice_start_phy_timer_e822(struct ice_hw *hw, u8 port, bool bypass)
+int ice_start_phy_timer_e822(struct ice_hw *hw, u8 port)
 {
        u32 lo, hi, val;
        u64 incval;
@@ -2414,110 +2415,42 @@ ice_start_phy_timer_e822(struct ice_hw *hw, u8 port, bool bypass)
        if (err)
                return err;
 
-       if (bypass) {
-               val |= P_REG_PS_BYPASS_MODE_M;
-               /* Enter BYPASS mode, enabling timestamps immediately. */
-               err = ice_write_phy_reg_e822(hw, port, P_REG_PS, val);
-               if (err)
-                       return err;
-
-               /* Program the fixed Tx offset */
-               err = ice_phy_cfg_fixed_tx_offset_e822(hw, port);
-               if (err)
-                       return err;
-
-               /* Program the fixed Rx offset */
-               err = ice_phy_cfg_fixed_rx_offset_e822(hw, port);
-               if (err)
-                       return err;
-       }
-
        ice_debug(hw, ICE_DBG_PTP, "Enabled clock on PHY port %u\n", port);
 
        return 0;
 }
 
 /**
- * ice_phy_exit_bypass_e822 - Exit bypass mode, after vernier calculations
+ * ice_get_phy_tx_tstamp_ready_e822 - Read Tx memory status register
  * @hw: pointer to the HW struct
- * @port: the PHY port to configure
- *
- * After hardware finishes vernier calculations for the Tx and Rx offset, this
- * function can be used to exit bypass mode by updating the total Tx and Rx
- * offsets, and then disabling bypass. This will enable hardware to include
- * the more precise offset calibrations, increasing precision of the generated
- * timestamps.
+ * @quad: the timestamp quad to read from
+ * @tstamp_ready: contents of the Tx memory status register
  *
- * This cannot be done until hardware has measured the offsets, which requires
- * waiting until at least one packet has been sent and received by the device.
+ * Read the Q_REG_TX_MEMORY_STATUS register indicating which timestamps in
+ * the PHY are ready. A set bit means the corresponding timestamp is valid and
+ * ready to be captured from the PHY timestamp block.
  */
-int ice_phy_exit_bypass_e822(struct ice_hw *hw, u8 port)
+static int
+ice_get_phy_tx_tstamp_ready_e822(struct ice_hw *hw, u8 quad, u64 *tstamp_ready)
 {
+       u32 hi, lo;
        int err;
-       u32 val;
-
-       err = ice_read_phy_reg_e822(hw, port, P_REG_TX_OV_STATUS, &val);
-       if (err) {
-               ice_debug(hw, ICE_DBG_PTP, "Failed to read TX_OV_STATUS for port %u, err %d\n",
-                         port, err);
-               return err;
-       }
-
-       if (!(val & P_REG_TX_OV_STATUS_OV_M)) {
-               ice_debug(hw, ICE_DBG_PTP, "Tx offset is not yet valid for port %u\n",
-                         port);
-               return -EBUSY;
-       }
-
-       err = ice_read_phy_reg_e822(hw, port, P_REG_RX_OV_STATUS, &val);
-       if (err) {
-               ice_debug(hw, ICE_DBG_PTP, "Failed to read RX_OV_STATUS for port %u, err %d\n",
-                         port, err);
-               return err;
-       }
-
-       if (!(val & P_REG_TX_OV_STATUS_OV_M)) {
-               ice_debug(hw, ICE_DBG_PTP, "Rx offset is not yet valid for port %u\n",
-                         port);
-               return -EBUSY;
-       }
-
-       err = ice_phy_cfg_tx_offset_e822(hw, port);
-       if (err) {
-               ice_debug(hw, ICE_DBG_PTP, "Failed to program total Tx offset for port %u, err %d\n",
-                         port, err);
-               return err;
-       }
-
-       err = ice_phy_cfg_rx_offset_e822(hw, port);
-       if (err) {
-               ice_debug(hw, ICE_DBG_PTP, "Failed to program total Rx offset for port %u, err %d\n",
-                         port, err);
-               return err;
-       }
 
-       /* Exit bypass mode now that the offset has been updated */
-       err = ice_read_phy_reg_e822(hw, port, P_REG_PS, &val);
+       err = ice_read_quad_reg_e822(hw, quad, Q_REG_TX_MEMORY_STATUS_U, &hi);
        if (err) {
-               ice_debug(hw, ICE_DBG_PTP, "Failed to read P_REG_PS for port %u, err %d\n",
-                         port, err);
+               ice_debug(hw, ICE_DBG_PTP, "Failed to read TX_MEMORY_STATUS_U for quad %u, err %d\n",
+                         quad, err);
                return err;
        }
 
-       if (!(val & P_REG_PS_BYPASS_MODE_M))
-               ice_debug(hw, ICE_DBG_PTP, "Port %u not in bypass mode\n",
-                         port);
-
-       val &= ~P_REG_PS_BYPASS_MODE_M;
-       err = ice_write_phy_reg_e822(hw, port, P_REG_PS, val);
+       err = ice_read_quad_reg_e822(hw, quad, Q_REG_TX_MEMORY_STATUS_L, &lo);
        if (err) {
-               ice_debug(hw, ICE_DBG_PTP, "Failed to disable bypass for port %u, err %d\n",
-                         port, err);
+               ice_debug(hw, ICE_DBG_PTP, "Failed to read TX_MEMORY_STATUS_L for quad %u, err %d\n",
+                         quad, err);
                return err;
        }
 
-       dev_info(ice_hw_to_dev(hw), "Exiting bypass mode on PHY port %u\n",
-                port);
+       *tstamp_ready = (u64)hi << 32 | (u64)lo;
 
        return 0;
 }
@@ -3196,6 +3129,22 @@ int ice_clear_phy_tstamp(struct ice_hw *hw, u8 block, u8 idx)
                return ice_clear_phy_tstamp_e822(hw, block, idx);
 }
 
+/**
+ * ice_get_phy_tx_tstamp_ready_e810 - Read Tx memory status register
+ * @hw: pointer to the HW struct
+ * @port: the PHY port to read
+ * @tstamp_ready: contents of the Tx memory status register
+ *
+ * E810 devices do not use a Tx memory status register. Instead simply
+ * indicate that all timestamps are currently ready.
+ */
+static int
+ice_get_phy_tx_tstamp_ready_e810(struct ice_hw *hw, u8 port, u64 *tstamp_ready)
+{
+       *tstamp_ready = 0xFFFFFFFFFFFFFFFF;
+       return 0;
+}
+
 /* E810T SMA functions
  *
  * The following functions operate specifically on E810T hardware and are used
@@ -3379,6 +3328,18 @@ bool ice_is_pca9575_present(struct ice_hw *hw)
 }
 
 /**
+ * ice_ptp_reset_ts_memory - Reset timestamp memory for all blocks
+ * @hw: pointer to the HW struct
+ */
+void ice_ptp_reset_ts_memory(struct ice_hw *hw)
+{
+       if (ice_is_e810(hw))
+               return;
+
+       ice_ptp_reset_ts_memory_e822(hw);
+}
+
+/**
  * ice_ptp_init_phc - Initialize PTP hardware clock
  * @hw: pointer to the HW struct
  *
@@ -3399,3 +3360,24 @@ int ice_ptp_init_phc(struct ice_hw *hw)
        else
                return ice_ptp_init_phc_e822(hw);
 }
+
+/**
+ * ice_get_phy_tx_tstamp_ready - Read PHY Tx memory status indication
+ * @hw: pointer to the HW struct
+ * @block: the timestamp block to check
+ * @tstamp_ready: storage for the PHY Tx memory status information
+ *
+ * Check the PHY for Tx timestamp memory status. This reports a 64 bit value
+ * which indicates which timestamps in the block may be captured. A set bit
+ * means the timestamp can be read. An unset bit means the timestamp is not
+ * ready and software should avoid reading the register.
+ */
+int ice_get_phy_tx_tstamp_ready(struct ice_hw *hw, u8 block, u64 *tstamp_ready)
+{
+       if (ice_is_e810(hw))
+               return ice_get_phy_tx_tstamp_ready_e810(hw, block,
+                                                       tstamp_ready);
+       else
+               return ice_get_phy_tx_tstamp_ready_e822(hw, block,
+                                                       tstamp_ready);
+}
index 2bda64c..3b68cb9 100644 (file)
@@ -133,7 +133,9 @@ int ice_ptp_write_incval_locked(struct ice_hw *hw, u64 incval);
 int ice_ptp_adj_clock(struct ice_hw *hw, s32 adj);
 int ice_read_phy_tstamp(struct ice_hw *hw, u8 block, u8 idx, u64 *tstamp);
 int ice_clear_phy_tstamp(struct ice_hw *hw, u8 block, u8 idx);
+void ice_ptp_reset_ts_memory(struct ice_hw *hw);
 int ice_ptp_init_phc(struct ice_hw *hw);
+int ice_get_phy_tx_tstamp_ready(struct ice_hw *hw, u8 block, u64 *tstamp_ready);
 
 /* E822 family functions */
 int ice_read_phy_reg_e822(struct ice_hw *hw, u8 port, u16 offset, u32 *val);
@@ -141,6 +143,7 @@ int ice_write_phy_reg_e822(struct ice_hw *hw, u8 port, u16 offset, u32 val);
 int ice_read_quad_reg_e822(struct ice_hw *hw, u8 quad, u16 offset, u32 *val);
 int ice_write_quad_reg_e822(struct ice_hw *hw, u8 quad, u16 offset, u32 val);
 int ice_ptp_prep_port_adj_e822(struct ice_hw *hw, u8 port, s64 time);
+void ice_ptp_reset_ts_memory_quad_e822(struct ice_hw *hw, u8 quad);
 
 /**
  * ice_e822_time_ref - Get the current TIME_REF from capabilities
@@ -184,8 +187,9 @@ static inline u64 ice_e822_pps_delay(enum ice_time_ref_freq time_ref)
 
 /* E822 Vernier calibration functions */
 int ice_stop_phy_timer_e822(struct ice_hw *hw, u8 port, bool soft_reset);
-int ice_start_phy_timer_e822(struct ice_hw *hw, u8 port, bool bypass);
-int ice_phy_exit_bypass_e822(struct ice_hw *hw, u8 port);
+int ice_start_phy_timer_e822(struct ice_hw *hw, u8 port);
+int ice_phy_cfg_tx_offset_e822(struct ice_hw *hw, u8 port);
+int ice_phy_cfg_rx_offset_e822(struct ice_hw *hw, u8 port);
 
 /* E810 family functions */
 int ice_ptp_init_phy_e810(struct ice_hw *hw);
index 36acec8..7d60da1 100644 (file)
@@ -1413,6 +1413,8 @@ static int igb_intr_test(struct igb_adapter *adapter, u64 *data)
                        *data = 1;
                        return -1;
                }
+               wr32(E1000_IVAR_MISC, E1000_IVAR_VALID << 8);
+               wr32(E1000_EIMS, BIT(0));
        } else if (adapter->flags & IGB_FLAG_HAS_MSI) {
                shared_int = false;
                if (request_irq(irq,
index 774de63..53a969e 100644 (file)
@@ -585,6 +585,11 @@ static int ixgbe_ipsec_add_sa(struct xfrm_state *xs)
                return -EINVAL;
        }
 
+       if (xs->xso.type != XFRM_DEV_OFFLOAD_CRYPTO) {
+               netdev_err(dev, "Unsupported ipsec offload type\n");
+               return -EINVAL;
+       }
+
        if (xs->xso.dir == XFRM_DEV_OFFLOAD_IN) {
                struct rx_sa rsa;
 
index 9984ebc..c1cf540 100644 (file)
@@ -280,6 +280,11 @@ static int ixgbevf_ipsec_add_sa(struct xfrm_state *xs)
                return -EINVAL;
        }
 
+       if (xs->xso.type != XFRM_DEV_OFFLOAD_CRYPTO) {
+               netdev_err(dev, "Unsupported ipsec offload type\n");
+               return -EINVAL;
+       }
+
        if (xs->xso.dir == XFRM_DEV_OFFLOAD_IN) {
                struct rx_sa rsa;
 
index c2cb98d..f8925ca 100644 (file)
@@ -4270,7 +4270,7 @@ static void mvneta_percpu_elect(struct mvneta_port *pp)
        /* Use the cpu associated to the rxq when it is online, in all
         * the other cases, use the cpu 0 which can't be offline.
         */
-       if (cpu_online(pp->rxq_def))
+       if (pp->rxq_def < nr_cpu_ids && cpu_online(pp->rxq_def))
                elected_cpu = pp->rxq_def;
 
        max_cpu = num_present_cpus();
index c8724bf..b2b71fe 100644 (file)
@@ -64,6 +64,7 @@ static int cgx_fwi_link_change(struct cgx *cgx, int lmac_id, bool en);
 static const struct pci_device_id cgx_id_table[] = {
        { PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, PCI_DEVID_OCTEONTX2_CGX) },
        { PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, PCI_DEVID_CN10K_RPM) },
+       { PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, PCI_DEVID_CN10KB_RPM) },
        { 0, }  /* end of table */
 };
 
@@ -73,12 +74,13 @@ static bool is_dev_rpm(void *cgxd)
 {
        struct cgx *cgx = cgxd;
 
-       return (cgx->pdev->device == PCI_DEVID_CN10K_RPM);
+       return (cgx->pdev->device == PCI_DEVID_CN10K_RPM) ||
+              (cgx->pdev->device == PCI_DEVID_CN10KB_RPM);
 }
 
 bool is_lmac_valid(struct cgx *cgx, int lmac_id)
 {
-       if (!cgx || lmac_id < 0 || lmac_id >= MAX_LMAC_PER_CGX)
+       if (!cgx || lmac_id < 0 || lmac_id >= cgx->max_lmac_per_mac)
                return false;
        return test_bit(lmac_id, &cgx->lmac_bmap);
 }
@@ -90,7 +92,7 @@ static int get_sequence_id_of_lmac(struct cgx *cgx, int lmac_id)
 {
        int tmp, id = 0;
 
-       for_each_set_bit(tmp, &cgx->lmac_bmap, MAX_LMAC_PER_CGX) {
+       for_each_set_bit(tmp, &cgx->lmac_bmap, cgx->max_lmac_per_mac) {
                if (tmp == lmac_id)
                        break;
                id++;
@@ -121,7 +123,7 @@ u64 cgx_read(struct cgx *cgx, u64 lmac, u64 offset)
 
 struct lmac *lmac_pdata(u8 lmac_id, struct cgx *cgx)
 {
-       if (!cgx || lmac_id >= MAX_LMAC_PER_CGX)
+       if (!cgx || lmac_id >= cgx->max_lmac_per_mac)
                return NULL;
 
        return cgx->lmac_idmap[lmac_id];
@@ -485,7 +487,7 @@ int cgx_set_pkind(void *cgxd, u8 lmac_id, int pkind)
        if (!is_lmac_valid(cgx, lmac_id))
                return -ENODEV;
 
-       cgx_write(cgx, lmac_id, CGXX_CMRX_RX_ID_MAP, (pkind & 0x3F));
+       cgx_write(cgx, lmac_id, cgx->mac_ops->rxid_map_offset, (pkind & 0x3F));
        return 0;
 }
 
@@ -740,6 +742,10 @@ int cgx_get_fec_stats(void *cgxd, int lmac_id, struct cgx_fec_stats_rsp *rsp)
 
        if (!cgx || lmac_id >= cgx->lmac_count)
                return -ENODEV;
+
+       if (cgx->lmac_idmap[lmac_id]->link_info.fec == OTX2_FEC_NONE)
+               return 0;
+
        fec_stats_count =
                cgx_set_fec_stats_count(&cgx->lmac_idmap[lmac_id]->link_info);
        if (cgx->lmac_idmap[lmac_id]->link_info.fec == OTX2_FEC_BASER) {
@@ -1224,7 +1230,7 @@ static inline void link_status_user_format(u64 lstat,
        linfo->speed = cgx_speed_mbps[FIELD_GET(RESP_LINKSTAT_SPEED, lstat)];
        linfo->an = FIELD_GET(RESP_LINKSTAT_AN, lstat);
        linfo->fec = FIELD_GET(RESP_LINKSTAT_FEC, lstat);
-       linfo->lmac_type_id = cgx_get_lmac_type(cgx, lmac_id);
+       linfo->lmac_type_id = FIELD_GET(RESP_LINKSTAT_LMAC_TYPE, lstat);
        lmac_string = cgx_lmactype_string[linfo->lmac_type_id];
        strncpy(linfo->lmac_type, lmac_string, LMACTYPE_STR_LEN - 1);
 }
@@ -1395,7 +1401,7 @@ int cgx_get_fwdata_base(u64 *base)
        if (!cgx)
                return -ENXIO;
 
-       first_lmac = find_first_bit(&cgx->lmac_bmap, MAX_LMAC_PER_CGX);
+       first_lmac = find_first_bit(&cgx->lmac_bmap, cgx->max_lmac_per_mac);
        req = FIELD_SET(CMDREG_ID, CGX_CMD_GET_FWD_BASE, req);
        err = cgx_fwi_cmd_generic(req, &resp, cgx, first_lmac);
        if (!err)
@@ -1484,7 +1490,7 @@ static int cgx_fwi_link_change(struct cgx *cgx, int lmac_id, bool enable)
 
 static inline int cgx_fwi_read_version(u64 *resp, struct cgx *cgx)
 {
-       int first_lmac = find_first_bit(&cgx->lmac_bmap, MAX_LMAC_PER_CGX);
+       int first_lmac = find_first_bit(&cgx->lmac_bmap, cgx->max_lmac_per_mac);
        u64 req = 0;
 
        req = FIELD_SET(CMDREG_ID, CGX_CMD_GET_FW_VER, req);
@@ -1522,7 +1528,7 @@ static void cgx_lmac_linkup_work(struct work_struct *work)
        int i, err;
 
        /* Do Link up for all the enabled lmacs */
-       for_each_set_bit(i, &cgx->lmac_bmap, MAX_LMAC_PER_CGX) {
+       for_each_set_bit(i, &cgx->lmac_bmap, cgx->max_lmac_per_mac) {
                err = cgx_fwi_link_change(cgx, i, true);
                if (err)
                        dev_info(dev, "cgx port %d:%d Link up command failed\n",
@@ -1542,14 +1548,6 @@ int cgx_lmac_linkup_start(void *cgxd)
        return 0;
 }
 
-static void cgx_lmac_get_fifolen(struct cgx *cgx)
-{
-       u64 cfg;
-
-       cfg = cgx_read(cgx, 0, CGX_CONST);
-       cgx->mac_ops->fifo_len = FIELD_GET(CGX_CONST_RXFIFO_SIZE, cfg);
-}
-
 static int cgx_configure_interrupt(struct cgx *cgx, struct lmac *lmac,
                                   int cnt, bool req_free)
 {
@@ -1604,17 +1602,20 @@ static int cgx_lmac_init(struct cgx *cgx)
        u64 lmac_list;
        int i, err;
 
-       cgx_lmac_get_fifolen(cgx);
-
-       cgx->lmac_count = cgx->mac_ops->get_nr_lmacs(cgx);
        /* lmac_list specifies which lmacs are enabled
         * when bit n is set to 1, LMAC[n] is enabled
         */
-       if (cgx->mac_ops->non_contiguous_serdes_lane)
-               lmac_list = cgx_read(cgx, 0, CGXX_CMRX_RX_LMACS) & 0xFULL;
+       if (cgx->mac_ops->non_contiguous_serdes_lane) {
+               if (is_dev_rpm2(cgx))
+                       lmac_list =
+                               cgx_read(cgx, 0, RPM2_CMRX_RX_LMACS) & 0xFFULL;
+               else
+                       lmac_list =
+                               cgx_read(cgx, 0, CGXX_CMRX_RX_LMACS) & 0xFULL;
+       }
 
-       if (cgx->lmac_count > MAX_LMAC_PER_CGX)
-               cgx->lmac_count = MAX_LMAC_PER_CGX;
+       if (cgx->lmac_count > cgx->max_lmac_per_mac)
+               cgx->lmac_count = cgx->max_lmac_per_mac;
 
        for (i = 0; i < cgx->lmac_count; i++) {
                lmac = kzalloc(sizeof(struct lmac), GFP_KERNEL);
@@ -1635,7 +1636,9 @@ static int cgx_lmac_init(struct cgx *cgx)
 
                lmac->cgx = cgx;
                lmac->mac_to_index_bmap.max =
-                               MAX_DMAC_ENTRIES_PER_CGX / cgx->lmac_count;
+                               cgx->mac_ops->dmac_filter_count /
+                               cgx->lmac_count;
+
                err = rvu_alloc_bitmap(&lmac->mac_to_index_bmap);
                if (err)
                        goto err_name_free;
@@ -1692,7 +1695,7 @@ static int cgx_lmac_exit(struct cgx *cgx)
        }
 
        /* Free all lmac related resources */
-       for_each_set_bit(i, &cgx->lmac_bmap, MAX_LMAC_PER_CGX) {
+       for_each_set_bit(i, &cgx->lmac_bmap, cgx->max_lmac_per_mac) {
                lmac = cgx->lmac_idmap[i];
                if (!lmac)
                        continue;
@@ -1708,6 +1711,12 @@ static int cgx_lmac_exit(struct cgx *cgx)
 
 static void cgx_populate_features(struct cgx *cgx)
 {
+       u64 cfg;
+
+       cfg = cgx_read(cgx, 0, CGX_CONST);
+       cgx->mac_ops->fifo_len = FIELD_GET(CGX_CONST_RXFIFO_SIZE, cfg);
+       cgx->max_lmac_per_mac = FIELD_GET(CGX_CONST_MAX_LMACS, cfg);
+
        if (is_dev_rpm(cgx))
                cgx->hw_features = (RVU_LMAC_FEAT_DMACF | RVU_MAC_RPM |
                                    RVU_LMAC_FEAT_FC | RVU_LMAC_FEAT_PTP);
@@ -1716,6 +1725,15 @@ static void cgx_populate_features(struct cgx *cgx)
                                    RVU_LMAC_FEAT_PTP | RVU_LMAC_FEAT_DMACF);
 }
 
+static u8 cgx_get_rxid_mapoffset(struct cgx *cgx)
+{
+       if (cgx->pdev->subsystem_device == PCI_SUBSYS_DEVID_CNF10KB_RPM ||
+           is_dev_rpm2(cgx))
+               return 0x80;
+       else
+               return 0x60;
+}
+
 static struct mac_ops  cgx_mac_ops    = {
        .name           =       "cgx",
        .csr_offset     =       0,
@@ -1728,12 +1746,14 @@ static struct mac_ops   cgx_mac_ops    = {
        .non_contiguous_serdes_lane = false,
        .rx_stats_cnt   =       9,
        .tx_stats_cnt   =       18,
+       .dmac_filter_count =    32,
        .get_nr_lmacs   =       cgx_get_nr_lmacs,
        .get_lmac_type  =       cgx_get_lmac_type,
        .lmac_fifo_len  =       cgx_get_lmac_fifo_len,
        .mac_lmac_intl_lbk =    cgx_lmac_internal_loopback,
        .mac_get_rx_stats  =    cgx_get_rx_stats,
        .mac_get_tx_stats  =    cgx_get_tx_stats,
+       .get_fec_stats     =    cgx_get_fec_stats,
        .mac_enadis_rx_pause_fwding =   cgx_lmac_enadis_rx_pause_fwding,
        .mac_get_pause_frm_status =     cgx_lmac_get_pause_frm_status,
        .mac_enadis_pause_frm =         cgx_lmac_enadis_pause_frm,
@@ -1759,11 +1779,13 @@ static int cgx_probe(struct pci_dev *pdev, const struct pci_device_id *id)
        pci_set_drvdata(pdev, cgx);
 
        /* Use mac_ops to get MAC specific features */
-       if (pdev->device == PCI_DEVID_CN10K_RPM)
-               cgx->mac_ops = rpm_get_mac_ops();
+       if (is_dev_rpm(cgx))
+               cgx->mac_ops = rpm_get_mac_ops(cgx);
        else
                cgx->mac_ops = &cgx_mac_ops;
 
+       cgx->mac_ops->rxid_map_offset = cgx_get_rxid_mapoffset(cgx);
+
        err = pci_enable_device(pdev);
        if (err) {
                dev_err(dev, "Failed to enable PCI device\n");
index 0b06788..fb2d376 100644 (file)
 /* PCI BAR nos */
 #define PCI_CFG_REG_BAR_NUM            0
 
-#define CGX_ID_MASK                    0x7
-#define MAX_LMAC_PER_CGX               4
-#define MAX_DMAC_ENTRIES_PER_CGX       32
-#define CGX_FIFO_LEN                   65536 /* 64K for both Rx & Tx */
-#define CGX_OFFSET(x)                  ((x) * MAX_LMAC_PER_CGX)
+#define CGX_ID_MASK                    0xF
 
 /* Registers */
 #define CGXX_CMRX_CFG                  0x00
@@ -56,7 +52,8 @@
 #define CGXX_SCRATCH0_REG              0x1050
 #define CGXX_SCRATCH1_REG              0x1058
 #define CGX_CONST                      0x2000
-#define CGX_CONST_RXFIFO_SIZE          GENMASK_ULL(23, 0)
+#define CGX_CONST_RXFIFO_SIZE          GENMASK_ULL(55, 32)
+#define CGX_CONST_MAX_LMACS            GENMASK_ULL(31, 24)
 #define CGXX_SPUX_CONTROL1             0x10000
 #define CGXX_SPUX_LNX_FEC_CORR_BLOCKS  0x10700
 #define CGXX_SPUX_LNX_FEC_UNCORR_BLOCKS        0x10800
index 52b6016..39aaf0e 100644 (file)
@@ -75,6 +75,11 @@ struct mac_ops {
        /* RPM & CGX differs in number of Receive/transmit stats */
        u8                      rx_stats_cnt;
        u8                      tx_stats_cnt;
+       /* Unlike CN10K which shares same CSR offset with CGX
+        * CNF10KB has different csr offset
+        */
+       u64                     rxid_map_offset;
+       u8                      dmac_filter_count;
        /* Incase of RPM get number of lmacs from RPMX_CMR_RX_LMACS[LMAC_EXIST]
         * number of setbits in lmac_exist tells number of lmacs
         */
@@ -121,6 +126,9 @@ struct mac_ops {
        int                     (*mac_get_pfc_frm_cfg)(void *cgxd, int lmac_id,
                                                       u8 *tx_pause, u8 *rx_pause);
 
+       /* FEC stats */
+       int                     (*get_fec_stats)(void *cgxd, int lmac_id,
+                                                struct cgx_fec_stats_rsp *rsp);
 };
 
 struct cgx {
@@ -128,7 +136,10 @@ struct cgx {
        struct pci_dev          *pdev;
        u8                      cgx_id;
        u8                      lmac_count;
-       struct lmac             *lmac_idmap[MAX_LMAC_PER_CGX];
+       /* number of LMACs per MAC could be 4 or 8 */
+       u8                      max_lmac_per_mac;
+#define MAX_LMAC_COUNT         8
+       struct lmac             *lmac_idmap[MAX_LMAC_COUNT];
        struct                  work_struct cgx_cmd_work;
        struct                  workqueue_struct *cgx_cmd_workq;
        struct list_head        cgx_list;
@@ -150,6 +161,6 @@ struct lmac *lmac_pdata(u8 lmac_id, struct cgx *cgx);
 int cgx_fwi_cmd_send(u64 req, u64 *resp, struct lmac *lmac);
 int cgx_fwi_cmd_generic(u64 req, u64 *resp, struct cgx *cgx, int lmac_id);
 bool is_lmac_valid(struct cgx *cgx, int lmac_id);
-struct mac_ops *rpm_get_mac_ops(void);
+struct mac_ops *rpm_get_mac_ops(struct cgx *cgx);
 
 #endif /* LMAC_COMMON_H */
index a70e115..de0d88d 100644 (file)
@@ -8,7 +8,7 @@
 #include "cgx.h"
 #include "lmac_common.h"
 
-static struct mac_ops  rpm_mac_ops   = {
+static struct mac_ops          rpm_mac_ops   = {
        .name           =       "rpm",
        .csr_offset     =       0x4e00,
        .lmac_offset    =       20,
@@ -20,12 +20,14 @@ static struct mac_ops       rpm_mac_ops   = {
        .non_contiguous_serdes_lane = true,
        .rx_stats_cnt   =       43,
        .tx_stats_cnt   =       34,
+       .dmac_filter_count =    32,
        .get_nr_lmacs   =       rpm_get_nr_lmacs,
        .get_lmac_type  =       rpm_get_lmac_type,
        .lmac_fifo_len  =       rpm_get_lmac_fifo_len,
        .mac_lmac_intl_lbk =    rpm_lmac_internal_loopback,
        .mac_get_rx_stats  =    rpm_get_rx_stats,
        .mac_get_tx_stats  =    rpm_get_tx_stats,
+       .get_fec_stats     =    rpm_get_fec_stats,
        .mac_enadis_rx_pause_fwding =   rpm_lmac_enadis_rx_pause_fwding,
        .mac_get_pause_frm_status =     rpm_lmac_get_pause_frm_status,
        .mac_enadis_pause_frm =         rpm_lmac_enadis_pause_frm,
@@ -37,9 +39,50 @@ static struct mac_ops        rpm_mac_ops   = {
        .mac_get_pfc_frm_cfg   =        rpm_lmac_get_pfc_frm_cfg,
 };
 
-struct mac_ops *rpm_get_mac_ops(void)
+static struct mac_ops          rpm2_mac_ops   = {
+       .name           =       "rpm",
+       .csr_offset     =       RPM2_CSR_OFFSET,
+       .lmac_offset    =       20,
+       .int_register   =       RPM2_CMRX_SW_INT,
+       .int_set_reg    =       RPM2_CMRX_SW_INT_ENA_W1S,
+       .irq_offset     =       1,
+       .int_ena_bit    =       BIT_ULL(0),
+       .lmac_fwi       =       RPM_LMAC_FWI,
+       .non_contiguous_serdes_lane = true,
+       .rx_stats_cnt   =       43,
+       .tx_stats_cnt   =       34,
+       .dmac_filter_count =    64,
+       .get_nr_lmacs   =       rpm2_get_nr_lmacs,
+       .get_lmac_type  =       rpm_get_lmac_type,
+       .lmac_fifo_len  =       rpm2_get_lmac_fifo_len,
+       .mac_lmac_intl_lbk =    rpm_lmac_internal_loopback,
+       .mac_get_rx_stats  =    rpm_get_rx_stats,
+       .mac_get_tx_stats  =    rpm_get_tx_stats,
+       .get_fec_stats     =    rpm_get_fec_stats,
+       .mac_enadis_rx_pause_fwding =   rpm_lmac_enadis_rx_pause_fwding,
+       .mac_get_pause_frm_status =     rpm_lmac_get_pause_frm_status,
+       .mac_enadis_pause_frm =         rpm_lmac_enadis_pause_frm,
+       .mac_pause_frm_config =         rpm_lmac_pause_frm_config,
+       .mac_enadis_ptp_config =        rpm_lmac_ptp_config,
+       .mac_rx_tx_enable =             rpm_lmac_rx_tx_enable,
+       .mac_tx_enable =                rpm_lmac_tx_enable,
+       .pfc_config =                   rpm_lmac_pfc_config,
+       .mac_get_pfc_frm_cfg   =        rpm_lmac_get_pfc_frm_cfg,
+};
+
+bool is_dev_rpm2(void *rpmd)
+{
+       rpm_t *rpm = rpmd;
+
+       return (rpm->pdev->device == PCI_DEVID_CN10KB_RPM);
+}
+
+struct mac_ops *rpm_get_mac_ops(rpm_t *rpm)
 {
-       return &rpm_mac_ops;
+       if (is_dev_rpm2(rpm))
+               return &rpm2_mac_ops;
+       else
+               return &rpm_mac_ops;
 }
 
 static void rpm_write(rpm_t *rpm, u64 lmac, u64 offset, u64 val)
@@ -52,6 +95,16 @@ static u64 rpm_read(rpm_t *rpm, u64 lmac, u64 offset)
        return  cgx_read(rpm, lmac, offset);
 }
 
+/* Read HW major version to determine RPM
+ * MAC type 100/USX
+ */
+static bool is_mac_rpmusx(void *rpmd)
+{
+       rpm_t *rpm = rpmd;
+
+       return rpm_read(rpm, 0, RPMX_CONST1) & 0x700ULL;
+}
+
 int rpm_get_nr_lmacs(void *rpmd)
 {
        rpm_t *rpm = rpmd;
@@ -59,6 +112,13 @@ int rpm_get_nr_lmacs(void *rpmd)
        return hweight8(rpm_read(rpm, 0, CGXX_CMRX_RX_LMACS) & 0xFULL);
 }
 
+int rpm2_get_nr_lmacs(void *rpmd)
+{
+       rpm_t *rpm = rpmd;
+
+       return hweight8(rpm_read(rpm, 0, RPM2_CMRX_RX_LMACS) & 0xFFULL);
+}
+
 int rpm_lmac_tx_enable(void *rpmd, int lmac_id, bool enable)
 {
        rpm_t *rpm = rpmd;
@@ -222,6 +282,46 @@ static void rpm_cfg_pfc_quanta_thresh(rpm_t *rpm, int lmac_id,
        }
 }
 
+static void rpm2_lmac_cfg_bp(rpm_t *rpm, int lmac_id, u8 tx_pause, u8 rx_pause)
+{
+       u64 cfg;
+
+       cfg = rpm_read(rpm, lmac_id, RPM2_CMR_RX_OVR_BP);
+       if (tx_pause) {
+               /* Configure CL0 Pause Quanta & threshold
+                * for 802.3X frames
+                */
+               rpm_cfg_pfc_quanta_thresh(rpm, lmac_id, 1, true);
+               cfg &= ~RPM2_CMR_RX_OVR_BP_EN;
+       } else {
+               /* Disable all Pause Quanta & threshold values */
+               rpm_cfg_pfc_quanta_thresh(rpm, lmac_id, 0xffff, false);
+               cfg |= RPM2_CMR_RX_OVR_BP_EN;
+               cfg &= ~RPM2_CMR_RX_OVR_BP_BP;
+       }
+       rpm_write(rpm, lmac_id, RPM2_CMR_RX_OVR_BP, cfg);
+}
+
+static void rpm_lmac_cfg_bp(rpm_t *rpm, int lmac_id, u8 tx_pause, u8 rx_pause)
+{
+       u64 cfg;
+
+       cfg = rpm_read(rpm, 0, RPMX_CMR_RX_OVR_BP);
+       if (tx_pause) {
+               /* Configure CL0 Pause Quanta & threshold for
+                * 802.3X frames
+                */
+               rpm_cfg_pfc_quanta_thresh(rpm, lmac_id, 1, true);
+               cfg &= ~RPMX_CMR_RX_OVR_BP_EN(lmac_id);
+       } else {
+               /* Disable all Pause Quanta & threshold values */
+               rpm_cfg_pfc_quanta_thresh(rpm, lmac_id, 0xffff, false);
+               cfg |= RPMX_CMR_RX_OVR_BP_EN(lmac_id);
+               cfg &= ~RPMX_CMR_RX_OVR_BP_BP(lmac_id);
+       }
+       rpm_write(rpm, 0, RPMX_CMR_RX_OVR_BP, cfg);
+}
+
 int rpm_lmac_enadis_pause_frm(void *rpmd, int lmac_id, u8 tx_pause,
                              u8 rx_pause)
 {
@@ -243,18 +343,11 @@ int rpm_lmac_enadis_pause_frm(void *rpmd, int lmac_id, u8 tx_pause,
        cfg |= tx_pause ? 0x0 : RPMX_MTI_MAC100X_COMMAND_CONFIG_TX_P_DISABLE;
        rpm_write(rpm, lmac_id, RPMX_MTI_MAC100X_COMMAND_CONFIG, cfg);
 
-       cfg = rpm_read(rpm, 0, RPMX_CMR_RX_OVR_BP);
-       if (tx_pause) {
-               /* Configure CL0 Pause Quanta & threshold for 802.3X frames */
-               rpm_cfg_pfc_quanta_thresh(rpm, lmac_id, 1, true);
-               cfg &= ~RPMX_CMR_RX_OVR_BP_EN(lmac_id);
-       } else {
-               /* Disable all Pause Quanta & threshold values */
-               rpm_cfg_pfc_quanta_thresh(rpm, lmac_id, 0xffff, false);
-               cfg |= RPMX_CMR_RX_OVR_BP_EN(lmac_id);
-               cfg &= ~RPMX_CMR_RX_OVR_BP_BP(lmac_id);
-       }
-       rpm_write(rpm, 0, RPMX_CMR_RX_OVR_BP, cfg);
+       if (is_dev_rpm2(rpm))
+               rpm2_lmac_cfg_bp(rpm, lmac_id, tx_pause, rx_pause);
+       else
+               rpm_lmac_cfg_bp(rpm, lmac_id, tx_pause, rx_pause);
+
        return 0;
 }
 
@@ -278,13 +371,16 @@ void rpm_lmac_pause_frm_config(void *rpmd, int lmac_id, bool enable)
        cfg |= RPMX_MTI_MAC100X_COMMAND_CONFIG_TX_P_DISABLE;
        rpm_write(rpm, lmac_id, RPMX_MTI_MAC100X_COMMAND_CONFIG, cfg);
 
+       /* Enable channel mask for all LMACS */
+       if (is_dev_rpm2(rpm))
+               rpm_write(rpm, lmac_id, RPM2_CMR_CHAN_MSK_OR, 0xffff);
+       else
+               rpm_write(rpm, 0, RPMX_CMR_CHAN_MSK_OR, ~0ULL);
+
        /* Disable all PFC classes */
        cfg = rpm_read(rpm, lmac_id, RPMX_CMRX_PRT_CBFC_CTL);
        cfg = FIELD_SET(RPM_PFC_CLASS_MASK, 0, cfg);
        rpm_write(rpm, lmac_id, RPMX_CMRX_PRT_CBFC_CTL, cfg);
-
-       /* Enable channel mask for all LMACS */
-       rpm_write(rpm, 0, RPMX_CMR_CHAN_MSK_OR, ~0ULL);
 }
 
 int rpm_get_rx_stats(void *rpmd, int lmac_id, int idx, u64 *rx_stat)
@@ -292,7 +388,7 @@ int rpm_get_rx_stats(void *rpmd, int lmac_id, int idx, u64 *rx_stat)
        rpm_t *rpm = rpmd;
        u64 val_lo, val_hi;
 
-       if (!rpm || lmac_id >= rpm->lmac_count)
+       if (!is_lmac_valid(rpm, lmac_id))
                return -ENODEV;
 
        mutex_lock(&rpm->lock);
@@ -320,7 +416,7 @@ int rpm_get_tx_stats(void *rpmd, int lmac_id, int idx, u64 *tx_stat)
        rpm_t *rpm = rpmd;
        u64 val_lo, val_hi;
 
-       if (!rpm || lmac_id >= rpm->lmac_count)
+       if (!is_lmac_valid(rpm, lmac_id))
                return -ENODEV;
 
        mutex_lock(&rpm->lock);
@@ -380,13 +476,71 @@ u32 rpm_get_lmac_fifo_len(void *rpmd, int lmac_id)
        return 0;
 }
 
+static int rpmusx_lmac_internal_loopback(rpm_t *rpm, int lmac_id, bool enable)
+{
+       u64 cfg;
+
+       cfg = rpm_read(rpm, lmac_id, RPM2_USX_PCSX_CONTROL1);
+
+       if (enable)
+               cfg |= RPM2_USX_PCS_LBK;
+       else
+               cfg &= ~RPM2_USX_PCS_LBK;
+       rpm_write(rpm, lmac_id, RPM2_USX_PCSX_CONTROL1, cfg);
+
+       return 0;
+}
+
+u32 rpm2_get_lmac_fifo_len(void *rpmd, int lmac_id)
+{
+       u64 hi_perf_lmac, lmac_info;
+       rpm_t *rpm = rpmd;
+       u8 num_lmacs;
+       u32 fifo_len;
+
+       lmac_info = rpm_read(rpm, 0, RPM2_CMRX_RX_LMACS);
+       /* LMACs are divided into two groups and each group
+        * gets half of the FIFO
+        * Group0 lmac_id range {0..3}
+        * Group1 lmac_id range {4..7}
+        */
+       fifo_len = rpm->mac_ops->fifo_len / 2;
+
+       if (lmac_id < 4) {
+               num_lmacs = hweight8(lmac_info & 0xF);
+               hi_perf_lmac = (lmac_info >> 8) & 0x3ULL;
+       } else {
+               num_lmacs = hweight8(lmac_info & 0xF0);
+               hi_perf_lmac = (lmac_info >> 10) & 0x3ULL;
+               hi_perf_lmac += 4;
+       }
+
+       switch (num_lmacs) {
+       case 1:
+               return fifo_len;
+       case 2:
+               return fifo_len / 2;
+       case 3:
+               /* LMAC marked as hi_perf gets half of the FIFO
+                * and rest 1/4th
+                */
+               if (lmac_id == hi_perf_lmac)
+                       return fifo_len / 2;
+               return fifo_len / 4;
+       case 4:
+       default:
+               return fifo_len / 4;
+       }
+       return 0;
+}
+
 int rpm_lmac_internal_loopback(void *rpmd, int lmac_id, bool enable)
 {
        rpm_t *rpm = rpmd;
        u8 lmac_type;
        u64 cfg;
 
-       if (!rpm || lmac_id >= rpm->lmac_count)
+       if (!is_lmac_valid(rpm, lmac_id))
                return -ENODEV;
        lmac_type = rpm->mac_ops->get_lmac_type(rpm, lmac_id);
 
@@ -395,6 +549,9 @@ int rpm_lmac_internal_loopback(void *rpmd, int lmac_id, bool enable)
                return 0;
        }
 
+       if (is_dev_rpm2(rpm) && is_mac_rpmusx(rpm))
+               return rpmusx_lmac_internal_loopback(rpm, lmac_id, enable);
+
        cfg = rpm_read(rpm, lmac_id, RPMX_MTI_PCS100X_CONTROL1);
 
        if (enable)
@@ -439,8 +596,8 @@ void rpm_lmac_ptp_config(void *rpmd, int lmac_id, bool enable)
 
 int rpm_lmac_pfc_config(void *rpmd, int lmac_id, u8 tx_pause, u8 rx_pause, u16 pfc_en)
 {
+       u64 cfg, class_en, pfc_class_mask_cfg;
        rpm_t *rpm = rpmd;
-       u64 cfg, class_en;
 
        if (!is_lmac_valid(rpm, lmac_id))
                return -ENODEV;
@@ -476,7 +633,10 @@ int rpm_lmac_pfc_config(void *rpmd, int lmac_id, u8 tx_pause, u8 rx_pause, u16 p
 
        rpm_write(rpm, lmac_id, RPMX_MTI_MAC100X_COMMAND_CONFIG, cfg);
 
-       rpm_write(rpm, lmac_id, RPMX_CMRX_PRT_CBFC_CTL, class_en);
+       pfc_class_mask_cfg = is_dev_rpm2(rpm) ? RPM2_CMRX_PRT_CBFC_CTL :
+                                               RPMX_CMRX_PRT_CBFC_CTL;
+
+       rpm_write(rpm, lmac_id, pfc_class_mask_cfg, class_en);
 
        return 0;
 }
@@ -497,3 +657,59 @@ int  rpm_lmac_get_pfc_frm_cfg(void *rpmd, int lmac_id, u8 *tx_pause, u8 *rx_paus
 
        return 0;
 }
+
+int rpm_get_fec_stats(void *rpmd, int lmac_id, struct cgx_fec_stats_rsp *rsp)
+{
+       u64 val_lo, val_hi;
+       rpm_t *rpm = rpmd;
+       u64 cfg;
+
+       if (!is_lmac_valid(rpm, lmac_id))
+               return -ENODEV;
+
+       if (rpm->lmac_idmap[lmac_id]->link_info.fec == OTX2_FEC_NONE)
+               return 0;
+
+       if (rpm->lmac_idmap[lmac_id]->link_info.fec == OTX2_FEC_BASER) {
+               val_lo = rpm_read(rpm, lmac_id, RPMX_MTI_FCFECX_VL0_CCW_LO);
+               val_hi = rpm_read(rpm, lmac_id, RPMX_MTI_FCFECX_CW_HI);
+               rsp->fec_corr_blks = (val_hi << 16 | val_lo);
+
+               val_lo = rpm_read(rpm, lmac_id, RPMX_MTI_FCFECX_VL0_NCCW_LO);
+               val_hi = rpm_read(rpm, lmac_id, RPMX_MTI_FCFECX_CW_HI);
+               rsp->fec_uncorr_blks = (val_hi << 16 | val_lo);
+
+               /* 50G uses 2 Physical serdes lines */
+               if (rpm->lmac_idmap[lmac_id]->link_info.lmac_type_id ==
+                   LMAC_MODE_50G_R) {
+                       val_lo = rpm_read(rpm, lmac_id,
+                                         RPMX_MTI_FCFECX_VL1_CCW_LO);
+                       val_hi = rpm_read(rpm, lmac_id,
+                                         RPMX_MTI_FCFECX_CW_HI);
+                       rsp->fec_corr_blks += (val_hi << 16 | val_lo);
+
+                       val_lo = rpm_read(rpm, lmac_id,
+                                         RPMX_MTI_FCFECX_VL1_NCCW_LO);
+                       val_hi = rpm_read(rpm, lmac_id,
+                                         RPMX_MTI_FCFECX_CW_HI);
+                       rsp->fec_uncorr_blks += (val_hi << 16 | val_lo);
+               }
+       } else {
+               /* enable RS-FEC capture */
+               cfg = rpm_read(rpm, 0, RPMX_MTI_STAT_STATN_CONTROL);
+               cfg |= RPMX_RSFEC_RX_CAPTURE | BIT(lmac_id);
+               rpm_write(rpm, 0, RPMX_MTI_STAT_STATN_CONTROL, cfg);
+
+               val_lo = rpm_read(rpm, 0,
+                                 RPMX_MTI_RSFEC_STAT_COUNTER_CAPTURE_2);
+               val_hi = rpm_read(rpm, 0, RPMX_MTI_STAT_DATA_HI_CDC);
+               rsp->fec_corr_blks = (val_hi << 32 | val_lo);
+
+               val_lo = rpm_read(rpm, 0,
+                                 RPMX_MTI_RSFEC_STAT_COUNTER_CAPTURE_3);
+               val_hi = rpm_read(rpm, 0, RPMX_MTI_STAT_DATA_HI_CDC);
+               rsp->fec_uncorr_blks = (val_hi << 32 | val_lo);
+       }
+
+       return 0;
+}
index 77f2ef9..22147b4 100644 (file)
 
 /* PCI device IDs */
 #define PCI_DEVID_CN10K_RPM            0xA060
+#define PCI_SUBSYS_DEVID_CNF10KB_RPM   0xBC00
+#define PCI_DEVID_CN10KB_RPM           0xA09F
 
 /* Registers */
 #define RPMX_CMRX_CFG                  0x00
 #define RPMX_RX_TS_PREPEND              BIT_ULL(22)
 #define RPMX_TX_PTP_1S_SUPPORT          BIT_ULL(17)
+#define RPMX_CMRX_RX_ID_MAP            0x80
 #define RPMX_CMRX_SW_INT                0x180
 #define RPMX_CMRX_SW_INT_W1S            0x188
 #define RPMX_CMRX_SW_INT_ENA_W1S        0x198
 #define RPMX_CMRX_LINK_CFG             0x1070
 #define RPMX_MTI_PCS100X_CONTROL1       0x20000
-#define RPMX_MTI_LPCSX_CONTROL1         0x30000
 #define RPMX_MTI_PCS_LBK                BIT_ULL(14)
 #define RPMX_MTI_LPCSX_CONTROL(id)     (0x30000 | ((id) * 0x100))
 
 #define RPMX_MTI_MAC100X_XIF_MODE                      0x8100
 #define RPMX_ONESTEP_ENABLE                            BIT_ULL(5)
 #define RPMX_TS_BINARY_MODE                            BIT_ULL(11)
+#define RPMX_CONST1                                    0x2008
+
+/* FEC stats */
+#define RPMX_MTI_STAT_STATN_CONTROL                    0x10018
+#define RPMX_MTI_STAT_DATA_HI_CDC                      0x10038
+#define RPMX_RSFEC_RX_CAPTURE                          BIT_ULL(27)
+#define RPMX_MTI_RSFEC_STAT_COUNTER_CAPTURE_2          0x40050
+#define RPMX_MTI_RSFEC_STAT_COUNTER_CAPTURE_3          0x40058
+#define RPMX_MTI_FCFECX_VL0_CCW_LO                     0x38618
+#define RPMX_MTI_FCFECX_VL0_NCCW_LO                    0x38620
+#define RPMX_MTI_FCFECX_VL1_CCW_LO                     0x38628
+#define RPMX_MTI_FCFECX_VL1_NCCW_LO                    0x38630
+#define RPMX_MTI_FCFECX_CW_HI                          0x38638
+
+/* CN10KB CSR Declaration */
+#define  RPM2_CMRX_SW_INT                              0x1b0
+#define  RPM2_CMRX_SW_INT_ENA_W1S                      0x1b8
+#define  RPM2_CMR_CHAN_MSK_OR                          0x3120
+#define  RPM2_CMR_RX_OVR_BP_EN                         BIT_ULL(2)
+#define  RPM2_CMR_RX_OVR_BP_BP                         BIT_ULL(1)
+#define  RPM2_CMR_RX_OVR_BP                            0x3130
+#define  RPM2_CSR_OFFSET                               0x3e00
+#define  RPM2_CMRX_PRT_CBFC_CTL                                0x6510
+#define  RPM2_CMRX_RX_LMACS                            0x100
+#define  RPM2_CMRX_RX_LOGL_XON                         0x3100
+#define  RPM2_CMRX_RX_STAT2                            0x3010
+#define  RPM2_USX_PCSX_CONTROL1                                0x80000
+#define  RPM2_USX_PCS_LBK                              BIT_ULL(14)
 
 /* Function Declarations */
 int rpm_get_nr_lmacs(void *rpmd);
 u8 rpm_get_lmac_type(void *rpmd, int lmac_id);
 u32 rpm_get_lmac_fifo_len(void *rpmd, int lmac_id);
+u32 rpm2_get_lmac_fifo_len(void *rpmd, int lmac_id);
 int rpm_lmac_internal_loopback(void *rpmd, int lmac_id, bool enable);
 void rpm_lmac_enadis_rx_pause_fwding(void *rpmd, int lmac_id, bool enable);
 int rpm_lmac_get_pause_frm_status(void *cgxd, int lmac_id, u8 *tx_pause,
@@ -97,4 +128,7 @@ int rpm_lmac_pfc_config(void *rpmd, int lmac_id, u8 tx_pause, u8 rx_pause,
                        u16 pfc_en);
 int rpm_lmac_get_pfc_frm_cfg(void *rpmd, int lmac_id, u8 *tx_pause,
                             u8 *rx_pause);
+int rpm2_get_nr_lmacs(void *rpmd);
+bool is_dev_rpm2(void *rpmd);
+int rpm_get_fec_stats(void *cgxd, int lmac_id, struct cgx_fec_stats_rsp *rsp);
 #endif /* RPM_H */
index f718cbd..7f0a647 100644 (file)
@@ -410,9 +410,15 @@ struct rvu_fwdata {
        u32 ptp_ext_tstamp;
 #define FWDATA_RESERVED_MEM 1022
        u64 reserved[FWDATA_RESERVED_MEM];
-#define CGX_MAX         5
+#define CGX_MAX         9
 #define CGX_LMACS_MAX   4
-       struct cgx_lmac_fwdata_s cgx_fw_data[CGX_MAX][CGX_LMACS_MAX];
+#define CGX_LMACS_USX   8
+       union {
+               struct cgx_lmac_fwdata_s
+                       cgx_fw_data[CGX_MAX][CGX_LMACS_MAX];
+               struct cgx_lmac_fwdata_s
+                       cgx_fw_data_usx[CGX_MAX][CGX_LMACS_USX];
+       };
        /* Do not add new fields below this line */
 };
 
@@ -478,7 +484,7 @@ struct rvu {
        u8                      cgx_mapped_pfs;
        u8                      cgx_cnt_max;     /* CGX port count max */
        u8                      *pf2cgxlmac_map; /* pf to cgx_lmac map */
-       u16                     *cgxlmac2pf_map; /* bitmap of mapped pfs for
+       u64                     *cgxlmac2pf_map; /* bitmap of mapped pfs for
                                                  * every cgx lmac port
                                                  */
        unsigned long           pf_notify_bmap; /* Flags for PF notification */
index addc69f..438b212 100644 (file)
@@ -55,8 +55,9 @@ bool is_mac_feature_supported(struct rvu *rvu, int pf, int feature)
        return  (cgx_features_get(cgxd) & feature);
 }
 
+#define CGX_OFFSET(x)                  ((x) * rvu->hw->lmac_per_cgx)
 /* Returns bitmap of mapped PFs */
-static u16 cgxlmac_to_pfmap(struct rvu *rvu, u8 cgx_id, u8 lmac_id)
+static u64 cgxlmac_to_pfmap(struct rvu *rvu, u8 cgx_id, u8 lmac_id)
 {
        return rvu->cgxlmac2pf_map[CGX_OFFSET(cgx_id) + lmac_id];
 }
@@ -71,7 +72,8 @@ int cgxlmac_to_pf(struct rvu *rvu, int cgx_id, int lmac_id)
        if (!pfmap)
                return -ENODEV;
        else
-               return find_first_bit(&pfmap, 16);
+               return find_first_bit(&pfmap,
+                                     rvu->cgx_cnt_max * rvu->hw->lmac_per_cgx);
 }
 
 static u8 cgxlmac_id_to_bmap(u8 cgx_id, u8 lmac_id)
@@ -129,14 +131,14 @@ static int rvu_map_cgx_lmac_pf(struct rvu *rvu)
        if (!cgx_cnt_max)
                return 0;
 
-       if (cgx_cnt_max > 0xF || MAX_LMAC_PER_CGX > 0xF)
+       if (cgx_cnt_max > 0xF || rvu->hw->lmac_per_cgx > 0xF)
                return -EINVAL;
 
        /* Alloc map table
         * An additional entry is required since PF id starts from 1 and
         * hence entry at offset 0 is invalid.
         */
-       size = (cgx_cnt_max * MAX_LMAC_PER_CGX + 1) * sizeof(u8);
+       size = (cgx_cnt_max * rvu->hw->lmac_per_cgx + 1) * sizeof(u8);
        rvu->pf2cgxlmac_map = devm_kmalloc(rvu->dev, size, GFP_KERNEL);
        if (!rvu->pf2cgxlmac_map)
                return -ENOMEM;
@@ -145,9 +147,10 @@ static int rvu_map_cgx_lmac_pf(struct rvu *rvu)
        memset(rvu->pf2cgxlmac_map, 0xFF, size);
 
        /* Reverse map table */
-       rvu->cgxlmac2pf_map = devm_kzalloc(rvu->dev,
-                                 cgx_cnt_max * MAX_LMAC_PER_CGX * sizeof(u16),
-                                 GFP_KERNEL);
+       rvu->cgxlmac2pf_map =
+               devm_kzalloc(rvu->dev,
+                            cgx_cnt_max * rvu->hw->lmac_per_cgx * sizeof(u64),
+                            GFP_KERNEL);
        if (!rvu->cgxlmac2pf_map)
                return -ENOMEM;
 
@@ -156,7 +159,7 @@ static int rvu_map_cgx_lmac_pf(struct rvu *rvu)
                if (!rvu_cgx_pdata(cgx, rvu))
                        continue;
                lmac_bmap = cgx_get_lmac_bmap(rvu_cgx_pdata(cgx, rvu));
-               for_each_set_bit(iter, &lmac_bmap, MAX_LMAC_PER_CGX) {
+               for_each_set_bit(iter, &lmac_bmap, rvu->hw->lmac_per_cgx) {
                        lmac = cgx_get_lmacid(rvu_cgx_pdata(cgx, rvu),
                                              iter);
                        rvu->pf2cgxlmac_map[pf] = cgxlmac_id_to_bmap(cgx, lmac);
@@ -235,7 +238,8 @@ static void cgx_notify_pfs(struct cgx_link_event *event, struct rvu *rvu)
        pfmap = cgxlmac_to_pfmap(rvu, event->cgx_id, event->lmac_id);
 
        do {
-               pfid = find_first_bit(&pfmap, 16);
+               pfid = find_first_bit(&pfmap,
+                                     rvu->cgx_cnt_max * rvu->hw->lmac_per_cgx);
                clear_bit(pfid, &pfmap);
 
                /* check if notification is enabled */
@@ -310,7 +314,7 @@ static int cgx_lmac_event_handler_init(struct rvu *rvu)
                if (!cgxd)
                        continue;
                lmac_bmap = cgx_get_lmac_bmap(cgxd);
-               for_each_set_bit(lmac, &lmac_bmap, MAX_LMAC_PER_CGX) {
+               for_each_set_bit(lmac, &lmac_bmap, rvu->hw->lmac_per_cgx) {
                        err = cgx_lmac_evh_register(&cb, cgxd, lmac);
                        if (err)
                                dev_err(rvu->dev,
@@ -396,7 +400,7 @@ int rvu_cgx_exit(struct rvu *rvu)
                if (!cgxd)
                        continue;
                lmac_bmap = cgx_get_lmac_bmap(cgxd);
-               for_each_set_bit(lmac, &lmac_bmap, MAX_LMAC_PER_CGX)
+               for_each_set_bit(lmac, &lmac_bmap, rvu->hw->lmac_per_cgx)
                        cgx_lmac_evh_unregister(cgxd, lmac);
        }
 
@@ -468,6 +472,7 @@ void rvu_cgx_disable_dmac_entries(struct rvu *rvu, u16 pcifunc)
 {
        int pf = rvu_get_pf(pcifunc);
        int i = 0, lmac_count = 0;
+       struct mac_ops *mac_ops;
        u8 max_dmac_filters;
        u8 cgx_id, lmac_id;
        void *cgx_dev;
@@ -483,7 +488,12 @@ void rvu_cgx_disable_dmac_entries(struct rvu *rvu, u16 pcifunc)
        rvu_get_cgx_lmac_id(rvu->pf2cgxlmac_map[pf], &cgx_id, &lmac_id);
        cgx_dev = cgx_get_pdata(cgx_id);
        lmac_count = cgx_get_lmac_cnt(cgx_dev);
-       max_dmac_filters = MAX_DMAC_ENTRIES_PER_CGX / lmac_count;
+
+       mac_ops = get_mac_ops(cgx_dev);
+       if (!mac_ops)
+               return;
+
+       max_dmac_filters = mac_ops->dmac_filter_count / lmac_count;
 
        for (i = 0; i < max_dmac_filters; i++)
                cgx_lmac_addr_del(cgx_id, lmac_id, i);
@@ -569,6 +579,7 @@ int rvu_mbox_handler_cgx_fec_stats(struct rvu *rvu,
                                   struct cgx_fec_stats_rsp *rsp)
 {
        int pf = rvu_get_pf(req->hdr.pcifunc);
+       struct mac_ops *mac_ops;
        u8 cgx_idx, lmac;
        void *cgxd;
 
@@ -577,7 +588,8 @@ int rvu_mbox_handler_cgx_fec_stats(struct rvu *rvu,
        rvu_get_cgx_lmac_id(rvu->pf2cgxlmac_map[pf], &cgx_idx, &lmac);
 
        cgxd = rvu_cgx_pdata(cgx_idx, rvu);
-       return cgx_get_fec_stats(cgxd, lmac, rsp);
+       mac_ops = get_mac_ops(cgxd);
+       return  mac_ops->get_fec_stats(cgxd, lmac, rsp);
 }
 
 int rvu_mbox_handler_cgx_mac_addr_set(struct rvu *rvu,
@@ -1110,8 +1122,15 @@ int rvu_mbox_handler_cgx_get_aux_link_info(struct rvu *rvu, struct msg_req *req,
 
        rvu_get_cgx_lmac_id(rvu->pf2cgxlmac_map[pf], &cgx_id, &lmac_id);
 
-       memcpy(&rsp->fwdata, &rvu->fwdata->cgx_fw_data[cgx_id][lmac_id],
-              sizeof(struct cgx_lmac_fwdata_s));
+       if (rvu->hw->lmac_per_cgx == CGX_LMACS_USX)
+               memcpy(&rsp->fwdata,
+                      &rvu->fwdata->cgx_fw_data_usx[cgx_id][lmac_id],
+                      sizeof(struct cgx_lmac_fwdata_s));
+       else
+               memcpy(&rsp->fwdata,
+                      &rvu->fwdata->cgx_fw_data[cgx_id][lmac_id],
+                      sizeof(struct cgx_lmac_fwdata_s));
+
        return 0;
 }
 
index 0eb3085..fa280eb 100644 (file)
@@ -2613,7 +2613,7 @@ static void rvu_dbg_cgx_init(struct rvu *rvu)
                rvu->rvu_dbg.cgx = debugfs_create_dir(dname,
                                                      rvu->rvu_dbg.cgx_root);
 
-               for_each_set_bit(lmac_id, &lmac_bmap, MAX_LMAC_PER_CGX) {
+               for_each_set_bit(lmac_id, &lmac_bmap, rvu->hw->lmac_per_cgx) {
                        /* lmac debugfs dir */
                        sprintf(dname, "lmac%d", lmac_id);
                        rvu->rvu_dbg.lmac =
index a62c1b3..6b8747e 100644 (file)
@@ -3197,8 +3197,12 @@ static void rvu_get_lbk_link_max_frs(struct rvu *rvu,  u16 *max_mtu)
 
 static void rvu_get_lmac_link_max_frs(struct rvu *rvu, u16 *max_mtu)
 {
-       /* RPM supports FIFO len 128 KB */
-       if (rvu_cgx_get_fifolen(rvu) == 0x20000)
+       int fifo_size = rvu_cgx_get_fifolen(rvu);
+
+       /* RPM supports FIFO len 128 KB and RPM2 supports double the
+        * FIFO len to accommodate 8 LMACS
+        */
+       if (fifo_size == 0x20000 || fifo_size == 0x40000)
                *max_mtu = CN10K_LMAC_LINK_MAX_FRS;
        else
                *max_mtu = NIC_HW_MAX_FRS;
@@ -4109,7 +4113,7 @@ static void nix_link_config(struct rvu *rvu, int blkaddr,
 
                /* Get LMAC id's from bitmap */
                lmac_bmap = cgx_get_lmac_bmap(rvu_cgx_pdata(cgx, rvu));
-               for_each_set_bit(iter, &lmac_bmap, MAX_LMAC_PER_CGX) {
+               for_each_set_bit(iter, &lmac_bmap, rvu->hw->lmac_per_cgx) {
                        lmac_fifo_len = rvu_cgx_get_lmac_fifolen(rvu, cgx, iter);
                        if (!lmac_fifo_len) {
                                dev_err(rvu->dev,
index 00aef8f..f69102d 100644 (file)
@@ -1956,7 +1956,9 @@ int rvu_npc_exact_init(struct rvu *rvu)
        /* Install SDP drop rule */
        drop_mcam_idx = &table->num_drop_rules;
 
-       max_lmac_cnt = rvu->cgx_cnt_max * MAX_LMAC_PER_CGX + PF_CGXMAP_BASE;
+       max_lmac_cnt = rvu->cgx_cnt_max * rvu->hw->lmac_per_cgx +
+                      PF_CGXMAP_BASE;
+
        for (i = PF_CGXMAP_BASE; i < max_lmac_cnt; i++) {
                if (rvu->pf2cgxlmac_map[i] == 0xFF)
                        continue;
index 0eb74e8..0f8d1a6 100644 (file)
@@ -1268,6 +1268,39 @@ end:
        return err;
 }
 
+static void otx2_get_fec_stats(struct net_device *netdev,
+                              struct ethtool_fec_stats *fec_stats)
+{
+       struct otx2_nic *pfvf = netdev_priv(netdev);
+       struct cgx_fw_data *rsp;
+
+       otx2_update_lmac_fec_stats(pfvf);
+
+       /* Report MAC FEC stats */
+       fec_stats->corrected_blocks.total     = pfvf->hw.cgx_fec_corr_blks;
+       fec_stats->uncorrectable_blocks.total = pfvf->hw.cgx_fec_uncorr_blks;
+
+       rsp = otx2_get_fwdata(pfvf);
+       if (!IS_ERR(rsp) && rsp->fwdata.phy.misc.has_fec_stats &&
+           !otx2_get_phy_fec_stats(pfvf)) {
+               /* Fetch fwdata again because it's been recently populated with
+                * latest PHY FEC stats.
+                */
+               rsp = otx2_get_fwdata(pfvf);
+               if (!IS_ERR(rsp)) {
+                       struct fec_stats_s *p = &rsp->fwdata.phy.fec_stats;
+
+                       if (pfvf->linfo.fec == OTX2_FEC_BASER) {
+                               fec_stats->corrected_blocks.total = p->brfec_corr_blks;
+                               fec_stats->uncorrectable_blocks.total = p->brfec_uncorr_blks;
+                       } else {
+                               fec_stats->corrected_blocks.total = p->rsfec_corr_cws;
+                               fec_stats->uncorrectable_blocks.total = p->rsfec_uncorr_cws;
+                       }
+               }
+       }
+}
+
 static const struct ethtool_ops otx2_ethtool_ops = {
        .supported_coalesce_params = ETHTOOL_COALESCE_USECS |
                                     ETHTOOL_COALESCE_MAX_FRAMES |
@@ -1298,6 +1331,7 @@ static const struct ethtool_ops otx2_ethtool_ops = {
        .get_pauseparam         = otx2_get_pauseparam,
        .set_pauseparam         = otx2_set_pauseparam,
        .get_ts_info            = otx2_get_ts_info,
+       .get_fec_stats          = otx2_get_fec_stats,
        .get_fecparam           = otx2_get_fecparam,
        .set_fecparam           = otx2_set_fecparam,
        .get_link_ksettings     = otx2_get_link_ksettings,
index e421714..044cc21 100644 (file)
@@ -1159,7 +1159,12 @@ int otx2_init_tc(struct otx2_nic *nic)
                return err;
 
        tc->flow_ht_params = tc_flow_ht_params;
-       return rhashtable_init(&tc->flow_table, &tc->flow_ht_params);
+       err = rhashtable_init(&tc->flow_table, &tc->flow_ht_params);
+       if (err) {
+               kfree(tc->tc_entries_bitmap);
+               tc->tc_entries_bitmap = NULL;
+       }
+       return err;
 }
 EXPORT_SYMBOL(otx2_init_tc);
 
index 8b93dab..e3de9a5 100644 (file)
@@ -4593,6 +4593,7 @@ static const struct mtk_soc_data mt7986_data = {
        .hw_features = MTK_HW_FEATURES,
        .required_clks = MT7986_CLKS_BITMAP,
        .required_pctl = false,
+       .offload_version = 2,
        .hash_offset = 4,
        .foe_entry_size = sizeof(struct mtk_foe_entry),
        .txrx = {
index d041615..a627144 100644 (file)
@@ -174,9 +174,10 @@ mtk_wed_wo_reset(struct mtk_wed_device *dev)
        mtk_wdma_tx_reset(dev);
        mtk_wed_reset(dev, MTK_WED_RESET_WED);
 
-       mtk_wed_mcu_send_msg(wo, MTK_WED_MODULE_ID_WO,
-                            MTK_WED_WO_CMD_CHANGE_STATE, &state,
-                            sizeof(state), false);
+       if (mtk_wed_mcu_send_msg(wo, MTK_WED_MODULE_ID_WO,
+                                MTK_WED_WO_CMD_CHANGE_STATE, &state,
+                                sizeof(state), false))
+               return;
 
        if (readx_poll_timeout(mtk_wed_wo_read_status, dev, val,
                               val == MTK_WED_WOIF_DISABLE_DONE,
@@ -576,12 +577,10 @@ mtk_wed_deinit(struct mtk_wed_device *dev)
 }
 
 static void
-mtk_wed_detach(struct mtk_wed_device *dev)
+__mtk_wed_detach(struct mtk_wed_device *dev)
 {
        struct mtk_wed_hw *hw = dev->hw;
 
-       mutex_lock(&hw_lock);
-
        mtk_wed_deinit(dev);
 
        mtk_wdma_rx_reset(dev);
@@ -590,9 +589,11 @@ mtk_wed_detach(struct mtk_wed_device *dev)
        mtk_wed_free_tx_rings(dev);
 
        if (mtk_wed_get_rx_capa(dev)) {
-               mtk_wed_wo_reset(dev);
+               if (hw->wed_wo)
+                       mtk_wed_wo_reset(dev);
                mtk_wed_free_rx_rings(dev);
-               mtk_wed_wo_deinit(hw);
+               if (hw->wed_wo)
+                       mtk_wed_wo_deinit(hw);
        }
 
        if (dev->wlan.bus_type == MTK_WED_BUS_PCIE) {
@@ -612,6 +613,13 @@ mtk_wed_detach(struct mtk_wed_device *dev)
        module_put(THIS_MODULE);
 
        hw->wed_dev = NULL;
+}
+
+static void
+mtk_wed_detach(struct mtk_wed_device *dev)
+{
+       mutex_lock(&hw_lock);
+       __mtk_wed_detach(dev);
        mutex_unlock(&hw_lock);
 }
 
@@ -1210,7 +1218,8 @@ mtk_wed_wdma_rx_ring_setup(struct mtk_wed_device *dev, int idx, int size,
 }
 
 static int
-mtk_wed_wdma_tx_ring_setup(struct mtk_wed_device *dev, int idx, int size)
+mtk_wed_wdma_tx_ring_setup(struct mtk_wed_device *dev, int idx, int size,
+                          bool reset)
 {
        u32 desc_size = sizeof(struct mtk_wdma_desc) * dev->hw->version;
        struct mtk_wed_ring *wdma;
@@ -1219,8 +1228,8 @@ mtk_wed_wdma_tx_ring_setup(struct mtk_wed_device *dev, int idx, int size)
                return -EINVAL;
 
        wdma = &dev->tx_wdma[idx];
-       if (mtk_wed_ring_alloc(dev, wdma, MTK_WED_WDMA_RING_SIZE, desc_size,
-                              true))
+       if (!reset && mtk_wed_ring_alloc(dev, wdma, MTK_WED_WDMA_RING_SIZE,
+                                        desc_size, true))
                return -ENOMEM;
 
        wdma_w32(dev, MTK_WDMA_RING_TX(idx) + MTK_WED_RING_OFS_BASE,
@@ -1230,6 +1239,9 @@ mtk_wed_wdma_tx_ring_setup(struct mtk_wed_device *dev, int idx, int size)
        wdma_w32(dev, MTK_WDMA_RING_TX(idx) + MTK_WED_RING_OFS_CPU_IDX, 0);
        wdma_w32(dev, MTK_WDMA_RING_TX(idx) + MTK_WED_RING_OFS_DMA_IDX, 0);
 
+       if (reset)
+               mtk_wed_ring_reset(wdma, MTK_WED_WDMA_RING_SIZE, true);
+
        if (!idx)  {
                wed_w32(dev, MTK_WED_WDMA_RING_TX + MTK_WED_RING_OFS_BASE,
                        wdma->desc_phys);
@@ -1490,8 +1502,10 @@ mtk_wed_attach(struct mtk_wed_device *dev)
                ret = mtk_wed_wo_init(hw);
        }
 out:
-       if (ret)
-               mtk_wed_detach(dev);
+       if (ret) {
+               dev_err(dev->hw->dev, "failed to attach wed device\n");
+               __mtk_wed_detach(dev);
+       }
 unlock:
        mutex_unlock(&hw_lock);
 
@@ -1569,18 +1583,20 @@ mtk_wed_txfree_ring_setup(struct mtk_wed_device *dev, void __iomem *regs)
 }
 
 static int
-mtk_wed_rx_ring_setup(struct mtk_wed_device *dev, int idx, void __iomem *regs)
+mtk_wed_rx_ring_setup(struct mtk_wed_device *dev, int idx, void __iomem *regs,
+                     bool reset)
 {
        struct mtk_wed_ring *ring = &dev->rx_ring[idx];
 
        if (WARN_ON(idx >= ARRAY_SIZE(dev->rx_ring)))
                return -EINVAL;
 
-       if (mtk_wed_ring_alloc(dev, ring, MTK_WED_RX_RING_SIZE,
-                              sizeof(*ring->desc), false))
+       if (!reset && mtk_wed_ring_alloc(dev, ring, MTK_WED_RX_RING_SIZE,
+                                        sizeof(*ring->desc), false))
                return -ENOMEM;
 
-       if (mtk_wed_wdma_tx_ring_setup(dev, idx, MTK_WED_WDMA_RING_SIZE))
+       if (mtk_wed_wdma_tx_ring_setup(dev, idx, MTK_WED_WDMA_RING_SIZE,
+                                      reset))
                return -ENOMEM;
 
        ring->reg_base = MTK_WED_RING_RX_DATA(idx);
index f9539e6..6bad0d2 100644 (file)
@@ -207,6 +207,9 @@ int mtk_wed_mcu_msg_update(struct mtk_wed_device *dev, int id, void *data,
        if (dev->hw->version == 1)
                return 0;
 
+       if (WARN_ON(!wo))
+               return -ENODEV;
+
        return mtk_wed_mcu_send_msg(wo, MTK_WED_MODULE_ID_WO, id, data, len,
                                    true);
 }
index a219da8..a0a3964 100644 (file)
@@ -408,8 +408,10 @@ mtk_wed_wo_hardware_init(struct mtk_wed_wo *wo)
                return -ENODEV;
 
        wo->mmio.regs = syscon_regmap_lookup_by_phandle(np, NULL);
-       if (IS_ERR_OR_NULL(wo->mmio.regs))
-               return PTR_ERR(wo->mmio.regs);
+       if (IS_ERR(wo->mmio.regs)) {
+               ret = PTR_ERR(wo->mmio.regs);
+               goto error_put;
+       }
 
        wo->mmio.irq = irq_of_parse_and_map(np, 0);
        wo->mmio.irq_mask = MTK_WED_WO_ALL_INT_MASK;
@@ -457,7 +459,8 @@ mtk_wed_wo_hardware_init(struct mtk_wed_wo *wo)
 
 error:
        devm_free_irq(wo->hw->dev, wo->mmio.irq, wo);
-
+error_put:
+       of_node_put(np);
        return ret;
 }
 
index 43a4102..c575863 100644 (file)
@@ -65,7 +65,7 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
        ring->size = size;
        ring->size_mask = size - 1;
        ring->sp_stride = stride;
-       ring->full_size = ring->size - HEADROOM - MAX_DESC_TXBBS;
+       ring->full_size = ring->size - HEADROOM - MLX4_MAX_DESC_TXBBS;
 
        tmp = size * sizeof(struct mlx4_en_tx_info);
        ring->tx_info = kvmalloc_node(tmp, GFP_KERNEL, node);
@@ -77,9 +77,11 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
        en_dbg(DRV, priv, "Allocated tx_info ring at addr:%p size:%d\n",
                 ring->tx_info, tmp);
 
-       ring->bounce_buf = kmalloc_node(MAX_DESC_SIZE, GFP_KERNEL, node);
+       ring->bounce_buf = kmalloc_node(MLX4_TX_BOUNCE_BUFFER_SIZE,
+                                       GFP_KERNEL, node);
        if (!ring->bounce_buf) {
-               ring->bounce_buf = kmalloc(MAX_DESC_SIZE, GFP_KERNEL);
+               ring->bounce_buf = kmalloc(MLX4_TX_BOUNCE_BUFFER_SIZE,
+                                          GFP_KERNEL);
                if (!ring->bounce_buf) {
                        err = -ENOMEM;
                        goto err_info;
@@ -909,11 +911,6 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
        /* Align descriptor to TXBB size */
        desc_size = ALIGN(real_size, TXBB_SIZE);
        nr_txbb = desc_size >> LOG_TXBB_SIZE;
-       if (unlikely(nr_txbb > MAX_DESC_TXBBS)) {
-               if (netif_msg_tx_err(priv))
-                       en_warn(priv, "Oversized header or SG list\n");
-               goto tx_drop_count;
-       }
 
        bf_ok = ring->bf_enabled;
        if (skb_vlan_tag_present(skb)) {
@@ -941,6 +938,11 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
        if (likely(index + nr_txbb <= ring->size))
                tx_desc = ring->buf + (index << LOG_TXBB_SIZE);
        else {
+               if (unlikely(nr_txbb > MLX4_MAX_DESC_TXBBS)) {
+                       if (netif_msg_tx_err(priv))
+                               en_warn(priv, "Oversized header or SG list\n");
+                       goto tx_drop_count;
+               }
                tx_desc = (struct mlx4_en_tx_desc *) ring->bounce_buf;
                bounce = true;
                bf_ok = false;
index e132ff4..3d4226d 100644 (file)
 #define MLX4_EN_FILTER_HASH_SHIFT 4
 #define MLX4_EN_FILTER_EXPIRY_QUOTA 60
 
-/* Typical TSO descriptor with 16 gather entries is 352 bytes... */
-#define MAX_DESC_SIZE          512
-#define MAX_DESC_TXBBS         (MAX_DESC_SIZE / TXBB_SIZE)
+#define CTRL_SIZE      sizeof(struct mlx4_wqe_ctrl_seg)
+#define DS_SIZE                sizeof(struct mlx4_wqe_data_seg)
+
+/* Maximal size of the bounce buffer:
+ * 256 bytes for LSO headers.
+ * CTRL_SIZE for control desc.
+ * DS_SIZE if skb->head contains some payload.
+ * MAX_SKB_FRAGS frags.
+ */
+#define MLX4_TX_BOUNCE_BUFFER_SIZE \
+       ALIGN(256 + CTRL_SIZE + DS_SIZE + MAX_SKB_FRAGS * DS_SIZE, TXBB_SIZE)
+
+#define MLX4_MAX_DESC_TXBBS       (MLX4_TX_BOUNCE_BUFFER_SIZE / TXBB_SIZE)
 
 /*
  * OS related constants and tunables
@@ -217,9 +227,7 @@ struct mlx4_en_tx_info {
 
 
 #define MLX4_EN_BIT_DESC_OWN   0x80000000
-#define CTRL_SIZE      sizeof(struct mlx4_wqe_ctrl_seg)
 #define MLX4_EN_MEMTYPE_PAD    0x100
-#define DS_SIZE                sizeof(struct mlx4_wqe_data_seg)
 
 
 struct mlx4_en_tx_desc {
index a22c32a..cd4a1ab 100644 (file)
@@ -111,6 +111,7 @@ mlx5_core-$(CONFIG_MLX5_SW_STEERING) += steering/dr_domain.o steering/dr_table.o
                                        steering/dr_ste_v2.o \
                                        steering/dr_cmd.o steering/dr_fw.o \
                                        steering/dr_action.o steering/fs_dr.o \
+                                       steering/dr_definer.o \
                                        steering/dr_dbg.o lib/smfs.o
 #
 # SF device
index 751bc4a..ddb1979 100644 (file)
@@ -314,6 +314,10 @@ static const struct devlink_ops mlx5_devlink_ops = {
        .rate_node_new = mlx5_esw_devlink_rate_node_new,
        .rate_node_del = mlx5_esw_devlink_rate_node_del,
        .rate_leaf_parent_set = mlx5_esw_devlink_rate_parent_set,
+       .port_fn_roce_get = mlx5_devlink_port_fn_roce_get,
+       .port_fn_roce_set = mlx5_devlink_port_fn_roce_set,
+       .port_fn_migratable_get = mlx5_devlink_port_fn_migratable_get,
+       .port_fn_migratable_set = mlx5_devlink_port_fn_migratable_set,
 #endif
 #ifdef CONFIG_MLX5_SF_MANAGER
        .port_new = mlx5_devlink_sf_port_new,
index c5bb79a..2732128 100644 (file)
@@ -228,6 +228,17 @@ const char *parse_fs_hdrs(struct trace_seq *p,
        return ret;
 }
 
+static const char
+*fs_dest_range_field_to_str(enum mlx5_flow_dest_range_field field)
+{
+       switch (field) {
+       case MLX5_FLOW_DEST_RANGE_FIELD_PKT_LEN:
+               return "packet len";
+       default:
+               return "unknown dest range field";
+       }
+}
+
 const char *parse_fs_dst(struct trace_seq *p,
                         const struct mlx5_flow_destination *dst,
                         u32 counter_id)
@@ -259,6 +270,11 @@ const char *parse_fs_dst(struct trace_seq *p,
        case MLX5_FLOW_DESTINATION_TYPE_PORT:
                trace_seq_printf(p, "port\n");
                break;
+       case MLX5_FLOW_DESTINATION_TYPE_RANGE:
+               trace_seq_printf(p, "field=%s min=%d max=%d\n",
+                                fs_dest_range_field_to_str(dst->range.field),
+                                dst->range.min, dst->range.max);
+               break;
        case MLX5_FLOW_DESTINATION_TYPE_NONE:
                trace_seq_printf(p, "none\n");
                break;
index 65790ff..2d77fb8 100644 (file)
@@ -1245,4 +1245,5 @@ int mlx5e_set_vf_rate(struct net_device *dev, int vf, int min_tx_rate, int max_t
 int mlx5e_get_vf_config(struct net_device *dev, int vf, struct ifla_vf_info *ivi);
 int mlx5e_get_vf_stats(struct net_device *dev, int vf, struct ifla_vf_stats *vf_stats);
 #endif
+int mlx5e_create_mkey(struct mlx5_core_dev *mdev, u32 pdn, u32 *mkey);
 #endif /* __MLX5_EN_H__ */
index bf2741e..379c6dc 100644 (file)
@@ -84,7 +84,8 @@ enum {
        MLX5E_ARFS_FT_LEVEL = MLX5E_INNER_TTC_FT_LEVEL + 1,
 #endif
 #ifdef CONFIG_MLX5_EN_IPSEC
-       MLX5E_ACCEL_FS_ESP_FT_LEVEL = MLX5E_INNER_TTC_FT_LEVEL + 1,
+       MLX5E_ACCEL_FS_POL_FT_LEVEL = MLX5E_INNER_TTC_FT_LEVEL + 1,
+       MLX5E_ACCEL_FS_ESP_FT_LEVEL,
        MLX5E_ACCEL_FS_ESP_FT_ERR_LEVEL,
 #endif
 };
index 21aab96..a278f52 100644 (file)
@@ -28,4 +28,5 @@ tc_act_parse_accept(struct mlx5e_tc_act_parse_state *parse_state,
 struct mlx5e_tc_act mlx5e_tc_act_accept = {
        .can_offload = tc_act_can_offload_accept,
        .parse_action = tc_act_parse_accept,
+       .is_terminating_action = true,
 };
index 3337241..eba0c86 100644 (file)
@@ -11,7 +11,7 @@ static struct mlx5e_tc_act *tc_acts_fdb[NUM_FLOW_ACTIONS] = {
        [FLOW_ACTION_DROP] = &mlx5e_tc_act_drop,
        [FLOW_ACTION_TRAP] = &mlx5e_tc_act_trap,
        [FLOW_ACTION_GOTO] = &mlx5e_tc_act_goto,
-       [FLOW_ACTION_REDIRECT] = &mlx5e_tc_act_mirred,
+       [FLOW_ACTION_REDIRECT] = &mlx5e_tc_act_redirect,
        [FLOW_ACTION_MIRRED] = &mlx5e_tc_act_mirred,
        [FLOW_ACTION_REDIRECT_INGRESS] = &mlx5e_tc_act_redirect_ingress,
        [FLOW_ACTION_VLAN_PUSH] = &mlx5e_tc_act_vlan,
index e1570ff..8346557 100644 (file)
@@ -32,6 +32,11 @@ struct mlx5e_tc_act_parse_state {
        struct mlx5_tc_ct_priv *ct_priv;
 };
 
+struct mlx5e_tc_act_branch_ctrl {
+       enum flow_action_id act_id;
+       u32 extval;
+};
+
 struct mlx5e_tc_act {
        bool (*can_offload)(struct mlx5e_tc_act_parse_state *parse_state,
                            const struct flow_action_entry *act,
@@ -60,6 +65,12 @@ struct mlx5e_tc_act {
 
        int (*stats_action)(struct mlx5e_priv *priv,
                            struct flow_offload_action *fl_act);
+
+       bool (*get_branch_ctrl)(const struct flow_action_entry *act,
+                               struct mlx5e_tc_act_branch_ctrl *cond_true,
+                               struct mlx5e_tc_act_branch_ctrl *cond_false);
+
+       bool is_terminating_action;
 };
 
 struct mlx5e_tc_flow_action {
@@ -81,6 +92,7 @@ extern struct mlx5e_tc_act mlx5e_tc_act_vlan_mangle;
 extern struct mlx5e_tc_act mlx5e_tc_act_mpls_push;
 extern struct mlx5e_tc_act mlx5e_tc_act_mpls_pop;
 extern struct mlx5e_tc_act mlx5e_tc_act_mirred;
+extern struct mlx5e_tc_act mlx5e_tc_act_redirect;
 extern struct mlx5e_tc_act mlx5e_tc_act_mirred_nic;
 extern struct mlx5e_tc_act mlx5e_tc_act_ct;
 extern struct mlx5e_tc_act mlx5e_tc_act_sample;
index dd025a9..7d16aea 100644 (file)
@@ -27,4 +27,5 @@ tc_act_parse_drop(struct mlx5e_tc_act_parse_state *parse_state,
 struct mlx5e_tc_act mlx5e_tc_act_drop = {
        .can_offload = tc_act_can_offload_drop,
        .parse_action = tc_act_parse_drop,
+       .is_terminating_action = true,
 };
index 25174f6..0923e6d 100644 (file)
@@ -121,4 +121,5 @@ struct mlx5e_tc_act mlx5e_tc_act_goto = {
        .can_offload = tc_act_can_offload_goto,
        .parse_action = tc_act_parse_goto,
        .post_parse = tc_act_post_parse_goto,
+       .is_terminating_action = true,
 };
index 4ac7de3..78c427b 100644 (file)
@@ -334,4 +334,11 @@ tc_act_parse_mirred(struct mlx5e_tc_act_parse_state *parse_state,
 struct mlx5e_tc_act mlx5e_tc_act_mirred = {
        .can_offload = tc_act_can_offload_mirred,
        .parse_action = tc_act_parse_mirred,
+       .is_terminating_action = false,
+};
+
+struct mlx5e_tc_act mlx5e_tc_act_redirect = {
+       .can_offload = tc_act_can_offload_mirred,
+       .parse_action = tc_act_parse_mirred,
+       .is_terminating_action = true,
 };
index 90b4c1b..7f40969 100644 (file)
@@ -48,4 +48,5 @@ tc_act_parse_mirred_nic(struct mlx5e_tc_act_parse_state *parse_state,
 struct mlx5e_tc_act mlx5e_tc_act_mirred_nic = {
        .can_offload = tc_act_can_offload_mirred_nic,
        .parse_action = tc_act_parse_mirred_nic,
+       .is_terminating_action = true,
 };
index c8e5ca6..512d431 100644 (file)
@@ -3,6 +3,45 @@
 
 #include "act.h"
 #include "en/tc_priv.h"
+#include "fs_core.h"
+
+static bool police_act_validate_control(enum flow_action_id act_id,
+                                       struct netlink_ext_ack *extack)
+{
+       if (act_id != FLOW_ACTION_PIPE &&
+           act_id != FLOW_ACTION_ACCEPT &&
+           act_id != FLOW_ACTION_JUMP &&
+           act_id != FLOW_ACTION_DROP) {
+               NL_SET_ERR_MSG_MOD(extack,
+                                  "Offload not supported when conform-exceed action is not pipe, ok, jump or drop");
+               return false;
+       }
+
+       return true;
+}
+
+static int police_act_validate(const struct flow_action_entry *act,
+                              struct netlink_ext_ack *extack)
+{
+       if (!police_act_validate_control(act->police.exceed.act_id, extack) ||
+           !police_act_validate_control(act->police.notexceed.act_id, extack))
+               return -EOPNOTSUPP;
+
+       if (act->police.peakrate_bytes_ps ||
+           act->police.avrate || act->police.overhead) {
+               NL_SET_ERR_MSG_MOD(extack,
+                                  "Offload not supported when peakrate/avrate/overhead is configured");
+               return -EOPNOTSUPP;
+       }
+
+       if (act->police.rate_pkt_ps) {
+               NL_SET_ERR_MSG_MOD(extack,
+                                  "QoS offload not support packets per second");
+               return -EOPNOTSUPP;
+       }
+
+       return 0;
+}
 
 static bool
 tc_act_can_offload_police(struct mlx5e_tc_act_parse_state *parse_state,
@@ -10,14 +49,10 @@ tc_act_can_offload_police(struct mlx5e_tc_act_parse_state *parse_state,
                          int act_index,
                          struct mlx5_flow_attr *attr)
 {
-       if (act->police.notexceed.act_id != FLOW_ACTION_PIPE &&
-           act->police.notexceed.act_id != FLOW_ACTION_ACCEPT) {
-               NL_SET_ERR_MSG_MOD(parse_state->extack,
-                                  "Offload not supported when conform action is not pipe or ok");
-               return false;
-       }
-       if (mlx5e_policer_validate(parse_state->flow_action, act,
-                                  parse_state->extack))
+       int err;
+
+       err = police_act_validate(act, parse_state->extack);
+       if (err)
                return false;
 
        return !!mlx5e_get_flow_meters(parse_state->flow->priv->mdev);
@@ -37,6 +72,8 @@ fill_meter_params_from_act(const struct flow_action_entry *act,
                params->mode = MLX5_RATE_LIMIT_PPS;
                params->rate = act->police.rate_pkt_ps;
                params->burst = act->police.burst_pkt;
+       } else if (act->police.mtu) {
+               params->mtu = act->police.mtu;
        } else {
                return -EOPNOTSUPP;
        }
@@ -50,14 +87,25 @@ tc_act_parse_police(struct mlx5e_tc_act_parse_state *parse_state,
                    struct mlx5e_priv *priv,
                    struct mlx5_flow_attr *attr)
 {
+       enum mlx5_flow_namespace_type ns =  mlx5e_get_flow_namespace(parse_state->flow);
+       struct mlx5e_flow_meter_params *params = &attr->meter_attr.params;
        int err;
 
-       err = fill_meter_params_from_act(act, &attr->meter_attr.params);
+       err = fill_meter_params_from_act(act, params);
        if (err)
                return err;
 
-       attr->action |= MLX5_FLOW_CONTEXT_ACTION_EXECUTE_ASO;
-       attr->exe_aso_type = MLX5_EXE_ASO_FLOW_METER;
+       if (params->mtu) {
+               if (!(mlx5_fs_get_capabilities(priv->mdev, ns) &
+                     MLX5_FLOW_STEERING_CAP_MATCH_RANGES))
+                       return -EOPNOTSUPP;
+
+               attr->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
+               attr->flags |= MLX5_ATTR_FLAG_MTU;
+       } else {
+               attr->action |= MLX5_FLOW_CONTEXT_ACTION_EXECUTE_ASO;
+               attr->exe_aso_type = MLX5_EXE_ASO_FLOW_METER;
+       }
 
        return 0;
 }
@@ -79,7 +127,7 @@ tc_act_police_offload(struct mlx5e_priv *priv,
        struct mlx5e_flow_meter_handle *meter;
        int err = 0;
 
-       err = mlx5e_policer_validate(&fl_act->action, act, fl_act->extack);
+       err = police_act_validate(act, fl_act->extack);
        if (err)
                return err;
 
@@ -147,6 +195,19 @@ tc_act_police_stats(struct mlx5e_priv *priv,
        return 0;
 }
 
+static bool
+tc_act_police_get_branch_ctrl(const struct flow_action_entry *act,
+                             struct mlx5e_tc_act_branch_ctrl *cond_true,
+                             struct mlx5e_tc_act_branch_ctrl *cond_false)
+{
+       cond_true->act_id = act->police.notexceed.act_id;
+       cond_true->extval = act->police.notexceed.extval;
+
+       cond_false->act_id = act->police.exceed.act_id;
+       cond_false->extval = act->police.exceed.extval;
+       return true;
+}
+
 struct mlx5e_tc_act mlx5e_tc_act_police = {
        .can_offload = tc_act_can_offload_police,
        .parse_action = tc_act_parse_police,
@@ -154,4 +215,5 @@ struct mlx5e_tc_act mlx5e_tc_act_police = {
        .offload_action = tc_act_police_offload,
        .destroy_action = tc_act_police_destroy,
        .stats_action = tc_act_police_stats,
+       .get_branch_ctrl = tc_act_police_get_branch_ctrl,
 };
index be74e14..78af8a3 100644 (file)
@@ -162,7 +162,6 @@ mlx5e_tc_meter_modify(struct mlx5_core_dev *mdev,
                           MLX5_ACCESS_ASO_OPC_MOD_FLOW_METER);
 
        aso_ctrl = &aso_wqe->aso_ctrl;
-       memset(aso_ctrl, 0, sizeof(*aso_ctrl));
        aso_ctrl->data_mask_mode = MLX5_ASO_DATA_MASK_MODE_BYTEWISE_64BYTE << 6;
        aso_ctrl->condition_1_0_operand = MLX5_ASO_ALWAYS_TRUE |
                                          MLX5_ASO_ALWAYS_TRUE << 4;
@@ -241,7 +240,7 @@ mlx5e_flow_meter_destroy_aso_obj(struct mlx5_core_dev *mdev, u32 obj_id)
 }
 
 static struct mlx5e_flow_meter_handle *
-__mlx5e_flow_meter_alloc(struct mlx5e_flow_meters *flow_meters)
+__mlx5e_flow_meter_alloc(struct mlx5e_flow_meters *flow_meters, bool alloc_aso)
 {
        struct mlx5_core_dev *mdev = flow_meters->mdev;
        struct mlx5e_flow_meter_aso_obj *meters_obj;
@@ -257,16 +256,19 @@ __mlx5e_flow_meter_alloc(struct mlx5e_flow_meters *flow_meters)
        counter = mlx5_fc_create(mdev, true);
        if (IS_ERR(counter)) {
                err = PTR_ERR(counter);
-               goto err_red_counter;
+               goto err_drop_counter;
        }
-       meter->red_counter = counter;
+       meter->drop_counter = counter;
 
        counter = mlx5_fc_create(mdev, true);
        if (IS_ERR(counter)) {
                err = PTR_ERR(counter);
-               goto err_green_counter;
+               goto err_act_counter;
        }
-       meter->green_counter = counter;
+       meter->act_counter = counter;
+
+       if (!alloc_aso)
+               goto no_aso;
 
        meters_obj = list_first_entry_or_null(&flow_meters->partial_list,
                                              struct mlx5e_flow_meter_aso_obj,
@@ -300,11 +302,12 @@ __mlx5e_flow_meter_alloc(struct mlx5e_flow_meters *flow_meters)
        }
 
        bitmap_set(meters_obj->meters_map, pos, 1);
-       meter->flow_meters = flow_meters;
        meter->meters_obj = meters_obj;
        meter->obj_id = meters_obj->base_id + pos / 2;
        meter->idx = pos % 2;
 
+no_aso:
+       meter->flow_meters = flow_meters;
        mlx5_core_dbg(mdev, "flow meter allocated, obj_id=0x%x, index=%d\n",
                      meter->obj_id, meter->idx);
 
@@ -313,10 +316,10 @@ __mlx5e_flow_meter_alloc(struct mlx5e_flow_meters *flow_meters)
 err_mem:
        mlx5e_flow_meter_destroy_aso_obj(mdev, id);
 err_create:
-       mlx5_fc_destroy(mdev, meter->green_counter);
-err_green_counter:
-       mlx5_fc_destroy(mdev, meter->red_counter);
-err_red_counter:
+       mlx5_fc_destroy(mdev, meter->act_counter);
+err_act_counter:
+       mlx5_fc_destroy(mdev, meter->drop_counter);
+err_drop_counter:
        kfree(meter);
        return ERR_PTR(err);
 }
@@ -329,8 +332,11 @@ __mlx5e_flow_meter_free(struct mlx5e_flow_meter_handle *meter)
        struct mlx5e_flow_meter_aso_obj *meters_obj;
        int n, pos;
 
-       mlx5_fc_destroy(mdev, meter->green_counter);
-       mlx5_fc_destroy(mdev, meter->red_counter);
+       mlx5_fc_destroy(mdev, meter->act_counter);
+       mlx5_fc_destroy(mdev, meter->drop_counter);
+
+       if (meter->params.mtu)
+               goto out_no_aso;
 
        meters_obj = meter->meters_obj;
        pos = (meter->obj_id - meters_obj->base_id) * 2 + meter->idx;
@@ -345,6 +351,7 @@ __mlx5e_flow_meter_free(struct mlx5e_flow_meter_handle *meter)
                list_add(&meters_obj->entry, &flow_meters->partial_list);
        }
 
+out_no_aso:
        mlx5_core_dbg(mdev, "flow meter freed, obj_id=0x%x, index=%d\n",
                      meter->obj_id, meter->idx);
        kfree(meter);
@@ -409,12 +416,13 @@ mlx5e_tc_meter_alloc(struct mlx5e_flow_meters *flow_meters,
 {
        struct mlx5e_flow_meter_handle *meter;
 
-       meter = __mlx5e_flow_meter_alloc(flow_meters);
+       meter = __mlx5e_flow_meter_alloc(flow_meters, !params->mtu);
        if (IS_ERR(meter))
                return meter;
 
        hash_add(flow_meters->hashtbl, &meter->hlist, params->index);
        meter->params.index = params->index;
+       meter->params.mtu = params->mtu;
        meter->refcnt++;
 
        return meter;
@@ -575,8 +583,8 @@ mlx5e_tc_meter_get_stats(struct mlx5e_flow_meter_handle *meter,
        u64 bytes1, packets1, lastuse1;
        u64 bytes2, packets2, lastuse2;
 
-       mlx5_fc_query_cached(meter->green_counter, &bytes1, &packets1, &lastuse1);
-       mlx5_fc_query_cached(meter->red_counter, &bytes2, &packets2, &lastuse2);
+       mlx5_fc_query_cached(meter->act_counter, &bytes1, &packets1, &lastuse1);
+       mlx5_fc_query_cached(meter->drop_counter, &bytes2, &packets2, &lastuse2);
 
        *bytes = bytes1 + bytes2;
        *packets = packets1 + packets2;
index 6de6e8a..9b795cd 100644 (file)
@@ -20,6 +20,7 @@ struct mlx5e_flow_meter_params {
        u32 index;
        u64 rate;
        u64 burst;
+       u32 mtu;
 };
 
 struct mlx5e_flow_meter_handle {
@@ -32,8 +33,8 @@ struct mlx5e_flow_meter_handle {
        struct hlist_node hlist;
        struct mlx5e_flow_meter_params params;
 
-       struct mlx5_fc *green_counter;
-       struct mlx5_fc *red_counter;
+       struct mlx5_fc *act_counter;
+       struct mlx5_fc *drop_counter;
 };
 
 struct mlx5e_meter_attr {
index 8b77e82..8d7d761 100644 (file)
@@ -8,23 +8,56 @@
 #define MLX5_PACKET_COLOR_BITS MLX5_REG_MAPPING_MBITS(PACKET_COLOR_TO_REG)
 #define MLX5_PACKET_COLOR_MASK MLX5_REG_MAPPING_MASK(PACKET_COLOR_TO_REG)
 
-struct mlx5e_post_meter_priv {
+struct mlx5e_post_meter_rate_table {
+       struct mlx5_flow_table *ft;
+       struct mlx5_flow_group *fg;
+       struct mlx5_flow_handle *green_rule;
+       struct mlx5_flow_attr *green_attr;
+       struct mlx5_flow_handle *red_rule;
+       struct mlx5_flow_attr *red_attr;
+};
+
+struct mlx5e_post_meter_mtu_table {
        struct mlx5_flow_table *ft;
        struct mlx5_flow_group *fg;
-       struct mlx5_flow_handle *fwd_green_rule;
-       struct mlx5_flow_handle *drop_red_rule;
+       struct mlx5_flow_handle *rule;
+       struct mlx5_flow_attr *attr;
+};
+
+struct mlx5e_post_meter_mtu_tables {
+       struct mlx5e_post_meter_mtu_table green_table;
+       struct mlx5e_post_meter_mtu_table red_table;
+};
+
+struct mlx5e_post_meter_priv {
+       enum mlx5e_post_meter_type type;
+       union {
+               struct mlx5e_post_meter_rate_table rate_steering_table;
+               struct mlx5e_post_meter_mtu_tables  mtu_tables;
+       };
 };
 
 struct mlx5_flow_table *
 mlx5e_post_meter_get_ft(struct mlx5e_post_meter_priv *post_meter)
 {
-       return post_meter->ft;
+       return post_meter->rate_steering_table.ft;
 }
 
-static int
+struct mlx5_flow_table *
+mlx5e_post_meter_get_mtu_true_ft(struct mlx5e_post_meter_priv *post_meter)
+{
+       return post_meter->mtu_tables.green_table.ft;
+}
+
+struct mlx5_flow_table *
+mlx5e_post_meter_get_mtu_false_ft(struct mlx5e_post_meter_priv *post_meter)
+{
+       return post_meter->mtu_tables.red_table.ft;
+}
+
+static struct mlx5_flow_table *
 mlx5e_post_meter_table_create(struct mlx5e_priv *priv,
-                             enum mlx5_flow_namespace_type ns_type,
-                             struct mlx5e_post_meter_priv *post_meter)
+                             enum mlx5_flow_namespace_type ns_type)
 {
        struct mlx5_flow_table_attr ft_attr = {};
        struct mlx5_flow_namespace *root_ns;
@@ -32,7 +65,7 @@ mlx5e_post_meter_table_create(struct mlx5e_priv *priv,
        root_ns = mlx5_get_flow_namespace(priv->mdev, ns_type);
        if (!root_ns) {
                mlx5_core_warn(priv->mdev, "Failed to get namespace for flow meter\n");
-               return -EOPNOTSUPP;
+               return ERR_PTR(-EOPNOTSUPP);
        }
 
        ft_attr.flags = MLX5_FLOW_TABLE_UNMANAGED;
@@ -40,19 +73,14 @@ mlx5e_post_meter_table_create(struct mlx5e_priv *priv,
        ft_attr.max_fte = 2;
        ft_attr.level = 1;
 
-       post_meter->ft = mlx5_create_flow_table(root_ns, &ft_attr);
-       if (IS_ERR(post_meter->ft)) {
-               mlx5_core_warn(priv->mdev, "Failed to create post_meter table\n");
-               return PTR_ERR(post_meter->ft);
-       }
-
-       return 0;
+       return mlx5_create_flow_table(root_ns, &ft_attr);
 }
 
 static int
-mlx5e_post_meter_fg_create(struct mlx5e_priv *priv,
-                          struct mlx5e_post_meter_priv *post_meter)
+mlx5e_post_meter_rate_fg_create(struct mlx5e_priv *priv,
+                               struct mlx5e_post_meter_priv *post_meter)
 {
+       struct mlx5e_post_meter_rate_table *table = &post_meter->rate_steering_table;
        int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
        void *misc2, *match_criteria;
        u32 *flow_group_in;
@@ -71,25 +99,58 @@ mlx5e_post_meter_fg_create(struct mlx5e_priv *priv,
        MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 0);
        MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 1);
 
-       post_meter->fg = mlx5_create_flow_group(post_meter->ft, flow_group_in);
-       if (IS_ERR(post_meter->fg)) {
+       table->fg = mlx5_create_flow_group(table->ft, flow_group_in);
+       if (IS_ERR(table->fg)) {
                mlx5_core_warn(priv->mdev, "Failed to create post_meter flow group\n");
-               err = PTR_ERR(post_meter->fg);
+               err = PTR_ERR(table->fg);
        }
 
        kvfree(flow_group_in);
        return err;
 }
 
+static struct mlx5_flow_handle *
+mlx5e_post_meter_add_rule(struct mlx5e_priv *priv,
+                         struct mlx5e_post_meter_priv *post_meter,
+                         struct mlx5_flow_spec *spec,
+                         struct mlx5_flow_attr *attr,
+                         struct mlx5_fc *act_counter,
+                         struct mlx5_fc *drop_counter)
+{
+       struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+       struct mlx5_flow_handle *ret;
+
+       attr->action |= MLX5_FLOW_CONTEXT_ACTION_COUNT;
+       if (attr->action & MLX5_FLOW_CONTEXT_ACTION_DROP)
+               attr->counter = drop_counter;
+       else
+               attr->counter = act_counter;
+
+       attr->flags |= MLX5_ATTR_FLAG_NO_IN_PORT;
+       attr->outer_match_level = MLX5_MATCH_NONE;
+       attr->chain = 0;
+       attr->prio = 0;
+
+       ret = mlx5_eswitch_add_offloaded_rule(esw, spec, attr);
+
+       /* We did not create the counter, so we can't delete it.
+        * Avoid freeing the counter when the attr is deleted in free_branching_attr
+        */
+       attr->action &= ~MLX5_FLOW_CONTEXT_ACTION_COUNT;
+
+       return ret;
+}
+
 static int
-mlx5e_post_meter_rules_create(struct mlx5e_priv *priv,
-                             struct mlx5e_post_meter_priv *post_meter,
-                             struct mlx5e_post_act *post_act,
-                             struct mlx5_fc *green_counter,
-                             struct mlx5_fc *red_counter)
-{
-       struct mlx5_flow_destination dest[2] = {};
-       struct mlx5_flow_act flow_act = {};
+mlx5e_post_meter_rate_rules_create(struct mlx5e_priv *priv,
+                                  struct mlx5e_post_meter_priv *post_meter,
+                                  struct mlx5e_post_act *post_act,
+                                  struct mlx5_fc *act_counter,
+                                  struct mlx5_fc *drop_counter,
+                                  struct mlx5_flow_attr *green_attr,
+                                  struct mlx5_flow_attr *red_attr)
+{
+       struct mlx5e_post_meter_rate_table *table = &post_meter->rate_steering_table;
        struct mlx5_flow_handle *rule;
        struct mlx5_flow_spec *spec;
        int err;
@@ -100,72 +161,242 @@ mlx5e_post_meter_rules_create(struct mlx5e_priv *priv,
 
        mlx5e_tc_match_to_reg_match(spec, PACKET_COLOR_TO_REG,
                                    MLX5_FLOW_METER_COLOR_RED, MLX5_PACKET_COLOR_MASK);
-       flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP |
-                         MLX5_FLOW_CONTEXT_ACTION_COUNT;
-       flow_act.flags |= FLOW_ACT_IGNORE_FLOW_LEVEL;
-       dest[0].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
-       dest[0].counter_id = mlx5_fc_id(red_counter);
-
-       rule = mlx5_add_flow_rules(post_meter->ft, spec, &flow_act, dest, 1);
+       red_attr->ft = post_meter->rate_steering_table.ft;
+       rule = mlx5e_post_meter_add_rule(priv, post_meter, spec, red_attr,
+                                        act_counter, drop_counter);
        if (IS_ERR(rule)) {
-               mlx5_core_warn(priv->mdev, "Failed to create post_meter flow drop rule\n");
+               mlx5_core_warn(priv->mdev, "Failed to create post_meter exceed rule\n");
                err = PTR_ERR(rule);
                goto err_red;
        }
-       post_meter->drop_red_rule = rule;
+       table->red_rule = rule;
+       table->red_attr = red_attr;
 
        mlx5e_tc_match_to_reg_match(spec, PACKET_COLOR_TO_REG,
                                    MLX5_FLOW_METER_COLOR_GREEN, MLX5_PACKET_COLOR_MASK);
-       flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST |
-                         MLX5_FLOW_CONTEXT_ACTION_COUNT;
-       dest[0].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE;
-       dest[0].ft = mlx5e_tc_post_act_get_ft(post_act);
-       dest[1].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
-       dest[1].counter_id = mlx5_fc_id(green_counter);
-
-       rule = mlx5_add_flow_rules(post_meter->ft, spec, &flow_act, dest, 2);
+       green_attr->ft = post_meter->rate_steering_table.ft;
+       rule = mlx5e_post_meter_add_rule(priv, post_meter, spec, green_attr,
+                                        act_counter, drop_counter);
        if (IS_ERR(rule)) {
-               mlx5_core_warn(priv->mdev, "Failed to create post_meter flow fwd rule\n");
+               mlx5_core_warn(priv->mdev, "Failed to create post_meter notexceed rule\n");
                err = PTR_ERR(rule);
                goto err_green;
        }
-       post_meter->fwd_green_rule = rule;
+       table->green_rule = rule;
+       table->green_attr = green_attr;
 
        kvfree(spec);
        return 0;
 
 err_green:
-       mlx5_del_flow_rules(post_meter->drop_red_rule);
+       mlx5_del_flow_rules(table->red_rule);
 err_red:
        kvfree(spec);
        return err;
 }
 
 static void
-mlx5e_post_meter_rules_destroy(struct mlx5e_post_meter_priv *post_meter)
+mlx5e_post_meter_rate_rules_destroy(struct mlx5_eswitch *esw,
+                                   struct mlx5e_post_meter_priv *post_meter)
 {
-       mlx5_del_flow_rules(post_meter->drop_red_rule);
-       mlx5_del_flow_rules(post_meter->fwd_green_rule);
+       struct mlx5e_post_meter_rate_table *rate_table = &post_meter->rate_steering_table;
+
+       mlx5_eswitch_del_offloaded_rule(esw, rate_table->red_rule, rate_table->red_attr);
+       mlx5_eswitch_del_offloaded_rule(esw, rate_table->green_rule, rate_table->green_attr);
 }
 
 static void
-mlx5e_post_meter_fg_destroy(struct mlx5e_post_meter_priv *post_meter)
+mlx5e_post_meter_rate_fg_destroy(struct mlx5e_post_meter_priv *post_meter)
 {
-       mlx5_destroy_flow_group(post_meter->fg);
+       mlx5_destroy_flow_group(post_meter->rate_steering_table.fg);
 }
 
 static void
-mlx5e_post_meter_table_destroy(struct mlx5e_post_meter_priv *post_meter)
+mlx5e_post_meter_rate_table_destroy(struct mlx5e_post_meter_priv *post_meter)
+{
+       mlx5_destroy_flow_table(post_meter->rate_steering_table.ft);
+}
+
+static void
+mlx5e_post_meter_mtu_rules_destroy(struct mlx5e_post_meter_priv *post_meter)
+{
+       struct mlx5e_post_meter_mtu_tables *mtu_tables = &post_meter->mtu_tables;
+
+       mlx5_del_flow_rules(mtu_tables->green_table.rule);
+       mlx5_del_flow_rules(mtu_tables->red_table.rule);
+}
+
+static void
+mlx5e_post_meter_mtu_fg_destroy(struct mlx5e_post_meter_priv *post_meter)
+{
+       struct mlx5e_post_meter_mtu_tables *mtu_tables = &post_meter->mtu_tables;
+
+       mlx5_destroy_flow_group(mtu_tables->green_table.fg);
+       mlx5_destroy_flow_group(mtu_tables->red_table.fg);
+}
+
+static void
+mlx5e_post_meter_mtu_table_destroy(struct mlx5e_post_meter_priv *post_meter)
+{
+       struct mlx5e_post_meter_mtu_tables *mtu_tables = &post_meter->mtu_tables;
+
+       mlx5_destroy_flow_table(mtu_tables->green_table.ft);
+       mlx5_destroy_flow_table(mtu_tables->red_table.ft);
+}
+
+static int
+mlx5e_post_meter_rate_create(struct mlx5e_priv *priv,
+                            enum mlx5_flow_namespace_type ns_type,
+                            struct mlx5e_post_act *post_act,
+                            struct mlx5_fc *act_counter,
+                            struct mlx5_fc *drop_counter,
+                            struct mlx5e_post_meter_priv *post_meter,
+                            struct mlx5_flow_attr *green_attr,
+                            struct mlx5_flow_attr *red_attr)
+{
+       struct mlx5_flow_table *ft;
+       int err;
+
+       post_meter->type = MLX5E_POST_METER_RATE;
+
+       ft = mlx5e_post_meter_table_create(priv, ns_type);
+       if (IS_ERR(ft)) {
+               err = PTR_ERR(ft);
+               mlx5_core_warn(priv->mdev, "Failed to create post_meter table\n");
+               goto err_ft;
+       }
+
+       post_meter->rate_steering_table.ft = ft;
+
+       err = mlx5e_post_meter_rate_fg_create(priv, post_meter);
+       if (err)
+               goto err_fg;
+
+       err = mlx5e_post_meter_rate_rules_create(priv, post_meter, post_act,
+                                                act_counter, drop_counter,
+                                                green_attr, red_attr);
+       if (err)
+               goto err_rules;
+
+       return 0;
+
+err_rules:
+       mlx5e_post_meter_rate_fg_destroy(post_meter);
+err_fg:
+       mlx5e_post_meter_rate_table_destroy(post_meter);
+err_ft:
+       return err;
+}
+
+static int
+mlx5e_post_meter_create_mtu_table(struct mlx5e_priv *priv,
+                                 enum mlx5_flow_namespace_type ns_type,
+                                 struct mlx5e_post_meter_mtu_table *table)
 {
-       mlx5_destroy_flow_table(post_meter->ft);
+       int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
+       struct mlx5_flow_group *fg;
+       u32 *flow_group_in;
+       int err;
+
+       flow_group_in = kvzalloc(inlen, GFP_KERNEL);
+       if (!flow_group_in)
+               return -ENOMEM;
+
+       table->ft = mlx5e_post_meter_table_create(priv, ns_type);
+       if (IS_ERR(table->ft)) {
+               err = PTR_ERR(table->ft);
+               goto err_ft;
+       }
+
+       /* create miss group */
+       memset(flow_group_in, 0, inlen);
+       MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 1);
+       MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 1);
+       fg = mlx5_create_flow_group(table->ft, flow_group_in);
+       if (IS_ERR(fg)) {
+               err = PTR_ERR(fg);
+               goto err_miss_grp;
+       }
+       table->fg = fg;
+
+       kvfree(flow_group_in);
+       return 0;
+
+err_miss_grp:
+       mlx5_destroy_flow_table(table->ft);
+err_ft:
+       kvfree(flow_group_in);
+       return err;
+}
+
+static int
+mlx5e_post_meter_mtu_create(struct mlx5e_priv *priv,
+                           enum mlx5_flow_namespace_type ns_type,
+                           struct mlx5e_post_act *post_act,
+                           struct mlx5_fc *act_counter,
+                           struct mlx5_fc *drop_counter,
+                           struct mlx5e_post_meter_priv *post_meter,
+                           struct mlx5_flow_attr *green_attr,
+                           struct mlx5_flow_attr *red_attr)
+{
+       struct mlx5e_post_meter_mtu_tables *mtu_tables = &post_meter->mtu_tables;
+       static struct mlx5_flow_spec zero_spec = {};
+       struct mlx5_flow_handle *rule;
+       int err;
+
+       post_meter->type = MLX5E_POST_METER_MTU;
+
+       err = mlx5e_post_meter_create_mtu_table(priv, ns_type, &mtu_tables->green_table);
+       if (err)
+               goto err_green_ft;
+
+       green_attr->ft = mtu_tables->green_table.ft;
+       rule = mlx5e_post_meter_add_rule(priv, post_meter, &zero_spec, green_attr,
+                                        act_counter, drop_counter);
+       if (IS_ERR(rule)) {
+               mlx5_core_warn(priv->mdev, "Failed to create post_meter conform rule\n");
+               err = PTR_ERR(rule);
+               goto err_green_rule;
+       }
+       mtu_tables->green_table.rule = rule;
+       mtu_tables->green_table.attr = green_attr;
+
+       err = mlx5e_post_meter_create_mtu_table(priv, ns_type, &mtu_tables->red_table);
+       if (err)
+               goto err_red_ft;
+
+       red_attr->ft = mtu_tables->red_table.ft;
+       rule = mlx5e_post_meter_add_rule(priv, post_meter, &zero_spec, red_attr,
+                                        act_counter, drop_counter);
+       if (IS_ERR(rule)) {
+               mlx5_core_warn(priv->mdev, "Failed to create post_meter exceed rule\n");
+               err = PTR_ERR(rule);
+               goto err_red_rule;
+       }
+       mtu_tables->red_table.rule = rule;
+       mtu_tables->red_table.attr = red_attr;
+
+       return 0;
+
+err_red_rule:
+       mlx5_destroy_flow_table(mtu_tables->red_table.ft);
+err_red_ft:
+       mlx5_del_flow_rules(mtu_tables->green_table.rule);
+err_green_rule:
+       mlx5_destroy_flow_table(mtu_tables->green_table.ft);
+err_green_ft:
+       return err;
 }
 
 struct mlx5e_post_meter_priv *
 mlx5e_post_meter_init(struct mlx5e_priv *priv,
                      enum mlx5_flow_namespace_type ns_type,
                      struct mlx5e_post_act *post_act,
-                     struct mlx5_fc *green_counter,
-                     struct mlx5_fc *red_counter)
+                     enum mlx5e_post_meter_type type,
+                     struct mlx5_fc *act_counter,
+                     struct mlx5_fc *drop_counter,
+                     struct mlx5_flow_attr *branch_true,
+                     struct mlx5_flow_attr *branch_false)
 {
        struct mlx5e_post_meter_priv *post_meter;
        int err;
@@ -174,36 +405,55 @@ mlx5e_post_meter_init(struct mlx5e_priv *priv,
        if (!post_meter)
                return ERR_PTR(-ENOMEM);
 
-       err = mlx5e_post_meter_table_create(priv, ns_type, post_meter);
-       if (err)
-               goto err_ft;
-
-       err = mlx5e_post_meter_fg_create(priv, post_meter);
-       if (err)
-               goto err_fg;
+       switch (type) {
+       case MLX5E_POST_METER_MTU:
+               err = mlx5e_post_meter_mtu_create(priv, ns_type, post_act,
+                                                 act_counter, drop_counter, post_meter,
+                                                 branch_true, branch_false);
+               break;
+       case MLX5E_POST_METER_RATE:
+               err = mlx5e_post_meter_rate_create(priv, ns_type, post_act,
+                                                  act_counter, drop_counter, post_meter,
+                                                  branch_true, branch_false);
+               break;
+       default:
+               err = -EOPNOTSUPP;
+       }
 
-       err = mlx5e_post_meter_rules_create(priv, post_meter, post_act, green_counter,
-                                           red_counter);
        if (err)
-               goto err_rules;
+               goto err;
 
        return post_meter;
 
-err_rules:
-       mlx5e_post_meter_fg_destroy(post_meter);
-err_fg:
-       mlx5e_post_meter_table_destroy(post_meter);
-err_ft:
+err:
        kfree(post_meter);
        return ERR_PTR(err);
 }
 
+static void
+mlx5e_post_meter_rate_destroy(struct mlx5_eswitch *esw, struct mlx5e_post_meter_priv *post_meter)
+{
+       mlx5e_post_meter_rate_rules_destroy(esw, post_meter);
+       mlx5e_post_meter_rate_fg_destroy(post_meter);
+       mlx5e_post_meter_rate_table_destroy(post_meter);
+}
+
+static void
+mlx5e_post_meter_mtu_destroy(struct mlx5e_post_meter_priv *post_meter)
+{
+       mlx5e_post_meter_mtu_rules_destroy(post_meter);
+       mlx5e_post_meter_mtu_fg_destroy(post_meter);
+       mlx5e_post_meter_mtu_table_destroy(post_meter);
+}
+
 void
-mlx5e_post_meter_cleanup(struct mlx5e_post_meter_priv *post_meter)
+mlx5e_post_meter_cleanup(struct mlx5_eswitch *esw, struct mlx5e_post_meter_priv *post_meter)
 {
-       mlx5e_post_meter_rules_destroy(post_meter);
-       mlx5e_post_meter_fg_destroy(post_meter);
-       mlx5e_post_meter_table_destroy(post_meter);
+       if (post_meter->type == MLX5E_POST_METER_RATE)
+               mlx5e_post_meter_rate_destroy(esw, post_meter);
+       else
+               mlx5e_post_meter_mtu_destroy(post_meter);
+
        kfree(post_meter);
 }
 
index 34d0e4b..e013b77 100644 (file)
 
 struct mlx5e_post_meter_priv;
 
+enum mlx5e_post_meter_type {
+       MLX5E_POST_METER_RATE = 0,
+       MLX5E_POST_METER_MTU
+};
+
+#if IS_ENABLED(CONFIG_MLX5_CLS_ACT)
+
 struct mlx5_flow_table *
 mlx5e_post_meter_get_ft(struct mlx5e_post_meter_priv *post_meter);
 
+struct mlx5_flow_table *
+mlx5e_post_meter_get_mtu_true_ft(struct mlx5e_post_meter_priv *post_meter);
+
+struct mlx5_flow_table *
+mlx5e_post_meter_get_mtu_false_ft(struct mlx5e_post_meter_priv *post_meter);
+
 struct mlx5e_post_meter_priv *
 mlx5e_post_meter_init(struct mlx5e_priv *priv,
                      enum mlx5_flow_namespace_type ns_type,
                      struct mlx5e_post_act *post_act,
-                     struct mlx5_fc *green_counter,
-                     struct mlx5_fc *red_counter);
+                     enum mlx5e_post_meter_type type,
+                     struct mlx5_fc *act_counter,
+                     struct mlx5_fc *drop_counter,
+                     struct mlx5_flow_attr *branch_true,
+                     struct mlx5_flow_attr *branch_false);
+
 void
-mlx5e_post_meter_cleanup(struct mlx5e_post_meter_priv *post_meter);
+mlx5e_post_meter_cleanup(struct mlx5_eswitch *esw, struct mlx5e_post_meter_priv *post_meter);
+
+#else /* CONFIG_MLX5_CLS_ACT */
+
+static inline struct mlx5_flow_table *
+mlx5e_post_meter_get_mtu_true_ft(struct mlx5e_post_meter_priv *post_meter)
+{
+       return NULL;
+}
+
+static inline struct mlx5_flow_table *
+mlx5e_post_meter_get_mtu_false_ft(struct mlx5e_post_meter_priv *post_meter)
+{
+       return NULL;
+}
+
+#endif
 
 #endif /* __MLX5_EN_POST_METER_H__ */
index 2e42d7c..2b7fd1c 100644 (file)
@@ -211,8 +211,4 @@ struct mlx5e_flow_meters *mlx5e_get_flow_meters(struct mlx5_core_dev *dev);
 void *mlx5e_get_match_headers_value(u32 flags, struct mlx5_flow_spec *spec);
 void *mlx5e_get_match_headers_criteria(u32 flags, struct mlx5_flow_spec *spec);
 
-int mlx5e_policer_validate(const struct flow_action *action,
-                          const struct flow_action_entry *act,
-                          struct netlink_ext_ack *extack);
-
 #endif /* __MLX5_EN_TC_PRIV_H__ */
index 1b03ab0..bb90239 100644 (file)
@@ -45,55 +45,9 @@ static struct mlx5e_ipsec_sa_entry *to_ipsec_sa_entry(struct xfrm_state *x)
        return (struct mlx5e_ipsec_sa_entry *)x->xso.offload_handle;
 }
 
-struct xfrm_state *mlx5e_ipsec_sadb_rx_lookup(struct mlx5e_ipsec *ipsec,
-                                             unsigned int handle)
+static struct mlx5e_ipsec_pol_entry *to_ipsec_pol_entry(struct xfrm_policy *x)
 {
-       struct mlx5e_ipsec_sa_entry *sa_entry;
-       struct xfrm_state *ret = NULL;
-
-       rcu_read_lock();
-       hash_for_each_possible_rcu(ipsec->sadb_rx, sa_entry, hlist, handle)
-               if (sa_entry->handle == handle) {
-                       ret = sa_entry->x;
-                       xfrm_state_hold(ret);
-                       break;
-               }
-       rcu_read_unlock();
-
-       return ret;
-}
-
-static int mlx5e_ipsec_sadb_rx_add(struct mlx5e_ipsec_sa_entry *sa_entry)
-{
-       unsigned int handle = sa_entry->ipsec_obj_id;
-       struct mlx5e_ipsec *ipsec = sa_entry->ipsec;
-       struct mlx5e_ipsec_sa_entry *_sa_entry;
-       unsigned long flags;
-
-       rcu_read_lock();
-       hash_for_each_possible_rcu(ipsec->sadb_rx, _sa_entry, hlist, handle)
-               if (_sa_entry->handle == handle) {
-                       rcu_read_unlock();
-                       return  -EEXIST;
-               }
-       rcu_read_unlock();
-
-       spin_lock_irqsave(&ipsec->sadb_rx_lock, flags);
-       sa_entry->handle = handle;
-       hash_add_rcu(ipsec->sadb_rx, &sa_entry->hlist, sa_entry->handle);
-       spin_unlock_irqrestore(&ipsec->sadb_rx_lock, flags);
-
-       return 0;
-}
-
-static void mlx5e_ipsec_sadb_rx_del(struct mlx5e_ipsec_sa_entry *sa_entry)
-{
-       struct mlx5e_ipsec *ipsec = sa_entry->ipsec;
-       unsigned long flags;
-
-       spin_lock_irqsave(&ipsec->sadb_rx_lock, flags);
-       hash_del_rcu(&sa_entry->hlist);
-       spin_unlock_irqrestore(&ipsec->sadb_rx_lock, flags);
+       return (struct mlx5e_ipsec_pol_entry *)x->xdo.offload_handle;
 }
 
 static bool mlx5e_ipsec_update_esn_state(struct mlx5e_ipsec_sa_entry *sa_entry)
@@ -129,9 +83,33 @@ static bool mlx5e_ipsec_update_esn_state(struct mlx5e_ipsec_sa_entry *sa_entry)
        return false;
 }
 
-static void
-mlx5e_ipsec_build_accel_xfrm_attrs(struct mlx5e_ipsec_sa_entry *sa_entry,
-                                  struct mlx5_accel_esp_xfrm_attrs *attrs)
+static void mlx5e_ipsec_init_limits(struct mlx5e_ipsec_sa_entry *sa_entry,
+                                   struct mlx5_accel_esp_xfrm_attrs *attrs)
+{
+       struct xfrm_state *x = sa_entry->x;
+
+       attrs->hard_packet_limit = x->lft.hard_packet_limit;
+       if (x->lft.soft_packet_limit == XFRM_INF)
+               return;
+
+       /* Hardware decrements hard_packet_limit counter through
+        * the operation. While fires an event when soft_packet_limit
+        * is reached. It emans that we need substitute the numbers
+        * in order to properly count soft limit.
+        *
+        * As an example:
+        * XFRM user sets soft limit is 2 and hard limit is 9 and
+        * expects to see soft event after 2 packets and hard event
+        * after 9 packets. In our case, the hard limit will be set
+        * to 9 and soft limit is comparator to 7 so user gets the
+        * soft event after 2 packeta
+        */
+       attrs->soft_packet_limit =
+               x->lft.hard_packet_limit - x->lft.soft_packet_limit;
+}
+
+void mlx5e_ipsec_build_accel_xfrm_attrs(struct mlx5e_ipsec_sa_entry *sa_entry,
+                                       struct mlx5_accel_esp_xfrm_attrs *attrs)
 {
        struct xfrm_state *x = sa_entry->x;
        struct aes_gcm_keymat *aes_gcm = &attrs->aes_gcm;
@@ -157,33 +135,31 @@ mlx5e_ipsec_build_accel_xfrm_attrs(struct mlx5e_ipsec_sa_entry *sa_entry,
        memcpy(&aes_gcm->salt, x->aead->alg_key + key_len,
               sizeof(aes_gcm->salt));
 
+       attrs->authsize = crypto_aead_authsize(aead) / 4; /* in dwords */
+
        /* iv len */
        aes_gcm->icv_len = x->aead->alg_icv_len;
 
        /* esn */
        if (sa_entry->esn_state.trigger) {
-               attrs->flags |= MLX5_ACCEL_ESP_FLAGS_ESN_TRIGGERED;
+               attrs->esn_trigger = true;
                attrs->esn = sa_entry->esn_state.esn;
-               if (sa_entry->esn_state.overlap)
-                       attrs->flags |= MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP;
+               attrs->esn_overlap = sa_entry->esn_state.overlap;
+               attrs->replay_window = x->replay_esn->replay_window;
        }
 
-       /* action */
-       attrs->action = (x->xso.dir == XFRM_DEV_OFFLOAD_OUT) ?
-                               MLX5_ACCEL_ESP_ACTION_ENCRYPT :
-                                     MLX5_ACCEL_ESP_ACTION_DECRYPT;
-       /* flags */
-       attrs->flags |= (x->props.mode == XFRM_MODE_TRANSPORT) ?
-                       MLX5_ACCEL_ESP_FLAGS_TRANSPORT :
-                       MLX5_ACCEL_ESP_FLAGS_TUNNEL;
-
+       attrs->dir = x->xso.dir;
        /* spi */
        attrs->spi = be32_to_cpu(x->id.spi);
 
        /* source , destination ips */
        memcpy(&attrs->saddr, x->props.saddr.a6, sizeof(attrs->saddr));
        memcpy(&attrs->daddr, x->id.daddr.a6, sizeof(attrs->daddr));
-       attrs->is_ipv6 = (x->props.family != AF_INET);
+       attrs->family = x->props.family;
+       attrs->type = x->xso.type;
+       attrs->reqid = x->props.reqid;
+
+       mlx5e_ipsec_init_limits(sa_entry, attrs);
 }
 
 static inline int mlx5e_xfrm_validate_state(struct xfrm_state *x)
@@ -215,11 +191,6 @@ static inline int mlx5e_xfrm_validate_state(struct xfrm_state *x)
                netdev_info(netdev, "Only IPv4/6 xfrm states may be offloaded\n");
                return -EINVAL;
        }
-       if (x->props.mode != XFRM_MODE_TRANSPORT &&
-           x->props.mode != XFRM_MODE_TUNNEL) {
-               dev_info(&netdev->dev, "Only transport and tunnel xfrm states may be offloaded\n");
-               return -EINVAL;
-       }
        if (x->id.proto != IPPROTO_ESP) {
                netdev_info(netdev, "Only ESP xfrm state may be offloaded\n");
                return -EINVAL;
@@ -253,6 +224,67 @@ static inline int mlx5e_xfrm_validate_state(struct xfrm_state *x)
                netdev_info(netdev, "Cannot offload xfrm states with geniv other than seqiv\n");
                return -EINVAL;
        }
+       switch (x->xso.type) {
+       case XFRM_DEV_OFFLOAD_CRYPTO:
+               if (!(mlx5_ipsec_device_caps(priv->mdev) &
+                     MLX5_IPSEC_CAP_CRYPTO)) {
+                       netdev_info(netdev, "Crypto offload is not supported\n");
+                       return -EINVAL;
+               }
+
+               if (x->props.mode != XFRM_MODE_TRANSPORT &&
+                   x->props.mode != XFRM_MODE_TUNNEL) {
+                       netdev_info(netdev, "Only transport and tunnel xfrm states may be offloaded\n");
+                       return -EINVAL;
+               }
+               break;
+       case XFRM_DEV_OFFLOAD_PACKET:
+               if (!(mlx5_ipsec_device_caps(priv->mdev) &
+                     MLX5_IPSEC_CAP_PACKET_OFFLOAD)) {
+                       netdev_info(netdev, "Packet offload is not supported\n");
+                       return -EINVAL;
+               }
+
+               if (x->props.mode != XFRM_MODE_TRANSPORT) {
+                       netdev_info(netdev, "Only transport xfrm states may be offloaded in packet mode\n");
+                       return -EINVAL;
+               }
+
+               if (x->replay_esn && x->replay_esn->replay_window != 32 &&
+                   x->replay_esn->replay_window != 64 &&
+                   x->replay_esn->replay_window != 128 &&
+                   x->replay_esn->replay_window != 256) {
+                       netdev_info(netdev,
+                                   "Unsupported replay window size %u\n",
+                                   x->replay_esn->replay_window);
+                       return -EINVAL;
+               }
+
+               if (!x->props.reqid) {
+                       netdev_info(netdev, "Cannot offload without reqid\n");
+                       return -EINVAL;
+               }
+
+               if (x->lft.hard_byte_limit != XFRM_INF ||
+                   x->lft.soft_byte_limit != XFRM_INF) {
+                       netdev_info(netdev,
+                                   "Device doesn't support limits in bytes\n");
+                       return -EINVAL;
+               }
+
+               if (x->lft.soft_packet_limit >= x->lft.hard_packet_limit &&
+                   x->lft.hard_packet_limit != XFRM_INF) {
+                       /* XFRM stack doesn't prevent such configuration :(. */
+                       netdev_info(netdev,
+                                   "Hard packet limit must be greater than soft one\n");
+                       return -EINVAL;
+               }
+               break;
+       default:
+               netdev_info(netdev, "Unsupported xfrm offload type %d\n",
+                           x->xso.type);
+               return -EINVAL;
+       }
        return 0;
 }
 
@@ -270,6 +302,7 @@ static int mlx5e_xfrm_add_state(struct xfrm_state *x)
 {
        struct mlx5e_ipsec_sa_entry *sa_entry = NULL;
        struct net_device *netdev = x->xso.real_dev;
+       struct mlx5e_ipsec *ipsec;
        struct mlx5e_priv *priv;
        int err;
 
@@ -277,6 +310,7 @@ static int mlx5e_xfrm_add_state(struct xfrm_state *x)
        if (!priv->ipsec)
                return -EOPNOTSUPP;
 
+       ipsec = priv->ipsec;
        err = mlx5e_xfrm_validate_state(x);
        if (err)
                return err;
@@ -288,7 +322,7 @@ static int mlx5e_xfrm_add_state(struct xfrm_state *x)
        }
 
        sa_entry->x = x;
-       sa_entry->ipsec = priv->ipsec;
+       sa_entry->ipsec = ipsec;
 
        /* check esn */
        mlx5e_ipsec_update_esn_state(sa_entry);
@@ -299,25 +333,29 @@ static int mlx5e_xfrm_add_state(struct xfrm_state *x)
        if (err)
                goto err_xfrm;
 
-       err = mlx5e_accel_ipsec_fs_add_rule(priv, sa_entry);
+       err = mlx5e_accel_ipsec_fs_add_rule(sa_entry);
        if (err)
                goto err_hw_ctx;
 
-       if (x->xso.dir == XFRM_DEV_OFFLOAD_IN) {
-               err = mlx5e_ipsec_sadb_rx_add(sa_entry);
-               if (err)
-                       goto err_add_rule;
-       } else {
+       /* We use *_bh() variant because xfrm_timer_handler(), which runs
+        * in softirq context, can reach our state delete logic and we need
+        * xa_erase_bh() there.
+        */
+       err = xa_insert_bh(&ipsec->sadb, sa_entry->ipsec_obj_id, sa_entry,
+                          GFP_KERNEL);
+       if (err)
+               goto err_add_rule;
+
+       if (x->xso.dir == XFRM_DEV_OFFLOAD_OUT)
                sa_entry->set_iv_op = (x->props.flags & XFRM_STATE_ESN) ?
                                mlx5e_ipsec_set_iv_esn : mlx5e_ipsec_set_iv;
-       }
 
        INIT_WORK(&sa_entry->modify_work.work, _update_xfrm_state);
        x->xso.offload_handle = (unsigned long)sa_entry;
-       goto out;
+       return 0;
 
 err_add_rule:
-       mlx5e_accel_ipsec_fs_del_rule(priv, sa_entry);
+       mlx5e_accel_ipsec_fs_del_rule(sa_entry);
 err_hw_ctx:
        mlx5_ipsec_free_sa_ctx(sa_entry);
 err_xfrm:
@@ -329,18 +367,19 @@ out:
 static void mlx5e_xfrm_del_state(struct xfrm_state *x)
 {
        struct mlx5e_ipsec_sa_entry *sa_entry = to_ipsec_sa_entry(x);
+       struct mlx5e_ipsec *ipsec = sa_entry->ipsec;
+       struct mlx5e_ipsec_sa_entry *old;
 
-       if (x->xso.dir == XFRM_DEV_OFFLOAD_IN)
-               mlx5e_ipsec_sadb_rx_del(sa_entry);
+       old = xa_erase_bh(&ipsec->sadb, sa_entry->ipsec_obj_id);
+       WARN_ON(old != sa_entry);
 }
 
 static void mlx5e_xfrm_free_state(struct xfrm_state *x)
 {
        struct mlx5e_ipsec_sa_entry *sa_entry = to_ipsec_sa_entry(x);
-       struct mlx5e_priv *priv = netdev_priv(x->xso.dev);
 
        cancel_work_sync(&sa_entry->modify_work.work);
-       mlx5e_accel_ipsec_fs_del_rule(priv, sa_entry);
+       mlx5e_accel_ipsec_fs_del_rule(sa_entry);
        mlx5_ipsec_free_sa_ctx(sa_entry);
        kfree(sa_entry);
 }
@@ -359,23 +398,33 @@ void mlx5e_ipsec_init(struct mlx5e_priv *priv)
        if (!ipsec)
                return;
 
-       hash_init(ipsec->sadb_rx);
-       spin_lock_init(&ipsec->sadb_rx_lock);
+       xa_init_flags(&ipsec->sadb, XA_FLAGS_ALLOC);
        ipsec->mdev = priv->mdev;
        ipsec->wq = alloc_ordered_workqueue("mlx5e_ipsec: %s", 0,
                                            priv->netdev->name);
        if (!ipsec->wq)
                goto err_wq;
 
+       if (mlx5_ipsec_device_caps(priv->mdev) &
+           MLX5_IPSEC_CAP_PACKET_OFFLOAD) {
+               ret = mlx5e_ipsec_aso_init(ipsec);
+               if (ret)
+                       goto err_aso;
+       }
+
        ret = mlx5e_accel_ipsec_fs_init(ipsec);
        if (ret)
                goto err_fs_init;
 
+       ipsec->fs = priv->fs;
        priv->ipsec = ipsec;
        netdev_dbg(priv->netdev, "IPSec attached to netdevice\n");
        return;
 
 err_fs_init:
+       if (mlx5_ipsec_device_caps(priv->mdev) & MLX5_IPSEC_CAP_PACKET_OFFLOAD)
+               mlx5e_ipsec_aso_cleanup(ipsec);
+err_aso:
        destroy_workqueue(ipsec->wq);
 err_wq:
        kfree(ipsec);
@@ -391,6 +440,8 @@ void mlx5e_ipsec_cleanup(struct mlx5e_priv *priv)
                return;
 
        mlx5e_accel_ipsec_fs_cleanup(ipsec);
+       if (mlx5_ipsec_device_caps(priv->mdev) & MLX5_IPSEC_CAP_PACKET_OFFLOAD)
+               mlx5e_ipsec_aso_cleanup(ipsec);
        destroy_workqueue(ipsec->wq);
        kfree(ipsec);
        priv->ipsec = NULL;
@@ -426,6 +477,122 @@ static void mlx5e_xfrm_advance_esn_state(struct xfrm_state *x)
        queue_work(sa_entry->ipsec->wq, &modify_work->work);
 }
 
+static void mlx5e_xfrm_update_curlft(struct xfrm_state *x)
+{
+       struct mlx5e_ipsec_sa_entry *sa_entry = to_ipsec_sa_entry(x);
+       int err;
+
+       lockdep_assert_held(&x->lock);
+
+       if (sa_entry->attrs.soft_packet_limit == XFRM_INF)
+               /* Limits are not configured, as soft limit
+                * must be lowever than hard limit.
+                */
+               return;
+
+       err = mlx5e_ipsec_aso_query(sa_entry, NULL);
+       if (err)
+               return;
+
+       mlx5e_ipsec_aso_update_curlft(sa_entry, &x->curlft.packets);
+}
+
+static int mlx5e_xfrm_validate_policy(struct xfrm_policy *x)
+{
+       struct net_device *netdev = x->xdo.real_dev;
+
+       if (x->type != XFRM_POLICY_TYPE_MAIN) {
+               netdev_info(netdev, "Cannot offload non-main policy types\n");
+               return -EINVAL;
+       }
+
+       /* Please pay attention that we support only one template */
+       if (x->xfrm_nr > 1) {
+               netdev_info(netdev, "Cannot offload more than one template\n");
+               return -EINVAL;
+       }
+
+       if (x->xdo.dir != XFRM_DEV_OFFLOAD_IN &&
+           x->xdo.dir != XFRM_DEV_OFFLOAD_OUT) {
+               netdev_info(netdev, "Cannot offload forward policy\n");
+               return -EINVAL;
+       }
+
+       if (!x->xfrm_vec[0].reqid) {
+               netdev_info(netdev, "Cannot offload policy without reqid\n");
+               return -EINVAL;
+       }
+
+       if (x->xdo.type != XFRM_DEV_OFFLOAD_PACKET) {
+               netdev_info(netdev, "Unsupported xfrm offload type\n");
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+static void
+mlx5e_ipsec_build_accel_pol_attrs(struct mlx5e_ipsec_pol_entry *pol_entry,
+                                 struct mlx5_accel_pol_xfrm_attrs *attrs)
+{
+       struct xfrm_policy *x = pol_entry->x;
+       struct xfrm_selector *sel;
+
+       sel = &x->selector;
+       memset(attrs, 0, sizeof(*attrs));
+
+       memcpy(&attrs->saddr, sel->saddr.a6, sizeof(attrs->saddr));
+       memcpy(&attrs->daddr, sel->daddr.a6, sizeof(attrs->daddr));
+       attrs->family = sel->family;
+       attrs->dir = x->xdo.dir;
+       attrs->action = x->action;
+       attrs->type = XFRM_DEV_OFFLOAD_PACKET;
+       attrs->reqid = x->xfrm_vec[0].reqid;
+}
+
+static int mlx5e_xfrm_add_policy(struct xfrm_policy *x)
+{
+       struct net_device *netdev = x->xdo.real_dev;
+       struct mlx5e_ipsec_pol_entry *pol_entry;
+       struct mlx5e_priv *priv;
+       int err;
+
+       priv = netdev_priv(netdev);
+       if (!priv->ipsec)
+               return -EOPNOTSUPP;
+
+       err = mlx5e_xfrm_validate_policy(x);
+       if (err)
+               return err;
+
+       pol_entry = kzalloc(sizeof(*pol_entry), GFP_KERNEL);
+       if (!pol_entry)
+               return -ENOMEM;
+
+       pol_entry->x = x;
+       pol_entry->ipsec = priv->ipsec;
+
+       mlx5e_ipsec_build_accel_pol_attrs(pol_entry, &pol_entry->attrs);
+       err = mlx5e_accel_ipsec_fs_add_pol(pol_entry);
+       if (err)
+               goto err_fs;
+
+       x->xdo.offload_handle = (unsigned long)pol_entry;
+       return 0;
+
+err_fs:
+       kfree(pol_entry);
+       return err;
+}
+
+static void mlx5e_xfrm_free_policy(struct xfrm_policy *x)
+{
+       struct mlx5e_ipsec_pol_entry *pol_entry = to_ipsec_pol_entry(x);
+
+       mlx5e_accel_ipsec_fs_del_pol(pol_entry);
+       kfree(pol_entry);
+}
+
 static const struct xfrmdev_ops mlx5e_ipsec_xfrmdev_ops = {
        .xdo_dev_state_add      = mlx5e_xfrm_add_state,
        .xdo_dev_state_delete   = mlx5e_xfrm_del_state,
@@ -434,6 +601,18 @@ static const struct xfrmdev_ops mlx5e_ipsec_xfrmdev_ops = {
        .xdo_dev_state_advance_esn = mlx5e_xfrm_advance_esn_state,
 };
 
+static const struct xfrmdev_ops mlx5e_ipsec_packet_xfrmdev_ops = {
+       .xdo_dev_state_add      = mlx5e_xfrm_add_state,
+       .xdo_dev_state_delete   = mlx5e_xfrm_del_state,
+       .xdo_dev_state_free     = mlx5e_xfrm_free_state,
+       .xdo_dev_offload_ok     = mlx5e_ipsec_offload_ok,
+       .xdo_dev_state_advance_esn = mlx5e_xfrm_advance_esn_state,
+
+       .xdo_dev_state_update_curlft = mlx5e_xfrm_update_curlft,
+       .xdo_dev_policy_add = mlx5e_xfrm_add_policy,
+       .xdo_dev_policy_free = mlx5e_xfrm_free_policy,
+};
+
 void mlx5e_ipsec_build_netdev(struct mlx5e_priv *priv)
 {
        struct mlx5_core_dev *mdev = priv->mdev;
@@ -443,7 +622,12 @@ void mlx5e_ipsec_build_netdev(struct mlx5e_priv *priv)
                return;
 
        mlx5_core_info(mdev, "mlx5e: IPSec ESP acceleration enabled\n");
-       netdev->xfrmdev_ops = &mlx5e_ipsec_xfrmdev_ops;
+
+       if (mlx5_ipsec_device_caps(mdev) & MLX5_IPSEC_CAP_PACKET_OFFLOAD)
+               netdev->xfrmdev_ops = &mlx5e_ipsec_packet_xfrmdev_ops;
+       else
+               netdev->xfrmdev_ops = &mlx5e_ipsec_xfrmdev_ops;
+
        netdev->features |= NETIF_F_HW_ESP;
        netdev->hw_enc_features |= NETIF_F_HW_ESP;
 
index 4c47347..a92e19c 100644 (file)
 #ifndef __MLX5E_IPSEC_H__
 #define __MLX5E_IPSEC_H__
 
-#ifdef CONFIG_MLX5_EN_IPSEC
-
 #include <linux/mlx5/device.h>
 #include <net/xfrm.h>
 #include <linux/idr.h>
+#include "lib/aso.h"
 
 #define MLX5E_IPSEC_SADB_RX_BITS 10
 #define MLX5E_IPSEC_ESN_SCOPE_MID 0x80000000L
 
-enum mlx5_accel_esp_flags {
-       MLX5_ACCEL_ESP_FLAGS_TUNNEL            = 0,    /* Default */
-       MLX5_ACCEL_ESP_FLAGS_TRANSPORT         = 1UL << 0,
-       MLX5_ACCEL_ESP_FLAGS_ESN_TRIGGERED     = 1UL << 1,
-       MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP = 1UL << 2,
-};
-
-enum mlx5_accel_esp_action {
-       MLX5_ACCEL_ESP_ACTION_DECRYPT,
-       MLX5_ACCEL_ESP_ACTION_ENCRYPT,
-};
-
 struct aes_gcm_keymat {
        u64   seq_iv;
 
@@ -66,7 +53,6 @@ struct aes_gcm_keymat {
 };
 
 struct mlx5_accel_esp_xfrm_attrs {
-       enum mlx5_accel_esp_action action;
        u32   esn;
        u32   spi;
        u32   flags;
@@ -82,16 +68,37 @@ struct mlx5_accel_esp_xfrm_attrs {
                __be32 a6[4];
        } daddr;
 
-       u8 is_ipv6;
+       u8 dir : 2;
+       u8 esn_overlap : 1;
+       u8 esn_trigger : 1;
+       u8 type : 2;
+       u8 family;
+       u32 replay_window;
+       u32 authsize;
+       u32 reqid;
+       u64 hard_packet_limit;
+       u64 soft_packet_limit;
 };
 
 enum mlx5_ipsec_cap {
        MLX5_IPSEC_CAP_CRYPTO           = 1 << 0,
        MLX5_IPSEC_CAP_ESN              = 1 << 1,
+       MLX5_IPSEC_CAP_PACKET_OFFLOAD   = 1 << 2,
 };
 
 struct mlx5e_priv;
 
+struct mlx5e_ipsec_hw_stats {
+       u64 ipsec_rx_pkts;
+       u64 ipsec_rx_bytes;
+       u64 ipsec_rx_drop_pkts;
+       u64 ipsec_rx_drop_bytes;
+       u64 ipsec_tx_pkts;
+       u64 ipsec_tx_bytes;
+       u64 ipsec_tx_drop_pkts;
+       u64 ipsec_tx_drop_bytes;
+};
+
 struct mlx5e_ipsec_sw_stats {
        atomic64_t ipsec_rx_drop_sp_alloc;
        atomic64_t ipsec_rx_drop_sadb_miss;
@@ -102,17 +109,38 @@ struct mlx5e_ipsec_sw_stats {
        atomic64_t ipsec_tx_drop_trailer;
 };
 
-struct mlx5e_accel_fs_esp;
+struct mlx5e_ipsec_rx;
 struct mlx5e_ipsec_tx;
 
+struct mlx5e_ipsec_work {
+       struct work_struct work;
+       struct mlx5e_ipsec *ipsec;
+       u32 id;
+};
+
+struct mlx5e_ipsec_aso {
+       u8 ctx[MLX5_ST_SZ_BYTES(ipsec_aso)];
+       dma_addr_t dma_addr;
+       struct mlx5_aso *aso;
+       /* IPsec ASO caches data on every query call,
+        * so in nested calls, we can use this boolean to save
+        * recursive calls to mlx5e_ipsec_aso_query()
+        */
+       u8 use_cache : 1;
+};
+
 struct mlx5e_ipsec {
        struct mlx5_core_dev *mdev;
-       DECLARE_HASHTABLE(sadb_rx, MLX5E_IPSEC_SADB_RX_BITS);
-       spinlock_t sadb_rx_lock; /* Protects sadb_rx */
+       struct xarray sadb;
        struct mlx5e_ipsec_sw_stats sw_stats;
+       struct mlx5e_ipsec_hw_stats hw_stats;
        struct workqueue_struct *wq;
-       struct mlx5e_accel_fs_esp *rx_fs;
-       struct mlx5e_ipsec_tx *tx_fs;
+       struct mlx5e_flow_steering *fs;
+       struct mlx5e_ipsec_rx *rx_ipv4;
+       struct mlx5e_ipsec_rx *rx_ipv6;
+       struct mlx5e_ipsec_tx *tx;
+       struct mlx5e_ipsec_aso *aso;
+       struct notifier_block nb;
 };
 
 struct mlx5e_ipsec_esn_state {
@@ -123,7 +151,8 @@ struct mlx5e_ipsec_esn_state {
 
 struct mlx5e_ipsec_rule {
        struct mlx5_flow_handle *rule;
-       struct mlx5_modify_hdr *set_modify_hdr;
+       struct mlx5_modify_hdr *modify_hdr;
+       struct mlx5_pkt_reformat *pkt_reformat;
 };
 
 struct mlx5e_ipsec_modify_state_work {
@@ -132,9 +161,7 @@ struct mlx5e_ipsec_modify_state_work {
 };
 
 struct mlx5e_ipsec_sa_entry {
-       struct hlist_node hlist; /* Item in SADB_RX hashtable */
        struct mlx5e_ipsec_esn_state esn_state;
-       unsigned int handle; /* Handle in SADB_RX */
        struct xfrm_state *x;
        struct mlx5e_ipsec *ipsec;
        struct mlx5_accel_esp_xfrm_attrs attrs;
@@ -146,19 +173,43 @@ struct mlx5e_ipsec_sa_entry {
        struct mlx5e_ipsec_modify_state_work modify_work;
 };
 
+struct mlx5_accel_pol_xfrm_attrs {
+       union {
+               __be32 a4;
+               __be32 a6[4];
+       } saddr;
+
+       union {
+               __be32 a4;
+               __be32 a6[4];
+       } daddr;
+
+       u8 family;
+       u8 action;
+       u8 type : 2;
+       u8 dir : 2;
+       u32 reqid;
+};
+
+struct mlx5e_ipsec_pol_entry {
+       struct xfrm_policy *x;
+       struct mlx5e_ipsec *ipsec;
+       struct mlx5e_ipsec_rule ipsec_rule;
+       struct mlx5_accel_pol_xfrm_attrs attrs;
+};
+
+#ifdef CONFIG_MLX5_EN_IPSEC
+
 void mlx5e_ipsec_init(struct mlx5e_priv *priv);
 void mlx5e_ipsec_cleanup(struct mlx5e_priv *priv);
 void mlx5e_ipsec_build_netdev(struct mlx5e_priv *priv);
 
-struct xfrm_state *mlx5e_ipsec_sadb_rx_lookup(struct mlx5e_ipsec *dev,
-                                             unsigned int handle);
-
 void mlx5e_accel_ipsec_fs_cleanup(struct mlx5e_ipsec *ipsec);
 int mlx5e_accel_ipsec_fs_init(struct mlx5e_ipsec *ipsec);
-int mlx5e_accel_ipsec_fs_add_rule(struct mlx5e_priv *priv,
-                                 struct mlx5e_ipsec_sa_entry *sa_entry);
-void mlx5e_accel_ipsec_fs_del_rule(struct mlx5e_priv *priv,
-                                  struct mlx5e_ipsec_sa_entry *sa_entry);
+int mlx5e_accel_ipsec_fs_add_rule(struct mlx5e_ipsec_sa_entry *sa_entry);
+void mlx5e_accel_ipsec_fs_del_rule(struct mlx5e_ipsec_sa_entry *sa_entry);
+int mlx5e_accel_ipsec_fs_add_pol(struct mlx5e_ipsec_pol_entry *pol_entry);
+void mlx5e_accel_ipsec_fs_del_pol(struct mlx5e_ipsec_pol_entry *pol_entry);
 
 int mlx5_ipsec_create_sa_ctx(struct mlx5e_ipsec_sa_entry *sa_entry);
 void mlx5_ipsec_free_sa_ctx(struct mlx5e_ipsec_sa_entry *sa_entry);
@@ -168,11 +219,30 @@ u32 mlx5_ipsec_device_caps(struct mlx5_core_dev *mdev);
 void mlx5_accel_esp_modify_xfrm(struct mlx5e_ipsec_sa_entry *sa_entry,
                                const struct mlx5_accel_esp_xfrm_attrs *attrs);
 
+int mlx5e_ipsec_aso_init(struct mlx5e_ipsec *ipsec);
+void mlx5e_ipsec_aso_cleanup(struct mlx5e_ipsec *ipsec);
+
+int mlx5e_ipsec_aso_query(struct mlx5e_ipsec_sa_entry *sa_entry,
+                         struct mlx5_wqe_aso_ctrl_seg *data);
+void mlx5e_ipsec_aso_update_curlft(struct mlx5e_ipsec_sa_entry *sa_entry,
+                                  u64 *packets);
+
+void mlx5e_accel_ipsec_fs_read_stats(struct mlx5e_priv *priv,
+                                    void *ipsec_stats);
+
+void mlx5e_ipsec_build_accel_xfrm_attrs(struct mlx5e_ipsec_sa_entry *sa_entry,
+                                       struct mlx5_accel_esp_xfrm_attrs *attrs);
 static inline struct mlx5_core_dev *
 mlx5e_ipsec_sa2dev(struct mlx5e_ipsec_sa_entry *sa_entry)
 {
        return sa_entry->ipsec->mdev;
 }
+
+static inline struct mlx5_core_dev *
+mlx5e_ipsec_pol2dev(struct mlx5e_ipsec_pol_entry *pol_entry)
+{
+       return pol_entry->ipsec->mdev;
+}
 #else
 static inline void mlx5e_ipsec_init(struct mlx5e_priv *priv)
 {
index b859e4a..9f19f4b 100644 (file)
@@ -9,53 +9,67 @@
 
 #define NUM_IPSEC_FTE BIT(15)
 
-enum accel_fs_esp_type {
-       ACCEL_FS_ESP4,
-       ACCEL_FS_ESP6,
-       ACCEL_FS_ESP_NUM_TYPES,
+struct mlx5e_ipsec_fc {
+       struct mlx5_fc *cnt;
+       struct mlx5_fc *drop;
 };
 
-struct mlx5e_ipsec_rx_err {
-       struct mlx5_flow_table *ft;
-       struct mlx5_flow_handle *rule;
-       struct mlx5_modify_hdr *copy_modify_hdr;
+struct mlx5e_ipsec_ft {
+       struct mutex mutex; /* Protect changes to this struct */
+       struct mlx5_flow_table *pol;
+       struct mlx5_flow_table *sa;
+       struct mlx5_flow_table *status;
+       u32 refcnt;
 };
 
-struct mlx5e_accel_fs_esp_prot {
-       struct mlx5_flow_table *ft;
-       struct mlx5_flow_group *miss_group;
-       struct mlx5_flow_handle *miss_rule;
-       struct mlx5_flow_destination default_dest;
-       struct mlx5e_ipsec_rx_err rx_err;
-       u32 refcnt;
-       struct mutex prot_mutex; /* protect ESP4/ESP6 protocol */
+struct mlx5e_ipsec_miss {
+       struct mlx5_flow_group *group;
+       struct mlx5_flow_handle *rule;
 };
 
-struct mlx5e_accel_fs_esp {
-       struct mlx5e_accel_fs_esp_prot fs_prot[ACCEL_FS_ESP_NUM_TYPES];
+struct mlx5e_ipsec_rx {
+       struct mlx5e_ipsec_ft ft;
+       struct mlx5e_ipsec_miss pol;
+       struct mlx5e_ipsec_miss sa;
+       struct mlx5e_ipsec_rule status;
+       struct mlx5e_ipsec_fc *fc;
 };
 
 struct mlx5e_ipsec_tx {
+       struct mlx5e_ipsec_ft ft;
+       struct mlx5e_ipsec_miss pol;
        struct mlx5_flow_namespace *ns;
-       struct mlx5_flow_table *ft;
-       struct mutex mutex; /* Protect IPsec TX steering */
-       u32 refcnt;
+       struct mlx5e_ipsec_fc *fc;
 };
 
 /* IPsec RX flow steering */
-static enum mlx5_traffic_types fs_esp2tt(enum accel_fs_esp_type i)
+static enum mlx5_traffic_types family2tt(u32 family)
 {
-       if (i == ACCEL_FS_ESP4)
+       if (family == AF_INET)
                return MLX5_TT_IPV4_IPSEC_ESP;
        return MLX5_TT_IPV6_IPSEC_ESP;
 }
 
-static int rx_err_add_rule(struct mlx5e_priv *priv,
-                          struct mlx5e_accel_fs_esp_prot *fs_prot,
-                          struct mlx5e_ipsec_rx_err *rx_err)
+static struct mlx5_flow_table *ipsec_ft_create(struct mlx5_flow_namespace *ns,
+                                              int level, int prio,
+                                              int max_num_groups)
+{
+       struct mlx5_flow_table_attr ft_attr = {};
+
+       ft_attr.autogroup.num_reserved_entries = 1;
+       ft_attr.autogroup.max_num_groups = max_num_groups;
+       ft_attr.max_fte = NUM_IPSEC_FTE;
+       ft_attr.level = level;
+       ft_attr.prio = prio;
+
+       return mlx5_create_auto_grouped_flow_table(ns, &ft_attr);
+}
+
+static int ipsec_status_rule(struct mlx5_core_dev *mdev,
+                            struct mlx5e_ipsec_rx *rx,
+                            struct mlx5_flow_destination *dest)
 {
        u8 action[MLX5_UN_SZ_BYTES(set_add_copy_action_in_auto)] = {};
-       struct mlx5_core_dev *mdev = priv->mdev;
        struct mlx5_flow_act flow_act = {};
        struct mlx5_modify_hdr *modify_hdr;
        struct mlx5_flow_handle *fte;
@@ -79,26 +93,26 @@ static int rx_err_add_rule(struct mlx5e_priv *priv,
 
        if (IS_ERR(modify_hdr)) {
                err = PTR_ERR(modify_hdr);
-               netdev_err(priv->netdev,
-                          "fail to alloc ipsec copy modify_header_id err=%d\n", err);
+               mlx5_core_err(mdev,
+                             "fail to alloc ipsec copy modify_header_id err=%d\n", err);
                goto out_spec;
        }
 
        /* create fte */
        flow_act.action = MLX5_FLOW_CONTEXT_ACTION_MOD_HDR |
-                         MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
+                         MLX5_FLOW_CONTEXT_ACTION_FWD_DEST |
+                         MLX5_FLOW_CONTEXT_ACTION_COUNT;
        flow_act.modify_hdr = modify_hdr;
-       fte = mlx5_add_flow_rules(rx_err->ft, spec, &flow_act,
-                                 &fs_prot->default_dest, 1);
+       fte = mlx5_add_flow_rules(rx->ft.status, spec, &flow_act, dest, 2);
        if (IS_ERR(fte)) {
                err = PTR_ERR(fte);
-               netdev_err(priv->netdev, "fail to add ipsec rx err copy rule err=%d\n", err);
+               mlx5_core_err(mdev, "fail to add ipsec rx err copy rule err=%d\n", err);
                goto out;
        }
 
        kvfree(spec);
-       rx_err->rule = fte;
-       rx_err->copy_modify_hdr = modify_hdr;
+       rx->status.rule = fte;
+       rx->status.modify_hdr = modify_hdr;
        return 0;
 
 out:
@@ -108,13 +122,12 @@ out_spec:
        return err;
 }
 
-static int rx_fs_create(struct mlx5e_priv *priv,
-                       struct mlx5e_accel_fs_esp_prot *fs_prot)
+static int ipsec_miss_create(struct mlx5_core_dev *mdev,
+                            struct mlx5_flow_table *ft,
+                            struct mlx5e_ipsec_miss *miss,
+                            struct mlx5_flow_destination *dest)
 {
        int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
-       struct mlx5_flow_table *ft = fs_prot->ft;
-       struct mlx5_flow_group *miss_group;
-       struct mlx5_flow_handle *miss_rule;
        MLX5_DECLARE_FLOW_ACT(flow_act);
        struct mlx5_flow_spec *spec;
        u32 *flow_group_in;
@@ -130,450 +143,888 @@ static int rx_fs_create(struct mlx5e_priv *priv,
        /* Create miss_group */
        MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, ft->max_fte - 1);
        MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, ft->max_fte - 1);
-       miss_group = mlx5_create_flow_group(ft, flow_group_in);
-       if (IS_ERR(miss_group)) {
-               err = PTR_ERR(miss_group);
-               netdev_err(priv->netdev, "fail to create ipsec rx miss_group err=%d\n", err);
+       miss->group = mlx5_create_flow_group(ft, flow_group_in);
+       if (IS_ERR(miss->group)) {
+               err = PTR_ERR(miss->group);
+               mlx5_core_err(mdev, "fail to create IPsec miss_group err=%d\n",
+                             err);
                goto out;
        }
-       fs_prot->miss_group = miss_group;
 
        /* Create miss rule */
-       miss_rule = mlx5_add_flow_rules(ft, spec, &flow_act, &fs_prot->default_dest, 1);
-       if (IS_ERR(miss_rule)) {
-               mlx5_destroy_flow_group(fs_prot->miss_group);
-               err = PTR_ERR(miss_rule);
-               netdev_err(priv->netdev, "fail to create ipsec rx miss_rule err=%d\n", err);
+       miss->rule = mlx5_add_flow_rules(ft, spec, &flow_act, dest, 1);
+       if (IS_ERR(miss->rule)) {
+               mlx5_destroy_flow_group(miss->group);
+               err = PTR_ERR(miss->rule);
+               mlx5_core_err(mdev, "fail to create IPsec miss_rule err=%d\n",
+                             err);
                goto out;
        }
-       fs_prot->miss_rule = miss_rule;
 out:
        kvfree(flow_group_in);
        kvfree(spec);
        return err;
 }
 
-static void rx_destroy(struct mlx5e_priv *priv, enum accel_fs_esp_type type)
+static void rx_destroy(struct mlx5_core_dev *mdev, struct mlx5e_ipsec_rx *rx)
 {
-       struct mlx5e_accel_fs_esp_prot *fs_prot;
-       struct mlx5e_accel_fs_esp *accel_esp;
-
-       accel_esp = priv->ipsec->rx_fs;
-
-       /* The netdev unreg already happened, so all offloaded rule are already removed */
-       fs_prot = &accel_esp->fs_prot[type];
+       mlx5_del_flow_rules(rx->pol.rule);
+       mlx5_destroy_flow_group(rx->pol.group);
+       mlx5_destroy_flow_table(rx->ft.pol);
 
-       mlx5_del_flow_rules(fs_prot->miss_rule);
-       mlx5_destroy_flow_group(fs_prot->miss_group);
-       mlx5_destroy_flow_table(fs_prot->ft);
+       mlx5_del_flow_rules(rx->sa.rule);
+       mlx5_destroy_flow_group(rx->sa.group);
+       mlx5_destroy_flow_table(rx->ft.sa);
 
-       mlx5_del_flow_rules(fs_prot->rx_err.rule);
-       mlx5_modify_header_dealloc(priv->mdev, fs_prot->rx_err.copy_modify_hdr);
-       mlx5_destroy_flow_table(fs_prot->rx_err.ft);
+       mlx5_del_flow_rules(rx->status.rule);
+       mlx5_modify_header_dealloc(mdev, rx->status.modify_hdr);
+       mlx5_destroy_flow_table(rx->ft.status);
 }
 
-static int rx_create(struct mlx5e_priv *priv, enum accel_fs_esp_type type)
+static int rx_create(struct mlx5_core_dev *mdev, struct mlx5e_ipsec *ipsec,
+                    struct mlx5e_ipsec_rx *rx, u32 family)
 {
-       struct mlx5_flow_namespace *ns = mlx5e_fs_get_ns(priv->fs, false);
-       struct mlx5_ttc_table *ttc = mlx5e_fs_get_ttc(priv->fs, false);
-       struct mlx5_flow_table_attr ft_attr = {};
-       struct mlx5e_accel_fs_esp_prot *fs_prot;
-       struct mlx5e_accel_fs_esp *accel_esp;
+       struct mlx5_flow_namespace *ns = mlx5e_fs_get_ns(ipsec->fs, false);
+       struct mlx5_ttc_table *ttc = mlx5e_fs_get_ttc(ipsec->fs, false);
+       struct mlx5_flow_destination dest[2];
        struct mlx5_flow_table *ft;
        int err;
 
-       accel_esp = priv->ipsec->rx_fs;
-       fs_prot = &accel_esp->fs_prot[type];
-       fs_prot->default_dest =
-               mlx5_ttc_get_default_dest(ttc, fs_esp2tt(type));
-
-       ft_attr.max_fte = 1;
-       ft_attr.autogroup.max_num_groups = 1;
-       ft_attr.level = MLX5E_ACCEL_FS_ESP_FT_ERR_LEVEL;
-       ft_attr.prio = MLX5E_NIC_PRIO;
-       ft = mlx5_create_auto_grouped_flow_table(ns, &ft_attr);
+       ft = ipsec_ft_create(ns, MLX5E_ACCEL_FS_ESP_FT_ERR_LEVEL,
+                            MLX5E_NIC_PRIO, 1);
        if (IS_ERR(ft))
                return PTR_ERR(ft);
 
-       fs_prot->rx_err.ft = ft;
-       err = rx_err_add_rule(priv, fs_prot, &fs_prot->rx_err);
+       rx->ft.status = ft;
+
+       dest[0] = mlx5_ttc_get_default_dest(ttc, family2tt(family));
+       dest[1].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
+       dest[1].counter_id = mlx5_fc_id(rx->fc->cnt);
+       err = ipsec_status_rule(mdev, rx, dest);
        if (err)
                goto err_add;
 
        /* Create FT */
-       ft_attr.max_fte = NUM_IPSEC_FTE;
-       ft_attr.level = MLX5E_ACCEL_FS_ESP_FT_LEVEL;
-       ft_attr.prio = MLX5E_NIC_PRIO;
-       ft_attr.autogroup.num_reserved_entries = 1;
-       ft_attr.autogroup.max_num_groups = 1;
-       ft = mlx5_create_auto_grouped_flow_table(ns, &ft_attr);
+       ft = ipsec_ft_create(ns, MLX5E_ACCEL_FS_ESP_FT_LEVEL, MLX5E_NIC_PRIO,
+                            2);
        if (IS_ERR(ft)) {
                err = PTR_ERR(ft);
                goto err_fs_ft;
        }
-       fs_prot->ft = ft;
+       rx->ft.sa = ft;
 
-       err = rx_fs_create(priv, fs_prot);
+       err = ipsec_miss_create(mdev, rx->ft.sa, &rx->sa, dest);
        if (err)
                goto err_fs;
 
+       ft = ipsec_ft_create(ns, MLX5E_ACCEL_FS_POL_FT_LEVEL, MLX5E_NIC_PRIO,
+                            2);
+       if (IS_ERR(ft)) {
+               err = PTR_ERR(ft);
+               goto err_pol_ft;
+       }
+       rx->ft.pol = ft;
+       memset(dest, 0x00, 2 * sizeof(*dest));
+       dest[0].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE;
+       dest[0].ft = rx->ft.sa;
+       err = ipsec_miss_create(mdev, rx->ft.pol, &rx->pol, dest);
+       if (err)
+               goto err_pol_miss;
+
        return 0;
 
+err_pol_miss:
+       mlx5_destroy_flow_table(rx->ft.pol);
+err_pol_ft:
+       mlx5_del_flow_rules(rx->sa.rule);
+       mlx5_destroy_flow_group(rx->sa.group);
 err_fs:
-       mlx5_destroy_flow_table(fs_prot->ft);
+       mlx5_destroy_flow_table(rx->ft.sa);
 err_fs_ft:
-       mlx5_del_flow_rules(fs_prot->rx_err.rule);
-       mlx5_modify_header_dealloc(priv->mdev, fs_prot->rx_err.copy_modify_hdr);
+       mlx5_del_flow_rules(rx->status.rule);
+       mlx5_modify_header_dealloc(mdev, rx->status.modify_hdr);
 err_add:
-       mlx5_destroy_flow_table(fs_prot->rx_err.ft);
+       mlx5_destroy_flow_table(rx->ft.status);
        return err;
 }
 
-static int rx_ft_get(struct mlx5e_priv *priv, enum accel_fs_esp_type type)
+static struct mlx5e_ipsec_rx *rx_ft_get(struct mlx5_core_dev *mdev,
+                                       struct mlx5e_ipsec *ipsec, u32 family)
 {
-       struct mlx5_ttc_table *ttc = mlx5e_fs_get_ttc(priv->fs, false);
-       struct mlx5e_accel_fs_esp_prot *fs_prot;
+       struct mlx5_ttc_table *ttc = mlx5e_fs_get_ttc(ipsec->fs, false);
        struct mlx5_flow_destination dest = {};
-       struct mlx5e_accel_fs_esp *accel_esp;
+       struct mlx5e_ipsec_rx *rx;
        int err = 0;
 
-       accel_esp = priv->ipsec->rx_fs;
-       fs_prot = &accel_esp->fs_prot[type];
-       mutex_lock(&fs_prot->prot_mutex);
-       if (fs_prot->refcnt)
+       if (family == AF_INET)
+               rx = ipsec->rx_ipv4;
+       else
+               rx = ipsec->rx_ipv6;
+
+       mutex_lock(&rx->ft.mutex);
+       if (rx->ft.refcnt)
                goto skip;
 
        /* create FT */
-       err = rx_create(priv, type);
+       err = rx_create(mdev, ipsec, rx, family);
        if (err)
                goto out;
 
        /* connect */
        dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE;
-       dest.ft = fs_prot->ft;
-       mlx5_ttc_fwd_dest(ttc, fs_esp2tt(type), &dest);
+       dest.ft = rx->ft.pol;
+       mlx5_ttc_fwd_dest(ttc, family2tt(family), &dest);
 
 skip:
-       fs_prot->refcnt++;
+       rx->ft.refcnt++;
 out:
-       mutex_unlock(&fs_prot->prot_mutex);
-       return err;
+       mutex_unlock(&rx->ft.mutex);
+       if (err)
+               return ERR_PTR(err);
+       return rx;
 }
 
-static void rx_ft_put(struct mlx5e_priv *priv, enum accel_fs_esp_type type)
+static void rx_ft_put(struct mlx5_core_dev *mdev, struct mlx5e_ipsec *ipsec,
+                     u32 family)
 {
-       struct mlx5_ttc_table *ttc = mlx5e_fs_get_ttc(priv->fs, false);
-       struct mlx5e_accel_fs_esp_prot *fs_prot;
-       struct mlx5e_accel_fs_esp *accel_esp;
+       struct mlx5_ttc_table *ttc = mlx5e_fs_get_ttc(ipsec->fs, false);
+       struct mlx5e_ipsec_rx *rx;
+
+       if (family == AF_INET)
+               rx = ipsec->rx_ipv4;
+       else
+               rx = ipsec->rx_ipv6;
 
-       accel_esp = priv->ipsec->rx_fs;
-       fs_prot = &accel_esp->fs_prot[type];
-       mutex_lock(&fs_prot->prot_mutex);
-       fs_prot->refcnt--;
-       if (fs_prot->refcnt)
+       mutex_lock(&rx->ft.mutex);
+       rx->ft.refcnt--;
+       if (rx->ft.refcnt)
                goto out;
 
        /* disconnect */
-       mlx5_ttc_fwd_default_dest(ttc, fs_esp2tt(type));
+       mlx5_ttc_fwd_default_dest(ttc, family2tt(family));
 
        /* remove FT */
-       rx_destroy(priv, type);
+       rx_destroy(mdev, rx);
 
 out:
-       mutex_unlock(&fs_prot->prot_mutex);
+       mutex_unlock(&rx->ft.mutex);
 }
 
 /* IPsec TX flow steering */
-static int tx_create(struct mlx5e_priv *priv)
+static int tx_create(struct mlx5_core_dev *mdev, struct mlx5e_ipsec_tx *tx)
 {
-       struct mlx5_flow_table_attr ft_attr = {};
-       struct mlx5e_ipsec *ipsec = priv->ipsec;
+       struct mlx5_flow_destination dest = {};
        struct mlx5_flow_table *ft;
        int err;
 
-       ft_attr.max_fte = NUM_IPSEC_FTE;
-       ft_attr.autogroup.max_num_groups = 1;
-       ft = mlx5_create_auto_grouped_flow_table(ipsec->tx_fs->ns, &ft_attr);
+       ft = ipsec_ft_create(tx->ns, 1, 0, 4);
+       if (IS_ERR(ft))
+               return PTR_ERR(ft);
+
+       tx->ft.sa = ft;
+
+       ft = ipsec_ft_create(tx->ns, 0, 0, 2);
        if (IS_ERR(ft)) {
                err = PTR_ERR(ft);
-               netdev_err(priv->netdev, "fail to create ipsec tx ft err=%d\n", err);
-               return err;
+               goto err_pol_ft;
        }
-       ipsec->tx_fs->ft = ft;
+       tx->ft.pol = ft;
+       dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE;
+       dest.ft = tx->ft.sa;
+       err = ipsec_miss_create(mdev, tx->ft.pol, &tx->pol, &dest);
+       if (err)
+               goto err_pol_miss;
        return 0;
+
+err_pol_miss:
+       mlx5_destroy_flow_table(tx->ft.pol);
+err_pol_ft:
+       mlx5_destroy_flow_table(tx->ft.sa);
+       return err;
 }
 
-static int tx_ft_get(struct mlx5e_priv *priv)
+static struct mlx5e_ipsec_tx *tx_ft_get(struct mlx5_core_dev *mdev,
+                                       struct mlx5e_ipsec *ipsec)
 {
-       struct mlx5e_ipsec_tx *tx_fs = priv->ipsec->tx_fs;
+       struct mlx5e_ipsec_tx *tx = ipsec->tx;
        int err = 0;
 
-       mutex_lock(&tx_fs->mutex);
-       if (tx_fs->refcnt)
+       mutex_lock(&tx->ft.mutex);
+       if (tx->ft.refcnt)
                goto skip;
 
-       err = tx_create(priv);
+       err = tx_create(mdev, tx);
        if (err)
                goto out;
 skip:
-       tx_fs->refcnt++;
+       tx->ft.refcnt++;
 out:
-       mutex_unlock(&tx_fs->mutex);
-       return err;
+       mutex_unlock(&tx->ft.mutex);
+       if (err)
+               return ERR_PTR(err);
+       return tx;
 }
 
-static void tx_ft_put(struct mlx5e_priv *priv)
+static void tx_ft_put(struct mlx5e_ipsec *ipsec)
 {
-       struct mlx5e_ipsec_tx *tx_fs = priv->ipsec->tx_fs;
+       struct mlx5e_ipsec_tx *tx = ipsec->tx;
 
-       mutex_lock(&tx_fs->mutex);
-       tx_fs->refcnt--;
-       if (tx_fs->refcnt)
+       mutex_lock(&tx->ft.mutex);
+       tx->ft.refcnt--;
+       if (tx->ft.refcnt)
                goto out;
 
-       mlx5_destroy_flow_table(tx_fs->ft);
+       mlx5_del_flow_rules(tx->pol.rule);
+       mlx5_destroy_flow_group(tx->pol.group);
+       mlx5_destroy_flow_table(tx->ft.pol);
+       mlx5_destroy_flow_table(tx->ft.sa);
 out:
-       mutex_unlock(&tx_fs->mutex);
+       mutex_unlock(&tx->ft.mutex);
 }
 
-static void setup_fte_common(struct mlx5_accel_esp_xfrm_attrs *attrs,
-                            u32 ipsec_obj_id,
-                            struct mlx5_flow_spec *spec,
-                            struct mlx5_flow_act *flow_act)
+static void setup_fte_addr4(struct mlx5_flow_spec *spec, __be32 *saddr,
+                           __be32 *daddr)
 {
-       u8 ip_version = attrs->is_ipv6 ? 6 : 4;
-
-       spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS | MLX5_MATCH_MISC_PARAMETERS;
+       spec->match_criteria_enable |= MLX5_MATCH_OUTER_HEADERS;
 
-       /* ip_version */
        MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.ip_version);
-       MLX5_SET(fte_match_param, spec->match_value, outer_headers.ip_version, ip_version);
+       MLX5_SET(fte_match_param, spec->match_value, outer_headers.ip_version, 4);
+
+       memcpy(MLX5_ADDR_OF(fte_match_param, spec->match_value,
+                           outer_headers.src_ipv4_src_ipv6.ipv4_layout.ipv4), saddr, 4);
+       memcpy(MLX5_ADDR_OF(fte_match_param, spec->match_value,
+                           outer_headers.dst_ipv4_dst_ipv6.ipv4_layout.ipv4), daddr, 4);
+       MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria,
+                        outer_headers.src_ipv4_src_ipv6.ipv4_layout.ipv4);
+       MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria,
+                        outer_headers.dst_ipv4_dst_ipv6.ipv4_layout.ipv4);
+}
 
-       /* Non fragmented */
-       MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.frag);
-       MLX5_SET(fte_match_param, spec->match_value, outer_headers.frag, 0);
+static void setup_fte_addr6(struct mlx5_flow_spec *spec, __be32 *saddr,
+                           __be32 *daddr)
+{
+       spec->match_criteria_enable |= MLX5_MATCH_OUTER_HEADERS;
 
+       MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.ip_version);
+       MLX5_SET(fte_match_param, spec->match_value, outer_headers.ip_version, 6);
+
+       memcpy(MLX5_ADDR_OF(fte_match_param, spec->match_value,
+                           outer_headers.src_ipv4_src_ipv6.ipv6_layout.ipv6), saddr, 16);
+       memcpy(MLX5_ADDR_OF(fte_match_param, spec->match_value,
+                           outer_headers.dst_ipv4_dst_ipv6.ipv6_layout.ipv6), daddr, 16);
+       memset(MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
+                           outer_headers.src_ipv4_src_ipv6.ipv6_layout.ipv6), 0xff, 16);
+       memset(MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
+                           outer_headers.dst_ipv4_dst_ipv6.ipv6_layout.ipv6), 0xff, 16);
+}
+
+static void setup_fte_esp(struct mlx5_flow_spec *spec)
+{
        /* ESP header */
+       spec->match_criteria_enable |= MLX5_MATCH_MISC_PARAMETERS;
+
        MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.ip_protocol);
        MLX5_SET(fte_match_param, spec->match_value, outer_headers.ip_protocol, IPPROTO_ESP);
+}
 
+static void setup_fte_spi(struct mlx5_flow_spec *spec, u32 spi)
+{
        /* SPI number */
+       spec->match_criteria_enable |= MLX5_MATCH_MISC_PARAMETERS;
+
        MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, misc_parameters.outer_esp_spi);
+       MLX5_SET(fte_match_param, spec->match_value, misc_parameters.outer_esp_spi, spi);
+}
+
+static void setup_fte_no_frags(struct mlx5_flow_spec *spec)
+{
+       /* Non fragmented */
+       spec->match_criteria_enable |= MLX5_MATCH_OUTER_HEADERS;
+
+       MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.frag);
+       MLX5_SET(fte_match_param, spec->match_value, outer_headers.frag, 0);
+}
+
+static void setup_fte_reg_a(struct mlx5_flow_spec *spec)
+{
+       /* Add IPsec indicator in metadata_reg_a */
+       spec->match_criteria_enable |= MLX5_MATCH_MISC_PARAMETERS_2;
+
+       MLX5_SET(fte_match_param, spec->match_criteria,
+                misc_parameters_2.metadata_reg_a, MLX5_ETH_WQE_FT_META_IPSEC);
        MLX5_SET(fte_match_param, spec->match_value,
-                misc_parameters.outer_esp_spi, attrs->spi);
-
-       if (ip_version == 4) {
-               memcpy(MLX5_ADDR_OF(fte_match_param, spec->match_value,
-                                   outer_headers.src_ipv4_src_ipv6.ipv4_layout.ipv4),
-                      &attrs->saddr.a4, 4);
-               memcpy(MLX5_ADDR_OF(fte_match_param, spec->match_value,
-                                   outer_headers.dst_ipv4_dst_ipv6.ipv4_layout.ipv4),
-                      &attrs->daddr.a4, 4);
-               MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria,
-                                outer_headers.src_ipv4_src_ipv6.ipv4_layout.ipv4);
-               MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria,
-                                outer_headers.dst_ipv4_dst_ipv6.ipv4_layout.ipv4);
-       } else {
-               memcpy(MLX5_ADDR_OF(fte_match_param, spec->match_value,
-                                   outer_headers.src_ipv4_src_ipv6.ipv6_layout.ipv6),
-                      &attrs->saddr.a6, 16);
-               memcpy(MLX5_ADDR_OF(fte_match_param, spec->match_value,
-                                   outer_headers.dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
-                      &attrs->daddr.a6, 16);
-               memset(MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
-                                   outer_headers.src_ipv4_src_ipv6.ipv6_layout.ipv6),
-                      0xff, 16);
-               memset(MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
-                                   outer_headers.dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
-                      0xff, 16);
-       }
+                misc_parameters_2.metadata_reg_a, MLX5_ETH_WQE_FT_META_IPSEC);
+}
 
-       flow_act->crypto.type = MLX5_FLOW_CONTEXT_ENCRYPT_DECRYPT_TYPE_IPSEC;
-       flow_act->crypto.obj_id = ipsec_obj_id;
-       flow_act->flags |= FLOW_ACT_NO_APPEND;
+static void setup_fte_reg_c0(struct mlx5_flow_spec *spec, u32 reqid)
+{
+       /* Pass policy check before choosing this SA */
+       spec->match_criteria_enable |= MLX5_MATCH_MISC_PARAMETERS_2;
+
+       MLX5_SET(fte_match_param, spec->match_criteria,
+                misc_parameters_2.metadata_reg_c_0, reqid);
+       MLX5_SET(fte_match_param, spec->match_value,
+                misc_parameters_2.metadata_reg_c_0, reqid);
 }
 
-static int rx_add_rule(struct mlx5e_priv *priv,
-                      struct mlx5e_ipsec_sa_entry *sa_entry)
+static int setup_modify_header(struct mlx5_core_dev *mdev, u32 val, u8 dir,
+                              struct mlx5_flow_act *flow_act)
 {
        u8 action[MLX5_UN_SZ_BYTES(set_add_copy_action_in_auto)] = {};
-       struct mlx5e_ipsec_rule *ipsec_rule = &sa_entry->ipsec_rule;
+       enum mlx5_flow_namespace_type ns_type;
+       struct mlx5_modify_hdr *modify_hdr;
+
+       MLX5_SET(set_action_in, action, action_type, MLX5_ACTION_TYPE_SET);
+       switch (dir) {
+       case XFRM_DEV_OFFLOAD_IN:
+               MLX5_SET(set_action_in, action, field,
+                        MLX5_ACTION_IN_FIELD_METADATA_REG_B);
+               ns_type = MLX5_FLOW_NAMESPACE_KERNEL;
+               break;
+       case XFRM_DEV_OFFLOAD_OUT:
+               MLX5_SET(set_action_in, action, field,
+                        MLX5_ACTION_IN_FIELD_METADATA_REG_C_0);
+               ns_type = MLX5_FLOW_NAMESPACE_EGRESS;
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       MLX5_SET(set_action_in, action, data, val);
+       MLX5_SET(set_action_in, action, offset, 0);
+       MLX5_SET(set_action_in, action, length, 32);
+
+       modify_hdr = mlx5_modify_header_alloc(mdev, ns_type, 1, action);
+       if (IS_ERR(modify_hdr)) {
+               mlx5_core_err(mdev, "Failed to allocate modify_header %ld\n",
+                             PTR_ERR(modify_hdr));
+               return PTR_ERR(modify_hdr);
+       }
+
+       flow_act->modify_hdr = modify_hdr;
+       flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR;
+       return 0;
+}
+
+static int setup_pkt_reformat(struct mlx5_core_dev *mdev,
+                             struct mlx5_accel_esp_xfrm_attrs *attrs,
+                             struct mlx5_flow_act *flow_act)
+{
+       enum mlx5_flow_namespace_type ns_type = MLX5_FLOW_NAMESPACE_EGRESS;
+       struct mlx5_pkt_reformat_params reformat_params = {};
+       struct mlx5_pkt_reformat *pkt_reformat;
+       u8 reformatbf[16] = {};
+       __be32 spi;
+
+       if (attrs->dir == XFRM_DEV_OFFLOAD_IN) {
+               reformat_params.type = MLX5_REFORMAT_TYPE_DEL_ESP_TRANSPORT;
+               ns_type = MLX5_FLOW_NAMESPACE_KERNEL;
+               goto cmd;
+       }
+
+       if (attrs->family == AF_INET)
+               reformat_params.type =
+                       MLX5_REFORMAT_TYPE_ADD_ESP_TRANSPORT_OVER_IPV4;
+       else
+               reformat_params.type =
+                       MLX5_REFORMAT_TYPE_ADD_ESP_TRANSPORT_OVER_IPV6;
+
+       /* convert to network format */
+       spi = htonl(attrs->spi);
+       memcpy(reformatbf, &spi, 4);
+
+       reformat_params.param_0 = attrs->authsize;
+       reformat_params.size = sizeof(reformatbf);
+       reformat_params.data = &reformatbf;
+
+cmd:
+       pkt_reformat =
+               mlx5_packet_reformat_alloc(mdev, &reformat_params, ns_type);
+       if (IS_ERR(pkt_reformat))
+               return PTR_ERR(pkt_reformat);
+
+       flow_act->pkt_reformat = pkt_reformat;
+       flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT;
+       return 0;
+}
+
+static int rx_add_rule(struct mlx5e_ipsec_sa_entry *sa_entry)
+{
        struct mlx5_accel_esp_xfrm_attrs *attrs = &sa_entry->attrs;
-       u32 ipsec_obj_id = sa_entry->ipsec_obj_id;
-       struct mlx5_modify_hdr *modify_hdr = NULL;
-       struct mlx5e_accel_fs_esp_prot *fs_prot;
+       struct mlx5_core_dev *mdev = mlx5e_ipsec_sa2dev(sa_entry);
+       struct mlx5e_ipsec *ipsec = sa_entry->ipsec;
        struct mlx5_flow_destination dest = {};
-       struct mlx5e_accel_fs_esp *accel_esp;
        struct mlx5_flow_act flow_act = {};
        struct mlx5_flow_handle *rule;
-       enum accel_fs_esp_type type;
        struct mlx5_flow_spec *spec;
-       int err = 0;
+       struct mlx5e_ipsec_rx *rx;
+       int err;
 
-       accel_esp = priv->ipsec->rx_fs;
-       type = attrs->is_ipv6 ? ACCEL_FS_ESP6 : ACCEL_FS_ESP4;
-       fs_prot = &accel_esp->fs_prot[type];
-
-       err = rx_ft_get(priv, type);
-       if (err)
-               return err;
+       rx = rx_ft_get(mdev, ipsec, attrs->family);
+       if (IS_ERR(rx))
+               return PTR_ERR(rx);
 
        spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
        if (!spec) {
                err = -ENOMEM;
-               goto out_err;
+               goto err_alloc;
        }
 
-       setup_fte_common(attrs, ipsec_obj_id, spec, &flow_act);
+       if (attrs->family == AF_INET)
+               setup_fte_addr4(spec, &attrs->saddr.a4, &attrs->daddr.a4);
+       else
+               setup_fte_addr6(spec, attrs->saddr.a6, attrs->daddr.a6);
 
-       /* Set bit[31] ipsec marker */
-       /* Set bit[23-0] ipsec_obj_id */
-       MLX5_SET(set_action_in, action, action_type, MLX5_ACTION_TYPE_SET);
-       MLX5_SET(set_action_in, action, field, MLX5_ACTION_IN_FIELD_METADATA_REG_B);
-       MLX5_SET(set_action_in, action, data, (ipsec_obj_id | BIT(31)));
-       MLX5_SET(set_action_in, action, offset, 0);
-       MLX5_SET(set_action_in, action, length, 32);
+       setup_fte_spi(spec, attrs->spi);
+       setup_fte_esp(spec);
+       setup_fte_no_frags(spec);
 
-       modify_hdr = mlx5_modify_header_alloc(priv->mdev, MLX5_FLOW_NAMESPACE_KERNEL,
-                                             1, action);
-       if (IS_ERR(modify_hdr)) {
-               err = PTR_ERR(modify_hdr);
-               netdev_err(priv->netdev,
-                          "fail to alloc ipsec set modify_header_id err=%d\n", err);
-               modify_hdr = NULL;
-               goto out_err;
+       err = setup_modify_header(mdev, sa_entry->ipsec_obj_id | BIT(31),
+                                 XFRM_DEV_OFFLOAD_IN, &flow_act);
+       if (err)
+               goto err_mod_header;
+
+       switch (attrs->type) {
+       case XFRM_DEV_OFFLOAD_PACKET:
+               err = setup_pkt_reformat(mdev, attrs, &flow_act);
+               if (err)
+                       goto err_pkt_reformat;
+               break;
+       default:
+               break;
        }
 
-       flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST |
-                         MLX5_FLOW_CONTEXT_ACTION_CRYPTO_DECRYPT |
-                         MLX5_FLOW_CONTEXT_ACTION_MOD_HDR;
+       flow_act.crypto.type = MLX5_FLOW_CONTEXT_ENCRYPT_DECRYPT_TYPE_IPSEC;
+       flow_act.crypto.obj_id = sa_entry->ipsec_obj_id;
+       flow_act.flags |= FLOW_ACT_NO_APPEND;
+       flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST |
+                          MLX5_FLOW_CONTEXT_ACTION_CRYPTO_DECRYPT;
        dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE;
-       flow_act.modify_hdr = modify_hdr;
-       dest.ft = fs_prot->rx_err.ft;
-       rule = mlx5_add_flow_rules(fs_prot->ft, spec, &flow_act, &dest, 1);
+       dest.ft = rx->ft.status;
+       rule = mlx5_add_flow_rules(rx->ft.sa, spec, &flow_act, &dest, 1);
        if (IS_ERR(rule)) {
                err = PTR_ERR(rule);
-               netdev_err(priv->netdev, "fail to add ipsec rule attrs->action=0x%x, err=%d\n",
-                          attrs->action, err);
-               goto out_err;
+               mlx5_core_err(mdev, "fail to add RX ipsec rule err=%d\n", err);
+               goto err_add_flow;
        }
+       kvfree(spec);
 
-       ipsec_rule->rule = rule;
-       ipsec_rule->set_modify_hdr = modify_hdr;
-       goto out;
-
-out_err:
-       if (modify_hdr)
-               mlx5_modify_header_dealloc(priv->mdev, modify_hdr);
-       rx_ft_put(priv, type);
+       sa_entry->ipsec_rule.rule = rule;
+       sa_entry->ipsec_rule.modify_hdr = flow_act.modify_hdr;
+       sa_entry->ipsec_rule.pkt_reformat = flow_act.pkt_reformat;
+       return 0;
 
-out:
+err_add_flow:
+       if (flow_act.pkt_reformat)
+               mlx5_packet_reformat_dealloc(mdev, flow_act.pkt_reformat);
+err_pkt_reformat:
+       mlx5_modify_header_dealloc(mdev, flow_act.modify_hdr);
+err_mod_header:
        kvfree(spec);
+err_alloc:
+       rx_ft_put(mdev, ipsec, attrs->family);
        return err;
 }
 
-static int tx_add_rule(struct mlx5e_priv *priv,
-                      struct mlx5e_ipsec_sa_entry *sa_entry)
+static int tx_add_rule(struct mlx5e_ipsec_sa_entry *sa_entry)
 {
+       struct mlx5_accel_esp_xfrm_attrs *attrs = &sa_entry->attrs;
+       struct mlx5_core_dev *mdev = mlx5e_ipsec_sa2dev(sa_entry);
+       struct mlx5e_ipsec *ipsec = sa_entry->ipsec;
+       struct mlx5_flow_destination dest = {};
        struct mlx5_flow_act flow_act = {};
        struct mlx5_flow_handle *rule;
        struct mlx5_flow_spec *spec;
+       struct mlx5e_ipsec_tx *tx;
        int err = 0;
 
-       err = tx_ft_get(priv);
-       if (err)
-               return err;
+       tx = tx_ft_get(mdev, ipsec);
+       if (IS_ERR(tx))
+               return PTR_ERR(tx);
 
        spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
        if (!spec) {
                err = -ENOMEM;
-               goto out;
+               goto err_alloc;
        }
 
-       setup_fte_common(&sa_entry->attrs, sa_entry->ipsec_obj_id, spec,
-                        &flow_act);
+       if (attrs->family == AF_INET)
+               setup_fte_addr4(spec, &attrs->saddr.a4, &attrs->daddr.a4);
+       else
+               setup_fte_addr6(spec, attrs->saddr.a6, attrs->daddr.a6);
+
+       setup_fte_no_frags(spec);
+
+       switch (attrs->type) {
+       case XFRM_DEV_OFFLOAD_CRYPTO:
+               setup_fte_spi(spec, attrs->spi);
+               setup_fte_esp(spec);
+               setup_fte_reg_a(spec);
+               break;
+       case XFRM_DEV_OFFLOAD_PACKET:
+               setup_fte_reg_c0(spec, attrs->reqid);
+               err = setup_pkt_reformat(mdev, attrs, &flow_act);
+               if (err)
+                       goto err_pkt_reformat;
+               break;
+       default:
+               break;
+       }
 
-       /* Add IPsec indicator in metadata_reg_a */
-       spec->match_criteria_enable |= MLX5_MATCH_MISC_PARAMETERS_2;
-       MLX5_SET(fte_match_param, spec->match_criteria, misc_parameters_2.metadata_reg_a,
-                MLX5_ETH_WQE_FT_META_IPSEC);
-       MLX5_SET(fte_match_param, spec->match_value, misc_parameters_2.metadata_reg_a,
-                MLX5_ETH_WQE_FT_META_IPSEC);
-
-       flow_act.action = MLX5_FLOW_CONTEXT_ACTION_ALLOW |
-                         MLX5_FLOW_CONTEXT_ACTION_CRYPTO_ENCRYPT;
-       rule = mlx5_add_flow_rules(priv->ipsec->tx_fs->ft, spec, &flow_act, NULL, 0);
+       flow_act.crypto.type = MLX5_FLOW_CONTEXT_ENCRYPT_DECRYPT_TYPE_IPSEC;
+       flow_act.crypto.obj_id = sa_entry->ipsec_obj_id;
+       flow_act.flags |= FLOW_ACT_NO_APPEND;
+       flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_ALLOW |
+                          MLX5_FLOW_CONTEXT_ACTION_CRYPTO_ENCRYPT |
+                          MLX5_FLOW_CONTEXT_ACTION_COUNT;
+       dest.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
+       dest.counter_id = mlx5_fc_id(tx->fc->cnt);
+       rule = mlx5_add_flow_rules(tx->ft.sa, spec, &flow_act, &dest, 1);
        if (IS_ERR(rule)) {
                err = PTR_ERR(rule);
-               netdev_err(priv->netdev, "fail to add ipsec rule attrs->action=0x%x, err=%d\n",
-                               sa_entry->attrs.action, err);
-               goto out;
+               mlx5_core_err(mdev, "fail to add TX ipsec rule err=%d\n", err);
+               goto err_add_flow;
        }
 
+       kvfree(spec);
        sa_entry->ipsec_rule.rule = rule;
+       sa_entry->ipsec_rule.pkt_reformat = flow_act.pkt_reformat;
+       return 0;
 
-out:
+err_add_flow:
+       if (flow_act.pkt_reformat)
+               mlx5_packet_reformat_dealloc(mdev, flow_act.pkt_reformat);
+err_pkt_reformat:
        kvfree(spec);
+err_alloc:
+       tx_ft_put(ipsec);
+       return err;
+}
+
+static int tx_add_policy(struct mlx5e_ipsec_pol_entry *pol_entry)
+{
+       struct mlx5_accel_pol_xfrm_attrs *attrs = &pol_entry->attrs;
+       struct mlx5_core_dev *mdev = mlx5e_ipsec_pol2dev(pol_entry);
+       struct mlx5_flow_destination dest[2] = {};
+       struct mlx5_flow_act flow_act = {};
+       struct mlx5_flow_handle *rule;
+       struct mlx5_flow_spec *spec;
+       struct mlx5e_ipsec_tx *tx;
+       int err, dstn = 0;
+
+       tx = tx_ft_get(mdev, pol_entry->ipsec);
+       if (IS_ERR(tx))
+               return PTR_ERR(tx);
+
+       spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
+       if (!spec) {
+               err = -ENOMEM;
+               goto err_alloc;
+       }
+
+       if (attrs->family == AF_INET)
+               setup_fte_addr4(spec, &attrs->saddr.a4, &attrs->daddr.a4);
+       else
+               setup_fte_addr6(spec, attrs->saddr.a6, attrs->daddr.a6);
+
+       setup_fte_no_frags(spec);
+
+       err = setup_modify_header(mdev, attrs->reqid, XFRM_DEV_OFFLOAD_OUT,
+                                 &flow_act);
        if (err)
-               tx_ft_put(priv);
+               goto err_mod_header;
+
+       switch (attrs->action) {
+       case XFRM_POLICY_ALLOW:
+               flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
+               break;
+       case XFRM_POLICY_BLOCK:
+               flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_DROP |
+                                  MLX5_FLOW_CONTEXT_ACTION_COUNT;
+               dest[dstn].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
+               dest[dstn].counter_id = mlx5_fc_id(tx->fc->drop);
+               dstn++;
+               break;
+       default:
+               WARN_ON(true);
+               err = -EINVAL;
+               goto err_action;
+       }
+
+       flow_act.flags |= FLOW_ACT_NO_APPEND;
+       dest[dstn].ft = tx->ft.sa;
+       dest[dstn].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE;
+       dstn++;
+       rule = mlx5_add_flow_rules(tx->ft.pol, spec, &flow_act, dest, dstn);
+       if (IS_ERR(rule)) {
+               err = PTR_ERR(rule);
+               mlx5_core_err(mdev, "fail to add TX ipsec rule err=%d\n", err);
+               goto err_action;
+       }
+
+       kvfree(spec);
+       pol_entry->ipsec_rule.rule = rule;
+       pol_entry->ipsec_rule.modify_hdr = flow_act.modify_hdr;
+       return 0;
+
+err_action:
+       mlx5_modify_header_dealloc(mdev, flow_act.modify_hdr);
+err_mod_header:
+       kvfree(spec);
+err_alloc:
+       tx_ft_put(pol_entry->ipsec);
        return err;
 }
 
-int mlx5e_accel_ipsec_fs_add_rule(struct mlx5e_priv *priv,
-                                 struct mlx5e_ipsec_sa_entry *sa_entry)
+static int rx_add_policy(struct mlx5e_ipsec_pol_entry *pol_entry)
 {
-       if (sa_entry->attrs.action == MLX5_ACCEL_ESP_ACTION_ENCRYPT)
-               return tx_add_rule(priv, sa_entry);
+       struct mlx5_accel_pol_xfrm_attrs *attrs = &pol_entry->attrs;
+       struct mlx5_core_dev *mdev = mlx5e_ipsec_pol2dev(pol_entry);
+       struct mlx5_flow_destination dest[2];
+       struct mlx5_flow_act flow_act = {};
+       struct mlx5_flow_handle *rule;
+       struct mlx5_flow_spec *spec;
+       struct mlx5e_ipsec_rx *rx;
+       int err, dstn = 0;
+
+       rx = rx_ft_get(mdev, pol_entry->ipsec, attrs->family);
+       if (IS_ERR(rx))
+               return PTR_ERR(rx);
+
+       spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
+       if (!spec) {
+               err = -ENOMEM;
+               goto err_alloc;
+       }
 
-       return rx_add_rule(priv, sa_entry);
+       if (attrs->family == AF_INET)
+               setup_fte_addr4(spec, &attrs->saddr.a4, &attrs->daddr.a4);
+       else
+               setup_fte_addr6(spec, attrs->saddr.a6, attrs->daddr.a6);
+
+       setup_fte_no_frags(spec);
+
+       switch (attrs->action) {
+       case XFRM_POLICY_ALLOW:
+               flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
+               break;
+       case XFRM_POLICY_BLOCK:
+               flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_DROP | MLX5_FLOW_CONTEXT_ACTION_COUNT;
+               dest[dstn].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
+               dest[dstn].counter_id = mlx5_fc_id(rx->fc->drop);
+               dstn++;
+               break;
+       default:
+               WARN_ON(true);
+               err = -EINVAL;
+               goto err_action;
+       }
+
+       flow_act.flags |= FLOW_ACT_NO_APPEND;
+       dest[dstn].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE;
+       dest[dstn].ft = rx->ft.sa;
+       dstn++;
+       rule = mlx5_add_flow_rules(rx->ft.pol, spec, &flow_act, dest, dstn);
+       if (IS_ERR(rule)) {
+               err = PTR_ERR(rule);
+               mlx5_core_err(mdev, "Fail to add RX IPsec policy rule err=%d\n", err);
+               goto err_action;
+       }
+
+       kvfree(spec);
+       pol_entry->ipsec_rule.rule = rule;
+       return 0;
+
+err_action:
+       kvfree(spec);
+err_alloc:
+       rx_ft_put(mdev, pol_entry->ipsec, attrs->family);
+       return err;
 }
 
-void mlx5e_accel_ipsec_fs_del_rule(struct mlx5e_priv *priv,
-                                  struct mlx5e_ipsec_sa_entry *sa_entry)
+static void ipsec_fs_destroy_counters(struct mlx5e_ipsec *ipsec)
+{
+       struct mlx5e_ipsec_rx *rx_ipv4 = ipsec->rx_ipv4;
+       struct mlx5_core_dev *mdev = ipsec->mdev;
+       struct mlx5e_ipsec_tx *tx = ipsec->tx;
+
+       mlx5_fc_destroy(mdev, tx->fc->drop);
+       mlx5_fc_destroy(mdev, tx->fc->cnt);
+       kfree(tx->fc);
+       mlx5_fc_destroy(mdev, rx_ipv4->fc->drop);
+       mlx5_fc_destroy(mdev, rx_ipv4->fc->cnt);
+       kfree(rx_ipv4->fc);
+}
+
+static int ipsec_fs_init_counters(struct mlx5e_ipsec *ipsec)
+{
+       struct mlx5e_ipsec_rx *rx_ipv4 = ipsec->rx_ipv4;
+       struct mlx5e_ipsec_rx *rx_ipv6 = ipsec->rx_ipv6;
+       struct mlx5_core_dev *mdev = ipsec->mdev;
+       struct mlx5e_ipsec_tx *tx = ipsec->tx;
+       struct mlx5e_ipsec_fc *fc;
+       struct mlx5_fc *counter;
+       int err;
+
+       fc = kzalloc(sizeof(*rx_ipv4->fc), GFP_KERNEL);
+       if (!fc)
+               return -ENOMEM;
+
+       /* Both IPv4 and IPv6 point to same flow counters struct. */
+       rx_ipv4->fc = fc;
+       rx_ipv6->fc = fc;
+       counter = mlx5_fc_create(mdev, false);
+       if (IS_ERR(counter)) {
+               err = PTR_ERR(counter);
+               goto err_rx_cnt;
+       }
+
+       fc->cnt = counter;
+       counter = mlx5_fc_create(mdev, false);
+       if (IS_ERR(counter)) {
+               err = PTR_ERR(counter);
+               goto err_rx_drop;
+       }
+
+       fc->drop = counter;
+       fc = kzalloc(sizeof(*tx->fc), GFP_KERNEL);
+       if (!fc) {
+               err = -ENOMEM;
+               goto err_tx_fc;
+       }
+
+       tx->fc = fc;
+       counter = mlx5_fc_create(mdev, false);
+       if (IS_ERR(counter)) {
+               err = PTR_ERR(counter);
+               goto err_tx_cnt;
+       }
+
+       fc->cnt = counter;
+       counter = mlx5_fc_create(mdev, false);
+       if (IS_ERR(counter)) {
+               err = PTR_ERR(counter);
+               goto err_tx_drop;
+       }
+
+       fc->drop = counter;
+       return 0;
+
+err_tx_drop:
+       mlx5_fc_destroy(mdev, tx->fc->cnt);
+err_tx_cnt:
+       kfree(tx->fc);
+err_tx_fc:
+       mlx5_fc_destroy(mdev, rx_ipv4->fc->drop);
+err_rx_drop:
+       mlx5_fc_destroy(mdev, rx_ipv4->fc->cnt);
+err_rx_cnt:
+       kfree(rx_ipv4->fc);
+       return err;
+}
+
+void mlx5e_accel_ipsec_fs_read_stats(struct mlx5e_priv *priv, void *ipsec_stats)
+{
+       struct mlx5_core_dev *mdev = priv->mdev;
+       struct mlx5e_ipsec *ipsec = priv->ipsec;
+       struct mlx5e_ipsec_hw_stats *stats;
+       struct mlx5e_ipsec_fc *fc;
+
+       stats = (struct mlx5e_ipsec_hw_stats *)ipsec_stats;
+
+       stats->ipsec_rx_pkts = 0;
+       stats->ipsec_rx_bytes = 0;
+       stats->ipsec_rx_drop_pkts = 0;
+       stats->ipsec_rx_drop_bytes = 0;
+       stats->ipsec_tx_pkts = 0;
+       stats->ipsec_tx_bytes = 0;
+       stats->ipsec_tx_drop_pkts = 0;
+       stats->ipsec_tx_drop_bytes = 0;
+
+       fc = ipsec->rx_ipv4->fc;
+       mlx5_fc_query(mdev, fc->cnt, &stats->ipsec_rx_pkts, &stats->ipsec_rx_bytes);
+       mlx5_fc_query(mdev, fc->drop, &stats->ipsec_rx_drop_pkts,
+                     &stats->ipsec_rx_drop_bytes);
+
+       fc = ipsec->tx->fc;
+       mlx5_fc_query(mdev, fc->cnt, &stats->ipsec_tx_pkts, &stats->ipsec_tx_bytes);
+       mlx5_fc_query(mdev, fc->drop, &stats->ipsec_tx_drop_pkts,
+                     &stats->ipsec_tx_drop_bytes);
+}
+
+int mlx5e_accel_ipsec_fs_add_rule(struct mlx5e_ipsec_sa_entry *sa_entry)
+{
+       if (sa_entry->attrs.dir == XFRM_DEV_OFFLOAD_OUT)
+               return tx_add_rule(sa_entry);
+
+       return rx_add_rule(sa_entry);
+}
+
+void mlx5e_accel_ipsec_fs_del_rule(struct mlx5e_ipsec_sa_entry *sa_entry)
 {
        struct mlx5e_ipsec_rule *ipsec_rule = &sa_entry->ipsec_rule;
        struct mlx5_core_dev *mdev = mlx5e_ipsec_sa2dev(sa_entry);
 
        mlx5_del_flow_rules(ipsec_rule->rule);
 
-       if (sa_entry->attrs.action == MLX5_ACCEL_ESP_ACTION_ENCRYPT) {
-               tx_ft_put(priv);
+       if (ipsec_rule->pkt_reformat)
+               mlx5_packet_reformat_dealloc(mdev, ipsec_rule->pkt_reformat);
+
+       if (sa_entry->attrs.dir == XFRM_DEV_OFFLOAD_OUT) {
+               tx_ft_put(sa_entry->ipsec);
                return;
        }
 
-       mlx5_modify_header_dealloc(mdev, ipsec_rule->set_modify_hdr);
-       rx_ft_put(priv,
-                 sa_entry->attrs.is_ipv6 ? ACCEL_FS_ESP6 : ACCEL_FS_ESP4);
+       mlx5_modify_header_dealloc(mdev, ipsec_rule->modify_hdr);
+       rx_ft_put(mdev, sa_entry->ipsec, sa_entry->attrs.family);
 }
 
-void mlx5e_accel_ipsec_fs_cleanup(struct mlx5e_ipsec *ipsec)
+int mlx5e_accel_ipsec_fs_add_pol(struct mlx5e_ipsec_pol_entry *pol_entry)
 {
-       struct mlx5e_accel_fs_esp_prot *fs_prot;
-       struct mlx5e_accel_fs_esp *accel_esp;
-       enum accel_fs_esp_type i;
+       if (pol_entry->attrs.dir == XFRM_DEV_OFFLOAD_OUT)
+               return tx_add_policy(pol_entry);
 
-       if (!ipsec->rx_fs)
-               return;
+       return rx_add_policy(pol_entry);
+}
+
+void mlx5e_accel_ipsec_fs_del_pol(struct mlx5e_ipsec_pol_entry *pol_entry)
+{
+       struct mlx5e_ipsec_rule *ipsec_rule = &pol_entry->ipsec_rule;
+       struct mlx5_core_dev *mdev = mlx5e_ipsec_pol2dev(pol_entry);
 
-       mutex_destroy(&ipsec->tx_fs->mutex);
-       WARN_ON(ipsec->tx_fs->refcnt);
-       kfree(ipsec->tx_fs);
+       mlx5_del_flow_rules(ipsec_rule->rule);
 
-       accel_esp = ipsec->rx_fs;
-       for (i = 0; i < ACCEL_FS_ESP_NUM_TYPES; i++) {
-               fs_prot = &accel_esp->fs_prot[i];
-               mutex_destroy(&fs_prot->prot_mutex);
-               WARN_ON(fs_prot->refcnt);
+       if (pol_entry->attrs.dir == XFRM_DEV_OFFLOAD_IN) {
+               rx_ft_put(mdev, pol_entry->ipsec, pol_entry->attrs.family);
+               return;
        }
-       kfree(ipsec->rx_fs);
+
+       mlx5_modify_header_dealloc(mdev, ipsec_rule->modify_hdr);
+       tx_ft_put(pol_entry->ipsec);
+}
+
+void mlx5e_accel_ipsec_fs_cleanup(struct mlx5e_ipsec *ipsec)
+{
+       if (!ipsec->tx)
+               return;
+
+       ipsec_fs_destroy_counters(ipsec);
+       mutex_destroy(&ipsec->tx->ft.mutex);
+       WARN_ON(ipsec->tx->ft.refcnt);
+       kfree(ipsec->tx);
+
+       mutex_destroy(&ipsec->rx_ipv4->ft.mutex);
+       WARN_ON(ipsec->rx_ipv4->ft.refcnt);
+       kfree(ipsec->rx_ipv4);
+
+       mutex_destroy(&ipsec->rx_ipv6->ft.mutex);
+       WARN_ON(ipsec->rx_ipv6->ft.refcnt);
+       kfree(ipsec->rx_ipv6);
 }
 
 int mlx5e_accel_ipsec_fs_init(struct mlx5e_ipsec *ipsec)
 {
-       struct mlx5e_accel_fs_esp_prot *fs_prot;
-       struct mlx5e_accel_fs_esp *accel_esp;
        struct mlx5_flow_namespace *ns;
-       enum accel_fs_esp_type i;
        int err = -ENOMEM;
 
        ns = mlx5_get_flow_namespace(ipsec->mdev,
@@ -581,26 +1032,34 @@ int mlx5e_accel_ipsec_fs_init(struct mlx5e_ipsec *ipsec)
        if (!ns)
                return -EOPNOTSUPP;
 
-       ipsec->tx_fs = kzalloc(sizeof(*ipsec->tx_fs), GFP_KERNEL);
-       if (!ipsec->tx_fs)
+       ipsec->tx = kzalloc(sizeof(*ipsec->tx), GFP_KERNEL);
+       if (!ipsec->tx)
                return -ENOMEM;
 
-       ipsec->rx_fs = kzalloc(sizeof(*ipsec->rx_fs), GFP_KERNEL);
-       if (!ipsec->rx_fs)
-               goto err_rx;
+       ipsec->rx_ipv4 = kzalloc(sizeof(*ipsec->rx_ipv4), GFP_KERNEL);
+       if (!ipsec->rx_ipv4)
+               goto err_rx_ipv4;
 
-       mutex_init(&ipsec->tx_fs->mutex);
-       ipsec->tx_fs->ns = ns;
+       ipsec->rx_ipv6 = kzalloc(sizeof(*ipsec->rx_ipv6), GFP_KERNEL);
+       if (!ipsec->rx_ipv6)
+               goto err_rx_ipv6;
 
-       accel_esp = ipsec->rx_fs;
-       for (i = 0; i < ACCEL_FS_ESP_NUM_TYPES; i++) {
-               fs_prot = &accel_esp->fs_prot[i];
-               mutex_init(&fs_prot->prot_mutex);
-       }
+       err = ipsec_fs_init_counters(ipsec);
+       if (err)
+               goto err_counters;
+
+       mutex_init(&ipsec->tx->ft.mutex);
+       mutex_init(&ipsec->rx_ipv4->ft.mutex);
+       mutex_init(&ipsec->rx_ipv6->ft.mutex);
+       ipsec->tx->ns = ns;
 
        return 0;
 
-err_rx:
-       kfree(ipsec->tx_fs);
+err_counters:
+       kfree(ipsec->rx_ipv6);
+err_rx_ipv6:
+       kfree(ipsec->rx_ipv4);
+err_rx_ipv4:
+       kfree(ipsec->tx);
        return err;
 }
index 792724c..8e36142 100644 (file)
@@ -2,9 +2,14 @@
 /* Copyright (c) 2017, Mellanox Technologies inc. All rights reserved. */
 
 #include "mlx5_core.h"
+#include "en.h"
 #include "ipsec.h"
 #include "lib/mlx5.h"
 
+enum {
+       MLX5_IPSEC_ASO_REMOVE_FLOW_PKT_CNT_OFFSET,
+};
+
 u32 mlx5_ipsec_device_caps(struct mlx5_core_dev *mdev)
 {
        u32 caps = 0;
@@ -31,6 +36,12 @@ u32 mlx5_ipsec_device_caps(struct mlx5_core_dev *mdev)
            MLX5_CAP_ETH(mdev, insert_trailer) && MLX5_CAP_ETH(mdev, swp))
                caps |= MLX5_IPSEC_CAP_CRYPTO;
 
+       if (MLX5_CAP_IPSEC(mdev, ipsec_full_offload) &&
+           MLX5_CAP_FLOWTABLE_NIC_TX(mdev, reformat_add_esp_trasport) &&
+           MLX5_CAP_FLOWTABLE_NIC_RX(mdev, reformat_del_esp_trasport) &&
+           MLX5_CAP_FLOWTABLE_NIC_RX(mdev, decap))
+               caps |= MLX5_IPSEC_CAP_PACKET_OFFLOAD;
+
        if (!caps)
                return 0;
 
@@ -46,6 +57,52 @@ u32 mlx5_ipsec_device_caps(struct mlx5_core_dev *mdev)
 }
 EXPORT_SYMBOL_GPL(mlx5_ipsec_device_caps);
 
+static void mlx5e_ipsec_packet_setup(void *obj, u32 pdn,
+                                    struct mlx5_accel_esp_xfrm_attrs *attrs)
+{
+       void *aso_ctx;
+
+       aso_ctx = MLX5_ADDR_OF(ipsec_obj, obj, ipsec_aso);
+       if (attrs->esn_trigger) {
+               MLX5_SET(ipsec_aso, aso_ctx, esn_event_arm, 1);
+
+               if (attrs->dir == XFRM_DEV_OFFLOAD_IN) {
+                       MLX5_SET(ipsec_aso, aso_ctx, window_sz,
+                                attrs->replay_window / 64);
+                       MLX5_SET(ipsec_aso, aso_ctx, mode,
+                                MLX5_IPSEC_ASO_REPLAY_PROTECTION);
+                       }
+       }
+
+       /* ASO context */
+       MLX5_SET(ipsec_obj, obj, ipsec_aso_access_pd, pdn);
+       MLX5_SET(ipsec_obj, obj, full_offload, 1);
+       MLX5_SET(ipsec_aso, aso_ctx, valid, 1);
+       /* MLX5_IPSEC_ASO_REG_C_4_5 is type C register that is used
+        * in flow steering to perform matching against. Please be
+        * aware that this register was chosen arbitrary and can't
+        * be used in other places as long as IPsec packet offload
+        * active.
+        */
+       MLX5_SET(ipsec_obj, obj, aso_return_reg, MLX5_IPSEC_ASO_REG_C_4_5);
+       if (attrs->dir == XFRM_DEV_OFFLOAD_OUT)
+               MLX5_SET(ipsec_aso, aso_ctx, mode, MLX5_IPSEC_ASO_INC_SN);
+
+       if (attrs->hard_packet_limit != XFRM_INF) {
+               MLX5_SET(ipsec_aso, aso_ctx, remove_flow_pkt_cnt,
+                        lower_32_bits(attrs->hard_packet_limit));
+               MLX5_SET(ipsec_aso, aso_ctx, hard_lft_arm, 1);
+               MLX5_SET(ipsec_aso, aso_ctx, remove_flow_enable, 1);
+       }
+
+       if (attrs->soft_packet_limit != XFRM_INF) {
+               MLX5_SET(ipsec_aso, aso_ctx, remove_flow_soft_lft,
+                        lower_32_bits(attrs->soft_packet_limit));
+
+               MLX5_SET(ipsec_aso, aso_ctx, soft_lft_arm, 1);
+       }
+}
+
 static int mlx5_create_ipsec_obj(struct mlx5e_ipsec_sa_entry *sa_entry)
 {
        struct mlx5_accel_esp_xfrm_attrs *attrs = &sa_entry->attrs;
@@ -54,6 +111,7 @@ static int mlx5_create_ipsec_obj(struct mlx5e_ipsec_sa_entry *sa_entry)
        u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)];
        u32 in[MLX5_ST_SZ_DW(create_ipsec_obj_in)] = {};
        void *obj, *salt_p, *salt_iv_p;
+       struct mlx5e_hw_objs *res;
        int err;
 
        obj = MLX5_ADDR_OF(create_ipsec_obj_in, in, ipsec_object);
@@ -66,11 +124,10 @@ static int mlx5_create_ipsec_obj(struct mlx5e_ipsec_sa_entry *sa_entry)
        salt_iv_p = MLX5_ADDR_OF(ipsec_obj, obj, implicit_iv);
        memcpy(salt_iv_p, &aes_gcm->seq_iv, sizeof(aes_gcm->seq_iv));
        /* esn */
-       if (attrs->flags & MLX5_ACCEL_ESP_FLAGS_ESN_TRIGGERED) {
+       if (attrs->esn_trigger) {
                MLX5_SET(ipsec_obj, obj, esn_en, 1);
                MLX5_SET(ipsec_obj, obj, esn_msb, attrs->esn);
-               if (attrs->flags & MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP)
-                       MLX5_SET(ipsec_obj, obj, esn_overlap, 1);
+               MLX5_SET(ipsec_obj, obj, esn_overlap, attrs->esn_overlap);
        }
 
        MLX5_SET(ipsec_obj, obj, dekn, sa_entry->enc_key_id);
@@ -81,6 +138,10 @@ static int mlx5_create_ipsec_obj(struct mlx5e_ipsec_sa_entry *sa_entry)
        MLX5_SET(general_obj_in_cmd_hdr, in, obj_type,
                 MLX5_GENERAL_OBJECT_TYPES_IPSEC);
 
+       res = &mdev->mlx5e_res.hw_objs;
+       if (attrs->type == XFRM_DEV_OFFLOAD_PACKET)
+               mlx5e_ipsec_packet_setup(obj, res->pdn, attrs);
+
        err = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
        if (!err)
                sa_entry->ipsec_obj_id =
@@ -152,7 +213,7 @@ static int mlx5_modify_ipsec_obj(struct mlx5e_ipsec_sa_entry *sa_entry,
        void *obj;
        int err;
 
-       if (!(attrs->flags & MLX5_ACCEL_ESP_FLAGS_ESN_TRIGGERED))
+       if (!attrs->esn_trigger)
                return 0;
 
        general_obj_types = MLX5_CAP_GEN_64(mdev, general_obj_types);
@@ -183,8 +244,7 @@ static int mlx5_modify_ipsec_obj(struct mlx5e_ipsec_sa_entry *sa_entry,
                   MLX5_MODIFY_IPSEC_BITMASK_ESN_OVERLAP |
                           MLX5_MODIFY_IPSEC_BITMASK_ESN_MSB);
        MLX5_SET(ipsec_obj, obj, esn_msb, attrs->esn);
-       if (attrs->flags & MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP)
-               MLX5_SET(ipsec_obj, obj, esn_overlap, 1);
+       MLX5_SET(ipsec_obj, obj, esn_overlap, attrs->esn_overlap);
 
        /* general object fields set */
        MLX5_SET(general_obj_in_cmd_hdr, in, opcode, MLX5_CMD_OP_MODIFY_GENERAL_OBJECT);
@@ -203,3 +263,234 @@ void mlx5_accel_esp_modify_xfrm(struct mlx5e_ipsec_sa_entry *sa_entry,
 
        memcpy(&sa_entry->attrs, attrs, sizeof(sa_entry->attrs));
 }
+
+static void
+mlx5e_ipsec_aso_update_esn(struct mlx5e_ipsec_sa_entry *sa_entry,
+                          const struct mlx5_accel_esp_xfrm_attrs *attrs)
+{
+       struct mlx5_wqe_aso_ctrl_seg data = {};
+
+       data.data_mask_mode = MLX5_ASO_DATA_MASK_MODE_BITWISE_64BIT << 6;
+       data.condition_1_0_operand = MLX5_ASO_ALWAYS_TRUE | MLX5_ASO_ALWAYS_TRUE
+                                                                   << 4;
+       data.data_offset_condition_operand = MLX5_IPSEC_ASO_REMOVE_FLOW_PKT_CNT_OFFSET;
+       data.bitwise_data = cpu_to_be64(BIT_ULL(54));
+       data.data_mask = data.bitwise_data;
+
+       mlx5e_ipsec_aso_query(sa_entry, &data);
+}
+
+static void mlx5e_ipsec_update_esn_state(struct mlx5e_ipsec_sa_entry *sa_entry,
+                                        u32 mode_param)
+{
+       struct mlx5_accel_esp_xfrm_attrs attrs = {};
+
+       if (mode_param < MLX5E_IPSEC_ESN_SCOPE_MID) {
+               sa_entry->esn_state.esn++;
+               sa_entry->esn_state.overlap = 0;
+       } else {
+               sa_entry->esn_state.overlap = 1;
+       }
+
+       mlx5e_ipsec_build_accel_xfrm_attrs(sa_entry, &attrs);
+       mlx5_accel_esp_modify_xfrm(sa_entry, &attrs);
+       mlx5e_ipsec_aso_update_esn(sa_entry, &attrs);
+}
+
+static void mlx5e_ipsec_handle_event(struct work_struct *_work)
+{
+       struct mlx5e_ipsec_work *work =
+               container_of(_work, struct mlx5e_ipsec_work, work);
+       struct mlx5_accel_esp_xfrm_attrs *attrs;
+       struct mlx5e_ipsec_sa_entry *sa_entry;
+       struct mlx5e_ipsec_aso *aso;
+       struct mlx5e_ipsec *ipsec;
+       int ret;
+
+       sa_entry = xa_load(&work->ipsec->sadb, work->id);
+       if (!sa_entry)
+               goto out;
+
+       ipsec = sa_entry->ipsec;
+       aso = ipsec->aso;
+       attrs = &sa_entry->attrs;
+
+       spin_lock(&sa_entry->x->lock);
+       ret = mlx5e_ipsec_aso_query(sa_entry, NULL);
+       if (ret)
+               goto unlock;
+
+       aso->use_cache = true;
+       if (attrs->esn_trigger &&
+           !MLX5_GET(ipsec_aso, aso->ctx, esn_event_arm)) {
+               u32 mode_param = MLX5_GET(ipsec_aso, aso->ctx, mode_parameter);
+
+               mlx5e_ipsec_update_esn_state(sa_entry, mode_param);
+       }
+
+       if (attrs->soft_packet_limit != XFRM_INF)
+               if (!MLX5_GET(ipsec_aso, aso->ctx, soft_lft_arm) ||
+                   !MLX5_GET(ipsec_aso, aso->ctx, hard_lft_arm) ||
+                   !MLX5_GET(ipsec_aso, aso->ctx, remove_flow_enable))
+                       xfrm_state_check_expire(sa_entry->x);
+       aso->use_cache = false;
+
+unlock:
+       spin_unlock(&sa_entry->x->lock);
+out:
+       kfree(work);
+}
+
+static int mlx5e_ipsec_event(struct notifier_block *nb, unsigned long event,
+                            void *data)
+{
+       struct mlx5e_ipsec *ipsec = container_of(nb, struct mlx5e_ipsec, nb);
+       struct mlx5_eqe_obj_change *object;
+       struct mlx5e_ipsec_work *work;
+       struct mlx5_eqe *eqe = data;
+       u16 type;
+
+       if (event != MLX5_EVENT_TYPE_OBJECT_CHANGE)
+               return NOTIFY_DONE;
+
+       object = &eqe->data.obj_change;
+       type = be16_to_cpu(object->obj_type);
+
+       if (type != MLX5_GENERAL_OBJECT_TYPES_IPSEC)
+               return NOTIFY_DONE;
+
+       work = kmalloc(sizeof(*work), GFP_ATOMIC);
+       if (!work)
+               return NOTIFY_DONE;
+
+       INIT_WORK(&work->work, mlx5e_ipsec_handle_event);
+       work->ipsec = ipsec;
+       work->id = be32_to_cpu(object->obj_id);
+
+       queue_work(ipsec->wq, &work->work);
+       return NOTIFY_OK;
+}
+
+int mlx5e_ipsec_aso_init(struct mlx5e_ipsec *ipsec)
+{
+       struct mlx5_core_dev *mdev = ipsec->mdev;
+       struct mlx5e_ipsec_aso *aso;
+       struct mlx5e_hw_objs *res;
+       struct device *pdev;
+       int err;
+
+       aso = kzalloc(sizeof(*ipsec->aso), GFP_KERNEL);
+       if (!aso)
+               return -ENOMEM;
+
+       res = &mdev->mlx5e_res.hw_objs;
+
+       pdev = mlx5_core_dma_dev(mdev);
+       aso->dma_addr = dma_map_single(pdev, aso->ctx, sizeof(aso->ctx),
+                                      DMA_BIDIRECTIONAL);
+       err = dma_mapping_error(pdev, aso->dma_addr);
+       if (err)
+               goto err_dma;
+
+       aso->aso = mlx5_aso_create(mdev, res->pdn);
+       if (IS_ERR(aso->aso)) {
+               err = PTR_ERR(aso->aso);
+               goto err_aso_create;
+       }
+
+       ipsec->nb.notifier_call = mlx5e_ipsec_event;
+       mlx5_notifier_register(mdev, &ipsec->nb);
+
+       ipsec->aso = aso;
+       return 0;
+
+err_aso_create:
+       dma_unmap_single(pdev, aso->dma_addr, sizeof(aso->ctx),
+                        DMA_BIDIRECTIONAL);
+err_dma:
+       kfree(aso);
+       return err;
+}
+
+void mlx5e_ipsec_aso_cleanup(struct mlx5e_ipsec *ipsec)
+{
+       struct mlx5_core_dev *mdev = ipsec->mdev;
+       struct mlx5e_ipsec_aso *aso;
+       struct device *pdev;
+
+       aso = ipsec->aso;
+       pdev = mlx5_core_dma_dev(mdev);
+
+       mlx5_notifier_unregister(mdev, &ipsec->nb);
+       mlx5_aso_destroy(aso->aso);
+       dma_unmap_single(pdev, aso->dma_addr, sizeof(aso->ctx),
+                        DMA_BIDIRECTIONAL);
+       kfree(aso);
+}
+
+static void mlx5e_ipsec_aso_copy(struct mlx5_wqe_aso_ctrl_seg *ctrl,
+                                struct mlx5_wqe_aso_ctrl_seg *data)
+{
+       if (!data)
+               return;
+
+       ctrl->data_mask_mode = data->data_mask_mode;
+       ctrl->condition_1_0_operand = data->condition_1_0_operand;
+       ctrl->condition_1_0_offset = data->condition_1_0_offset;
+       ctrl->data_offset_condition_operand = data->data_offset_condition_operand;
+       ctrl->condition_0_data = data->condition_0_data;
+       ctrl->condition_0_mask = data->condition_0_mask;
+       ctrl->condition_1_data = data->condition_1_data;
+       ctrl->condition_1_mask = data->condition_1_mask;
+       ctrl->bitwise_data = data->bitwise_data;
+       ctrl->data_mask = data->data_mask;
+}
+
+int mlx5e_ipsec_aso_query(struct mlx5e_ipsec_sa_entry *sa_entry,
+                         struct mlx5_wqe_aso_ctrl_seg *data)
+{
+       struct mlx5e_ipsec *ipsec = sa_entry->ipsec;
+       struct mlx5e_ipsec_aso *aso = ipsec->aso;
+       struct mlx5_core_dev *mdev = ipsec->mdev;
+       struct mlx5_wqe_aso_ctrl_seg *ctrl;
+       struct mlx5e_hw_objs *res;
+       struct mlx5_aso_wqe *wqe;
+       u8 ds_cnt;
+
+       lockdep_assert_held(&sa_entry->x->lock);
+       if (aso->use_cache)
+               return 0;
+
+       res = &mdev->mlx5e_res.hw_objs;
+
+       memset(aso->ctx, 0, sizeof(aso->ctx));
+       wqe = mlx5_aso_get_wqe(aso->aso);
+       ds_cnt = DIV_ROUND_UP(sizeof(*wqe), MLX5_SEND_WQE_DS);
+       mlx5_aso_build_wqe(aso->aso, ds_cnt, wqe, sa_entry->ipsec_obj_id,
+                          MLX5_ACCESS_ASO_OPC_MOD_IPSEC);
+
+       ctrl = &wqe->aso_ctrl;
+       ctrl->va_l =
+               cpu_to_be32(lower_32_bits(aso->dma_addr) | ASO_CTRL_READ_EN);
+       ctrl->va_h = cpu_to_be32(upper_32_bits(aso->dma_addr));
+       ctrl->l_key = cpu_to_be32(res->mkey);
+       mlx5e_ipsec_aso_copy(ctrl, data);
+
+       mlx5_aso_post_wqe(aso->aso, false, &wqe->ctrl);
+       return mlx5_aso_poll_cq(aso->aso, false);
+}
+
+void mlx5e_ipsec_aso_update_curlft(struct mlx5e_ipsec_sa_entry *sa_entry,
+                                  u64 *packets)
+{
+       struct mlx5e_ipsec *ipsec = sa_entry->ipsec;
+       struct mlx5e_ipsec_aso *aso = ipsec->aso;
+       u64 hard_cnt;
+
+       hard_cnt = MLX5_GET(ipsec_aso, aso->ctx, remove_flow_pkt_cnt);
+       /* HW decresases the limit till it reaches zero to fire an avent.
+        * We need to fix the calculations, so the returned count is a total
+        * number of passed packets and not how much left.
+        */
+       *packets = sa_entry->attrs.hard_packet_limit - hard_cnt;
+}
index 6859f1c..eab5bc7 100644 (file)
@@ -312,27 +312,31 @@ void mlx5e_ipsec_offload_handle_rx_skb(struct net_device *netdev,
                                       struct mlx5_cqe64 *cqe)
 {
        u32 ipsec_meta_data = be32_to_cpu(cqe->ft_metadata);
-       struct mlx5e_priv *priv;
+       struct mlx5e_priv *priv = netdev_priv(netdev);
+       struct mlx5e_ipsec *ipsec = priv->ipsec;
+       struct mlx5e_ipsec_sa_entry *sa_entry;
        struct xfrm_offload *xo;
-       struct xfrm_state *xs;
        struct sec_path *sp;
        u32  sa_handle;
 
        sa_handle = MLX5_IPSEC_METADATA_HANDLE(ipsec_meta_data);
-       priv = netdev_priv(netdev);
        sp = secpath_set(skb);
        if (unlikely(!sp)) {
-               atomic64_inc(&priv->ipsec->sw_stats.ipsec_rx_drop_sp_alloc);
+               atomic64_inc(&ipsec->sw_stats.ipsec_rx_drop_sp_alloc);
                return;
        }
 
-       xs = mlx5e_ipsec_sadb_rx_lookup(priv->ipsec, sa_handle);
-       if (unlikely(!xs)) {
-               atomic64_inc(&priv->ipsec->sw_stats.ipsec_rx_drop_sadb_miss);
+       rcu_read_lock();
+       sa_entry = xa_load(&ipsec->sadb, sa_handle);
+       if (unlikely(!sa_entry)) {
+               rcu_read_unlock();
+               atomic64_inc(&ipsec->sw_stats.ipsec_rx_drop_sadb_miss);
                return;
        }
+       xfrm_state_hold(sa_entry->x);
+       rcu_read_unlock();
 
-       sp->xvec[sp->len++] = xs;
+       sp->xvec[sp->len++] = sa_entry->x;
        sp->olen++;
 
        xo = xfrm_offload(skb);
@@ -349,6 +353,6 @@ void mlx5e_ipsec_offload_handle_rx_skb(struct net_device *netdev,
                xo->status = CRYPTO_INVALID_PACKET_SYNTAX;
                break;
        default:
-               atomic64_inc(&priv->ipsec->sw_stats.ipsec_rx_drop_syndrome);
+               atomic64_inc(&ipsec->sw_stats.ipsec_rx_drop_syndrome);
        }
 }
index 9de8482..e0e36a0 100644 (file)
 #include "en.h"
 #include "ipsec.h"
 
+static const struct counter_desc mlx5e_ipsec_hw_stats_desc[] = {
+       { MLX5E_DECLARE_STAT(struct mlx5e_ipsec_hw_stats, ipsec_rx_pkts) },
+       { MLX5E_DECLARE_STAT(struct mlx5e_ipsec_hw_stats, ipsec_rx_bytes) },
+       { MLX5E_DECLARE_STAT(struct mlx5e_ipsec_hw_stats, ipsec_rx_drop_pkts) },
+       { MLX5E_DECLARE_STAT(struct mlx5e_ipsec_hw_stats, ipsec_rx_drop_bytes) },
+       { MLX5E_DECLARE_STAT(struct mlx5e_ipsec_hw_stats, ipsec_tx_pkts) },
+       { MLX5E_DECLARE_STAT(struct mlx5e_ipsec_hw_stats, ipsec_tx_bytes) },
+       { MLX5E_DECLARE_STAT(struct mlx5e_ipsec_hw_stats, ipsec_tx_drop_pkts) },
+       { MLX5E_DECLARE_STAT(struct mlx5e_ipsec_hw_stats, ipsec_tx_drop_bytes) },
+};
+
 static const struct counter_desc mlx5e_ipsec_sw_stats_desc[] = {
        { MLX5E_DECLARE_STAT(struct mlx5e_ipsec_sw_stats, ipsec_rx_drop_sp_alloc) },
        { MLX5E_DECLARE_STAT(struct mlx5e_ipsec_sw_stats, ipsec_rx_drop_sadb_miss) },
@@ -50,8 +61,48 @@ static const struct counter_desc mlx5e_ipsec_sw_stats_desc[] = {
 #define MLX5E_READ_CTR_ATOMIC64(ptr, dsc, i) \
        atomic64_read((atomic64_t *)((char *)(ptr) + (dsc)[i].offset))
 
+#define NUM_IPSEC_HW_COUNTERS ARRAY_SIZE(mlx5e_ipsec_hw_stats_desc)
 #define NUM_IPSEC_SW_COUNTERS ARRAY_SIZE(mlx5e_ipsec_sw_stats_desc)
 
+static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(ipsec_hw)
+{
+       if (!priv->ipsec)
+               return 0;
+
+       return NUM_IPSEC_HW_COUNTERS;
+}
+
+static inline MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(ipsec_hw) {}
+
+static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(ipsec_hw)
+{
+       unsigned int i;
+
+       if (!priv->ipsec)
+               return idx;
+
+       for (i = 0; i < NUM_IPSEC_HW_COUNTERS; i++)
+               strcpy(data + (idx++) * ETH_GSTRING_LEN,
+                      mlx5e_ipsec_hw_stats_desc[i].format);
+
+       return idx;
+}
+
+static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(ipsec_hw)
+{
+       int i;
+
+       if (!priv->ipsec)
+               return idx;
+
+       mlx5e_accel_ipsec_fs_read_stats(priv, &priv->ipsec->hw_stats);
+       for (i = 0; i < NUM_IPSEC_HW_COUNTERS; i++)
+               data[idx++] = MLX5E_READ_CTR_ATOMIC64(&priv->ipsec->hw_stats,
+                                                     mlx5e_ipsec_hw_stats_desc, i);
+
+       return idx;
+}
+
 static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(ipsec_sw)
 {
        return priv->ipsec ? NUM_IPSEC_SW_COUNTERS : 0;
@@ -81,4 +132,5 @@ static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(ipsec_sw)
        return idx;
 }
 
+MLX5E_DEFINE_STATS_GRP(ipsec_hw, 0);
 MLX5E_DEFINE_STATS_GRP(ipsec_sw, 0);
index 6238864..75b9e15 100644 (file)
@@ -85,18 +85,25 @@ static const struct counter_desc sw_rep_stats_desc[] = {
        { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_bytes) },
 };
 
-struct vport_stats {
-       u64 vport_rx_packets;
-       u64 vport_tx_packets;
-       u64 vport_rx_bytes;
-       u64 vport_tx_bytes;
-};
-
 static const struct counter_desc vport_rep_stats_desc[] = {
-       { MLX5E_DECLARE_STAT(struct vport_stats, vport_rx_packets) },
-       { MLX5E_DECLARE_STAT(struct vport_stats, vport_rx_bytes) },
-       { MLX5E_DECLARE_STAT(struct vport_stats, vport_tx_packets) },
-       { MLX5E_DECLARE_STAT(struct vport_stats, vport_tx_bytes) },
+       { MLX5E_DECLARE_STAT(struct mlx5e_rep_stats, vport_rx_packets) },
+       { MLX5E_DECLARE_STAT(struct mlx5e_rep_stats, vport_rx_bytes) },
+       { MLX5E_DECLARE_STAT(struct mlx5e_rep_stats, vport_tx_packets) },
+       { MLX5E_DECLARE_STAT(struct mlx5e_rep_stats, vport_tx_bytes) },
+       { MLX5E_DECLARE_STAT(struct mlx5e_rep_stats,
+                            rx_vport_rdma_unicast_packets) },
+       { MLX5E_DECLARE_STAT(struct mlx5e_rep_stats, rx_vport_rdma_unicast_bytes) },
+       { MLX5E_DECLARE_STAT(struct mlx5e_rep_stats,
+                            tx_vport_rdma_unicast_packets) },
+       { MLX5E_DECLARE_STAT(struct mlx5e_rep_stats, tx_vport_rdma_unicast_bytes) },
+       { MLX5E_DECLARE_STAT(struct mlx5e_rep_stats,
+                            rx_vport_rdma_multicast_packets) },
+       { MLX5E_DECLARE_STAT(struct mlx5e_rep_stats,
+                            rx_vport_rdma_multicast_bytes) },
+       { MLX5E_DECLARE_STAT(struct mlx5e_rep_stats,
+                            tx_vport_rdma_multicast_packets) },
+       { MLX5E_DECLARE_STAT(struct mlx5e_rep_stats,
+                            tx_vport_rdma_multicast_bytes) },
 };
 
 #define NUM_VPORT_REP_SW_COUNTERS ARRAY_SIZE(sw_rep_stats_desc)
@@ -161,33 +168,80 @@ static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(vport_rep)
        int i;
 
        for (i = 0; i < NUM_VPORT_REP_HW_COUNTERS; i++)
-               data[idx++] = MLX5E_READ_CTR64_CPU(&priv->stats.vf_vport,
+               data[idx++] = MLX5E_READ_CTR64_CPU(&priv->stats.rep_stats,
                                                   vport_rep_stats_desc, i);
        return idx;
 }
 
 static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(vport_rep)
 {
+       struct mlx5e_rep_stats *rep_stats = &priv->stats.rep_stats;
+       int outlen = MLX5_ST_SZ_BYTES(query_vport_counter_out);
        struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
        struct mlx5e_rep_priv *rpriv = priv->ppriv;
        struct mlx5_eswitch_rep *rep = rpriv->rep;
-       struct rtnl_link_stats64 *vport_stats;
-       struct ifla_vf_stats vf_stats;
+       u32 *out;
        int err;
 
-       err = mlx5_eswitch_get_vport_stats(esw, rep->vport, &vf_stats);
+       out = kvzalloc(outlen, GFP_KERNEL);
+       if (!out)
+               return;
+
+       err = mlx5_core_query_vport_counter(esw->dev, 1, rep->vport - 1, 0, out);
        if (err) {
                netdev_warn(priv->netdev, "vport %d error %d reading stats\n",
                            rep->vport, err);
                return;
        }
 
-       vport_stats = &priv->stats.vf_vport;
+       #define MLX5_GET_CTR(p, x) \
+               MLX5_GET64(query_vport_counter_out, p, x)
        /* flip tx/rx as we are reporting the counters for the switch vport */
-       vport_stats->rx_packets = vf_stats.tx_packets;
-       vport_stats->rx_bytes   = vf_stats.tx_bytes;
-       vport_stats->tx_packets = vf_stats.rx_packets;
-       vport_stats->tx_bytes   = vf_stats.rx_bytes;
+       rep_stats->vport_rx_packets =
+               MLX5_GET_CTR(out, transmitted_ib_unicast.packets) +
+               MLX5_GET_CTR(out, transmitted_eth_unicast.packets) +
+               MLX5_GET_CTR(out, transmitted_ib_multicast.packets) +
+               MLX5_GET_CTR(out, transmitted_eth_multicast.packets) +
+               MLX5_GET_CTR(out, transmitted_eth_broadcast.packets);
+
+       rep_stats->vport_tx_packets =
+               MLX5_GET_CTR(out, received_ib_unicast.packets) +
+               MLX5_GET_CTR(out, received_eth_unicast.packets) +
+               MLX5_GET_CTR(out, received_ib_multicast.packets) +
+               MLX5_GET_CTR(out, received_eth_multicast.packets) +
+               MLX5_GET_CTR(out, received_eth_broadcast.packets);
+
+       rep_stats->vport_rx_bytes =
+               MLX5_GET_CTR(out, transmitted_ib_unicast.octets) +
+               MLX5_GET_CTR(out, transmitted_eth_unicast.octets) +
+               MLX5_GET_CTR(out, transmitted_ib_multicast.octets) +
+               MLX5_GET_CTR(out, transmitted_eth_broadcast.octets);
+
+       rep_stats->vport_tx_bytes =
+               MLX5_GET_CTR(out, received_ib_unicast.octets) +
+               MLX5_GET_CTR(out, received_eth_unicast.octets) +
+               MLX5_GET_CTR(out, received_ib_multicast.octets) +
+               MLX5_GET_CTR(out, received_eth_multicast.octets) +
+               MLX5_GET_CTR(out, received_eth_broadcast.octets);
+
+       rep_stats->rx_vport_rdma_unicast_packets =
+               MLX5_GET_CTR(out, transmitted_ib_unicast.packets);
+       rep_stats->tx_vport_rdma_unicast_packets =
+               MLX5_GET_CTR(out, received_ib_unicast.packets);
+       rep_stats->rx_vport_rdma_unicast_bytes =
+               MLX5_GET_CTR(out, transmitted_ib_unicast.octets);
+       rep_stats->tx_vport_rdma_unicast_bytes =
+               MLX5_GET_CTR(out, received_ib_unicast.octets);
+       rep_stats->rx_vport_rdma_multicast_packets =
+               MLX5_GET_CTR(out, transmitted_ib_multicast.packets);
+       rep_stats->tx_vport_rdma_multicast_packets =
+               MLX5_GET_CTR(out, received_ib_multicast.packets);
+       rep_stats->rx_vport_rdma_multicast_bytes =
+               MLX5_GET_CTR(out, transmitted_ib_multicast.octets);
+       rep_stats->tx_vport_rdma_multicast_bytes =
+               MLX5_GET_CTR(out, received_ib_multicast.octets);
+
+       kvfree(out);
 }
 
 static void mlx5e_rep_get_strings(struct net_device *dev,
index 70c4ea3..6687b81 100644 (file)
@@ -2480,6 +2480,7 @@ mlx5e_stats_grp_t mlx5e_nic_stats_grps[] = {
        &MLX5E_STATS_GRP(per_prio),
        &MLX5E_STATS_GRP(pme),
 #ifdef CONFIG_MLX5_EN_IPSEC
+       &MLX5E_STATS_GRP(ipsec_hw),
        &MLX5E_STATS_GRP(ipsec_sw),
 #endif
        &MLX5E_STATS_GRP(tls),
index cbc831c..375752d 100644 (file)
@@ -463,6 +463,21 @@ struct mlx5e_ptp_cq_stats {
        u64 resync_event;
 };
 
+struct mlx5e_rep_stats {
+       u64 vport_rx_packets;
+       u64 vport_tx_packets;
+       u64 vport_rx_bytes;
+       u64 vport_tx_bytes;
+       u64 rx_vport_rdma_unicast_packets;
+       u64 tx_vport_rdma_unicast_packets;
+       u64 rx_vport_rdma_unicast_bytes;
+       u64 tx_vport_rdma_unicast_bytes;
+       u64 rx_vport_rdma_multicast_packets;
+       u64 tx_vport_rdma_multicast_packets;
+       u64 rx_vport_rdma_multicast_bytes;
+       u64 tx_vport_rdma_multicast_bytes;
+};
+
 struct mlx5e_stats {
        struct mlx5e_sw_stats sw;
        struct mlx5e_qcounter_stats qcnt;
@@ -471,6 +486,7 @@ struct mlx5e_stats {
        struct mlx5e_pport_stats pport;
        struct rtnl_link_stats64 vf_vport;
        struct mlx5e_pcie_stats pcie;
+       struct mlx5e_rep_stats rep_stats;
 };
 
 extern mlx5e_stats_grp_t mlx5e_nic_stats_grps[];
@@ -490,6 +506,7 @@ extern MLX5E_DECLARE_STATS_GRP(per_prio);
 extern MLX5E_DECLARE_STATS_GRP(pme);
 extern MLX5E_DECLARE_STATS_GRP(channels);
 extern MLX5E_DECLARE_STATS_GRP(per_port_buff_congest);
+extern MLX5E_DECLARE_STATS_GRP(ipsec_hw);
 extern MLX5E_DECLARE_STATS_GRP(ipsec_sw);
 extern MLX5E_DECLARE_STATS_GRP(ptp);
 extern MLX5E_DECLARE_STATS_GRP(macsec_hw);
index 10d1609..9af2aa2 100644 (file)
@@ -132,6 +132,15 @@ struct mlx5e_tc_attr_to_reg_mapping mlx5e_tc_attr_to_reg_mappings[] = {
        [PACKET_COLOR_TO_REG] = packet_color_to_reg,
 };
 
+struct mlx5e_tc_jump_state {
+       u32 jump_count;
+       bool jump_target;
+       struct mlx5_flow_attr *jumping_attr;
+
+       enum flow_action_id last_id;
+       u32 last_index;
+};
+
 struct mlx5e_tc_table *mlx5e_tc_table_alloc(void)
 {
        struct mlx5e_tc_table *tc;
@@ -160,6 +169,7 @@ static struct lock_class_key tc_ht_lock_key;
 
 static void mlx5e_put_flow_tunnel_id(struct mlx5e_tc_flow *flow);
 static void free_flow_post_acts(struct mlx5e_tc_flow *flow);
+static void mlx5_free_flow_attr(struct mlx5e_tc_flow *flow, struct mlx5_flow_attr *attr);
 
 void
 mlx5e_tc_match_to_reg_match(struct mlx5_flow_spec *spec,
@@ -392,8 +402,9 @@ mlx5_tc_rule_delete(struct mlx5e_priv *priv,
 static bool
 is_flow_meter_action(struct mlx5_flow_attr *attr)
 {
-       return ((attr->action & MLX5_FLOW_CONTEXT_ACTION_EXECUTE_ASO) &&
-               (attr->exe_aso_type == MLX5_EXE_ASO_FLOW_METER));
+       return (((attr->action & MLX5_FLOW_CONTEXT_ACTION_EXECUTE_ASO) &&
+                (attr->exe_aso_type == MLX5_EXE_ASO_FLOW_METER)) ||
+               attr->flags & MLX5_ATTR_FLAG_MTU);
 }
 
 static int
@@ -404,6 +415,7 @@ mlx5e_tc_add_flow_meter(struct mlx5e_priv *priv,
        struct mlx5e_post_meter_priv *post_meter;
        enum mlx5_flow_namespace_type ns_type;
        struct mlx5e_flow_meter_handle *meter;
+       enum mlx5e_post_meter_type type;
 
        meter = mlx5e_tc_meter_replace(priv->mdev, &attr->meter_attr.params);
        if (IS_ERR(meter)) {
@@ -412,8 +424,11 @@ mlx5e_tc_add_flow_meter(struct mlx5e_priv *priv,
        }
 
        ns_type = mlx5e_tc_meter_get_namespace(meter->flow_meters);
-       post_meter = mlx5e_post_meter_init(priv, ns_type, post_act, meter->green_counter,
-                                          meter->red_counter);
+       type = meter->params.mtu ? MLX5E_POST_METER_MTU : MLX5E_POST_METER_RATE;
+       post_meter = mlx5e_post_meter_init(priv, ns_type, post_act,
+                                          type,
+                                          meter->act_counter, meter->drop_counter,
+                                          attr->branch_true, attr->branch_false);
        if (IS_ERR(post_meter)) {
                mlx5_core_err(priv->mdev, "Failed to init post meter\n");
                goto err_meter_init;
@@ -432,9 +447,9 @@ err_meter_init:
 }
 
 static void
-mlx5e_tc_del_flow_meter(struct mlx5_flow_attr *attr)
+mlx5e_tc_del_flow_meter(struct mlx5_eswitch *esw, struct mlx5_flow_attr *attr)
 {
-       mlx5e_post_meter_cleanup(attr->meter_attr.post_meter);
+       mlx5e_post_meter_cleanup(esw, attr->meter_attr.post_meter);
        mlx5e_tc_meter_put(attr->meter_attr.meter);
 }
 
@@ -495,7 +510,7 @@ mlx5e_tc_rule_unoffload(struct mlx5e_priv *priv,
        mlx5_eswitch_del_offloaded_rule(esw, rule, attr);
 
        if (attr->meter_attr.meter)
-               mlx5e_tc_del_flow_meter(attr);
+               mlx5e_tc_del_flow_meter(esw, attr);
 }
 
 int
@@ -606,6 +621,12 @@ int mlx5e_get_flow_namespace(struct mlx5e_tc_flow *flow)
                MLX5_FLOW_NAMESPACE_FDB : MLX5_FLOW_NAMESPACE_KERNEL;
 }
 
+static struct mlx5_core_dev *
+get_flow_counter_dev(struct mlx5e_tc_flow *flow)
+{
+       return mlx5e_is_eswitch_flow(flow) ? flow->attr->esw_attr->counter_dev : flow->priv->mdev;
+}
+
 static struct mod_hdr_tbl *
 get_mod_hdr_table(struct mlx5e_priv *priv, struct mlx5e_tc_flow *flow)
 {
@@ -1719,6 +1740,90 @@ clean_encap_dests(struct mlx5e_priv *priv,
 }
 
 static int
+verify_attr_actions(u32 actions, struct netlink_ext_ack *extack)
+{
+       if (!(actions &
+             (MLX5_FLOW_CONTEXT_ACTION_FWD_DEST | MLX5_FLOW_CONTEXT_ACTION_DROP))) {
+               NL_SET_ERR_MSG_MOD(extack, "Rule must have at least one forward/drop action");
+               return -EOPNOTSUPP;
+       }
+
+       if (!(~actions &
+             (MLX5_FLOW_CONTEXT_ACTION_FWD_DEST | MLX5_FLOW_CONTEXT_ACTION_DROP))) {
+               NL_SET_ERR_MSG_MOD(extack, "Rule cannot support forward+drop action");
+               return -EOPNOTSUPP;
+       }
+
+       if (actions & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR &&
+           actions & MLX5_FLOW_CONTEXT_ACTION_DROP) {
+               NL_SET_ERR_MSG_MOD(extack, "Drop with modify header action is not supported");
+               return -EOPNOTSUPP;
+       }
+
+       return 0;
+}
+
+static int
+post_process_attr(struct mlx5e_tc_flow *flow,
+                 struct mlx5_flow_attr *attr,
+                 bool is_post_act_attr,
+                 struct netlink_ext_ack *extack)
+{
+       struct mlx5_eswitch *esw = flow->priv->mdev->priv.eswitch;
+       bool vf_tun;
+       int err = 0;
+
+       err = verify_attr_actions(attr->action, extack);
+       if (err)
+               goto err_out;
+
+       err = set_encap_dests(flow->priv, flow, attr, extack, &vf_tun);
+       if (err)
+               goto err_out;
+
+       if (mlx5e_is_eswitch_flow(flow)) {
+               err = mlx5_eswitch_add_vlan_action(esw, attr);
+               if (err)
+                       goto err_out;
+       }
+
+       if (attr->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR) {
+               if (vf_tun || is_post_act_attr) {
+                       err = mlx5e_tc_add_flow_mod_hdr(flow->priv, flow, attr);
+                       if (err)
+                               goto err_out;
+               } else {
+                       err = mlx5e_attach_mod_hdr(flow->priv, flow, attr->parse_attr);
+                       if (err)
+                               goto err_out;
+               }
+       }
+
+       if (attr->branch_true &&
+           attr->branch_true->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR) {
+               err = mlx5e_tc_add_flow_mod_hdr(flow->priv, flow, attr->branch_true);
+               if (err)
+                       goto err_out;
+       }
+
+       if (attr->branch_false &&
+           attr->branch_false->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR) {
+               err = mlx5e_tc_add_flow_mod_hdr(flow->priv, flow, attr->branch_false);
+               if (err)
+                       goto err_out;
+       }
+
+       if (attr->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
+               err = alloc_flow_attr_counter(get_flow_counter_dev(flow), attr);
+               if (err)
+                       goto err_out;
+       }
+
+err_out:
+       return err;
+}
+
+static int
 mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
                      struct mlx5e_tc_flow *flow,
                      struct netlink_ext_ack *extack)
@@ -1728,7 +1833,6 @@ mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
        struct mlx5_flow_attr *attr = flow->attr;
        struct mlx5_esw_flow_attr *esw_attr;
        u32 max_prio, max_chain;
-       bool vf_tun;
        int err = 0;
 
        parse_attr = attr->parse_attr;
@@ -1818,32 +1922,10 @@ mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
                esw_attr->int_port = int_port;
        }
 
-       err = set_encap_dests(priv, flow, attr, extack, &vf_tun);
-       if (err)
-               goto err_out;
-
-       err = mlx5_eswitch_add_vlan_action(esw, attr);
+       err = post_process_attr(flow, attr, false, extack);
        if (err)
                goto err_out;
 
-       if (attr->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR) {
-               if (vf_tun) {
-                       err = mlx5e_tc_add_flow_mod_hdr(priv, flow, attr);
-                       if (err)
-                               goto err_out;
-               } else {
-                       err = mlx5e_attach_mod_hdr(priv, flow, parse_attr);
-                       if (err)
-                               goto err_out;
-               }
-       }
-
-       if (attr->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
-               err = alloc_flow_attr_counter(esw_attr->counter_dev, attr);
-               if (err)
-                       goto err_out;
-       }
-
        /* we get here if one of the following takes place:
         * (1) there's no error
         * (2) there's an encap action and we don't have valid neigh
@@ -1879,6 +1961,16 @@ static bool mlx5_flow_has_geneve_opt(struct mlx5e_tc_flow *flow)
        return !!geneve_tlv_opt_0_data;
 }
 
+static void free_branch_attr(struct mlx5e_tc_flow *flow, struct mlx5_flow_attr *attr)
+{
+       if (!attr)
+               return;
+
+       mlx5_free_flow_attr(flow, attr);
+       kvfree(attr->parse_attr);
+       kfree(attr);
+}
+
 static void mlx5e_tc_del_fdb_flow(struct mlx5e_priv *priv,
                                  struct mlx5e_tc_flow *flow)
 {
@@ -1934,6 +2026,8 @@ static void mlx5e_tc_del_fdb_flow(struct mlx5e_priv *priv,
                mlx5e_detach_decap(priv, flow);
 
        free_flow_post_acts(flow);
+       free_branch_attr(flow, attr->branch_true);
+       free_branch_attr(flow, attr->branch_false);
 
        if (flow->attr->lag.count)
                mlx5_lag_del_mpesw_rule(esw->dev);
@@ -3507,36 +3601,6 @@ actions_match_supported(struct mlx5e_priv *priv,
        ct_clear = flow->attr->ct_attr.ct_action & TCA_CT_ACT_CLEAR;
        ct_flow = flow_flag_test(flow, CT) && !ct_clear;
 
-       if (!(actions &
-             (MLX5_FLOW_CONTEXT_ACTION_FWD_DEST | MLX5_FLOW_CONTEXT_ACTION_DROP))) {
-               NL_SET_ERR_MSG_MOD(extack, "Rule must have at least one forward/drop action");
-               return false;
-       }
-
-       if (!(~actions &
-             (MLX5_FLOW_CONTEXT_ACTION_FWD_DEST | MLX5_FLOW_CONTEXT_ACTION_DROP))) {
-               NL_SET_ERR_MSG_MOD(extack, "Rule cannot support forward+drop action");
-               return false;
-       }
-
-       if (actions & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR &&
-           actions & MLX5_FLOW_CONTEXT_ACTION_DROP) {
-               NL_SET_ERR_MSG_MOD(extack, "Drop with modify header action is not supported");
-               return false;
-       }
-
-       if (!(~actions &
-             (MLX5_FLOW_CONTEXT_ACTION_FWD_DEST | MLX5_FLOW_CONTEXT_ACTION_DROP))) {
-               NL_SET_ERR_MSG_MOD(extack, "Rule cannot support forward+drop action");
-               return false;
-       }
-
-       if (actions & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR &&
-           actions & MLX5_FLOW_CONTEXT_ACTION_DROP) {
-               NL_SET_ERR_MSG_MOD(extack, "Drop with modify header action is not supported");
-               return false;
-       }
-
        if (actions & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR &&
            !modify_header_match_supported(priv, &parse_attr->spec, flow_action,
                                           actions, ct_flow, ct_clear, extack))
@@ -3636,15 +3700,12 @@ mlx5e_clone_flow_attr_for_post_act(struct mlx5_flow_attr *attr,
                attr2->esw_attr->split_count = 0;
        }
 
+       attr2->branch_true = NULL;
+       attr2->branch_false = NULL;
+       attr2->jumping_attr = NULL;
        return attr2;
 }
 
-static struct mlx5_core_dev *
-get_flow_counter_dev(struct mlx5e_tc_flow *flow)
-{
-       return mlx5e_is_eswitch_flow(flow) ? flow->attr->esw_attr->counter_dev : flow->priv->mdev;
-}
-
 struct mlx5_flow_attr *
 mlx5e_tc_get_encap_attr(struct mlx5e_tc_flow *flow)
 {
@@ -3680,28 +3741,15 @@ mlx5e_tc_unoffload_flow_post_acts(struct mlx5e_tc_flow *flow)
 static void
 free_flow_post_acts(struct mlx5e_tc_flow *flow)
 {
-       struct mlx5_core_dev *counter_dev = get_flow_counter_dev(flow);
-       struct mlx5e_post_act *post_act = get_post_action(flow->priv);
        struct mlx5_flow_attr *attr, *tmp;
-       bool vf_tun;
 
        list_for_each_entry_safe(attr, tmp, &flow->attrs, list) {
                if (list_is_last(&attr->list, &flow->attrs))
                        break;
 
-               if (attr->post_act_handle)
-                       mlx5e_tc_post_act_del(post_act, attr->post_act_handle);
-
-               clean_encap_dests(flow->priv, flow, attr, &vf_tun);
-
-               if (attr->action & MLX5_FLOW_CONTEXT_ACTION_COUNT)
-                       mlx5_fc_destroy(counter_dev, attr->counter);
-
-               if (attr->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR) {
-                       mlx5e_mod_hdr_dealloc(&attr->parse_attr->mod_hdr_acts);
-                       if (attr->modify_hdr)
-                               mlx5_modify_header_dealloc(flow->priv->mdev, attr->modify_hdr);
-               }
+               mlx5_free_flow_attr(flow, attr);
+               free_branch_attr(flow, attr->branch_true);
+               free_branch_attr(flow, attr->branch_false);
 
                list_del(&attr->list);
                kvfree(attr->parse_attr);
@@ -3754,7 +3802,6 @@ alloc_flow_post_acts(struct mlx5e_tc_flow *flow, struct netlink_ext_ack *extack)
        struct mlx5e_post_act *post_act = get_post_action(flow->priv);
        struct mlx5_flow_attr *attr, *next_attr = NULL;
        struct mlx5e_post_act_handle *handle;
-       bool vf_tun;
        int err;
 
        /* This is going in reverse order as needed.
@@ -3764,7 +3811,9 @@ alloc_flow_post_acts(struct mlx5e_tc_flow *flow, struct netlink_ext_ack *extack)
                if (!next_attr) {
                        /* Set counter action on last post act rule. */
                        attr->action |= MLX5_FLOW_CONTEXT_ACTION_COUNT;
-               } else {
+               }
+
+               if (next_attr && !(attr->flags & MLX5_ATTR_FLAG_TERMINATING)) {
                        err = mlx5e_tc_act_set_next_post_act(flow, attr, next_attr);
                        if (err)
                                goto out_free;
@@ -3776,26 +3825,14 @@ alloc_flow_post_acts(struct mlx5e_tc_flow *flow, struct netlink_ext_ack *extack)
                if (list_is_last(&attr->list, &flow->attrs))
                        break;
 
-               err = set_encap_dests(flow->priv, flow, attr, extack, &vf_tun);
+               err = actions_prepare_mod_hdr_actions(flow->priv, flow, attr, extack);
                if (err)
                        goto out_free;
 
-               err = actions_prepare_mod_hdr_actions(flow->priv, flow, attr, extack);
+               err = post_process_attr(flow, attr, true, extack);
                if (err)
                        goto out_free;
 
-               if (attr->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR) {
-                       err = mlx5e_tc_add_flow_mod_hdr(flow->priv, flow, attr);
-                       if (err)
-                               goto out_free;
-               }
-
-               if (attr->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
-                       err = alloc_flow_attr_counter(get_flow_counter_dev(flow), attr);
-                       if (err)
-                               goto out_free;
-               }
-
                handle = mlx5e_tc_post_act_add(post_act, attr);
                if (IS_ERR(handle)) {
                        err = PTR_ERR(handle);
@@ -3803,6 +3840,13 @@ alloc_flow_post_acts(struct mlx5e_tc_flow *flow, struct netlink_ext_ack *extack)
                }
 
                attr->post_act_handle = handle;
+
+               if (attr->jumping_attr) {
+                       err = mlx5e_tc_act_set_next_post_act(flow, attr->jumping_attr, attr);
+                       if (err)
+                               goto out_free;
+               }
+
                next_attr = attr;
        }
 
@@ -3822,12 +3866,145 @@ out_free:
 }
 
 static int
+alloc_branch_attr(struct mlx5e_tc_flow *flow,
+                 struct mlx5e_tc_act_branch_ctrl *cond,
+                 struct mlx5_flow_attr **cond_attr,
+                 u32 *jump_count,
+                 struct netlink_ext_ack *extack)
+{
+       struct mlx5_flow_attr *attr;
+       int err = 0;
+
+       *cond_attr = mlx5e_clone_flow_attr_for_post_act(flow->attr,
+                                                       mlx5e_get_flow_namespace(flow));
+       if (!(*cond_attr))
+               return -ENOMEM;
+
+       attr = *cond_attr;
+
+       switch (cond->act_id) {
+       case FLOW_ACTION_DROP:
+               attr->action |= MLX5_FLOW_CONTEXT_ACTION_DROP;
+               break;
+       case FLOW_ACTION_ACCEPT:
+       case FLOW_ACTION_PIPE:
+               attr->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
+               attr->dest_ft = mlx5e_tc_post_act_get_ft(get_post_action(flow->priv));
+               break;
+       case FLOW_ACTION_JUMP:
+               if (*jump_count) {
+                       NL_SET_ERR_MSG_MOD(extack, "Cannot offload flows with nested jumps");
+                       err = -EOPNOTSUPP;
+                       goto out_err;
+               }
+               *jump_count = cond->extval;
+               attr->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
+               attr->dest_ft = mlx5e_tc_post_act_get_ft(get_post_action(flow->priv));
+               break;
+       default:
+               err = -EOPNOTSUPP;
+               goto out_err;
+       }
+
+       return err;
+out_err:
+       kfree(*cond_attr);
+       *cond_attr = NULL;
+       return err;
+}
+
+static void
+dec_jump_count(struct flow_action_entry *act, struct mlx5e_tc_act *tc_act,
+              struct mlx5_flow_attr *attr, struct mlx5e_priv *priv,
+              struct mlx5e_tc_jump_state *jump_state)
+{
+       if (!jump_state->jump_count)
+               return;
+
+       /* Single tc action can instantiate multiple offload actions (e.g. pedit)
+        * Jump only over a tc action
+        */
+       if (act->id == jump_state->last_id && act->hw_index == jump_state->last_index)
+               return;
+
+       jump_state->last_id = act->id;
+       jump_state->last_index = act->hw_index;
+
+       /* nothing to do for intermediate actions */
+       if (--jump_state->jump_count > 1)
+               return;
+
+       if (jump_state->jump_count == 1) { /* last action in the jump action list */
+
+               /* create a new attribute after this action */
+               jump_state->jump_target = true;
+
+               if (tc_act->is_terminating_action) { /* the branch ends here */
+                       attr->flags |= MLX5_ATTR_FLAG_TERMINATING;
+                       attr->action |= MLX5_FLOW_CONTEXT_ACTION_COUNT;
+               } else { /* the branch continues executing the rest of the actions */
+                       struct mlx5e_post_act *post_act;
+
+                       attr->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
+                       post_act = get_post_action(priv);
+                       attr->dest_ft = mlx5e_tc_post_act_get_ft(post_act);
+               }
+       } else if (jump_state->jump_count == 0) { /* first attr after the jump action list */
+               /* This is the post action for the jumping attribute (either red or green)
+                * Use the stored jumping_attr to set the post act id on the jumping attribute
+                */
+               attr->jumping_attr = jump_state->jumping_attr;
+       }
+}
+
+static int
+parse_branch_ctrl(struct flow_action_entry *act, struct mlx5e_tc_act *tc_act,
+                 struct mlx5e_tc_flow *flow, struct mlx5_flow_attr *attr,
+                 struct mlx5e_tc_jump_state *jump_state,
+                 struct netlink_ext_ack *extack)
+{
+       struct mlx5e_tc_act_branch_ctrl cond_true, cond_false;
+       u32 jump_count = jump_state->jump_count;
+       int err;
+
+       if (!tc_act->get_branch_ctrl)
+               return 0;
+
+       tc_act->get_branch_ctrl(act, &cond_true, &cond_false);
+
+       err = alloc_branch_attr(flow, &cond_true,
+                               &attr->branch_true, &jump_count, extack);
+       if (err)
+               goto out_err;
+
+       if (jump_count)
+               jump_state->jumping_attr = attr->branch_true;
+
+       err = alloc_branch_attr(flow, &cond_false,
+                               &attr->branch_false, &jump_count, extack);
+       if (err)
+               goto err_branch_false;
+
+       if (jump_count && !jump_state->jumping_attr)
+               jump_state->jumping_attr = attr->branch_false;
+
+       jump_state->jump_count = jump_count;
+       return 0;
+
+err_branch_false:
+       free_branch_attr(flow, attr->branch_true);
+out_err:
+       return err;
+}
+
+static int
 parse_tc_actions(struct mlx5e_tc_act_parse_state *parse_state,
                 struct flow_action *flow_action)
 {
        struct netlink_ext_ack *extack = parse_state->extack;
        struct mlx5e_tc_flow_action flow_action_reorder;
        struct mlx5e_tc_flow *flow = parse_state->flow;
+       struct mlx5e_tc_jump_state jump_state = {};
        struct mlx5_flow_attr *attr = flow->attr;
        enum mlx5_flow_namespace_type ns_type;
        struct mlx5e_priv *priv = flow->priv;
@@ -3847,6 +4024,7 @@ parse_tc_actions(struct mlx5e_tc_act_parse_state *parse_state,
        list_add(&attr->list, &flow->attrs);
 
        flow_action_for_each(i, _act, &flow_action_reorder) {
+               jump_state.jump_target = false;
                act = *_act;
                tc_act = mlx5e_tc_act_get(act->id, ns_type);
                if (!tc_act) {
@@ -3864,12 +4042,19 @@ parse_tc_actions(struct mlx5e_tc_act_parse_state *parse_state,
                if (err)
                        goto out_free;
 
+               dec_jump_count(act, tc_act, attr, priv, &jump_state);
+
+               err = parse_branch_ctrl(act, tc_act, flow, attr, &jump_state, extack);
+               if (err)
+                       goto out_free;
+
                parse_state->actions |= attr->action;
 
                /* Split attr for multi table act if not the last act. */
-               if (tc_act->is_multi_table_act &&
+               if (jump_state.jump_target ||
+                   (tc_act->is_multi_table_act &&
                    tc_act->is_multi_table_act(priv, act, attr) &&
-                   i < flow_action_reorder.num_entries - 1) {
+                   i < flow_action_reorder.num_entries - 1)) {
                        err = mlx5e_tc_act_post_parse(parse_state, flow_action, attr, ns_type);
                        if (err)
                                goto out_free;
@@ -3951,6 +4136,10 @@ parse_tc_nic_actions(struct mlx5e_priv *priv,
        if (err)
                return err;
 
+       err = verify_attr_actions(attr->action, extack);
+       if (err)
+               return err;
+
        if (!actions_match_supported(priv, flow_action, parse_state->actions,
                                     parse_attr, flow, extack))
                return -EOPNOTSUPP;
@@ -4188,6 +4377,30 @@ mlx5_alloc_flow_attr(enum mlx5_flow_namespace_type type)
        return attr;
 }
 
+static void
+mlx5_free_flow_attr(struct mlx5e_tc_flow *flow, struct mlx5_flow_attr *attr)
+{
+       struct mlx5_core_dev *counter_dev = get_flow_counter_dev(flow);
+       bool vf_tun;
+
+       if (!attr)
+               return;
+
+       if (attr->post_act_handle)
+               mlx5e_tc_post_act_del(get_post_action(flow->priv), attr->post_act_handle);
+
+       clean_encap_dests(flow->priv, flow, attr, &vf_tun);
+
+       if (attr->action & MLX5_FLOW_CONTEXT_ACTION_COUNT)
+               mlx5_fc_destroy(counter_dev, attr->counter);
+
+       if (attr->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR) {
+               mlx5e_mod_hdr_dealloc(&attr->parse_attr->mod_hdr_acts);
+               if (attr->modify_hdr)
+                       mlx5_modify_header_dealloc(flow->priv->mdev, attr->modify_hdr);
+       }
+}
+
 static int
 mlx5e_alloc_flow(struct mlx5e_priv *priv, int attr_size,
                 struct flow_cls_offload *f, unsigned long flow_flags,
@@ -4730,10 +4943,17 @@ static int apply_police_params(struct mlx5e_priv *priv, u64 rate,
        return err;
 }
 
-int mlx5e_policer_validate(const struct flow_action *action,
-                          const struct flow_action_entry *act,
-                          struct netlink_ext_ack *extack)
+static int
+tc_matchall_police_validate(const struct flow_action *action,
+                           const struct flow_action_entry *act,
+                           struct netlink_ext_ack *extack)
 {
+       if (act->police.notexceed.act_id != FLOW_ACTION_CONTINUE) {
+               NL_SET_ERR_MSG_MOD(extack,
+                                  "Offload not supported when conform action is not continue");
+               return -EOPNOTSUPP;
+       }
+
        if (act->police.exceed.act_id != FLOW_ACTION_DROP) {
                NL_SET_ERR_MSG_MOD(extack,
                                   "Offload not supported when exceed action is not drop");
@@ -4784,13 +5004,7 @@ static int scan_tc_matchall_fdb_actions(struct mlx5e_priv *priv,
        flow_action_for_each(i, act, flow_action) {
                switch (act->id) {
                case FLOW_ACTION_POLICE:
-                       if (act->police.notexceed.act_id != FLOW_ACTION_CONTINUE) {
-                               NL_SET_ERR_MSG_MOD(extack,
-                                                  "Offload not supported when conform action is not continue");
-                               return -EOPNOTSUPP;
-                       }
-
-                       err = mlx5e_policer_validate(flow_action, act, extack);
+                       err = tc_matchall_police_validate(flow_action, act, extack);
                        if (err)
                                return err;
 
index 0db41fa..50af70e 100644 (file)
@@ -95,6 +95,9 @@ struct mlx5_flow_attr {
                 */
                bool count;
        } lag;
+       struct mlx5_flow_attr *branch_true;
+       struct mlx5_flow_attr *branch_false;
+       struct mlx5_flow_attr *jumping_attr;
        /* keep this union last */
        union {
                DECLARE_FLEX_ARRAY(struct mlx5_esw_flow_attr, esw_attr);
@@ -110,6 +113,8 @@ enum {
        MLX5_ATTR_FLAG_SAMPLE        = BIT(4),
        MLX5_ATTR_FLAG_ACCEPT        = BIT(5),
        MLX5_ATTR_FLAG_CT            = BIT(6),
+       MLX5_ATTR_FLAG_TERMINATING   = BIT(7),
+       MLX5_ATTR_FLAG_MTU           = BIT(8),
 };
 
 /* Returns true if any of the flags that require skipping further TC/NF processing are set. */
index a0242dc..8f7580f 100644 (file)
@@ -19,6 +19,7 @@
 #include "diag/fw_tracer.h"
 #include "mlx5_irq.h"
 #include "devlink.h"
+#include "en_accel/ipsec.h"
 
 enum {
        MLX5_EQE_OWNER_INIT_VAL = 0x1,
@@ -578,6 +579,10 @@ static void gather_async_events_mask(struct mlx5_core_dev *dev, u64 mask[4])
        if (MLX5_CAP_MACSEC(dev, log_max_macsec_offload))
                async_event_mask |= (1ull << MLX5_EVENT_TYPE_OBJECT_CHANGE);
 
+       if (mlx5_ipsec_device_caps(dev) & MLX5_IPSEC_CAP_PACKET_OFFLOAD)
+               async_event_mask |=
+                       (1ull << MLX5_EVENT_TYPE_OBJECT_CHANGE);
+
        mask[0] = async_event_mask;
 
        if (MLX5_CAP_GEN(dev, event_cap))
index 2db13c7..3d0bbcc 100644 (file)
@@ -12,10 +12,11 @@ enum vnic_diag_counter {
        MLX5_VNIC_DIAG_CQ_OVERRUN,
        MLX5_VNIC_DIAG_INVALID_COMMAND,
        MLX5_VNIC_DIAG_QOUTA_EXCEEDED_COMMAND,
+       MLX5_VNIC_DIAG_RX_STEERING_DISCARD,
 };
 
 static int mlx5_esw_query_vnic_diag(struct mlx5_vport *vport, enum vnic_diag_counter counter,
-                                   u32 *val)
+                                   u64 *val)
 {
        u32 out[MLX5_ST_SZ_DW(query_vnic_env_out)] = {};
        u32 in[MLX5_ST_SZ_DW(query_vnic_env_in)] = {};
@@ -57,6 +58,10 @@ static int mlx5_esw_query_vnic_diag(struct mlx5_vport *vport, enum vnic_diag_cou
        case MLX5_VNIC_DIAG_QOUTA_EXCEEDED_COMMAND:
                *val = MLX5_GET(vnic_diagnostic_statistics, vnic_diag_out, quota_exceeded_command);
                break;
+       case MLX5_VNIC_DIAG_RX_STEERING_DISCARD:
+               *val = MLX5_GET64(vnic_diagnostic_statistics, vnic_diag_out,
+                                 nic_receive_steering_discard);
+               break;
        }
 
        return 0;
@@ -65,14 +70,14 @@ static int mlx5_esw_query_vnic_diag(struct mlx5_vport *vport, enum vnic_diag_cou
 static int __show_vnic_diag(struct seq_file *file, struct mlx5_vport *vport,
                            enum vnic_diag_counter type)
 {
-       u32 val = 0;
+       u64 val = 0;
        int ret;
 
        ret = mlx5_esw_query_vnic_diag(vport, type, &val);
        if (ret)
                return ret;
 
-       seq_printf(file, "%d\n", val);
+       seq_printf(file, "%llu\n", val);
        return 0;
 }
 
@@ -112,6 +117,11 @@ static int quota_exceeded_command_show(struct seq_file *file, void *priv)
        return __show_vnic_diag(file, file->private, MLX5_VNIC_DIAG_QOUTA_EXCEEDED_COMMAND);
 }
 
+static int rx_steering_discard_show(struct seq_file *file, void *priv)
+{
+       return __show_vnic_diag(file, file->private, MLX5_VNIC_DIAG_RX_STEERING_DISCARD);
+}
+
 DEFINE_SHOW_ATTRIBUTE(total_q_under_processor_handle);
 DEFINE_SHOW_ATTRIBUTE(send_queue_priority_update_flow);
 DEFINE_SHOW_ATTRIBUTE(comp_eq_overrun);
@@ -119,6 +129,7 @@ DEFINE_SHOW_ATTRIBUTE(async_eq_overrun);
 DEFINE_SHOW_ATTRIBUTE(cq_overrun);
 DEFINE_SHOW_ATTRIBUTE(invalid_command);
 DEFINE_SHOW_ATTRIBUTE(quota_exceeded_command);
+DEFINE_SHOW_ATTRIBUTE(rx_steering_discard);
 
 void mlx5_esw_vport_debugfs_destroy(struct mlx5_eswitch *esw, u16 vport_num)
 {
@@ -179,4 +190,9 @@ void mlx5_esw_vport_debugfs_create(struct mlx5_eswitch *esw, u16 vport_num, bool
        if (MLX5_CAP_GEN(esw->dev, quota_exceeded_count))
                debugfs_create_file("quota_exceeded_command", 0444, vnic_diag, vport,
                                    &quota_exceeded_command_fops);
+
+       if (MLX5_CAP_GEN(esw->dev, nic_receive_steering_discard))
+               debugfs_create_file("rx_steering_discard", 0444, vnic_diag, vport,
+                                   &rx_steering_discard_fops);
+
 }
index 374e3fb..527e4bf 100644 (file)
@@ -772,6 +772,41 @@ static void esw_vport_cleanup_acl(struct mlx5_eswitch *esw,
                esw_vport_destroy_offloads_acl_tables(esw, vport);
 }
 
+static int mlx5_esw_vport_caps_get(struct mlx5_eswitch *esw, struct mlx5_vport *vport)
+{
+       int query_out_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out);
+       void *query_ctx;
+       void *hca_caps;
+       int err;
+
+       if (!MLX5_CAP_GEN(esw->dev, vhca_resource_manager))
+               return 0;
+
+       query_ctx = kzalloc(query_out_sz, GFP_KERNEL);
+       if (!query_ctx)
+               return -ENOMEM;
+
+       err = mlx5_vport_get_other_func_cap(esw->dev, vport->vport, query_ctx,
+                                           MLX5_CAP_GENERAL);
+       if (err)
+               goto out_free;
+
+       hca_caps = MLX5_ADDR_OF(query_hca_cap_out, query_ctx, capability);
+       vport->info.roce_enabled = MLX5_GET(cmd_hca_cap, hca_caps, roce);
+
+       memset(query_ctx, 0, query_out_sz);
+       err = mlx5_vport_get_other_func_cap(esw->dev, vport->vport, query_ctx,
+                                           MLX5_CAP_GENERAL_2);
+       if (err)
+               goto out_free;
+
+       hca_caps = MLX5_ADDR_OF(query_hca_cap_out, query_ctx, capability);
+       vport->info.mig_enabled = MLX5_GET(cmd_hca_cap_2, hca_caps, migratable);
+out_free:
+       kfree(query_ctx);
+       return err;
+}
+
 static int esw_vport_setup(struct mlx5_eswitch *esw, struct mlx5_vport *vport)
 {
        u16 vport_num = vport->vport;
@@ -785,6 +820,10 @@ static int esw_vport_setup(struct mlx5_eswitch *esw, struct mlx5_vport *vport)
        if (mlx5_esw_is_manager_vport(esw, vport_num))
                return 0;
 
+       err = mlx5_esw_vport_caps_get(esw, vport);
+       if (err)
+               goto err_caps;
+
        mlx5_modify_vport_admin_state(esw->dev,
                                      MLX5_VPORT_STATE_OP_MOD_ESW_VPORT,
                                      vport_num, 1,
@@ -804,6 +843,10 @@ static int esw_vport_setup(struct mlx5_eswitch *esw, struct mlx5_vport *vport)
                               vport->info.qos, flags);
 
        return 0;
+
+err_caps:
+       esw_vport_cleanup_acl(esw, vport);
+       return err;
 }
 
 /* Don't cleanup vport->info, it's needed to restore vport configuration */
index 42d9df4..5a85a5d 100644 (file)
@@ -153,6 +153,8 @@ struct mlx5_vport_info {
        u8                      qos;
        u8                      spoofchk: 1;
        u8                      trusted: 1;
+       u8                      roce_enabled: 1;
+       u8                      mig_enabled: 1;
 };
 
 /* Vport context events */
@@ -508,7 +510,14 @@ int mlx5_devlink_port_function_hw_addr_get(struct devlink_port *port,
 int mlx5_devlink_port_function_hw_addr_set(struct devlink_port *port,
                                           const u8 *hw_addr, int hw_addr_len,
                                           struct netlink_ext_ack *extack);
-
+int mlx5_devlink_port_fn_roce_get(struct devlink_port *port, bool *is_enabled,
+                                 struct netlink_ext_ack *extack);
+int mlx5_devlink_port_fn_roce_set(struct devlink_port *port, bool enable,
+                                 struct netlink_ext_ack *extack);
+int mlx5_devlink_port_fn_migratable_get(struct devlink_port *port, bool *is_enabled,
+                                       struct netlink_ext_ack *extack);
+int mlx5_devlink_port_fn_migratable_set(struct devlink_port *port, bool enable,
+                                       struct netlink_ext_ack *extack);
 void *mlx5_eswitch_get_uplink_priv(struct mlx5_eswitch *esw, u8 rep_type);
 
 int mlx5_eswitch_add_vlan_action(struct mlx5_eswitch *esw,
index 9b6fbb1..e455b21 100644 (file)
@@ -50,6 +50,7 @@
 #include "en/mapping.h"
 #include "devlink.h"
 #include "lag/lag.h"
+#include "en/tc/post_meter.h"
 
 #define mlx5_esw_for_each_rep(esw, i, rep) \
        xa_for_each(&((esw)->offloads.vport_reps), i, rep)
@@ -202,6 +203,21 @@ esw_cleanup_decap_indir(struct mlx5_eswitch *esw,
 }
 
 static int
+esw_setup_mtu_dest(struct mlx5_flow_destination *dest,
+                  struct mlx5e_meter_attr *meter,
+                  int i)
+{
+       dest[i].type = MLX5_FLOW_DESTINATION_TYPE_RANGE;
+       dest[i].range.field = MLX5_FLOW_DEST_RANGE_FIELD_PKT_LEN;
+       dest[i].range.min = 0;
+       dest[i].range.max = meter->params.mtu;
+       dest[i].range.hit_ft = mlx5e_post_meter_get_mtu_true_ft(meter->post_meter);
+       dest[i].range.miss_ft = mlx5e_post_meter_get_mtu_false_ft(meter->post_meter);
+
+       return 0;
+}
+
+static int
 esw_setup_sampler_dest(struct mlx5_flow_destination *dest,
                       struct mlx5_flow_act *flow_act,
                       u32 sampler_id,
@@ -491,6 +507,9 @@ esw_setup_dests(struct mlx5_flow_destination *dest,
        } else if (attr->flags & MLX5_ATTR_FLAG_ACCEPT) {
                esw_setup_accept_dest(dest, flow_act, chains, *i);
                (*i)++;
+       } else if (attr->flags & MLX5_ATTR_FLAG_MTU) {
+               err = esw_setup_mtu_dest(dest, &attr->meter_attr, *i);
+               (*i)++;
        } else if (esw_is_indir_table(esw, attr)) {
                err = esw_setup_indir_table(dest, flow_act, esw, attr, spec, true, i);
        } else if (esw_is_chain_src_port_rewrite(esw, esw_attr)) {
@@ -640,6 +659,11 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
                goto err_esw_get;
        }
 
+       if (!i) {
+               kfree(dest);
+               dest = NULL;
+       }
+
        if (mlx5_eswitch_termtbl_required(esw, attr, &flow_act, spec))
                rule = mlx5_eswitch_add_termtbl_rule(esw, fdb, spec, esw_attr,
                                                     &flow_act, dest, i);
@@ -3889,7 +3913,7 @@ static int mlx5_esw_query_vport_vhca_id(struct mlx5_eswitch *esw, u16 vport_num,
        if (!query_ctx)
                return -ENOMEM;
 
-       err = mlx5_vport_get_other_func_cap(esw->dev, vport_num, query_ctx);
+       err = mlx5_vport_get_other_func_general_cap(esw->dev, vport_num, query_ctx);
        if (err)
                goto out_free;
 
@@ -4022,3 +4046,212 @@ int mlx5_devlink_port_function_hw_addr_set(struct devlink_port *port,
 
        return mlx5_eswitch_set_vport_mac(esw, vport_num, hw_addr);
 }
+
+static struct mlx5_vport *
+mlx5_devlink_port_fn_get_vport(struct devlink_port *port, struct mlx5_eswitch *esw)
+{
+       u16 vport_num;
+
+       if (!MLX5_CAP_GEN(esw->dev, vhca_resource_manager))
+               return ERR_PTR(-EOPNOTSUPP);
+
+       vport_num = mlx5_esw_devlink_port_index_to_vport_num(port->index);
+       if (!is_port_function_supported(esw, vport_num))
+               return ERR_PTR(-EOPNOTSUPP);
+
+       return mlx5_eswitch_get_vport(esw, vport_num);
+}
+
+int mlx5_devlink_port_fn_migratable_get(struct devlink_port *port, bool *is_enabled,
+                                       struct netlink_ext_ack *extack)
+{
+       struct mlx5_eswitch *esw;
+       struct mlx5_vport *vport;
+       int err = -EOPNOTSUPP;
+
+       esw = mlx5_devlink_eswitch_get(port->devlink);
+       if (IS_ERR(esw))
+               return PTR_ERR(esw);
+
+       if (!MLX5_CAP_GEN(esw->dev, migration)) {
+               NL_SET_ERR_MSG_MOD(extack, "Device doesn't support migration");
+               return err;
+       }
+
+       vport = mlx5_devlink_port_fn_get_vport(port, esw);
+       if (IS_ERR(vport)) {
+               NL_SET_ERR_MSG_MOD(extack, "Invalid port");
+               return PTR_ERR(vport);
+       }
+
+       mutex_lock(&esw->state_lock);
+       if (vport->enabled) {
+               *is_enabled = vport->info.mig_enabled;
+               err = 0;
+       }
+       mutex_unlock(&esw->state_lock);
+       return err;
+}
+
+int mlx5_devlink_port_fn_migratable_set(struct devlink_port *port, bool enable,
+                                       struct netlink_ext_ack *extack)
+{
+       int query_out_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out);
+       struct mlx5_eswitch *esw;
+       struct mlx5_vport *vport;
+       void *query_ctx;
+       void *hca_caps;
+       int err = -EOPNOTSUPP;
+
+       esw = mlx5_devlink_eswitch_get(port->devlink);
+       if (IS_ERR(esw))
+               return PTR_ERR(esw);
+
+       if (!MLX5_CAP_GEN(esw->dev, migration)) {
+               NL_SET_ERR_MSG_MOD(extack, "Device doesn't support migration");
+               return err;
+       }
+
+       vport = mlx5_devlink_port_fn_get_vport(port, esw);
+       if (IS_ERR(vport)) {
+               NL_SET_ERR_MSG_MOD(extack, "Invalid port");
+               return PTR_ERR(vport);
+       }
+
+       mutex_lock(&esw->state_lock);
+       if (!vport->enabled) {
+               NL_SET_ERR_MSG_MOD(extack, "Eswitch vport is disabled");
+               goto out;
+       }
+
+       if (vport->info.mig_enabled == enable) {
+               err = 0;
+               goto out;
+       }
+
+       query_ctx = kzalloc(query_out_sz, GFP_KERNEL);
+       if (!query_ctx) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       err = mlx5_vport_get_other_func_cap(esw->dev, vport->vport, query_ctx,
+                                           MLX5_CAP_GENERAL_2);
+       if (err) {
+               NL_SET_ERR_MSG_MOD(extack, "Failed getting HCA caps");
+               goto out_free;
+       }
+
+       hca_caps = MLX5_ADDR_OF(query_hca_cap_out, query_ctx, capability);
+       memcpy(hca_caps, MLX5_ADDR_OF(query_hca_cap_out, query_ctx, capability),
+              MLX5_UN_SZ_BYTES(hca_cap_union));
+       MLX5_SET(cmd_hca_cap_2, hca_caps, migratable, 1);
+
+       err = mlx5_vport_set_other_func_cap(esw->dev, hca_caps, vport->vport,
+                                           MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE2);
+       if (err) {
+               NL_SET_ERR_MSG_MOD(extack, "Failed setting HCA migratable cap");
+               goto out_free;
+       }
+
+       vport->info.mig_enabled = enable;
+
+out_free:
+       kfree(query_ctx);
+out:
+       mutex_unlock(&esw->state_lock);
+       return err;
+}
+
+int mlx5_devlink_port_fn_roce_get(struct devlink_port *port, bool *is_enabled,
+                                 struct netlink_ext_ack *extack)
+{
+       struct mlx5_eswitch *esw;
+       struct mlx5_vport *vport;
+       int err = -EOPNOTSUPP;
+
+       esw = mlx5_devlink_eswitch_get(port->devlink);
+       if (IS_ERR(esw))
+               return PTR_ERR(esw);
+
+       vport = mlx5_devlink_port_fn_get_vport(port, esw);
+       if (IS_ERR(vport)) {
+               NL_SET_ERR_MSG_MOD(extack, "Invalid port");
+               return PTR_ERR(vport);
+       }
+
+       mutex_lock(&esw->state_lock);
+       if (vport->enabled) {
+               *is_enabled = vport->info.roce_enabled;
+               err = 0;
+       }
+       mutex_unlock(&esw->state_lock);
+       return err;
+}
+
+int mlx5_devlink_port_fn_roce_set(struct devlink_port *port, bool enable,
+                                 struct netlink_ext_ack *extack)
+{
+       int query_out_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out);
+       struct mlx5_eswitch *esw;
+       struct mlx5_vport *vport;
+       int err = -EOPNOTSUPP;
+       void *query_ctx;
+       void *hca_caps;
+       u16 vport_num;
+
+       esw = mlx5_devlink_eswitch_get(port->devlink);
+       if (IS_ERR(esw))
+               return PTR_ERR(esw);
+
+       vport = mlx5_devlink_port_fn_get_vport(port, esw);
+       if (IS_ERR(vport)) {
+               NL_SET_ERR_MSG_MOD(extack, "Invalid port");
+               return PTR_ERR(vport);
+       }
+       vport_num = vport->vport;
+
+       mutex_lock(&esw->state_lock);
+       if (!vport->enabled) {
+               NL_SET_ERR_MSG_MOD(extack, "Eswitch vport is disabled");
+               goto out;
+       }
+
+       if (vport->info.roce_enabled == enable) {
+               err = 0;
+               goto out;
+       }
+
+       query_ctx = kzalloc(query_out_sz, GFP_KERNEL);
+       if (!query_ctx) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       err = mlx5_vport_get_other_func_cap(esw->dev, vport_num, query_ctx,
+                                           MLX5_CAP_GENERAL);
+       if (err) {
+               NL_SET_ERR_MSG_MOD(extack, "Failed getting HCA caps");
+               goto out_free;
+       }
+
+       hca_caps = MLX5_ADDR_OF(query_hca_cap_out, query_ctx, capability);
+       memcpy(hca_caps, MLX5_ADDR_OF(query_hca_cap_out, query_ctx, capability),
+              MLX5_UN_SZ_BYTES(hca_cap_union));
+       MLX5_SET(cmd_hca_cap, hca_caps, roce, enable);
+
+       err = mlx5_vport_set_other_func_cap(esw->dev, hca_caps, vport_num,
+                                           MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE);
+       if (err) {
+               NL_SET_ERR_MSG_MOD(extack, "Failed setting HCA roce cap");
+               goto out_free;
+       }
+
+       vport->info.roce_enabled = enable;
+
+out_free:
+       kfree(query_ctx);
+out:
+       mutex_unlock(&esw->state_lock);
+       return err;
+}
index d537492..5a85d8c 100644 (file)
 #define ETHTOOL_PRIO_NUM_LEVELS 1
 #define ETHTOOL_NUM_PRIOS 11
 #define ETHTOOL_MIN_LEVEL (KERNEL_MIN_LEVEL + ETHTOOL_NUM_PRIOS)
-/* Promiscuous, Vlan, mac, ttc, inner ttc, {UDP/ANY/aRFS/accel/{esp, esp_err}} */
-#define KERNEL_NIC_PRIO_NUM_LEVELS 7
+/* Promiscuous, Vlan, mac, ttc, inner ttc, {UDP/ANY/aRFS/accel/{esp, esp_err}}, IPsec policy */
+#define KERNEL_NIC_PRIO_NUM_LEVELS 8
 #define KERNEL_NIC_NUM_PRIOS 1
 /* One more level for tc */
 #define KERNEL_MIN_LEVEL (KERNEL_NIC_PRIO_NUM_LEVELS + 1)
 #define LAG_MIN_LEVEL (OFFLOADS_MIN_LEVEL + KERNEL_RX_MACSEC_MIN_LEVEL + 1)
 
 #define KERNEL_TX_IPSEC_NUM_PRIOS  1
-#define KERNEL_TX_IPSEC_NUM_LEVELS 1
+#define KERNEL_TX_IPSEC_NUM_LEVELS 2
 #define KERNEL_TX_IPSEC_MIN_LEVEL        (KERNEL_TX_IPSEC_NUM_LEVELS)
 
 #define KERNEL_TX_MACSEC_NUM_PRIOS  1
@@ -448,7 +448,8 @@ static bool is_fwd_dest_type(enum mlx5_flow_destination_type type)
                type == MLX5_FLOW_DESTINATION_TYPE_UPLINK ||
                type == MLX5_FLOW_DESTINATION_TYPE_VPORT ||
                type == MLX5_FLOW_DESTINATION_TYPE_FLOW_SAMPLER ||
-               type == MLX5_FLOW_DESTINATION_TYPE_TIR;
+               type == MLX5_FLOW_DESTINATION_TYPE_TIR ||
+               type == MLX5_FLOW_DESTINATION_TYPE_RANGE;
 }
 
 static bool check_valid_spec(const struct mlx5_flow_spec *spec)
@@ -1578,7 +1579,13 @@ static bool mlx5_flow_dests_cmp(struct mlx5_flow_destination *d1,
                    (d1->type == MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE_NUM &&
                     d1->ft_num == d2->ft_num) ||
                    (d1->type == MLX5_FLOW_DESTINATION_TYPE_FLOW_SAMPLER &&
-                    d1->sampler_id == d2->sampler_id))
+                    d1->sampler_id == d2->sampler_id) ||
+                   (d1->type == MLX5_FLOW_DESTINATION_TYPE_RANGE &&
+                    d1->range.field == d2->range.field &&
+                    d1->range.hit_ft == d2->range.hit_ft &&
+                    d1->range.miss_ft == d2->range.miss_ft &&
+                    d1->range.min == d2->range.min &&
+                    d1->range.max == d2->range.max))
                        return true;
        }
 
@@ -1962,6 +1969,9 @@ _mlx5_add_flow_rules(struct mlx5_flow_table *ft,
        if (flow_act->fg && ft->autogroup.active)
                return ERR_PTR(-EINVAL);
 
+       if (dest && dest_num <= 0)
+               return ERR_PTR(-EINVAL);
+
        for (i = 0; i < dest_num; i++) {
                if (!dest_is_valid(&dest[i], flow_act, ft))
                        return ERR_PTR(-EINVAL);
index 3af50fd..f137a06 100644 (file)
@@ -123,6 +123,7 @@ enum mlx5_flow_steering_mode {
 enum mlx5_flow_steering_capabilty {
        MLX5_FLOW_STEERING_CAP_VLAN_PUSH_ON_RX = 1UL << 0,
        MLX5_FLOW_STEERING_CAP_VLAN_POP_ON_TX = 1UL << 1,
+       MLX5_FLOW_STEERING_CAP_MATCH_RANGES = 1UL << 2,
 };
 
 struct mlx5_flow_steering {
index 0259a14..d9fcb9e 100644 (file)
@@ -118,13 +118,41 @@ struct mlx5_fib_event_work {
        };
 };
 
+static struct net_device*
+mlx5_lag_get_next_fib_dev(struct mlx5_lag *ldev,
+                         struct fib_info *fi,
+                         struct net_device *current_dev)
+{
+       struct net_device *fib_dev;
+       int i, ldev_idx, nhs;
+
+       nhs = fib_info_num_path(fi);
+       i = 0;
+       if (current_dev) {
+               for (; i < nhs; i++) {
+                       fib_dev = fib_info_nh(fi, i)->fib_nh_dev;
+                       if (fib_dev == current_dev) {
+                               i++;
+                               break;
+                       }
+               }
+       }
+       for (; i < nhs; i++) {
+               fib_dev = fib_info_nh(fi, i)->fib_nh_dev;
+               ldev_idx = mlx5_lag_dev_get_netdev_idx(ldev, fib_dev);
+               if (ldev_idx >= 0)
+                       return ldev->pf[ldev_idx].netdev;
+       }
+
+       return NULL;
+}
+
 static void mlx5_lag_fib_route_event(struct mlx5_lag *ldev, unsigned long event,
                                     struct fib_entry_notifier_info *fen_info)
 {
+       struct net_device *nh_dev0, *nh_dev1;
        struct fib_info *fi = fen_info->fi;
        struct lag_mp *mp = &ldev->lag_mp;
-       struct fib_nh *fib_nh0, *fib_nh1;
-       unsigned int nhs;
 
        /* Handle delete event */
        if (event == FIB_EVENT_ENTRY_DEL) {
@@ -140,16 +168,25 @@ static void mlx5_lag_fib_route_event(struct mlx5_lag *ldev, unsigned long event,
            fi->fib_priority >= mp->fib.priority)
                return;
 
+       nh_dev0 = mlx5_lag_get_next_fib_dev(ldev, fi, NULL);
+       nh_dev1 = mlx5_lag_get_next_fib_dev(ldev, fi, nh_dev0);
+
        /* Handle add/replace event */
-       nhs = fib_info_num_path(fi);
-       if (nhs == 1) {
-               if (__mlx5_lag_is_active(ldev)) {
-                       struct fib_nh *nh = fib_info_nh(fi, 0);
-                       struct net_device *nh_dev = nh->fib_nh_dev;
-                       int i = mlx5_lag_dev_get_netdev_idx(ldev, nh_dev);
+       if (!nh_dev0) {
+               if (mp->fib.dst == fen_info->dst && mp->fib.dst_len == fen_info->dst_len)
+                       mp->fib.mfi = NULL;
+               return;
+       }
 
-                       if (i < 0)
-                               return;
+       if (nh_dev0 == nh_dev1) {
+               mlx5_core_warn(ldev->pf[MLX5_LAG_P1].dev,
+                              "Multipath offload doesn't support routes with multiple nexthops of the same device");
+               return;
+       }
+
+       if (!nh_dev1) {
+               if (__mlx5_lag_is_active(ldev)) {
+                       int i = mlx5_lag_dev_get_netdev_idx(ldev, nh_dev0);
 
                        i++;
                        mlx5_lag_set_port_affinity(ldev, i);
@@ -159,21 +196,6 @@ static void mlx5_lag_fib_route_event(struct mlx5_lag *ldev, unsigned long event,
                return;
        }
 
-       if (nhs != 2)
-               return;
-
-       /* Verify next hops are ports of the same hca */
-       fib_nh0 = fib_info_nh(fi, 0);
-       fib_nh1 = fib_info_nh(fi, 1);
-       if (!(fib_nh0->fib_nh_dev == ldev->pf[MLX5_LAG_P1].netdev &&
-             fib_nh1->fib_nh_dev == ldev->pf[MLX5_LAG_P2].netdev) &&
-           !(fib_nh0->fib_nh_dev == ldev->pf[MLX5_LAG_P2].netdev &&
-             fib_nh1->fib_nh_dev == ldev->pf[MLX5_LAG_P1].netdev)) {
-               mlx5_core_warn(ldev->pf[MLX5_LAG_P1].dev,
-                              "Multipath offload require two ports of the same HCA\n");
-               return;
-       }
-
        /* First time we see multipath route */
        if (!mp->fib.mfi && !__mlx5_lag_is_active(ldev)) {
                struct lag_tracker tracker;
@@ -268,7 +290,6 @@ static int mlx5_lag_fib_event(struct notifier_block *nb,
        struct mlx5_fib_event_work *fib_work;
        struct fib_entry_notifier_info *fen_info;
        struct fib_nh_notifier_info *fnh_info;
-       struct net_device *fib_dev;
        struct fib_info *fi;
 
        if (info->family != AF_INET)
@@ -285,11 +306,7 @@ static int mlx5_lag_fib_event(struct notifier_block *nb,
                fi = fen_info->fi;
                if (fi->nh)
                        return NOTIFY_DONE;
-               fib_dev = fib_info_nh(fen_info->fi, 0)->fib_nh_dev;
-               if (fib_dev != ldev->pf[MLX5_LAG_P1].netdev &&
-                   fib_dev != ldev->pf[MLX5_LAG_P2].netdev) {
-                       return NOTIFY_DONE;
-               }
+
                fib_work = mlx5_lag_init_fib_work(ldev, event);
                if (!fib_work)
                        return NOTIFY_DONE;
index 0f9e4f0..5a80fb7 100644 (file)
@@ -353,12 +353,15 @@ void mlx5_aso_build_wqe(struct mlx5_aso *aso, u8 ds_cnt,
        cseg->general_id = cpu_to_be32(obj_id);
 }
 
-void *mlx5_aso_get_wqe(struct mlx5_aso *aso)
+struct mlx5_aso_wqe *mlx5_aso_get_wqe(struct mlx5_aso *aso)
 {
+       struct mlx5_aso_wqe *wqe;
        u16 pi;
 
        pi = mlx5_wq_cyc_ctr2ix(&aso->wq, aso->pc);
-       return mlx5_wq_cyc_get_wqe(&aso->wq, pi);
+       wqe = mlx5_wq_cyc_get_wqe(&aso->wq, pi);
+       memset(wqe, 0, sizeof(*wqe));
+       return wqe;
 }
 
 void mlx5_aso_post_wqe(struct mlx5_aso *aso, bool with_data,
index 2d40dcf..afb078b 100644 (file)
@@ -15,6 +15,7 @@
 #define MLX5_WQE_CTRL_WQE_OPC_MOD_SHIFT 24
 #define MLX5_MACSEC_ASO_DS_CNT (DIV_ROUND_UP(sizeof(struct mlx5_aso_wqe), MLX5_SEND_WQE_DS))
 
+#define ASO_CTRL_READ_EN BIT(0)
 struct mlx5_wqe_aso_ctrl_seg {
        __be32  va_h;
        __be32  va_l; /* include read_enable */
@@ -71,13 +72,14 @@ enum {
 };
 
 enum {
+       MLX5_ACCESS_ASO_OPC_MOD_IPSEC = 0x0,
        MLX5_ACCESS_ASO_OPC_MOD_FLOW_METER = 0x2,
        MLX5_ACCESS_ASO_OPC_MOD_MACSEC = 0x5,
 };
 
 struct mlx5_aso;
 
-void *mlx5_aso_get_wqe(struct mlx5_aso *aso);
+struct mlx5_aso_wqe *mlx5_aso_get_wqe(struct mlx5_aso *aso);
 void mlx5_aso_build_wqe(struct mlx5_aso *aso, u8 ds_cnt,
                        struct mlx5_aso_wqe *aso_wqe,
                        u32 obj_id, u32 opc_mode);
index a806e3d..029305a 100644 (file)
@@ -324,7 +324,10 @@ void mlx5_unload_one_devl_locked(struct mlx5_core_dev *dev);
 int mlx5_load_one(struct mlx5_core_dev *dev, bool recovery);
 int mlx5_load_one_devl_locked(struct mlx5_core_dev *dev, bool recovery);
 
-int mlx5_vport_get_other_func_cap(struct mlx5_core_dev *dev, u16 function_id, void *out);
+int mlx5_vport_set_other_func_cap(struct mlx5_core_dev *dev, const void *hca_cap, u16 function_id,
+                                 u16 opmod);
+#define mlx5_vport_get_other_func_general_cap(dev, fid, out)           \
+       mlx5_vport_get_other_func_cap(dev, fid, out, MLX5_CAP_GENERAL)
 
 void mlx5_events_work_enqueue(struct mlx5_core_dev *dev, struct work_struct *work);
 static inline u32 mlx5_sriov_get_vf_total_msix(struct pci_dev *pdev)
index 662f1d5..6bde18b 100644 (file)
@@ -4,6 +4,7 @@
 #include <linux/interrupt.h>
 #include <linux/notifier.h>
 #include <linux/mlx5/driver.h>
+#include <linux/mlx5/vport.h>
 #include "mlx5_core.h"
 #include "mlx5_irq.h"
 #include "pci_irq.h"
@@ -101,7 +102,7 @@ int mlx5_set_msix_vec_count(struct mlx5_core_dev *dev, int function_id,
                goto out;
        }
 
-       ret = mlx5_vport_get_other_func_cap(dev, function_id, query_cap);
+       ret = mlx5_vport_get_other_func_general_cap(dev, function_id, query_cap);
        if (ret)
                goto out;
 
index b1dfad2..ee104cf 100644 (file)
@@ -44,6 +44,7 @@ static const char * const action_type_to_str[] = {
        [DR_ACTION_TYP_INSERT_HDR] = "DR_ACTION_TYP_INSERT_HDR",
        [DR_ACTION_TYP_REMOVE_HDR] = "DR_ACTION_TYP_REMOVE_HDR",
        [DR_ACTION_TYP_ASO_FLOW_METER] = "DR_ACTION_TYP_ASO_FLOW_METER",
+       [DR_ACTION_TYP_RANGE] = "DR_ACTION_TYP_RANGE",
        [DR_ACTION_TYP_MAX] = "DR_ACTION_UNKNOWN",
 };
 
@@ -61,6 +62,7 @@ next_action_state[DR_ACTION_DOMAIN_MAX][DR_ACTION_STATE_MAX][DR_ACTION_TYP_MAX]
                        [DR_ACTION_TYP_DROP]            = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_QP]              = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_FT]              = DR_ACTION_STATE_TERM,
+                       [DR_ACTION_TYP_RANGE]           = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_SAMPLER]         = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_TAG]             = DR_ACTION_STATE_NON_TERM,
                        [DR_ACTION_TYP_CTR]             = DR_ACTION_STATE_NON_TERM,
@@ -79,6 +81,7 @@ next_action_state[DR_ACTION_DOMAIN_MAX][DR_ACTION_STATE_MAX][DR_ACTION_TYP_MAX]
                        [DR_ACTION_TYP_DROP]            = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_QP]              = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_FT]              = DR_ACTION_STATE_TERM,
+                       [DR_ACTION_TYP_RANGE]           = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_SAMPLER]         = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_TAG]             = DR_ACTION_STATE_DECAP,
                        [DR_ACTION_TYP_CTR]             = DR_ACTION_STATE_DECAP,
@@ -94,6 +97,7 @@ next_action_state[DR_ACTION_DOMAIN_MAX][DR_ACTION_STATE_MAX][DR_ACTION_TYP_MAX]
                        [DR_ACTION_TYP_DROP]            = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_QP]              = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_FT]              = DR_ACTION_STATE_TERM,
+                       [DR_ACTION_TYP_RANGE]           = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_SAMPLER]         = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_TAG]             = DR_ACTION_STATE_ENCAP,
                        [DR_ACTION_TYP_CTR]             = DR_ACTION_STATE_ENCAP,
@@ -103,6 +107,7 @@ next_action_state[DR_ACTION_DOMAIN_MAX][DR_ACTION_STATE_MAX][DR_ACTION_TYP_MAX]
                        [DR_ACTION_TYP_DROP]            = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_QP]              = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_FT]              = DR_ACTION_STATE_TERM,
+                       [DR_ACTION_TYP_RANGE]           = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_SAMPLER]         = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_TAG]             = DR_ACTION_STATE_MODIFY_HDR,
                        [DR_ACTION_TYP_CTR]             = DR_ACTION_STATE_MODIFY_HDR,
@@ -116,6 +121,7 @@ next_action_state[DR_ACTION_DOMAIN_MAX][DR_ACTION_STATE_MAX][DR_ACTION_TYP_MAX]
                        [DR_ACTION_TYP_DROP]            = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_QP]              = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_FT]              = DR_ACTION_STATE_TERM,
+                       [DR_ACTION_TYP_RANGE]           = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_SAMPLER]         = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_TAG]             = DR_ACTION_STATE_POP_VLAN,
                        [DR_ACTION_TYP_CTR]             = DR_ACTION_STATE_POP_VLAN,
@@ -129,6 +135,7 @@ next_action_state[DR_ACTION_DOMAIN_MAX][DR_ACTION_STATE_MAX][DR_ACTION_TYP_MAX]
                [DR_ACTION_STATE_PUSH_VLAN] = {
                        [DR_ACTION_TYP_QP]              = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_FT]              = DR_ACTION_STATE_TERM,
+                       [DR_ACTION_TYP_RANGE]           = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_SAMPLER]         = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_TAG]             = DR_ACTION_STATE_PUSH_VLAN,
                        [DR_ACTION_TYP_CTR]             = DR_ACTION_STATE_PUSH_VLAN,
@@ -141,6 +148,7 @@ next_action_state[DR_ACTION_DOMAIN_MAX][DR_ACTION_STATE_MAX][DR_ACTION_TYP_MAX]
                        [DR_ACTION_TYP_DROP]            = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_QP]              = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_FT]              = DR_ACTION_STATE_TERM,
+                       [DR_ACTION_TYP_RANGE]           = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_SAMPLER]         = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_TAG]             = DR_ACTION_STATE_NON_TERM,
                        [DR_ACTION_TYP_CTR]             = DR_ACTION_STATE_NON_TERM,
@@ -159,6 +167,7 @@ next_action_state[DR_ACTION_DOMAIN_MAX][DR_ACTION_STATE_MAX][DR_ACTION_TYP_MAX]
                        [DR_ACTION_TYP_DROP]            = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_QP]              = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_FT]              = DR_ACTION_STATE_TERM,
+                       [DR_ACTION_TYP_RANGE]           = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_CTR]             = DR_ACTION_STATE_ASO,
                },
                [DR_ACTION_STATE_TERM] = {
@@ -169,6 +178,7 @@ next_action_state[DR_ACTION_DOMAIN_MAX][DR_ACTION_STATE_MAX][DR_ACTION_TYP_MAX]
                [DR_ACTION_STATE_NO_ACTION] = {
                        [DR_ACTION_TYP_DROP]            = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_FT]              = DR_ACTION_STATE_TERM,
+                       [DR_ACTION_TYP_RANGE]           = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_SAMPLER]         = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_CTR]             = DR_ACTION_STATE_NON_TERM,
                        [DR_ACTION_TYP_L2_TO_TNL_L2]    = DR_ACTION_STATE_ENCAP,
@@ -183,6 +193,7 @@ next_action_state[DR_ACTION_DOMAIN_MAX][DR_ACTION_STATE_MAX][DR_ACTION_TYP_MAX]
                [DR_ACTION_STATE_DECAP] = {
                        [DR_ACTION_TYP_DROP]            = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_FT]              = DR_ACTION_STATE_TERM,
+                       [DR_ACTION_TYP_RANGE]           = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_SAMPLER]         = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_CTR]             = DR_ACTION_STATE_DECAP,
                        [DR_ACTION_TYP_ASO_FLOW_METER]  = DR_ACTION_STATE_ASO,
@@ -190,6 +201,7 @@ next_action_state[DR_ACTION_DOMAIN_MAX][DR_ACTION_STATE_MAX][DR_ACTION_TYP_MAX]
                [DR_ACTION_STATE_ENCAP] = {
                        [DR_ACTION_TYP_DROP]            = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_FT]              = DR_ACTION_STATE_TERM,
+                       [DR_ACTION_TYP_RANGE]           = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_SAMPLER]         = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_CTR]             = DR_ACTION_STATE_ENCAP,
                        [DR_ACTION_TYP_ASO_FLOW_METER]  = DR_ACTION_STATE_ASO,
@@ -197,6 +209,7 @@ next_action_state[DR_ACTION_DOMAIN_MAX][DR_ACTION_STATE_MAX][DR_ACTION_TYP_MAX]
                [DR_ACTION_STATE_MODIFY_HDR] = {
                        [DR_ACTION_TYP_DROP]            = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_FT]              = DR_ACTION_STATE_TERM,
+                       [DR_ACTION_TYP_RANGE]           = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_SAMPLER]         = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_CTR]             = DR_ACTION_STATE_MODIFY_HDR,
                        [DR_ACTION_TYP_L2_TO_TNL_L2]    = DR_ACTION_STATE_ENCAP,
@@ -207,6 +220,7 @@ next_action_state[DR_ACTION_DOMAIN_MAX][DR_ACTION_STATE_MAX][DR_ACTION_TYP_MAX]
                },
                [DR_ACTION_STATE_POP_VLAN] = {
                        [DR_ACTION_TYP_FT]              = DR_ACTION_STATE_TERM,
+                       [DR_ACTION_TYP_RANGE]           = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_SAMPLER]         = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_CTR]             = DR_ACTION_STATE_POP_VLAN,
                        [DR_ACTION_TYP_POP_VLAN]        = DR_ACTION_STATE_POP_VLAN,
@@ -220,6 +234,7 @@ next_action_state[DR_ACTION_DOMAIN_MAX][DR_ACTION_STATE_MAX][DR_ACTION_TYP_MAX]
                [DR_ACTION_STATE_PUSH_VLAN] = {
                        [DR_ACTION_TYP_DROP]            = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_FT]              = DR_ACTION_STATE_TERM,
+                       [DR_ACTION_TYP_RANGE]           = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_SAMPLER]         = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_CTR]             = DR_ACTION_STATE_PUSH_VLAN,
                        [DR_ACTION_TYP_PUSH_VLAN]       = DR_ACTION_STATE_PUSH_VLAN,
@@ -231,6 +246,7 @@ next_action_state[DR_ACTION_DOMAIN_MAX][DR_ACTION_STATE_MAX][DR_ACTION_TYP_MAX]
                [DR_ACTION_STATE_NON_TERM] = {
                        [DR_ACTION_TYP_DROP]            = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_FT]              = DR_ACTION_STATE_TERM,
+                       [DR_ACTION_TYP_RANGE]           = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_SAMPLER]         = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_CTR]             = DR_ACTION_STATE_NON_TERM,
                        [DR_ACTION_TYP_L2_TO_TNL_L2]    = DR_ACTION_STATE_ENCAP,
@@ -250,6 +266,7 @@ next_action_state[DR_ACTION_DOMAIN_MAX][DR_ACTION_STATE_MAX][DR_ACTION_TYP_MAX]
                        [DR_ACTION_TYP_CTR]             = DR_ACTION_STATE_ASO,
                        [DR_ACTION_TYP_DROP]            = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_FT]              = DR_ACTION_STATE_TERM,
+                       [DR_ACTION_TYP_RANGE]           = DR_ACTION_STATE_TERM,
                },
                [DR_ACTION_STATE_TERM] = {
                        [DR_ACTION_TYP_CTR]             = DR_ACTION_STATE_TERM,
@@ -259,6 +276,7 @@ next_action_state[DR_ACTION_DOMAIN_MAX][DR_ACTION_STATE_MAX][DR_ACTION_TYP_MAX]
                [DR_ACTION_STATE_NO_ACTION] = {
                        [DR_ACTION_TYP_DROP]            = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_FT]              = DR_ACTION_STATE_TERM,
+                       [DR_ACTION_TYP_RANGE]           = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_SAMPLER]         = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_CTR]             = DR_ACTION_STATE_NON_TERM,
                        [DR_ACTION_TYP_TNL_L2_TO_L2]    = DR_ACTION_STATE_DECAP,
@@ -276,6 +294,7 @@ next_action_state[DR_ACTION_DOMAIN_MAX][DR_ACTION_STATE_MAX][DR_ACTION_TYP_MAX]
                [DR_ACTION_STATE_DECAP] = {
                        [DR_ACTION_TYP_DROP]            = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_FT]              = DR_ACTION_STATE_TERM,
+                       [DR_ACTION_TYP_RANGE]           = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_CTR]             = DR_ACTION_STATE_DECAP,
                        [DR_ACTION_TYP_SAMPLER]         = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_MODIFY_HDR]      = DR_ACTION_STATE_MODIFY_HDR,
@@ -291,6 +310,7 @@ next_action_state[DR_ACTION_DOMAIN_MAX][DR_ACTION_STATE_MAX][DR_ACTION_TYP_MAX]
                        [DR_ACTION_TYP_DROP]            = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_QP]              = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_FT]              = DR_ACTION_STATE_TERM,
+                       [DR_ACTION_TYP_RANGE]           = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_VPORT]           = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_SAMPLER]         = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_CTR]             = DR_ACTION_STATE_ENCAP,
@@ -299,6 +319,7 @@ next_action_state[DR_ACTION_DOMAIN_MAX][DR_ACTION_STATE_MAX][DR_ACTION_TYP_MAX]
                [DR_ACTION_STATE_MODIFY_HDR] = {
                        [DR_ACTION_TYP_DROP]            = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_FT]              = DR_ACTION_STATE_TERM,
+                       [DR_ACTION_TYP_RANGE]           = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_SAMPLER]         = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_CTR]             = DR_ACTION_STATE_MODIFY_HDR,
                        [DR_ACTION_TYP_VPORT]           = DR_ACTION_STATE_TERM,
@@ -311,6 +332,7 @@ next_action_state[DR_ACTION_DOMAIN_MAX][DR_ACTION_STATE_MAX][DR_ACTION_TYP_MAX]
                [DR_ACTION_STATE_POP_VLAN] = {
                        [DR_ACTION_TYP_DROP]            = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_FT]              = DR_ACTION_STATE_TERM,
+                       [DR_ACTION_TYP_RANGE]           = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_SAMPLER]         = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_POP_VLAN]        = DR_ACTION_STATE_POP_VLAN,
                        [DR_ACTION_TYP_CTR]             = DR_ACTION_STATE_POP_VLAN,
@@ -324,6 +346,7 @@ next_action_state[DR_ACTION_DOMAIN_MAX][DR_ACTION_STATE_MAX][DR_ACTION_TYP_MAX]
                [DR_ACTION_STATE_PUSH_VLAN] = {
                        [DR_ACTION_TYP_DROP]            = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_FT]              = DR_ACTION_STATE_TERM,
+                       [DR_ACTION_TYP_RANGE]           = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_SAMPLER]         = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_PUSH_VLAN]       = DR_ACTION_STATE_PUSH_VLAN,
                        [DR_ACTION_TYP_CTR]             = DR_ACTION_STATE_PUSH_VLAN,
@@ -337,6 +360,7 @@ next_action_state[DR_ACTION_DOMAIN_MAX][DR_ACTION_STATE_MAX][DR_ACTION_TYP_MAX]
                [DR_ACTION_STATE_NON_TERM] = {
                        [DR_ACTION_TYP_DROP]            = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_FT]              = DR_ACTION_STATE_TERM,
+                       [DR_ACTION_TYP_RANGE]           = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_SAMPLER]         = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_CTR]             = DR_ACTION_STATE_NON_TERM,
                        [DR_ACTION_TYP_TNL_L2_TO_L2]    = DR_ACTION_STATE_DECAP,
@@ -354,6 +378,7 @@ next_action_state[DR_ACTION_DOMAIN_MAX][DR_ACTION_STATE_MAX][DR_ACTION_TYP_MAX]
                [DR_ACTION_STATE_ASO] = {
                        [DR_ACTION_TYP_DROP]            = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_FT]              = DR_ACTION_STATE_TERM,
+                       [DR_ACTION_TYP_RANGE]           = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_VPORT]           = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_CTR]             = DR_ACTION_STATE_ASO,
                },
@@ -365,6 +390,7 @@ next_action_state[DR_ACTION_DOMAIN_MAX][DR_ACTION_STATE_MAX][DR_ACTION_TYP_MAX]
                [DR_ACTION_STATE_NO_ACTION] = {
                        [DR_ACTION_TYP_DROP]            = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_FT]              = DR_ACTION_STATE_TERM,
+                       [DR_ACTION_TYP_RANGE]           = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_SAMPLER]         = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_CTR]             = DR_ACTION_STATE_NON_TERM,
                        [DR_ACTION_TYP_MODIFY_HDR]      = DR_ACTION_STATE_MODIFY_HDR,
@@ -380,6 +406,7 @@ next_action_state[DR_ACTION_DOMAIN_MAX][DR_ACTION_STATE_MAX][DR_ACTION_TYP_MAX]
                [DR_ACTION_STATE_DECAP] = {
                        [DR_ACTION_TYP_DROP]            = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_FT]              = DR_ACTION_STATE_TERM,
+                       [DR_ACTION_TYP_RANGE]           = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_CTR]             = DR_ACTION_STATE_DECAP,
                        [DR_ACTION_TYP_VPORT]           = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_SAMPLER]         = DR_ACTION_STATE_TERM,
@@ -388,6 +415,7 @@ next_action_state[DR_ACTION_DOMAIN_MAX][DR_ACTION_STATE_MAX][DR_ACTION_TYP_MAX]
                [DR_ACTION_STATE_ENCAP] = {
                        [DR_ACTION_TYP_DROP]            = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_FT]              = DR_ACTION_STATE_TERM,
+                       [DR_ACTION_TYP_RANGE]           = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_CTR]             = DR_ACTION_STATE_ENCAP,
                        [DR_ACTION_TYP_SAMPLER]         = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_VPORT]           = DR_ACTION_STATE_TERM,
@@ -396,6 +424,7 @@ next_action_state[DR_ACTION_DOMAIN_MAX][DR_ACTION_STATE_MAX][DR_ACTION_TYP_MAX]
                [DR_ACTION_STATE_MODIFY_HDR] = {
                        [DR_ACTION_TYP_DROP]            = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_FT]              = DR_ACTION_STATE_TERM,
+                       [DR_ACTION_TYP_RANGE]           = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_SAMPLER]         = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_CTR]             = DR_ACTION_STATE_MODIFY_HDR,
                        [DR_ACTION_TYP_L2_TO_TNL_L2]    = DR_ACTION_STATE_ENCAP,
@@ -407,6 +436,7 @@ next_action_state[DR_ACTION_DOMAIN_MAX][DR_ACTION_STATE_MAX][DR_ACTION_TYP_MAX]
                },
                [DR_ACTION_STATE_POP_VLAN] = {
                        [DR_ACTION_TYP_FT]              = DR_ACTION_STATE_TERM,
+                       [DR_ACTION_TYP_RANGE]           = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_SAMPLER]         = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_CTR]             = DR_ACTION_STATE_POP_VLAN,
                        [DR_ACTION_TYP_POP_VLAN]        = DR_ACTION_STATE_POP_VLAN,
@@ -421,6 +451,7 @@ next_action_state[DR_ACTION_DOMAIN_MAX][DR_ACTION_STATE_MAX][DR_ACTION_TYP_MAX]
                [DR_ACTION_STATE_PUSH_VLAN] = {
                        [DR_ACTION_TYP_DROP]            = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_FT]              = DR_ACTION_STATE_TERM,
+                       [DR_ACTION_TYP_RANGE]           = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_SAMPLER]         = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_PUSH_VLAN]       = DR_ACTION_STATE_PUSH_VLAN,
                        [DR_ACTION_TYP_CTR]             = DR_ACTION_STATE_PUSH_VLAN,
@@ -433,6 +464,7 @@ next_action_state[DR_ACTION_DOMAIN_MAX][DR_ACTION_STATE_MAX][DR_ACTION_TYP_MAX]
                [DR_ACTION_STATE_NON_TERM] = {
                        [DR_ACTION_TYP_DROP]            = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_FT]              = DR_ACTION_STATE_TERM,
+                       [DR_ACTION_TYP_RANGE]           = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_SAMPLER]         = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_CTR]             = DR_ACTION_STATE_NON_TERM,
                        [DR_ACTION_TYP_MODIFY_HDR]      = DR_ACTION_STATE_MODIFY_HDR,
@@ -452,6 +484,7 @@ next_action_state[DR_ACTION_DOMAIN_MAX][DR_ACTION_STATE_MAX][DR_ACTION_TYP_MAX]
                        [DR_ACTION_TYP_PUSH_VLAN]       = DR_ACTION_STATE_PUSH_VLAN,
                        [DR_ACTION_TYP_DROP]            = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_FT]              = DR_ACTION_STATE_TERM,
+                       [DR_ACTION_TYP_RANGE]           = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_VPORT]           = DR_ACTION_STATE_TERM,
                        [DR_ACTION_TYP_CTR]             = DR_ACTION_STATE_ASO,
                },
@@ -634,6 +667,83 @@ static void dr_action_print_sequence(struct mlx5dr_domain *dmn,
                           actions[i]->action_type);
 }
 
+static int dr_action_get_dest_fw_tbl_addr(struct mlx5dr_matcher *matcher,
+                                         struct mlx5dr_action_dest_tbl *dest_tbl,
+                                         bool is_rx_rule,
+                                         u64 *final_icm_addr)
+{
+       struct mlx5dr_cmd_query_flow_table_details output;
+       struct mlx5dr_domain *dmn = matcher->tbl->dmn;
+       int ret;
+
+       if (!dest_tbl->fw_tbl.rx_icm_addr) {
+               ret = mlx5dr_cmd_query_flow_table(dmn->mdev,
+                                                 dest_tbl->fw_tbl.type,
+                                                 dest_tbl->fw_tbl.id,
+                                                 &output);
+               if (ret) {
+                       mlx5dr_err(dmn,
+                                  "Failed mlx5_cmd_query_flow_table ret: %d\n",
+                                  ret);
+                       return ret;
+               }
+
+               dest_tbl->fw_tbl.tx_icm_addr = output.sw_owner_icm_root_1;
+               dest_tbl->fw_tbl.rx_icm_addr = output.sw_owner_icm_root_0;
+       }
+
+       *final_icm_addr = is_rx_rule ? dest_tbl->fw_tbl.rx_icm_addr :
+                                      dest_tbl->fw_tbl.tx_icm_addr;
+       return 0;
+}
+
+static int dr_action_get_dest_sw_tbl_addr(struct mlx5dr_matcher *matcher,
+                                         struct mlx5dr_action_dest_tbl *dest_tbl,
+                                         bool is_rx_rule,
+                                         u64 *final_icm_addr)
+{
+       struct mlx5dr_domain *dmn = matcher->tbl->dmn;
+       struct mlx5dr_icm_chunk *chunk;
+
+       if (dest_tbl->tbl->dmn != dmn) {
+               mlx5dr_err(dmn,
+                          "Destination table belongs to a different domain\n");
+               return -EINVAL;
+       }
+
+       if (dest_tbl->tbl->level <= matcher->tbl->level) {
+               mlx5_core_dbg_once(dmn->mdev,
+                                  "Connecting table to a lower/same level destination table\n");
+               mlx5dr_dbg(dmn,
+                          "Connecting table at level %d to a destination table at level %d\n",
+                          matcher->tbl->level,
+                          dest_tbl->tbl->level);
+       }
+
+       chunk = is_rx_rule ? dest_tbl->tbl->rx.s_anchor->chunk :
+                            dest_tbl->tbl->tx.s_anchor->chunk;
+
+       *final_icm_addr = mlx5dr_icm_pool_get_chunk_icm_addr(chunk);
+       return 0;
+}
+
+static int dr_action_get_dest_tbl_addr(struct mlx5dr_matcher *matcher,
+                                      struct mlx5dr_action_dest_tbl *dest_tbl,
+                                      bool is_rx_rule,
+                                      u64 *final_icm_addr)
+{
+       if (dest_tbl->is_fw_tbl)
+               return dr_action_get_dest_fw_tbl_addr(matcher,
+                                                     dest_tbl,
+                                                     is_rx_rule,
+                                                     final_icm_addr);
+
+       return dr_action_get_dest_sw_tbl_addr(matcher,
+                                             dest_tbl,
+                                             is_rx_rule,
+                                             final_icm_addr);
+}
+
 #define WITH_VLAN_NUM_HW_ACTIONS 6
 
 int mlx5dr_actions_build_ste_arr(struct mlx5dr_matcher *matcher,
@@ -661,8 +771,6 @@ int mlx5dr_actions_build_ste_arr(struct mlx5dr_matcher *matcher,
        action_domain = dr_action_get_action_domain(dmn->type, nic_dmn->type);
 
        for (i = 0; i < num_actions; i++) {
-               struct mlx5dr_action_dest_tbl *dest_tbl;
-               struct mlx5dr_icm_chunk *chunk;
                struct mlx5dr_action *action;
                int max_actions_type = 1;
                u32 action_type;
@@ -676,50 +784,27 @@ int mlx5dr_actions_build_ste_arr(struct mlx5dr_matcher *matcher,
                        break;
                case DR_ACTION_TYP_FT:
                        dest_action = action;
-                       dest_tbl = action->dest_tbl;
-                       if (!dest_tbl->is_fw_tbl) {
-                               if (dest_tbl->tbl->dmn != dmn) {
-                                       mlx5dr_err(dmn,
-                                                  "Destination table belongs to a different domain\n");
-                                       return -EINVAL;
-                               }
-                               if (dest_tbl->tbl->level <= matcher->tbl->level) {
-                                       mlx5_core_dbg_once(dmn->mdev,
-                                                          "Connecting table to a lower/same level destination table\n");
-                                       mlx5dr_dbg(dmn,
-                                                  "Connecting table at level %d to a destination table at level %d\n",
-                                                  matcher->tbl->level,
-                                                  dest_tbl->tbl->level);
-                               }
-                               chunk = rx_rule ? dest_tbl->tbl->rx.s_anchor->chunk :
-                                       dest_tbl->tbl->tx.s_anchor->chunk;
-                               attr.final_icm_addr = mlx5dr_icm_pool_get_chunk_icm_addr(chunk);
-                       } else {
-                               struct mlx5dr_cmd_query_flow_table_details output;
-                               int ret;
-
-                               /* get the relevant addresses */
-                               if (!action->dest_tbl->fw_tbl.rx_icm_addr) {
-                                       ret = mlx5dr_cmd_query_flow_table(dmn->mdev,
-                                                                         dest_tbl->fw_tbl.type,
-                                                                         dest_tbl->fw_tbl.id,
-                                                                         &output);
-                                       if (!ret) {
-                                               dest_tbl->fw_tbl.tx_icm_addr =
-                                                       output.sw_owner_icm_root_1;
-                                               dest_tbl->fw_tbl.rx_icm_addr =
-                                                       output.sw_owner_icm_root_0;
-                                       } else {
-                                               mlx5dr_err(dmn,
-                                                          "Failed mlx5_cmd_query_flow_table ret: %d\n",
-                                                          ret);
-                                               return ret;
-                                       }
-                               }
-                               attr.final_icm_addr = rx_rule ?
-                                       dest_tbl->fw_tbl.rx_icm_addr :
-                                       dest_tbl->fw_tbl.tx_icm_addr;
-                       }
+                       ret = dr_action_get_dest_tbl_addr(matcher, action->dest_tbl,
+                                                         rx_rule, &attr.final_icm_addr);
+                       if (ret)
+                               return ret;
+                       break;
+               case DR_ACTION_TYP_RANGE:
+                       ret = dr_action_get_dest_tbl_addr(matcher,
+                                                         action->range->hit_tbl_action->dest_tbl,
+                                                         rx_rule, &attr.final_icm_addr);
+                       if (ret)
+                               return ret;
+
+                       ret = dr_action_get_dest_tbl_addr(matcher,
+                                                         action->range->miss_tbl_action->dest_tbl,
+                                                         rx_rule, &attr.range.miss_icm_addr);
+                       if (ret)
+                               return ret;
+
+                       attr.range.definer_id = action->range->definer_id;
+                       attr.range.min = action->range->min;
+                       attr.range.max = action->range->max;
                        break;
                case DR_ACTION_TYP_QP:
                        mlx5dr_info(dmn, "Domain doesn't support QP\n");
@@ -866,6 +951,7 @@ static unsigned int action_size[DR_ACTION_TYP_MAX] = {
        [DR_ACTION_TYP_REMOVE_HDR]   = sizeof(struct mlx5dr_action_reformat),
        [DR_ACTION_TYP_SAMPLER]      = sizeof(struct mlx5dr_action_sampler),
        [DR_ACTION_TYP_ASO_FLOW_METER] = sizeof(struct mlx5dr_action_aso_flow_meter),
+       [DR_ACTION_TYP_RANGE]        = sizeof(struct mlx5dr_action_range),
 };
 
 static struct mlx5dr_action *
@@ -933,6 +1019,123 @@ dec_ref:
        return NULL;
 }
 
+static void dr_action_range_definer_fill(u16 *format_id,
+                                        u8 *dw_selectors,
+                                        u8 *byte_selectors,
+                                        u8 *match_mask)
+{
+       int i;
+
+       *format_id = MLX5_IFC_DEFINER_FORMAT_ID_SELECT;
+
+       dw_selectors[0] = MLX5_IFC_DEFINER_FORMAT_OFFSET_OUTER_ETH_PKT_LEN / 4;
+
+       for (i = 1; i < MLX5_IFC_DEFINER_DW_SELECTORS_NUM; i++)
+               dw_selectors[i] = MLX5_IFC_DEFINER_FORMAT_OFFSET_UNUSED;
+
+       for (i = 0; i < MLX5_IFC_DEFINER_BYTE_SELECTORS_NUM; i++)
+               byte_selectors[i] = MLX5_IFC_DEFINER_FORMAT_OFFSET_UNUSED;
+
+       MLX5_SET(match_definer_match_mask, match_mask,
+                match_dw_0, 0xffffUL << 16);
+}
+
+static int dr_action_create_range_definer(struct mlx5dr_action *action)
+{
+       u8 match_mask[MLX5_FLD_SZ_BYTES(match_definer, match_mask)] = {};
+       u8 byte_selectors[MLX5_IFC_DEFINER_BYTE_SELECTORS_NUM] = {};
+       u8 dw_selectors[MLX5_IFC_DEFINER_DW_SELECTORS_NUM] = {};
+       struct mlx5dr_domain *dmn = action->range->dmn;
+       u32 definer_id;
+       u16 format_id;
+       int ret;
+
+       dr_action_range_definer_fill(&format_id,
+                                    dw_selectors,
+                                    byte_selectors,
+                                    match_mask);
+
+       ret = mlx5dr_definer_get(dmn, format_id,
+                                dw_selectors, byte_selectors,
+                                match_mask, &definer_id);
+       if (ret)
+               return ret;
+
+       action->range->definer_id = definer_id;
+       return 0;
+}
+
+static void dr_action_destroy_range_definer(struct mlx5dr_action *action)
+{
+       mlx5dr_definer_put(action->range->dmn, action->range->definer_id);
+}
+
+struct mlx5dr_action *
+mlx5dr_action_create_dest_match_range(struct mlx5dr_domain *dmn,
+                                     u32 field,
+                                     struct mlx5_flow_table *hit_ft,
+                                     struct mlx5_flow_table *miss_ft,
+                                     u32 min,
+                                     u32 max)
+{
+       struct mlx5dr_action *action;
+       int ret;
+
+       if (!mlx5dr_supp_match_ranges(dmn->mdev)) {
+               mlx5dr_dbg(dmn, "SELECT definer support is needed for match range\n");
+               return NULL;
+       }
+
+       if (field != MLX5_FLOW_DEST_RANGE_FIELD_PKT_LEN ||
+           min > 0xffff || max > 0xffff) {
+               mlx5dr_err(dmn, "Invalid match range parameters\n");
+               return NULL;
+       }
+
+       action = dr_action_create_generic(DR_ACTION_TYP_RANGE);
+       if (!action)
+               return NULL;
+
+       action->range->hit_tbl_action =
+               mlx5dr_is_fw_table(hit_ft) ?
+                       mlx5dr_action_create_dest_flow_fw_table(dmn, hit_ft) :
+                       mlx5dr_action_create_dest_table(hit_ft->fs_dr_table.dr_table);
+
+       if (!action->range->hit_tbl_action)
+               goto free_action;
+
+       action->range->miss_tbl_action =
+               mlx5dr_is_fw_table(miss_ft) ?
+                       mlx5dr_action_create_dest_flow_fw_table(dmn, miss_ft) :
+                       mlx5dr_action_create_dest_table(miss_ft->fs_dr_table.dr_table);
+
+       if (!action->range->miss_tbl_action)
+               goto free_hit_tbl_action;
+
+       action->range->min = min;
+       action->range->max = max;
+       action->range->dmn = dmn;
+
+       ret = dr_action_create_range_definer(action);
+       if (ret)
+               goto free_miss_tbl_action;
+
+       /* No need to increase refcount on domain for this action,
+        * the hit/miss table actions will do it internally.
+        */
+
+       return action;
+
+free_miss_tbl_action:
+       mlx5dr_action_destroy(action->range->miss_tbl_action);
+free_hit_tbl_action:
+       mlx5dr_action_destroy(action->range->hit_tbl_action);
+free_action:
+       kfree(action);
+
+       return NULL;
+}
+
 struct mlx5dr_action *
 mlx5dr_action_create_mult_dest_tbl(struct mlx5dr_domain *dmn,
                                   struct mlx5dr_action_dest *dests,
@@ -1980,6 +2183,11 @@ int mlx5dr_action_destroy(struct mlx5dr_action *action)
        case DR_ACTION_TYP_ASO_FLOW_METER:
                refcount_dec(&action->aso->dmn->refcount);
                break;
+       case DR_ACTION_TYP_RANGE:
+               dr_action_destroy_range_definer(action);
+               mlx5dr_action_destroy(action->range->miss_tbl_action);
+               mlx5dr_action_destroy(action->range->hit_tbl_action);
+               break;
        default:
                break;
        }
index b4739ea..07b6a6d 100644 (file)
@@ -564,6 +564,83 @@ void mlx5dr_cmd_destroy_reformat_ctx(struct mlx5_core_dev *mdev,
        mlx5_cmd_exec_in(mdev, dealloc_packet_reformat_context, in);
 }
 
+static void dr_cmd_set_definer_format(void *ptr, u16 format_id,
+                                     u8 *dw_selectors,
+                                     u8 *byte_selectors)
+{
+       if (format_id != MLX5_IFC_DEFINER_FORMAT_ID_SELECT)
+               return;
+
+       MLX5_SET(match_definer, ptr, format_select_dw0, dw_selectors[0]);
+       MLX5_SET(match_definer, ptr, format_select_dw1, dw_selectors[1]);
+       MLX5_SET(match_definer, ptr, format_select_dw2, dw_selectors[2]);
+       MLX5_SET(match_definer, ptr, format_select_dw3, dw_selectors[3]);
+       MLX5_SET(match_definer, ptr, format_select_dw4, dw_selectors[4]);
+       MLX5_SET(match_definer, ptr, format_select_dw5, dw_selectors[5]);
+       MLX5_SET(match_definer, ptr, format_select_dw6, dw_selectors[6]);
+       MLX5_SET(match_definer, ptr, format_select_dw7, dw_selectors[7]);
+       MLX5_SET(match_definer, ptr, format_select_dw8, dw_selectors[8]);
+
+       MLX5_SET(match_definer, ptr, format_select_byte0, byte_selectors[0]);
+       MLX5_SET(match_definer, ptr, format_select_byte1, byte_selectors[1]);
+       MLX5_SET(match_definer, ptr, format_select_byte2, byte_selectors[2]);
+       MLX5_SET(match_definer, ptr, format_select_byte3, byte_selectors[3]);
+       MLX5_SET(match_definer, ptr, format_select_byte4, byte_selectors[4]);
+       MLX5_SET(match_definer, ptr, format_select_byte5, byte_selectors[5]);
+       MLX5_SET(match_definer, ptr, format_select_byte6, byte_selectors[6]);
+       MLX5_SET(match_definer, ptr, format_select_byte7, byte_selectors[7]);
+}
+
+int mlx5dr_cmd_create_definer(struct mlx5_core_dev *mdev,
+                             u16 format_id,
+                             u8 *dw_selectors,
+                             u8 *byte_selectors,
+                             u8 *match_mask,
+                             u32 *definer_id)
+{
+       u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
+       u32 in[MLX5_ST_SZ_DW(create_match_definer_in)] = {};
+       void *ptr;
+       int err;
+
+       ptr = MLX5_ADDR_OF(create_match_definer_in, in,
+                          general_obj_in_cmd_hdr);
+       MLX5_SET(general_obj_in_cmd_hdr, ptr, opcode,
+                MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
+       MLX5_SET(general_obj_in_cmd_hdr, ptr, obj_type,
+                MLX5_OBJ_TYPE_MATCH_DEFINER);
+
+       ptr = MLX5_ADDR_OF(create_match_definer_in, in, obj_context);
+       MLX5_SET(match_definer, ptr, format_id, format_id);
+
+       dr_cmd_set_definer_format(ptr, format_id,
+                                 dw_selectors, byte_selectors);
+
+       ptr = MLX5_ADDR_OF(match_definer, ptr, match_mask);
+       memcpy(ptr, match_mask, MLX5_FLD_SZ_BYTES(match_definer, match_mask));
+
+       err = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
+       if (err)
+               return err;
+
+       *definer_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
+
+       return 0;
+}
+
+void
+mlx5dr_cmd_destroy_definer(struct mlx5_core_dev *mdev, u32 definer_id)
+{
+       u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {};
+       u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)];
+
+       MLX5_SET(general_obj_in_cmd_hdr, in, opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT);
+       MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, MLX5_OBJ_TYPE_MATCH_DEFINER);
+       MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, definer_id);
+
+       mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
+}
+
 int mlx5dr_cmd_query_gid(struct mlx5_core_dev *mdev, u8 vhca_port_num,
                         u16 index, struct mlx5dr_cmd_gid_attr *attr)
 {
index 7adcf0e..db81d88 100644 (file)
@@ -49,7 +49,8 @@ enum dr_dump_rec_type {
        DR_DUMP_REC_TYPE_ACTION_POP_VLAN = 3413,
        DR_DUMP_REC_TYPE_ACTION_SAMPLER = 3415,
        DR_DUMP_REC_TYPE_ACTION_INSERT_HDR = 3420,
-       DR_DUMP_REC_TYPE_ACTION_REMOVE_HDR = 3421
+       DR_DUMP_REC_TYPE_ACTION_REMOVE_HDR = 3421,
+       DR_DUMP_REC_TYPE_ACTION_MATCH_RANGE = 3425,
 };
 
 void mlx5dr_dbg_tbl_add(struct mlx5dr_table *tbl)
@@ -107,6 +108,8 @@ dr_dump_rule_action_mem(struct seq_file *file, const u64 rule_id,
 {
        struct mlx5dr_action *action = action_mem->action;
        const u64 action_id = DR_DBG_PTR_TO_ID(action);
+       u64 hit_tbl_ptr, miss_tbl_ptr;
+       u32 hit_tbl_id, miss_tbl_id;
 
        switch (action->action_type) {
        case DR_ACTION_TYP_DROP:
@@ -198,6 +201,30 @@ dr_dump_rule_action_mem(struct seq_file *file, const u64 rule_id,
                           action->sampler->rx_icm_addr,
                           action->sampler->tx_icm_addr);
                break;
+       case DR_ACTION_TYP_RANGE:
+               if (action->range->hit_tbl_action->dest_tbl->is_fw_tbl) {
+                       hit_tbl_id = action->range->hit_tbl_action->dest_tbl->fw_tbl.id;
+                       hit_tbl_ptr = 0;
+               } else {
+                       hit_tbl_id = action->range->hit_tbl_action->dest_tbl->tbl->table_id;
+                       hit_tbl_ptr =
+                               DR_DBG_PTR_TO_ID(action->range->hit_tbl_action->dest_tbl->tbl);
+               }
+
+               if (action->range->miss_tbl_action->dest_tbl->is_fw_tbl) {
+                       miss_tbl_id = action->range->miss_tbl_action->dest_tbl->fw_tbl.id;
+                       miss_tbl_ptr = 0;
+               } else {
+                       miss_tbl_id = action->range->miss_tbl_action->dest_tbl->tbl->table_id;
+                       miss_tbl_ptr =
+                               DR_DBG_PTR_TO_ID(action->range->miss_tbl_action->dest_tbl->tbl);
+               }
+
+               seq_printf(file, "%d,0x%llx,0x%llx,0x%x,0x%llx,0x%x,0x%llx,0x%x\n",
+                          DR_DUMP_REC_TYPE_ACTION_MATCH_RANGE, action_id, rule_id,
+                          hit_tbl_id, hit_tbl_ptr, miss_tbl_id, miss_tbl_ptr,
+                          action->range->definer_id);
+               break;
        default:
                return 0;
        }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_definer.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_definer.c
new file mode 100644 (file)
index 0000000..d5ea977
--- /dev/null
@@ -0,0 +1,151 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+#include "dr_types.h"
+#include "dr_ste.h"
+
+struct dr_definer_object {
+       u32 id;
+       u16 format_id;
+       u8 dw_selectors[MLX5_IFC_DEFINER_DW_SELECTORS_NUM];
+       u8 byte_selectors[MLX5_IFC_DEFINER_BYTE_SELECTORS_NUM];
+       u8 match_mask[DR_STE_SIZE_MATCH_TAG];
+       refcount_t refcount;
+};
+
+static bool dr_definer_compare(struct dr_definer_object *definer,
+                              u16 format_id, u8 *dw_selectors,
+                              u8 *byte_selectors, u8 *match_mask)
+{
+       int i;
+
+       if (definer->format_id != format_id)
+               return false;
+
+       for (i = 0; i < MLX5_IFC_DEFINER_DW_SELECTORS_NUM; i++)
+               if (definer->dw_selectors[i] != dw_selectors[i])
+                       return false;
+
+       for (i = 0; i < MLX5_IFC_DEFINER_BYTE_SELECTORS_NUM; i++)
+               if (definer->byte_selectors[i] != byte_selectors[i])
+                       return false;
+
+       if (memcmp(definer->match_mask, match_mask, DR_STE_SIZE_MATCH_TAG))
+               return false;
+
+       return true;
+}
+
+static struct dr_definer_object *
+dr_definer_find_obj(struct mlx5dr_domain *dmn, u16 format_id,
+                   u8 *dw_selectors, u8 *byte_selectors, u8 *match_mask)
+{
+       struct dr_definer_object *definer_obj;
+       unsigned long id;
+
+       xa_for_each(&dmn->definers_xa, id, definer_obj) {
+               if (dr_definer_compare(definer_obj, format_id,
+                                      dw_selectors, byte_selectors,
+                                      match_mask))
+                       return definer_obj;
+       }
+
+       return NULL;
+}
+
+static struct dr_definer_object *
+dr_definer_create_obj(struct mlx5dr_domain *dmn, u16 format_id,
+                     u8 *dw_selectors, u8 *byte_selectors, u8 *match_mask)
+{
+       struct dr_definer_object *definer_obj;
+       int ret = 0;
+
+       definer_obj = kzalloc(sizeof(*definer_obj), GFP_KERNEL);
+       if (!definer_obj)
+               return NULL;
+
+       ret = mlx5dr_cmd_create_definer(dmn->mdev,
+                                       format_id,
+                                       dw_selectors,
+                                       byte_selectors,
+                                       match_mask,
+                                       &definer_obj->id);
+       if (ret)
+               goto err_free_definer_obj;
+
+       /* Definer ID can have 32 bits, but STE format
+        * supports only definers with 8 bit IDs.
+        */
+       if (definer_obj->id > 0xff) {
+               mlx5dr_err(dmn, "Unsupported definer ID (%d)\n", definer_obj->id);
+               goto err_destroy_definer;
+       }
+
+       definer_obj->format_id = format_id;
+       memcpy(definer_obj->dw_selectors, dw_selectors, sizeof(definer_obj->dw_selectors));
+       memcpy(definer_obj->byte_selectors, byte_selectors, sizeof(definer_obj->byte_selectors));
+       memcpy(definer_obj->match_mask, match_mask, sizeof(definer_obj->match_mask));
+
+       refcount_set(&definer_obj->refcount, 1);
+
+       ret = xa_insert(&dmn->definers_xa, definer_obj->id, definer_obj, GFP_KERNEL);
+       if (ret) {
+               mlx5dr_dbg(dmn, "Couldn't insert new definer into xarray (%d)\n", ret);
+               goto err_destroy_definer;
+       }
+
+       return definer_obj;
+
+err_destroy_definer:
+       mlx5dr_cmd_destroy_definer(dmn->mdev, definer_obj->id);
+err_free_definer_obj:
+       kfree(definer_obj);
+
+       return NULL;
+}
+
+static void dr_definer_destroy_obj(struct mlx5dr_domain *dmn,
+                                  struct dr_definer_object *definer_obj)
+{
+       mlx5dr_cmd_destroy_definer(dmn->mdev, definer_obj->id);
+       xa_erase(&dmn->definers_xa, definer_obj->id);
+       kfree(definer_obj);
+}
+
+int mlx5dr_definer_get(struct mlx5dr_domain *dmn, u16 format_id,
+                      u8 *dw_selectors, u8 *byte_selectors,
+                      u8 *match_mask, u32 *definer_id)
+{
+       struct dr_definer_object *definer_obj;
+       int ret = 0;
+
+       definer_obj = dr_definer_find_obj(dmn, format_id, dw_selectors,
+                                         byte_selectors, match_mask);
+       if (!definer_obj) {
+               definer_obj = dr_definer_create_obj(dmn, format_id,
+                                                   dw_selectors, byte_selectors,
+                                                   match_mask);
+               if (!definer_obj)
+                       return -ENOMEM;
+       } else {
+               refcount_inc(&definer_obj->refcount);
+       }
+
+       *definer_id = definer_obj->id;
+
+       return ret;
+}
+
+void mlx5dr_definer_put(struct mlx5dr_domain *dmn, u32 definer_id)
+{
+       struct dr_definer_object *definer_obj;
+
+       definer_obj = xa_load(&dmn->definers_xa, definer_id);
+       if (!definer_obj) {
+               mlx5dr_err(dmn, "Definer ID %d not found\n", definer_id);
+               return;
+       }
+
+       if (refcount_dec_and_test(&definer_obj->refcount))
+               dr_definer_destroy_obj(dmn, definer_obj);
+}
index 9a98362..5b8bb2c 100644 (file)
@@ -425,10 +425,11 @@ mlx5dr_domain_create(struct mlx5_core_dev *mdev, enum mlx5dr_domain_type type)
        refcount_set(&dmn->refcount, 1);
        mutex_init(&dmn->info.rx.mutex);
        mutex_init(&dmn->info.tx.mutex);
+       xa_init(&dmn->definers_xa);
 
        if (dr_domain_caps_init(mdev, dmn)) {
                mlx5dr_err(dmn, "Failed init domain, no caps\n");
-               goto free_domain;
+               goto def_xa_destroy;
        }
 
        dmn->info.max_log_action_icm_sz = DR_CHUNK_SIZE_4K;
@@ -453,7 +454,8 @@ mlx5dr_domain_create(struct mlx5_core_dev *mdev, enum mlx5dr_domain_type type)
 
 uninit_caps:
        dr_domain_caps_uninit(dmn);
-free_domain:
+def_xa_destroy:
+       xa_destroy(&dmn->definers_xa);
        kfree(dmn);
        return NULL;
 }
@@ -493,6 +495,7 @@ int mlx5dr_domain_destroy(struct mlx5dr_domain *dmn)
        dr_domain_uninit_csum_recalc_fts(dmn);
        dr_domain_uninit_resources(dmn);
        dr_domain_caps_uninit(dmn);
+       xa_destroy(&dmn->definers_xa);
        mutex_destroy(&dmn->info.tx.mutex);
        mutex_destroy(&dmn->info.rx.mutex);
        kfree(dmn);
index 7879991..74cbe53 100644 (file)
@@ -35,16 +35,28 @@ static int dr_rule_append_to_miss_list(struct mlx5dr_domain *dmn,
        return 0;
 }
 
+static void dr_rule_set_last_ste_miss_addr(struct mlx5dr_matcher *matcher,
+                                          struct mlx5dr_matcher_rx_tx *nic_matcher,
+                                          u8 *hw_ste)
+{
+       struct mlx5dr_ste_ctx *ste_ctx = matcher->tbl->dmn->ste_ctx;
+       u64 icm_addr;
+
+       if (mlx5dr_ste_is_miss_addr_set(ste_ctx, hw_ste))
+               return;
+
+       icm_addr = mlx5dr_icm_pool_get_chunk_icm_addr(nic_matcher->e_anchor->chunk);
+       mlx5dr_ste_set_miss_addr(ste_ctx, hw_ste, icm_addr);
+}
+
 static struct mlx5dr_ste *
 dr_rule_create_collision_htbl(struct mlx5dr_matcher *matcher,
                              struct mlx5dr_matcher_rx_tx *nic_matcher,
                              u8 *hw_ste)
 {
        struct mlx5dr_domain *dmn = matcher->tbl->dmn;
-       struct mlx5dr_ste_ctx *ste_ctx = dmn->ste_ctx;
        struct mlx5dr_ste_htbl *new_htbl;
        struct mlx5dr_ste *ste;
-       u64 icm_addr;
 
        /* Create new table for miss entry */
        new_htbl = mlx5dr_ste_htbl_alloc(dmn->ste_icm_pool,
@@ -58,8 +70,7 @@ dr_rule_create_collision_htbl(struct mlx5dr_matcher *matcher,
 
        /* One and only entry, never grows */
        ste = new_htbl->chunk->ste_arr;
-       icm_addr = mlx5dr_icm_pool_get_chunk_icm_addr(nic_matcher->e_anchor->chunk);
-       mlx5dr_ste_set_miss_addr(ste_ctx, hw_ste, icm_addr);
+       dr_rule_set_last_ste_miss_addr(matcher, nic_matcher, hw_ste);
        mlx5dr_htbl_get(new_htbl);
 
        return ste;
@@ -241,7 +252,6 @@ dr_rule_rehash_copy_ste(struct mlx5dr_matcher *matcher,
        bool use_update_list = false;
        u8 hw_ste[DR_STE_SIZE] = {};
        struct mlx5dr_ste *new_ste;
-       u64 icm_addr;
        int new_idx;
        u8 sb_idx;
 
@@ -250,9 +260,8 @@ dr_rule_rehash_copy_ste(struct mlx5dr_matcher *matcher,
        mlx5dr_ste_set_bit_mask(hw_ste, nic_matcher->ste_builder[sb_idx].bit_mask);
 
        /* Copy STE control and tag */
-       icm_addr = mlx5dr_icm_pool_get_chunk_icm_addr(nic_matcher->e_anchor->chunk);
        memcpy(hw_ste, mlx5dr_ste_get_hw_ste(cur_ste), DR_STE_SIZE_REDUCED);
-       mlx5dr_ste_set_miss_addr(dmn->ste_ctx, hw_ste, icm_addr);
+       dr_rule_set_last_ste_miss_addr(matcher, nic_matcher, hw_ste);
 
        new_idx = mlx5dr_ste_calc_hash_index(hw_ste, new_htbl);
        new_ste = &new_htbl->chunk->ste_arr[new_idx];
@@ -773,7 +782,6 @@ static int dr_rule_handle_empty_entry(struct mlx5dr_matcher *matcher,
 {
        struct mlx5dr_domain *dmn = matcher->tbl->dmn;
        struct mlx5dr_ste_send_info *ste_info;
-       u64 icm_addr;
 
        /* Take ref on table, only on first time this ste is used */
        mlx5dr_htbl_get(cur_htbl);
@@ -781,8 +789,7 @@ static int dr_rule_handle_empty_entry(struct mlx5dr_matcher *matcher,
        /* new entry -> new branch */
        list_add_tail(&ste->miss_list_node, miss_list);
 
-       icm_addr = mlx5dr_icm_pool_get_chunk_icm_addr(nic_matcher->e_anchor->chunk);
-       mlx5dr_ste_set_miss_addr(dmn->ste_ctx, hw_ste, icm_addr);
+       dr_rule_set_last_ste_miss_addr(matcher, nic_matcher, hw_ste);
 
        ste->ste_chain_location = ste_location;
 
index 9e19a8d..1e15f60 100644 (file)
@@ -90,6 +90,16 @@ static void dr_ste_set_always_miss(struct dr_hw_ste_format *hw_ste)
        hw_ste->mask[0] = 0;
 }
 
+bool mlx5dr_ste_is_miss_addr_set(struct mlx5dr_ste_ctx *ste_ctx,
+                                u8 *hw_ste_p)
+{
+       if (!ste_ctx->is_miss_addr_set)
+               return false;
+
+       /* check if miss address is already set for this type of STE */
+       return ste_ctx->is_miss_addr_set(hw_ste_p);
+}
+
 void mlx5dr_ste_set_miss_addr(struct mlx5dr_ste_ctx *ste_ctx,
                              u8 *hw_ste_p, u64 miss_addr)
 {
index 17513ba..7075142 100644 (file)
@@ -151,6 +151,7 @@ struct mlx5dr_ste_ctx {
                         bool is_rx, u16 gvmi);
        void (*set_next_lu_type)(u8 *hw_ste_p, u16 lu_type);
        u16  (*get_next_lu_type)(u8 *hw_ste_p);
+       bool (*is_miss_addr_set)(u8 *hw_ste_p);
        void (*set_miss_addr)(u8 *hw_ste_p, u64 miss_addr);
        u64  (*get_miss_addr)(u8 *hw_ste_p);
        void (*set_hit_addr)(u8 *hw_ste_p, u64 icm_addr, u32 ht_size);
index ee677a5..084145f 100644 (file)
@@ -13,6 +13,7 @@ enum dr_ste_v1_entry_format {
        DR_STE_V1_TYPE_BWC_BYTE = 0x0,
        DR_STE_V1_TYPE_BWC_DW   = 0x1,
        DR_STE_V1_TYPE_MATCH    = 0x2,
+       DR_STE_V1_TYPE_MATCH_RANGES = 0x7,
 };
 
 /* Lookup type is built from 2B: [ Definer mode 1B ][ Definer index 1B ] */
@@ -267,6 +268,16 @@ static void dr_ste_v1_set_entry_type(u8 *hw_ste_p, u8 entry_type)
        MLX5_SET(ste_match_bwc_v1, hw_ste_p, entry_format, entry_type);
 }
 
+bool dr_ste_v1_is_miss_addr_set(u8 *hw_ste_p)
+{
+       u8 entry_type = MLX5_GET(ste_match_bwc_v1, hw_ste_p, entry_format);
+
+       /* unlike MATCH STE, for MATCH_RANGES STE both hit and miss addresses
+        * are part of the action, so they both set as part of STE init
+        */
+       return entry_type == DR_STE_V1_TYPE_MATCH_RANGES;
+}
+
 void dr_ste_v1_set_miss_addr(u8 *hw_ste_p, u64 miss_addr)
 {
        u64 index = miss_addr >> 6;
@@ -520,6 +531,27 @@ static void dr_ste_v1_set_aso_flow_meter(u8 *d_action,
                 init_color);
 }
 
+static void dr_ste_v1_set_match_range_pkt_len(u8 *hw_ste_p, u32 definer_id,
+                                             u32 min, u32 max)
+{
+       MLX5_SET(ste_match_ranges_v1, hw_ste_p, match_definer_ctx_idx, definer_id);
+
+       /* When the STE will be sent, its mask and tags will be swapped in
+        * dr_ste_v1_prepare_for_postsend(). This, however, is match range STE
+        * which doesn't have mask, and shouldn't have mask/tag swapped.
+        * We're using the common utilities functions to send this STE, so need
+        * to allow for this swapping - place the values in the corresponding
+        * locations to allow flipping them when writing to ICM.
+        *
+        * min/max_value_2 corresponds to match_dw_0 in its definer.
+        * To allow mask/tag swapping, writing the min/max_2 to min/max_0.
+        *
+        * Pkt len is 2 bytes that are stored in the higher section of the DW.
+        */
+       MLX5_SET(ste_match_ranges_v1, hw_ste_p, min_value_0, min << 16);
+       MLX5_SET(ste_match_ranges_v1, hw_ste_p, max_value_0, max << 16);
+}
+
 static void dr_ste_v1_arr_init_next_match(u8 **last_ste,
                                          u32 *added_stes,
                                          u16 gvmi)
@@ -535,6 +567,14 @@ static void dr_ste_v1_arr_init_next_match(u8 **last_ste,
        memset(action, 0, MLX5_FLD_SZ_BYTES(ste_mask_and_match_v1, action));
 }
 
+static void dr_ste_v1_arr_init_next_match_range(u8 **last_ste,
+                                               u32 *added_stes,
+                                               u16 gvmi)
+{
+       dr_ste_v1_arr_init_next_match(last_ste, added_stes, gvmi);
+       dr_ste_v1_set_entry_type(*last_ste, DR_STE_V1_TYPE_MATCH_RANGES);
+}
+
 void dr_ste_v1_set_actions_tx(struct mlx5dr_domain *dmn,
                              u8 *action_type_set,
                              u32 actions_caps,
@@ -670,6 +710,20 @@ void dr_ste_v1_set_actions_tx(struct mlx5dr_domain *dmn,
                action += DR_STE_ACTION_DOUBLE_SZ;
        }
 
+       if (action_type_set[DR_ACTION_TYP_RANGE]) {
+               /* match ranges requires a new STE of its own type */
+               dr_ste_v1_arr_init_next_match_range(&last_ste, added_stes, attr->gvmi);
+               dr_ste_v1_set_miss_addr(last_ste, attr->range.miss_icm_addr);
+
+               /* we do not support setting any action on the match ranges STE */
+               action_sz = 0;
+
+               dr_ste_v1_set_match_range_pkt_len(last_ste,
+                                                 attr->range.definer_id,
+                                                 attr->range.min,
+                                                 attr->range.max);
+       }
+
        dr_ste_v1_set_hit_gvmi(last_ste, attr->hit_gvmi);
        dr_ste_v1_set_hit_addr(last_ste, attr->final_icm_addr, 1);
 }
@@ -858,6 +912,20 @@ void dr_ste_v1_set_actions_rx(struct mlx5dr_domain *dmn,
                action += DR_STE_ACTION_DOUBLE_SZ;
        }
 
+       if (action_type_set[DR_ACTION_TYP_RANGE]) {
+               /* match ranges requires a new STE of its own type */
+               dr_ste_v1_arr_init_next_match_range(&last_ste, added_stes, attr->gvmi);
+               dr_ste_v1_set_miss_addr(last_ste, attr->range.miss_icm_addr);
+
+               /* we do not support setting any action on the match ranges STE */
+               action_sz = 0;
+
+               dr_ste_v1_set_match_range_pkt_len(last_ste,
+                                                 attr->range.definer_id,
+                                                 attr->range.min,
+                                                 attr->range.max);
+       }
+
        dr_ste_v1_set_hit_gvmi(last_ste, attr->hit_gvmi);
        dr_ste_v1_set_hit_addr(last_ste, attr->final_icm_addr, 1);
 }
@@ -2144,6 +2212,7 @@ static struct mlx5dr_ste_ctx ste_ctx_v1 = {
        .ste_init                       = &dr_ste_v1_init,
        .set_next_lu_type               = &dr_ste_v1_set_next_lu_type,
        .get_next_lu_type               = &dr_ste_v1_get_next_lu_type,
+       .is_miss_addr_set               = &dr_ste_v1_is_miss_addr_set,
        .set_miss_addr                  = &dr_ste_v1_set_miss_addr,
        .get_miss_addr                  = &dr_ste_v1_get_miss_addr,
        .set_hit_addr                   = &dr_ste_v1_set_hit_addr,
index 8a1d497..b5c0f0f 100644 (file)
@@ -7,6 +7,7 @@
 #include "dr_types.h"
 #include "dr_ste.h"
 
+bool dr_ste_v1_is_miss_addr_set(u8 *hw_ste_p);
 void dr_ste_v1_set_miss_addr(u8 *hw_ste_p, u64 miss_addr);
 u64 dr_ste_v1_get_miss_addr(u8 *hw_ste_p);
 void dr_ste_v1_set_byte_mask(u8 *hw_ste_p, u16 byte_mask);
index c60fddd..cf1a3c9 100644 (file)
@@ -202,6 +202,7 @@ static struct mlx5dr_ste_ctx ste_ctx_v2 = {
        .ste_init                       = &dr_ste_v1_init,
        .set_next_lu_type               = &dr_ste_v1_set_next_lu_type,
        .get_next_lu_type               = &dr_ste_v1_get_next_lu_type,
+       .is_miss_addr_set               = &dr_ste_v1_is_miss_addr_set,
        .set_miss_addr                  = &dr_ste_v1_set_miss_addr,
        .get_miss_addr                  = &dr_ste_v1_get_miss_addr,
        .set_hit_addr                   = &dr_ste_v1_set_hit_addr,
index 41a37b9..2b769dc 100644 (file)
@@ -81,6 +81,7 @@ mlx5dr_icm_next_higher_chunk(enum mlx5dr_icm_chunk_size chunk)
 enum {
        DR_STE_SIZE = 64,
        DR_STE_SIZE_CTRL = 32,
+       DR_STE_SIZE_MATCH_TAG = 32,
        DR_STE_SIZE_TAG = 16,
        DR_STE_SIZE_MASK = 16,
        DR_STE_SIZE_REDUCED = DR_STE_SIZE - DR_STE_SIZE_MASK,
@@ -128,6 +129,7 @@ enum mlx5dr_action_type {
        DR_ACTION_TYP_REMOVE_HDR,
        DR_ACTION_TYP_SAMPLER,
        DR_ACTION_TYP_ASO_FLOW_METER,
+       DR_ACTION_TYP_RANGE,
        DR_ACTION_TYP_MAX,
 };
 
@@ -237,6 +239,7 @@ static inline void mlx5dr_htbl_get(struct mlx5dr_ste_htbl *htbl)
 
 /* STE utils */
 u32 mlx5dr_ste_calc_hash_index(u8 *hw_ste_p, struct mlx5dr_ste_htbl *htbl);
+bool mlx5dr_ste_is_miss_addr_set(struct mlx5dr_ste_ctx *ste_ctx, u8 *hw_ste_p);
 void mlx5dr_ste_set_miss_addr(struct mlx5dr_ste_ctx *ste_ctx,
                              u8 *hw_ste, u64 miss_addr);
 void mlx5dr_ste_set_hit_addr(struct mlx5dr_ste_ctx *ste_ctx,
@@ -281,6 +284,13 @@ struct mlx5dr_ste_actions_attr {
                u8 dest_reg_id;
                u8 init_color;
        } aso_flow_meter;
+
+       struct {
+               u64     miss_icm_addr;
+               u32     definer_id;
+               u32     min;
+               u32     max;
+       } range;
 };
 
 void mlx5dr_ste_set_actions_rx(struct mlx5dr_ste_ctx *ste_ctx,
@@ -924,6 +934,7 @@ struct mlx5dr_domain {
        struct mlx5dr_ste_ctx *ste_ctx;
        struct list_head dbg_tbl_list;
        struct mlx5dr_dbg_dump_info dump_info;
+       struct xarray definers_xa;
 };
 
 struct mlx5dr_table_rx_tx {
@@ -1026,6 +1037,15 @@ struct mlx5dr_action_dest_tbl {
        };
 };
 
+struct mlx5dr_action_range {
+       struct mlx5dr_domain *dmn;
+       struct mlx5dr_action *hit_tbl_action;
+       struct mlx5dr_action *miss_tbl_action;
+       u32 definer_id;
+       u32 min;
+       u32 max;
+};
+
 struct mlx5dr_action_ctr {
        u32 ctr_id;
        u32 offset;
@@ -1072,6 +1092,7 @@ struct mlx5dr_action {
                struct mlx5dr_action_push_vlan *push_vlan;
                struct mlx5dr_action_flow_tag *flow_tag;
                struct mlx5dr_action_aso_flow_meter *aso;
+               struct mlx5dr_action_range *range;
        };
 };
 
@@ -1295,6 +1316,14 @@ int mlx5dr_cmd_create_reformat_ctx(struct mlx5_core_dev *mdev,
                                   u32 *reformat_id);
 void mlx5dr_cmd_destroy_reformat_ctx(struct mlx5_core_dev *mdev,
                                     u32 reformat_id);
+int mlx5dr_cmd_create_definer(struct mlx5_core_dev *mdev,
+                             u16 format_id,
+                             u8 *dw_selectors,
+                             u8 *byte_selectors,
+                             u8 *match_mask,
+                             u32 *definer_id);
+void mlx5dr_cmd_destroy_definer(struct mlx5_core_dev *mdev,
+                               u32 definer_id);
 
 struct mlx5dr_cmd_gid_attr {
        u8 gid[16];
@@ -1483,4 +1512,18 @@ int mlx5dr_fw_create_md_tbl(struct mlx5dr_domain *dmn,
                            u32 flow_source);
 void mlx5dr_fw_destroy_md_tbl(struct mlx5dr_domain *dmn, u32 tbl_id,
                              u32 group_id);
+
+static inline bool mlx5dr_is_fw_table(struct mlx5_flow_table *ft)
+{
+       return !ft->fs_dr_table.dr_table;
+}
+
+static inline bool mlx5dr_supp_match_ranges(struct mlx5_core_dev *dev)
+{
+       return (MLX5_CAP_GEN(dev, steering_format_version) >=
+               MLX5_STEERING_FORMAT_CONNECTX_6DX) &&
+              (MLX5_CAP_GEN_64(dev, match_definer_format_supported) &
+                       (1ULL << MLX5_IFC_DEFINER_FORMAT_ID_SELECT));
+}
+
 #endif  /* _DR_TYPES_H_ */
index 13b6d47..9846537 100644 (file)
@@ -7,10 +7,11 @@
 #include "fs_cmd.h"
 #include "mlx5dr.h"
 #include "fs_dr.h"
+#include "dr_types.h"
 
-static bool mlx5_dr_is_fw_table(u32 flags)
+static bool dr_is_fw_term_table(struct mlx5_flow_table *ft)
 {
-       if (flags & MLX5_FLOW_TABLE_TERMINATION)
+       if (ft->flags & MLX5_FLOW_TABLE_TERMINATION)
                return true;
 
        return false;
@@ -69,7 +70,7 @@ static int mlx5_cmd_dr_create_flow_table(struct mlx5_flow_root_namespace *ns,
        u32 flags;
        int err;
 
-       if (mlx5_dr_is_fw_table(ft->flags))
+       if (dr_is_fw_term_table(ft))
                return mlx5_fs_cmd_get_fw_cmds()->create_flow_table(ns, ft,
                                                                    ft_attr,
                                                                    next_ft);
@@ -109,7 +110,7 @@ static int mlx5_cmd_dr_destroy_flow_table(struct mlx5_flow_root_namespace *ns,
        struct mlx5dr_action *action = ft->fs_dr_table.miss_action;
        int err;
 
-       if (mlx5_dr_is_fw_table(ft->flags))
+       if (dr_is_fw_term_table(ft))
                return mlx5_fs_cmd_get_fw_cmds()->destroy_flow_table(ns, ft);
 
        err = mlx5dr_table_destroy(ft->fs_dr_table.dr_table);
@@ -134,7 +135,7 @@ static int mlx5_cmd_dr_modify_flow_table(struct mlx5_flow_root_namespace *ns,
                                         struct mlx5_flow_table *ft,
                                         struct mlx5_flow_table *next_ft)
 {
-       if (mlx5_dr_is_fw_table(ft->flags))
+       if (dr_is_fw_term_table(ft))
                return mlx5_fs_cmd_get_fw_cmds()->modify_flow_table(ns, ft, next_ft);
 
        return set_miss_action(ns, ft, next_ft);
@@ -153,7 +154,7 @@ static int mlx5_cmd_dr_create_flow_group(struct mlx5_flow_root_namespace *ns,
                                            match_criteria_enable);
        struct mlx5dr_match_parameters mask;
 
-       if (mlx5_dr_is_fw_table(ft->flags))
+       if (dr_is_fw_term_table(ft))
                return mlx5_fs_cmd_get_fw_cmds()->create_flow_group(ns, ft, in,
                                                                    fg);
 
@@ -178,7 +179,7 @@ static int mlx5_cmd_dr_destroy_flow_group(struct mlx5_flow_root_namespace *ns,
                                          struct mlx5_flow_table *ft,
                                          struct mlx5_flow_group *fg)
 {
-       if (mlx5_dr_is_fw_table(ft->flags))
+       if (dr_is_fw_term_table(ft))
                return mlx5_fs_cmd_get_fw_cmds()->destroy_flow_group(ns, ft, fg);
 
        return mlx5dr_matcher_destroy(fg->fs_dr_matcher.dr_matcher);
@@ -209,11 +210,22 @@ static struct mlx5dr_action *create_ft_action(struct mlx5dr_domain *domain,
 {
        struct mlx5_flow_table *dest_ft = dst->dest_attr.ft;
 
-       if (mlx5_dr_is_fw_table(dest_ft->flags))
+       if (mlx5dr_is_fw_table(dest_ft))
                return mlx5dr_action_create_dest_flow_fw_table(domain, dest_ft);
        return mlx5dr_action_create_dest_table(dest_ft->fs_dr_table.dr_table);
 }
 
+static struct mlx5dr_action *create_range_action(struct mlx5dr_domain *domain,
+                                                struct mlx5_flow_rule *dst)
+{
+       return mlx5dr_action_create_dest_match_range(domain,
+                                                    dst->dest_attr.range.field,
+                                                    dst->dest_attr.range.hit_ft,
+                                                    dst->dest_attr.range.miss_ft,
+                                                    dst->dest_attr.range.min,
+                                                    dst->dest_attr.range.max);
+}
+
 static struct mlx5dr_action *create_action_push_vlan(struct mlx5dr_domain *domain,
                                                     struct mlx5_fs_vlan *vlan)
 {
@@ -260,7 +272,7 @@ static int mlx5_cmd_dr_create_fte(struct mlx5_flow_root_namespace *ns,
        int err = 0;
        int i;
 
-       if (mlx5_dr_is_fw_table(ft->flags))
+       if (dr_is_fw_term_table(ft))
                return mlx5_fs_cmd_get_fw_cmds()->create_fte(ns, ft, group, fte);
 
        actions = kcalloc(MLX5_FLOW_CONTEXT_ACTION_MAX, sizeof(*actions),
@@ -467,6 +479,15 @@ static int mlx5_cmd_dr_create_fte(struct mlx5_flow_root_namespace *ns,
                                fs_dr_actions[fs_dr_num_actions++] = tmp_action;
                                term_actions[num_term_actions++].dest = tmp_action;
                                break;
+                       case MLX5_FLOW_DESTINATION_TYPE_RANGE:
+                               tmp_action = create_range_action(domain, dst);
+                               if (!tmp_action) {
+                                       err = -ENOMEM;
+                                       goto free_actions;
+                               }
+                               fs_dr_actions[fs_dr_num_actions++] = tmp_action;
+                               term_actions[num_term_actions++].dest = tmp_action;
+                               break;
                        default:
                                err = -EOPNOTSUPP;
                                goto free_actions;
@@ -702,7 +723,7 @@ static int mlx5_cmd_dr_delete_fte(struct mlx5_flow_root_namespace *ns,
        int err;
        int i;
 
-       if (mlx5_dr_is_fw_table(ft->flags))
+       if (dr_is_fw_term_table(ft))
                return mlx5_fs_cmd_get_fw_cmds()->delete_fte(ns, ft, fte);
 
        err = mlx5dr_rule_destroy(rule->dr_rule);
@@ -727,7 +748,7 @@ static int mlx5_cmd_dr_update_fte(struct mlx5_flow_root_namespace *ns,
        struct fs_fte fte_tmp = {};
        int ret;
 
-       if (mlx5_dr_is_fw_table(ft->flags))
+       if (dr_is_fw_term_table(ft))
                return mlx5_fs_cmd_get_fw_cmds()->update_fte(ns, ft, group, modify_mask, fte);
 
        /* Backup current dr rule details */
@@ -780,11 +801,19 @@ static int mlx5_cmd_dr_destroy_ns(struct mlx5_flow_root_namespace *ns)
 static u32 mlx5_cmd_dr_get_capabilities(struct mlx5_flow_root_namespace *ns,
                                        enum fs_flow_table_type ft_type)
 {
+       u32 steering_caps = 0;
+
        if (ft_type != FS_FT_FDB ||
            MLX5_CAP_GEN(ns->dev, steering_format_version) == MLX5_STEERING_FORMAT_CONNECTX_5)
                return 0;
 
-       return MLX5_FLOW_STEERING_CAP_VLAN_PUSH_ON_RX | MLX5_FLOW_STEERING_CAP_VLAN_POP_ON_TX;
+       steering_caps |= MLX5_FLOW_STEERING_CAP_VLAN_PUSH_ON_RX;
+       steering_caps |= MLX5_FLOW_STEERING_CAP_VLAN_POP_ON_TX;
+
+       if (mlx5dr_supp_match_ranges(ns->dev))
+               steering_caps |= MLX5_FLOW_STEERING_CAP_MATCH_RANGES;
+
+       return steering_caps;
 }
 
 bool mlx5_fs_dr_is_supported(struct mlx5_core_dev *dev)
index 34c2bd1..790a17d 100644 (file)
@@ -165,6 +165,41 @@ struct mlx5_ifc_ste_mask_and_match_v1_bits {
        u8         action[0x60];
 };
 
+struct mlx5_ifc_ste_match_ranges_v1_bits {
+       u8         entry_format[0x8];
+       u8         counter_id[0x18];
+
+       u8         miss_address_63_48[0x10];
+       u8         match_definer_ctx_idx[0x8];
+       u8         miss_address_39_32[0x8];
+
+       u8         miss_address_31_6[0x1a];
+       u8         reserved_at_5a[0x1];
+       u8         match_polarity[0x1];
+       u8         reparse[0x1];
+       u8         reserved_at_5d[0x3];
+
+       u8         next_table_base_63_48[0x10];
+       u8         hash_definer_ctx_idx[0x8];
+       u8         next_table_base_39_32_size[0x8];
+
+       u8         next_table_base_31_5_size[0x1b];
+       u8         hash_type[0x2];
+       u8         hash_after_actions[0x1];
+       u8         reserved_at_9e[0x2];
+
+       u8         action[0x60];
+
+       u8         max_value_0[0x20];
+       u8         min_value_0[0x20];
+       u8         max_value_1[0x20];
+       u8         min_value_1[0x20];
+       u8         max_value_2[0x20];
+       u8         min_value_2[0x20];
+       u8         max_value_3[0x20];
+       u8         min_value_3[0x20];
+};
+
 struct mlx5_ifc_ste_eth_l2_src_v1_bits {
        u8         reserved_at_0[0x1];
        u8         sx_sniffer[0x1];
index 84ed777..9afd268 100644 (file)
@@ -140,8 +140,21 @@ mlx5dr_action_create_aso(struct mlx5dr_domain *dmn,
                         u8 init_color,
                         u8 meter_id);
 
+struct mlx5dr_action *
+mlx5dr_action_create_dest_match_range(struct mlx5dr_domain *dmn,
+                                     u32 field,
+                                     struct mlx5_flow_table *hit_ft,
+                                     struct mlx5_flow_table *miss_ft,
+                                     u32 min,
+                                     u32 max);
+
 int mlx5dr_action_destroy(struct mlx5dr_action *action);
 
+int mlx5dr_definer_get(struct mlx5dr_domain *dmn, u16 format_id,
+                      u8 *dw_selectors, u8 *byte_selectors,
+                      u8 *match_mask, u32 *definer_id);
+void mlx5dr_definer_put(struct mlx5dr_domain *dmn, u32 definer_id);
+
 static inline bool
 mlx5dr_is_supported(struct mlx5_core_dev *dev)
 {
index d5c3173..ba7e3df 100644 (file)
@@ -1160,14 +1160,40 @@ u64 mlx5_query_nic_system_image_guid(struct mlx5_core_dev *mdev)
 }
 EXPORT_SYMBOL_GPL(mlx5_query_nic_system_image_guid);
 
-int mlx5_vport_get_other_func_cap(struct mlx5_core_dev *dev, u16 function_id, void *out)
+int mlx5_vport_get_other_func_cap(struct mlx5_core_dev *dev, u16 function_id, void *out,
+                                 u16 opmod)
 {
-       u16 opmod = (MLX5_CAP_GENERAL << 1) | (HCA_CAP_OPMOD_GET_MAX & 0x01);
        u8 in[MLX5_ST_SZ_BYTES(query_hca_cap_in)] = {};
 
+       opmod = (opmod << 1) | (HCA_CAP_OPMOD_GET_MAX & 0x01);
        MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP);
        MLX5_SET(query_hca_cap_in, in, op_mod, opmod);
        MLX5_SET(query_hca_cap_in, in, function_id, function_id);
        MLX5_SET(query_hca_cap_in, in, other_function, true);
        return mlx5_cmd_exec_inout(dev, query_hca_cap, in, out);
 }
+EXPORT_SYMBOL_GPL(mlx5_vport_get_other_func_cap);
+
+int mlx5_vport_set_other_func_cap(struct mlx5_core_dev *dev, const void *hca_cap,
+                                 u16 function_id, u16 opmod)
+{
+       int set_sz = MLX5_ST_SZ_BYTES(set_hca_cap_in);
+       void *set_hca_cap;
+       void *set_ctx;
+       int ret;
+
+       set_ctx = kzalloc(set_sz, GFP_KERNEL);
+       if (!set_ctx)
+               return -ENOMEM;
+
+       MLX5_SET(set_hca_cap_in, set_ctx, opcode, MLX5_CMD_OP_SET_HCA_CAP);
+       MLX5_SET(set_hca_cap_in, set_ctx, op_mod, opmod << 1);
+       set_hca_cap = MLX5_ADDR_OF(set_hca_cap_in, set_ctx, capability);
+       memcpy(set_hca_cap, hca_cap, MLX5_ST_SZ_BYTES(cmd_hca_cap));
+       MLX5_SET(set_hca_cap_in, set_ctx, function_id, function_id);
+       MLX5_SET(set_hca_cap_in, set_ctx, other_function, true);
+       ret = mlx5_cmd_exec_in(dev, set_hca_cap, set_ctx);
+
+       kfree(set_ctx);
+       return ret;
+}
index a2ee695..3340b4a 100644 (file)
@@ -363,93 +363,7 @@ static const struct mlxsw_sp_ipip_ops mlxsw_sp_ipip_gre4_ops = {
 };
 
 static struct mlxsw_sp_ipip_parms
-mlxsw_sp1_ipip_netdev_parms_init_gre6(const struct net_device *ol_dev)
-{
-       struct mlxsw_sp_ipip_parms parms = {0};
-
-       WARN_ON_ONCE(1);
-       return parms;
-}
-
-static int
-mlxsw_sp1_ipip_nexthop_update_gre6(struct mlxsw_sp *mlxsw_sp, u32 adj_index,
-                                  struct mlxsw_sp_ipip_entry *ipip_entry,
-                                  bool force, char *ratr_pl)
-{
-       WARN_ON_ONCE(1);
-       return -EINVAL;
-}
-
-static int
-mlxsw_sp1_ipip_decap_config_gre6(struct mlxsw_sp *mlxsw_sp,
-                                struct mlxsw_sp_ipip_entry *ipip_entry,
-                                u32 tunnel_index)
-{
-       WARN_ON_ONCE(1);
-       return -EINVAL;
-}
-
-static bool mlxsw_sp1_ipip_can_offload_gre6(const struct mlxsw_sp *mlxsw_sp,
-                                           const struct net_device *ol_dev)
-{
-       return false;
-}
-
-static struct mlxsw_sp_rif_ipip_lb_config
-mlxsw_sp1_ipip_ol_loopback_config_gre6(struct mlxsw_sp *mlxsw_sp,
-                                      const struct net_device *ol_dev)
-{
-       struct mlxsw_sp_rif_ipip_lb_config config = {0};
-
-       WARN_ON_ONCE(1);
-       return config;
-}
-
-static int
-mlxsw_sp1_ipip_ol_netdev_change_gre6(struct mlxsw_sp *mlxsw_sp,
-                                    struct mlxsw_sp_ipip_entry *ipip_entry,
-                                    struct netlink_ext_ack *extack)
-{
-       WARN_ON_ONCE(1);
-       return -EINVAL;
-}
-
-static int
-mlxsw_sp1_ipip_rem_addr_set_gre6(struct mlxsw_sp *mlxsw_sp,
-                                struct mlxsw_sp_ipip_entry *ipip_entry)
-{
-       WARN_ON_ONCE(1);
-       return -EINVAL;
-}
-
-static void
-mlxsw_sp1_ipip_rem_addr_unset_gre6(struct mlxsw_sp *mlxsw_sp,
-                                  const struct mlxsw_sp_ipip_entry *ipip_entry)
-{
-       WARN_ON_ONCE(1);
-}
-
-static const struct mlxsw_sp_ipip_ops mlxsw_sp1_ipip_gre6_ops = {
-       .dev_type = ARPHRD_IP6GRE,
-       .ul_proto = MLXSW_SP_L3_PROTO_IPV6,
-       .inc_parsing_depth = true,
-       .parms_init = mlxsw_sp1_ipip_netdev_parms_init_gre6,
-       .nexthop_update = mlxsw_sp1_ipip_nexthop_update_gre6,
-       .decap_config = mlxsw_sp1_ipip_decap_config_gre6,
-       .can_offload = mlxsw_sp1_ipip_can_offload_gre6,
-       .ol_loopback_config = mlxsw_sp1_ipip_ol_loopback_config_gre6,
-       .ol_netdev_change = mlxsw_sp1_ipip_ol_netdev_change_gre6,
-       .rem_ip_addr_set = mlxsw_sp1_ipip_rem_addr_set_gre6,
-       .rem_ip_addr_unset = mlxsw_sp1_ipip_rem_addr_unset_gre6,
-};
-
-const struct mlxsw_sp_ipip_ops *mlxsw_sp1_ipip_ops_arr[] = {
-       [MLXSW_SP_IPIP_TYPE_GRE4] = &mlxsw_sp_ipip_gre4_ops,
-       [MLXSW_SP_IPIP_TYPE_GRE6] = &mlxsw_sp1_ipip_gre6_ops,
-};
-
-static struct mlxsw_sp_ipip_parms
-mlxsw_sp2_ipip_netdev_parms_init_gre6(const struct net_device *ol_dev)
+mlxsw_sp_ipip_netdev_parms_init_gre6(const struct net_device *ol_dev)
 {
        struct __ip6_tnl_parm parms = mlxsw_sp_ipip_netdev_parms6(ol_dev);
 
@@ -464,9 +378,9 @@ mlxsw_sp2_ipip_netdev_parms_init_gre6(const struct net_device *ol_dev)
 }
 
 static int
-mlxsw_sp2_ipip_nexthop_update_gre6(struct mlxsw_sp *mlxsw_sp, u32 adj_index,
-                                  struct mlxsw_sp_ipip_entry *ipip_entry,
-                                  bool force, char *ratr_pl)
+mlxsw_sp_ipip_nexthop_update_gre6(struct mlxsw_sp *mlxsw_sp, u32 adj_index,
+                                 struct mlxsw_sp_ipip_entry *ipip_entry,
+                                 bool force, char *ratr_pl)
 {
        u16 rif_index = mlxsw_sp_ipip_lb_rif_index(ipip_entry->ol_lb);
        enum mlxsw_reg_ratr_op op;
@@ -482,9 +396,9 @@ mlxsw_sp2_ipip_nexthop_update_gre6(struct mlxsw_sp *mlxsw_sp, u32 adj_index,
 }
 
 static int
-mlxsw_sp2_ipip_decap_config_gre6(struct mlxsw_sp *mlxsw_sp,
-                                struct mlxsw_sp_ipip_entry *ipip_entry,
-                                u32 tunnel_index)
+mlxsw_sp_ipip_decap_config_gre6(struct mlxsw_sp *mlxsw_sp,
+                               struct mlxsw_sp_ipip_entry *ipip_entry,
+                               u32 tunnel_index)
 {
        u16 rif_index = mlxsw_sp_ipip_lb_rif_index(ipip_entry->ol_lb);
        u16 ul_rif_id = mlxsw_sp_ipip_lb_ul_rif_id(ipip_entry->ol_lb);
@@ -519,8 +433,8 @@ mlxsw_sp2_ipip_decap_config_gre6(struct mlxsw_sp *mlxsw_sp,
        return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rtdp), rtdp_pl);
 }
 
-static bool mlxsw_sp2_ipip_can_offload_gre6(const struct mlxsw_sp *mlxsw_sp,
-                                           const struct net_device *ol_dev)
+static bool mlxsw_sp_ipip_can_offload_gre6(const struct mlxsw_sp *mlxsw_sp,
+                                          const struct net_device *ol_dev)
 {
        struct __ip6_tnl_parm tparm = mlxsw_sp_ipip_netdev_parms6(ol_dev);
        bool inherit_tos = tparm.flags & IP6_TNL_F_USE_ORIG_TCLASS;
@@ -534,8 +448,8 @@ static bool mlxsw_sp2_ipip_can_offload_gre6(const struct mlxsw_sp *mlxsw_sp,
 }
 
 static struct mlxsw_sp_rif_ipip_lb_config
-mlxsw_sp2_ipip_ol_loopback_config_gre6(struct mlxsw_sp *mlxsw_sp,
-                                      const struct net_device *ol_dev)
+mlxsw_sp_ipip_ol_loopback_config_gre6(struct mlxsw_sp *mlxsw_sp,
+                                     const struct net_device *ol_dev)
 {
        struct __ip6_tnl_parm parms = mlxsw_sp_ipip_netdev_parms6(ol_dev);
        enum mlxsw_reg_ritr_loopback_ipip_type lb_ipipt;
@@ -553,20 +467,20 @@ mlxsw_sp2_ipip_ol_loopback_config_gre6(struct mlxsw_sp *mlxsw_sp,
 }
 
 static int
-mlxsw_sp2_ipip_ol_netdev_change_gre6(struct mlxsw_sp *mlxsw_sp,
-                                    struct mlxsw_sp_ipip_entry *ipip_entry,
-                                    struct netlink_ext_ack *extack)
+mlxsw_sp_ipip_ol_netdev_change_gre6(struct mlxsw_sp *mlxsw_sp,
+                                   struct mlxsw_sp_ipip_entry *ipip_entry,
+                                   struct netlink_ext_ack *extack)
 {
        struct mlxsw_sp_ipip_parms new_parms;
 
-       new_parms = mlxsw_sp2_ipip_netdev_parms_init_gre6(ipip_entry->ol_dev);
+       new_parms = mlxsw_sp_ipip_netdev_parms_init_gre6(ipip_entry->ol_dev);
        return mlxsw_sp_ipip_ol_netdev_change_gre(mlxsw_sp, ipip_entry,
                                                  &new_parms, extack);
 }
 
 static int
-mlxsw_sp2_ipip_rem_addr_set_gre6(struct mlxsw_sp *mlxsw_sp,
-                                struct mlxsw_sp_ipip_entry *ipip_entry)
+mlxsw_sp_ipip_rem_addr_set_gre6(struct mlxsw_sp *mlxsw_sp,
+                               struct mlxsw_sp_ipip_entry *ipip_entry)
 {
        return mlxsw_sp_ipv6_addr_kvdl_index_get(mlxsw_sp,
                                                 &ipip_entry->parms.daddr.addr6,
@@ -574,24 +488,44 @@ mlxsw_sp2_ipip_rem_addr_set_gre6(struct mlxsw_sp *mlxsw_sp,
 }
 
 static void
-mlxsw_sp2_ipip_rem_addr_unset_gre6(struct mlxsw_sp *mlxsw_sp,
-                                  const struct mlxsw_sp_ipip_entry *ipip_entry)
+mlxsw_sp_ipip_rem_addr_unset_gre6(struct mlxsw_sp *mlxsw_sp,
+                                 const struct mlxsw_sp_ipip_entry *ipip_entry)
 {
        mlxsw_sp_ipv6_addr_put(mlxsw_sp, &ipip_entry->parms.daddr.addr6);
 }
 
+static const struct mlxsw_sp_ipip_ops mlxsw_sp1_ipip_gre6_ops = {
+       .dev_type = ARPHRD_IP6GRE,
+       .ul_proto = MLXSW_SP_L3_PROTO_IPV6,
+       .inc_parsing_depth = true,
+       .double_rif_entry = true,
+       .parms_init = mlxsw_sp_ipip_netdev_parms_init_gre6,
+       .nexthop_update = mlxsw_sp_ipip_nexthop_update_gre6,
+       .decap_config = mlxsw_sp_ipip_decap_config_gre6,
+       .can_offload = mlxsw_sp_ipip_can_offload_gre6,
+       .ol_loopback_config = mlxsw_sp_ipip_ol_loopback_config_gre6,
+       .ol_netdev_change = mlxsw_sp_ipip_ol_netdev_change_gre6,
+       .rem_ip_addr_set = mlxsw_sp_ipip_rem_addr_set_gre6,
+       .rem_ip_addr_unset = mlxsw_sp_ipip_rem_addr_unset_gre6,
+};
+
+const struct mlxsw_sp_ipip_ops *mlxsw_sp1_ipip_ops_arr[] = {
+       [MLXSW_SP_IPIP_TYPE_GRE4] = &mlxsw_sp_ipip_gre4_ops,
+       [MLXSW_SP_IPIP_TYPE_GRE6] = &mlxsw_sp1_ipip_gre6_ops,
+};
+
 static const struct mlxsw_sp_ipip_ops mlxsw_sp2_ipip_gre6_ops = {
        .dev_type = ARPHRD_IP6GRE,
        .ul_proto = MLXSW_SP_L3_PROTO_IPV6,
        .inc_parsing_depth = true,
-       .parms_init = mlxsw_sp2_ipip_netdev_parms_init_gre6,
-       .nexthop_update = mlxsw_sp2_ipip_nexthop_update_gre6,
-       .decap_config = mlxsw_sp2_ipip_decap_config_gre6,
-       .can_offload = mlxsw_sp2_ipip_can_offload_gre6,
-       .ol_loopback_config = mlxsw_sp2_ipip_ol_loopback_config_gre6,
-       .ol_netdev_change = mlxsw_sp2_ipip_ol_netdev_change_gre6,
-       .rem_ip_addr_set = mlxsw_sp2_ipip_rem_addr_set_gre6,
-       .rem_ip_addr_unset = mlxsw_sp2_ipip_rem_addr_unset_gre6,
+       .parms_init = mlxsw_sp_ipip_netdev_parms_init_gre6,
+       .nexthop_update = mlxsw_sp_ipip_nexthop_update_gre6,
+       .decap_config = mlxsw_sp_ipip_decap_config_gre6,
+       .can_offload = mlxsw_sp_ipip_can_offload_gre6,
+       .ol_loopback_config = mlxsw_sp_ipip_ol_loopback_config_gre6,
+       .ol_netdev_change = mlxsw_sp_ipip_ol_netdev_change_gre6,
+       .rem_ip_addr_set = mlxsw_sp_ipip_rem_addr_set_gre6,
+       .rem_ip_addr_unset = mlxsw_sp_ipip_rem_addr_unset_gre6,
 };
 
 const struct mlxsw_sp_ipip_ops *mlxsw_sp2_ipip_ops_arr[] = {
index 8cc259d..a35f009 100644 (file)
@@ -49,6 +49,7 @@ struct mlxsw_sp_ipip_ops {
        int dev_type;
        enum mlxsw_sp_l3proto ul_proto; /* Underlay. */
        bool inc_parsing_depth;
+       bool double_rif_entry;
 
        struct mlxsw_sp_ipip_parms
        (*parms_init)(const struct net_device *ol_dev);
index 48f1fa6..c22c3ac 100644 (file)
@@ -18,6 +18,7 @@
 #include <linux/jhash.h>
 #include <linux/net_namespace.h>
 #include <linux/mutex.h>
+#include <linux/genalloc.h>
 #include <net/netevent.h>
 #include <net/neighbour.h>
 #include <net/arp.h>
@@ -59,6 +60,7 @@ struct mlxsw_sp_rif {
        int mtu;
        u16 rif_index;
        u8 mac_profile_id;
+       u8 rif_entries;
        u16 vr_id;
        const struct mlxsw_sp_rif_ops *ops;
        struct mlxsw_sp *mlxsw_sp;
@@ -77,6 +79,7 @@ struct mlxsw_sp_rif_params {
        };
        u16 vid;
        bool lag;
+       bool double_entry;
 };
 
 struct mlxsw_sp_rif_subport {
@@ -1068,6 +1071,7 @@ mlxsw_sp_ipip_ol_ipip_lb_create(struct mlxsw_sp *mlxsw_sp,
        lb_params = (struct mlxsw_sp_rif_params_ipip_lb) {
                .common.dev = ol_dev,
                .common.lag = false,
+               .common.double_entry = ipip_ops->double_rif_entry,
                .lb_config = ipip_ops->ol_loopback_config(mlxsw_sp, ol_dev),
        };
 
@@ -7826,18 +7830,26 @@ mlxsw_sp_dev_rif_type(const struct mlxsw_sp *mlxsw_sp,
        return mlxsw_sp_fid_type_rif_type(mlxsw_sp, type);
 }
 
-static int mlxsw_sp_rif_index_alloc(struct mlxsw_sp *mlxsw_sp, u16 *p_rif_index)
+static int mlxsw_sp_rif_index_alloc(struct mlxsw_sp *mlxsw_sp, u16 *p_rif_index,
+                                   u8 rif_entries)
 {
-       int i;
+       *p_rif_index = gen_pool_alloc(mlxsw_sp->router->rifs_table,
+                                     rif_entries);
+       if (*p_rif_index == 0)
+               return -ENOBUFS;
+       *p_rif_index -= MLXSW_SP_ROUTER_GENALLOC_OFFSET;
 
-       for (i = 0; i < MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_RIFS); i++) {
-               if (!mlxsw_sp->router->rifs[i]) {
-                       *p_rif_index = i;
-                       return 0;
-               }
-       }
+       /* RIF indexes must be aligned to the allocation size. */
+       WARN_ON_ONCE(*p_rif_index % rif_entries);
 
-       return -ENOBUFS;
+       return 0;
+}
+
+static void mlxsw_sp_rif_index_free(struct mlxsw_sp *mlxsw_sp, u16 rif_index,
+                                   u8 rif_entries)
+{
+       gen_pool_free(mlxsw_sp->router->rifs_table,
+                     MLXSW_SP_ROUTER_GENALLOC_OFFSET + rif_index, rif_entries);
 }
 
 static struct mlxsw_sp_rif *mlxsw_sp_rif_alloc(size_t rif_size, u16 rif_index,
@@ -8081,6 +8093,7 @@ mlxsw_sp_rif_create(struct mlxsw_sp *mlxsw_sp,
                    const struct mlxsw_sp_rif_params *params,
                    struct netlink_ext_ack *extack)
 {
+       u8 rif_entries = params->double_entry ? 2 : 1;
        u32 tb_id = l3mdev_fib_table(params->dev);
        const struct mlxsw_sp_rif_ops *ops;
        struct mlxsw_sp_fid *fid = NULL;
@@ -8098,7 +8111,7 @@ mlxsw_sp_rif_create(struct mlxsw_sp *mlxsw_sp,
                return ERR_CAST(vr);
        vr->rif_count++;
 
-       err = mlxsw_sp_rif_index_alloc(mlxsw_sp, &rif_index);
+       err = mlxsw_sp_rif_index_alloc(mlxsw_sp, &rif_index, rif_entries);
        if (err) {
                NL_SET_ERR_MSG_MOD(extack, "Exceeded number of supported router interfaces");
                goto err_rif_index_alloc;
@@ -8113,6 +8126,7 @@ mlxsw_sp_rif_create(struct mlxsw_sp *mlxsw_sp,
        mlxsw_sp->router->rifs[rif_index] = rif;
        rif->mlxsw_sp = mlxsw_sp;
        rif->ops = ops;
+       rif->rif_entries = rif_entries;
 
        if (ops->fid_get) {
                fid = ops->fid_get(rif, extack);
@@ -8146,7 +8160,7 @@ mlxsw_sp_rif_create(struct mlxsw_sp *mlxsw_sp,
                mlxsw_sp_rif_counters_alloc(rif);
        }
 
-       atomic_inc(&mlxsw_sp->router->rifs_count);
+       atomic_add(rif_entries, &mlxsw_sp->router->rifs_count);
        return rif;
 
 err_stats_enable:
@@ -8162,6 +8176,7 @@ err_fid_get:
        dev_put(rif->dev);
        kfree(rif);
 err_rif_alloc:
+       mlxsw_sp_rif_index_free(mlxsw_sp, rif_index, rif_entries);
 err_rif_index_alloc:
        vr->rif_count--;
        mlxsw_sp_vr_put(mlxsw_sp, vr);
@@ -8173,10 +8188,12 @@ static void mlxsw_sp_rif_destroy(struct mlxsw_sp_rif *rif)
        const struct mlxsw_sp_rif_ops *ops = rif->ops;
        struct mlxsw_sp *mlxsw_sp = rif->mlxsw_sp;
        struct mlxsw_sp_fid *fid = rif->fid;
+       u8 rif_entries = rif->rif_entries;
+       u16 rif_index = rif->rif_index;
        struct mlxsw_sp_vr *vr;
        int i;
 
-       atomic_dec(&mlxsw_sp->router->rifs_count);
+       atomic_sub(rif_entries, &mlxsw_sp->router->rifs_count);
        mlxsw_sp_router_rif_gone_sync(mlxsw_sp, rif);
        vr = &mlxsw_sp->router->vrs[rif->vr_id];
 
@@ -8198,6 +8215,7 @@ static void mlxsw_sp_rif_destroy(struct mlxsw_sp_rif *rif)
        mlxsw_sp->router->rifs[rif->rif_index] = NULL;
        dev_put(rif->dev);
        kfree(rif);
+       mlxsw_sp_rif_index_free(mlxsw_sp, rif_index, rif_entries);
        vr->rif_count--;
        mlxsw_sp_vr_put(mlxsw_sp, vr);
 }
@@ -9771,42 +9789,51 @@ mlxsw_sp_ul_rif_create(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_vr *vr,
                       struct netlink_ext_ack *extack)
 {
        struct mlxsw_sp_rif *ul_rif;
+       u8 rif_entries = 1;
        u16 rif_index;
        int err;
 
-       err = mlxsw_sp_rif_index_alloc(mlxsw_sp, &rif_index);
+       err = mlxsw_sp_rif_index_alloc(mlxsw_sp, &rif_index, rif_entries);
        if (err) {
                NL_SET_ERR_MSG_MOD(extack, "Exceeded number of supported router interfaces");
                return ERR_PTR(err);
        }
 
        ul_rif = mlxsw_sp_rif_alloc(sizeof(*ul_rif), rif_index, vr->id, NULL);
-       if (!ul_rif)
-               return ERR_PTR(-ENOMEM);
+       if (!ul_rif) {
+               err = -ENOMEM;
+               goto err_rif_alloc;
+       }
 
        mlxsw_sp->router->rifs[rif_index] = ul_rif;
        ul_rif->mlxsw_sp = mlxsw_sp;
+       ul_rif->rif_entries = rif_entries;
        err = mlxsw_sp_rif_ipip_lb_ul_rif_op(ul_rif, true);
        if (err)
                goto ul_rif_op_err;
 
-       atomic_inc(&mlxsw_sp->router->rifs_count);
+       atomic_add(rif_entries, &mlxsw_sp->router->rifs_count);
        return ul_rif;
 
 ul_rif_op_err:
        mlxsw_sp->router->rifs[rif_index] = NULL;
        kfree(ul_rif);
+err_rif_alloc:
+       mlxsw_sp_rif_index_free(mlxsw_sp, rif_index, rif_entries);
        return ERR_PTR(err);
 }
 
 static void mlxsw_sp_ul_rif_destroy(struct mlxsw_sp_rif *ul_rif)
 {
        struct mlxsw_sp *mlxsw_sp = ul_rif->mlxsw_sp;
+       u8 rif_entries = ul_rif->rif_entries;
+       u16 rif_index = ul_rif->rif_index;
 
-       atomic_dec(&mlxsw_sp->router->rifs_count);
+       atomic_sub(rif_entries, &mlxsw_sp->router->rifs_count);
        mlxsw_sp_rif_ipip_lb_ul_rif_op(ul_rif, false);
        mlxsw_sp->router->rifs[ul_rif->rif_index] = NULL;
        kfree(ul_rif);
+       mlxsw_sp_rif_index_free(mlxsw_sp, rif_index, rif_entries);
 }
 
 static struct mlxsw_sp_rif *
@@ -9940,11 +9967,43 @@ static const struct mlxsw_sp_rif_ops *mlxsw_sp2_rif_ops_arr[] = {
        [MLXSW_SP_RIF_TYPE_IPIP_LB]     = &mlxsw_sp2_rif_ipip_lb_ops,
 };
 
+static int mlxsw_sp_rifs_table_init(struct mlxsw_sp *mlxsw_sp)
+{
+       struct gen_pool *rifs_table;
+       int err;
+
+       rifs_table = gen_pool_create(0, -1);
+       if (!rifs_table)
+               return -ENOMEM;
+
+       gen_pool_set_algo(rifs_table, gen_pool_first_fit_order_align,
+                         NULL);
+
+       err = gen_pool_add(rifs_table, MLXSW_SP_ROUTER_GENALLOC_OFFSET,
+                          MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_RIFS), -1);
+       if (err)
+               goto err_gen_pool_add;
+
+       mlxsw_sp->router->rifs_table = rifs_table;
+
+       return 0;
+
+err_gen_pool_add:
+       gen_pool_destroy(rifs_table);
+       return err;
+}
+
+static void mlxsw_sp_rifs_table_fini(struct mlxsw_sp *mlxsw_sp)
+{
+       gen_pool_destroy(mlxsw_sp->router->rifs_table);
+}
+
 static int mlxsw_sp_rifs_init(struct mlxsw_sp *mlxsw_sp)
 {
        u64 max_rifs = MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_RIFS);
        struct devlink *devlink = priv_to_devlink(mlxsw_sp->core);
        struct mlxsw_core *core = mlxsw_sp->core;
+       int err;
 
        if (!MLXSW_CORE_RES_VALID(core, MAX_RIF_MAC_PROFILES))
                return -EIO;
@@ -9957,6 +10016,10 @@ static int mlxsw_sp_rifs_init(struct mlxsw_sp *mlxsw_sp)
        if (!mlxsw_sp->router->rifs)
                return -ENOMEM;
 
+       err = mlxsw_sp_rifs_table_init(mlxsw_sp);
+       if (err)
+               goto err_rifs_table_init;
+
        idr_init(&mlxsw_sp->router->rif_mac_profiles_idr);
        atomic_set(&mlxsw_sp->router->rif_mac_profiles_count, 0);
        atomic_set(&mlxsw_sp->router->rifs_count, 0);
@@ -9970,6 +10033,10 @@ static int mlxsw_sp_rifs_init(struct mlxsw_sp *mlxsw_sp)
                                       mlxsw_sp);
 
        return 0;
+
+err_rifs_table_init:
+       kfree(mlxsw_sp->router->rifs);
+       return err;
 }
 
 static void mlxsw_sp_rifs_fini(struct mlxsw_sp *mlxsw_sp)
@@ -9986,6 +10053,7 @@ static void mlxsw_sp_rifs_fini(struct mlxsw_sp *mlxsw_sp)
                                         MLXSW_SP_RESOURCE_RIF_MAC_PROFILES);
        WARN_ON(!idr_is_empty(&mlxsw_sp->router->rif_mac_profiles_idr));
        idr_destroy(&mlxsw_sp->router->rif_mac_profiles_idr);
+       mlxsw_sp_rifs_table_fini(mlxsw_sp);
        kfree(mlxsw_sp->router->rifs);
 }
 
index c5dfb97..37d6e4c 100644 (file)
@@ -15,8 +15,12 @@ struct mlxsw_sp_router_nve_decap {
        u8 valid:1;
 };
 
+/* gen_pool_alloc() returns 0 when allocation fails, so use an offset */
+#define MLXSW_SP_ROUTER_GENALLOC_OFFSET 0x100
+
 struct mlxsw_sp_router {
        struct mlxsw_sp *mlxsw_sp;
+       struct gen_pool *rifs_table;
        struct mlxsw_sp_rif **rifs;
        struct idr rif_mac_profiles_idr;
        atomic_t rif_mac_profiles_count;
index 81a8ccc..5693784 100644 (file)
@@ -359,7 +359,7 @@ static int regmap_encx24j600_phy_reg_read(void *context, unsigned int reg,
                goto err_out;
 
        usleep_range(26, 100);
-       while ((ret = regmap_read(ctx->regmap, MISTAT, &mistat) != 0) &&
+       while (((ret = regmap_read(ctx->regmap, MISTAT, &mistat)) == 0) &&
               (mistat & BUSY))
                cpu_relax();
 
@@ -397,7 +397,7 @@ static int regmap_encx24j600_phy_reg_write(void *context, unsigned int reg,
                goto err_out;
 
        usleep_range(26, 100);
-       while ((ret = regmap_read(ctx->regmap, MISTAT, &mistat) != 0) &&
+       while (((ret = regmap_read(ctx->regmap, MISTAT, &mistat)) == 0) &&
               (mistat & BUSY))
                cpu_relax();
 
index f609298..cadde20 100644 (file)
@@ -443,11 +443,22 @@ static int lan966x_port_ioctl(struct net_device *dev, struct ifreq *ifr,
                              int cmd)
 {
        struct lan966x_port *port = netdev_priv(dev);
+       int err;
+
+       if (cmd == SIOCSHWTSTAMP) {
+               err = lan966x_ptp_setup_traps(port, ifr);
+               if (err)
+                       return err;
+       }
 
        if (!phy_has_hwtstamp(dev->phydev) && port->lan966x->ptp) {
                switch (cmd) {
                case SIOCSHWTSTAMP:
-                       return lan966x_ptp_hwtstamp_set(port, ifr);
+                       err = lan966x_ptp_hwtstamp_set(port, ifr);
+                       if (err)
+                               lan966x_ptp_del_traps(port);
+
+                       return err;
                case SIOCGHWTSTAMP:
                        return lan966x_ptp_hwtstamp_get(port, ifr);
                }
@@ -456,7 +467,11 @@ static int lan966x_port_ioctl(struct net_device *dev, struct ifreq *ifr,
        if (!dev->phydev)
                return -ENODEV;
 
-       return phy_mii_ioctl(dev->phydev, ifr, cmd);
+       err = phy_mii_ioctl(dev->phydev, ifr, cmd);
+       if (err && cmd == SIOCSHWTSTAMP)
+               lan966x_ptp_del_traps(port);
+
+       return err;
 }
 
 static const struct net_device_ops lan966x_port_netdev_ops = {
index f2e45da..3491f19 100644 (file)
 #define SE_IDX_QUEUE                   0  /* 0-79 : Queue scheduler elements */
 #define SE_IDX_PORT                    80 /* 80-89 : Port schedular elements */
 
+#define LAN966X_VCAP_CID_IS2_L0 VCAP_CID_INGRESS_STAGE2_L0 /* IS2 lookup 0 */
+#define LAN966X_VCAP_CID_IS2_L1 VCAP_CID_INGRESS_STAGE2_L1 /* IS2 lookup 1 */
+#define LAN966X_VCAP_CID_IS2_MAX (VCAP_CID_INGRESS_STAGE2_L2 - 1) /* IS2 Max */
+
 /* MAC table entry types.
  * ENTRYTYPE_NORMAL is subject to aging.
  * ENTRYTYPE_LOCKED is not subject to aging.
@@ -116,6 +120,14 @@ enum lan966x_fdma_action {
        FDMA_REDIRECT,
 };
 
+/* Controls how PORT_MASK is applied */
+enum LAN966X_PORT_MASK_MODE {
+       LAN966X_PMM_NO_ACTION,
+       LAN966X_PMM_REPLACE,
+       LAN966X_PMM_FORWARDING,
+       LAN966X_PMM_REDIRECT,
+};
+
 struct lan966x_port;
 
 struct lan966x_db {
@@ -473,6 +485,8 @@ irqreturn_t lan966x_ptp_irq_handler(int irq, void *args);
 irqreturn_t lan966x_ptp_ext_irq_handler(int irq, void *args);
 u32 lan966x_ptp_get_period_ps(void);
 int lan966x_ptp_gettime64(struct ptp_clock_info *ptp, struct timespec64 *ts);
+int lan966x_ptp_setup_traps(struct lan966x_port *port, struct ifreq *ifr);
+int lan966x_ptp_del_traps(struct lan966x_port *port);
 
 int lan966x_fdma_xmit(struct sk_buff *skb, __be32 *ifh, struct net_device *dev);
 int lan966x_fdma_xmit_xdpf(struct lan966x_port *port,
index e5a2bbe..f9ebfaa 100644 (file)
@@ -3,6 +3,8 @@
 #include <linux/ptp_classify.h>
 
 #include "lan966x_main.h"
+#include "vcap_api.h"
+#include "vcap_api_client.h"
 
 #define LAN966X_MAX_PTP_ID     512
 
 
 #define TOD_ACC_PIN            0x7
 
+/* This represents the base rule ID for the PTP rules that are added in the
+ * VCAP to trap frames to CPU. This number needs to be bigger than the maximum
+ * number of entries that can exist in the VCAP.
+ */
+#define LAN966X_VCAP_PTP_RULE_ID       1000000
+#define LAN966X_VCAP_L2_PTP_TRAP       (LAN966X_VCAP_PTP_RULE_ID + 0)
+#define LAN966X_VCAP_IPV4_EV_PTP_TRAP  (LAN966X_VCAP_PTP_RULE_ID + 1)
+#define LAN966X_VCAP_IPV4_GEN_PTP_TRAP (LAN966X_VCAP_PTP_RULE_ID + 2)
+#define LAN966X_VCAP_IPV6_EV_PTP_TRAP  (LAN966X_VCAP_PTP_RULE_ID + 3)
+#define LAN966X_VCAP_IPV6_GEN_PTP_TRAP (LAN966X_VCAP_PTP_RULE_ID + 4)
+
 enum {
        PTP_PIN_ACTION_IDLE = 0,
        PTP_PIN_ACTION_LOAD,
@@ -35,19 +48,226 @@ static u64 lan966x_ptp_get_nominal_value(void)
        return 0x304d4873ecade305;
 }
 
+static int lan966x_ptp_add_trap(struct lan966x_port *port,
+                               int (*add_ptp_key)(struct vcap_rule *vrule,
+                                                  struct lan966x_port*),
+                               u32 rule_id,
+                               u16 proto)
+{
+       struct lan966x *lan966x = port->lan966x;
+       struct vcap_rule *vrule;
+       int err;
+
+       vrule = vcap_get_rule(lan966x->vcap_ctrl, rule_id);
+       if (vrule) {
+               u32 value, mask;
+
+               /* Just modify the ingress port mask and exit */
+               vcap_rule_get_key_u32(vrule, VCAP_KF_IF_IGR_PORT_MASK,
+                                     &value, &mask);
+               mask &= ~BIT(port->chip_port);
+               vcap_rule_mod_key_u32(vrule, VCAP_KF_IF_IGR_PORT_MASK,
+                                     value, mask);
+
+               err = vcap_mod_rule(vrule);
+               goto free_rule;
+       }
+
+       vrule = vcap_alloc_rule(lan966x->vcap_ctrl, port->dev,
+                               LAN966X_VCAP_CID_IS2_L0,
+                               VCAP_USER_PTP, 0, rule_id);
+       if (IS_ERR(vrule))
+               return PTR_ERR(vrule);
+
+       err = add_ptp_key(vrule, port);
+       if (err)
+               goto free_rule;
+
+       err = vcap_set_rule_set_actionset(vrule, VCAP_AFS_BASE_TYPE);
+       err |= vcap_rule_add_action_bit(vrule, VCAP_AF_CPU_COPY_ENA, VCAP_BIT_1);
+       err |= vcap_rule_add_action_u32(vrule, VCAP_AF_MASK_MODE, LAN966X_PMM_REPLACE);
+       err |= vcap_val_rule(vrule, proto);
+       if (err)
+               goto free_rule;
+
+       err = vcap_add_rule(vrule);
+
+free_rule:
+       /* Free the local copy of the rule */
+       vcap_free_rule(vrule);
+       return err;
+}
+
+static int lan966x_ptp_del_trap(struct lan966x_port *port,
+                               u32 rule_id)
+{
+       struct lan966x *lan966x = port->lan966x;
+       struct vcap_rule *vrule;
+       u32 value, mask;
+       int err;
+
+       vrule = vcap_get_rule(lan966x->vcap_ctrl, rule_id);
+       if (!vrule)
+               return -EEXIST;
+
+       vcap_rule_get_key_u32(vrule, VCAP_KF_IF_IGR_PORT_MASK, &value, &mask);
+       mask |= BIT(port->chip_port);
+
+       /* No other port requires this trap, so it is safe to remove it */
+       if (mask == GENMASK(lan966x->num_phys_ports, 0)) {
+               err = vcap_del_rule(lan966x->vcap_ctrl, port->dev, rule_id);
+               goto free_rule;
+       }
+
+       vcap_rule_mod_key_u32(vrule, VCAP_KF_IF_IGR_PORT_MASK, value, mask);
+       err = vcap_mod_rule(vrule);
+
+free_rule:
+       vcap_free_rule(vrule);
+       return err;
+}
+
+static int lan966x_ptp_add_l2_key(struct vcap_rule *vrule,
+                                 struct lan966x_port *port)
+{
+       return vcap_rule_add_key_u32(vrule, VCAP_KF_ETYPE, ETH_P_1588, ~0);
+}
+
+static int lan966x_ptp_add_ip_event_key(struct vcap_rule *vrule,
+                                       struct lan966x_port *port)
+{
+       return vcap_rule_add_key_u32(vrule, VCAP_KF_L4_DPORT, PTP_EV_PORT, ~0) ||
+              vcap_rule_add_key_bit(vrule, VCAP_KF_TCP_IS, VCAP_BIT_0);
+}
+
+static int lan966x_ptp_add_ip_general_key(struct vcap_rule *vrule,
+                                         struct lan966x_port *port)
+{
+       return vcap_rule_add_key_u32(vrule, VCAP_KF_L4_DPORT, PTP_GEN_PORT, ~0) ||
+              vcap_rule_add_key_bit(vrule, VCAP_KF_TCP_IS, VCAP_BIT_0);
+}
+
+static int lan966x_ptp_add_l2_rule(struct lan966x_port *port)
+{
+       return lan966x_ptp_add_trap(port, lan966x_ptp_add_l2_key,
+                                   LAN966X_VCAP_L2_PTP_TRAP, ETH_P_ALL);
+}
+
+static int lan966x_ptp_add_ipv4_rules(struct lan966x_port *port)
+{
+       int err;
+
+       err = lan966x_ptp_add_trap(port, lan966x_ptp_add_ip_event_key,
+                                  LAN966X_VCAP_IPV4_EV_PTP_TRAP, ETH_P_IP);
+       if (err)
+               return err;
+
+       err = lan966x_ptp_add_trap(port, lan966x_ptp_add_ip_general_key,
+                                  LAN966X_VCAP_IPV4_GEN_PTP_TRAP, ETH_P_IP);
+       if (err)
+               lan966x_ptp_del_trap(port, LAN966X_VCAP_IPV4_EV_PTP_TRAP);
+
+       return err;
+}
+
+static int lan966x_ptp_add_ipv6_rules(struct lan966x_port *port)
+{
+       int err;
+
+       err = lan966x_ptp_add_trap(port, lan966x_ptp_add_ip_event_key,
+                                  LAN966X_VCAP_IPV6_EV_PTP_TRAP, ETH_P_IPV6);
+       if (err)
+               return err;
+
+       err = lan966x_ptp_add_trap(port, lan966x_ptp_add_ip_general_key,
+                                  LAN966X_VCAP_IPV6_GEN_PTP_TRAP, ETH_P_IPV6);
+       if (err)
+               lan966x_ptp_del_trap(port, LAN966X_VCAP_IPV6_EV_PTP_TRAP);
+
+       return err;
+}
+
+static int lan966x_ptp_del_l2_rule(struct lan966x_port *port)
+{
+       return lan966x_ptp_del_trap(port, LAN966X_VCAP_L2_PTP_TRAP);
+}
+
+static int lan966x_ptp_del_ipv4_rules(struct lan966x_port *port)
+{
+       int err;
+
+       err = lan966x_ptp_del_trap(port, LAN966X_VCAP_IPV4_EV_PTP_TRAP);
+       err |= lan966x_ptp_del_trap(port, LAN966X_VCAP_IPV4_GEN_PTP_TRAP);
+
+       return err;
+}
+
+static int lan966x_ptp_del_ipv6_rules(struct lan966x_port *port)
+{
+       int err;
+
+       err = lan966x_ptp_del_trap(port, LAN966X_VCAP_IPV6_EV_PTP_TRAP);
+       err |= lan966x_ptp_del_trap(port, LAN966X_VCAP_IPV6_GEN_PTP_TRAP);
+
+       return err;
+}
+
+static int lan966x_ptp_add_traps(struct lan966x_port *port)
+{
+       int err;
+
+       err = lan966x_ptp_add_l2_rule(port);
+       if (err)
+               goto err_l2;
+
+       err = lan966x_ptp_add_ipv4_rules(port);
+       if (err)
+               goto err_ipv4;
+
+       err = lan966x_ptp_add_ipv6_rules(port);
+       if (err)
+               goto err_ipv6;
+
+       return err;
+
+err_ipv6:
+       lan966x_ptp_del_ipv4_rules(port);
+err_ipv4:
+       lan966x_ptp_del_l2_rule(port);
+err_l2:
+       return err;
+}
+
+int lan966x_ptp_del_traps(struct lan966x_port *port)
+{
+       int err;
+
+       err = lan966x_ptp_del_l2_rule(port);
+       err |= lan966x_ptp_del_ipv4_rules(port);
+       err |= lan966x_ptp_del_ipv6_rules(port);
+
+       return err;
+}
+
+int lan966x_ptp_setup_traps(struct lan966x_port *port, struct ifreq *ifr)
+{
+       struct hwtstamp_config cfg;
+
+       if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
+               return -EFAULT;
+
+       if (cfg.rx_filter == HWTSTAMP_FILTER_NONE)
+               return lan966x_ptp_del_traps(port);
+       else
+               return lan966x_ptp_add_traps(port);
+}
+
 int lan966x_ptp_hwtstamp_set(struct lan966x_port *port, struct ifreq *ifr)
 {
        struct lan966x *lan966x = port->lan966x;
        struct hwtstamp_config cfg;
        struct lan966x_phc *phc;
 
-       /* For now don't allow to run ptp on ports that are part of a bridge,
-        * because in case of transparent clock the HW will still forward the
-        * frames, so there would be duplicate frames
-        */
-       if (lan966x->bridge_mask & BIT(port->chip_port))
-               return -EINVAL;
-
        if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
                return -EFAULT;
 
index 04a2afd..ba3fa91 100644 (file)
@@ -4,14 +4,6 @@
 #include "vcap_api.h"
 #include "vcap_api_client.h"
 
-/* Controls how PORT_MASK is applied */
-enum LAN966X_PORT_MASK_MODE {
-       LAN966X_PMM_NO_ACTION,
-       LAN966X_PMM_REPLACE,
-       LAN966X_PMM_FORWARDING,
-       LAN966X_PMM_REDIRECT,
-};
-
 struct lan966x_tc_flower_parse_usage {
        struct flow_cls_offload *f;
        struct flow_rule *frule;
index 44f40d9..d8dc9fb 100644 (file)
@@ -5,10 +5,6 @@
 #include "vcap_api.h"
 #include "vcap_api_client.h"
 
-#define LAN966X_VCAP_CID_IS2_L0 VCAP_CID_INGRESS_STAGE2_L0 /* IS2 lookup 0 */
-#define LAN966X_VCAP_CID_IS2_L1 VCAP_CID_INGRESS_STAGE2_L1 /* IS2 lookup 1 */
-#define LAN966X_VCAP_CID_IS2_MAX (VCAP_CID_INGRESS_STAGE2_L2 - 1) /* IS2 Max */
-
 #define STREAMSIZE (64 * 4)
 
 #define LAN966X_IS2_LOOKUPS 2
@@ -219,9 +215,12 @@ static void lan966x_vcap_add_default_fields(struct net_device *dev,
                                            struct vcap_rule *rule)
 {
        struct lan966x_port *port = netdev_priv(dev);
+       u32 value, mask;
 
-       vcap_rule_add_key_u32(rule, VCAP_KF_IF_IGR_PORT_MASK, 0,
-                             ~BIT(port->chip_port));
+       if (vcap_rule_get_key_u32(rule, VCAP_KF_IF_IGR_PORT_MASK,
+                                 &value, &mask))
+               vcap_rule_add_key_u32(rule, VCAP_KF_IF_IGR_PORT_MASK, 0,
+                                     ~BIT(port->chip_port));
 
        if (lan966x_vcap_is_first_chain(rule))
                vcap_rule_add_key_bit(rule, VCAP_KF_LOOKUP_FIRST_IS,
index 66360c8..141897d 100644 (file)
@@ -317,7 +317,7 @@ int sparx5_fdma_xmit(struct sparx5 *sparx5, u32 *ifh, struct sk_buff *skb)
        next_dcb_hw = sparx5_fdma_next_dcb(tx, tx->curr_entry);
        db_hw = &next_dcb_hw->db[0];
        if (!(db_hw->status & FDMA_DCB_STATUS_DONE))
-               tx->dropped++;
+               return -EINVAL;
        db = list_first_entry(&tx->db_list, struct sparx5_db, list);
        list_move_tail(&db->list, &tx->db_list);
        next_dcb_hw->nextptr = FDMA_DCB_INVALID_DATA;
index f8382d3..d25f4f0 100644 (file)
@@ -897,6 +897,8 @@ static int mchp_sparx5_probe(struct platform_device *pdev)
 
 cleanup_ports:
        sparx5_cleanup_ports(sparx5);
+       if (sparx5->mact_queue)
+               destroy_workqueue(sparx5->mact_queue);
 cleanup_config:
        kfree(configs);
 cleanup_pnode:
@@ -923,6 +925,7 @@ static int mchp_sparx5_remove(struct platform_device *pdev)
        sparx5_vcap_destroy(sparx5);
        /* Unregister netdevs */
        sparx5_unregister_notifier_blocks(sparx5);
+       destroy_workqueue(sparx5->mact_queue);
 
        return 0;
 }
index 83c16ca..6db6ac6 100644 (file)
@@ -234,9 +234,8 @@ netdev_tx_t sparx5_port_xmit_impl(struct sk_buff *skb, struct net_device *dev)
        sparx5_set_port_ifh(ifh, port->portno);
 
        if (sparx5->ptp && skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) {
-               ret = sparx5_ptp_txtstamp_request(port, skb);
-               if (ret)
-                       return ret;
+               if (sparx5_ptp_txtstamp_request(port, skb) < 0)
+                       return NETDEV_TX_BUSY;
 
                sparx5_set_port_ifh_rew_op(ifh, SPARX5_SKB_CB(skb)->rew_op);
                sparx5_set_port_ifh_pdu_type(ifh, SPARX5_SKB_CB(skb)->pdu_type);
@@ -250,23 +249,31 @@ netdev_tx_t sparx5_port_xmit_impl(struct sk_buff *skb, struct net_device *dev)
        else
                ret = sparx5_inject(sparx5, ifh, skb, dev);
 
-       if (ret == NETDEV_TX_OK) {
-               stats->tx_bytes += skb->len;
-               stats->tx_packets++;
+       if (ret == -EBUSY)
+               goto busy;
+       if (ret < 0)
+               goto drop;
 
-               if (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP &&
-                   SPARX5_SKB_CB(skb)->rew_op == IFH_REW_OP_TWO_STEP_PTP)
-                       return ret;
+       stats->tx_bytes += skb->len;
+       stats->tx_packets++;
+       sparx5->tx.packets++;
 
-               dev_kfree_skb_any(skb);
-       } else {
-               stats->tx_dropped++;
+       if (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP &&
+           SPARX5_SKB_CB(skb)->rew_op == IFH_REW_OP_TWO_STEP_PTP)
+               return NETDEV_TX_OK;
 
-               if (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP &&
-                   SPARX5_SKB_CB(skb)->rew_op == IFH_REW_OP_TWO_STEP_PTP)
-                       sparx5_ptp_txtstamp_release(port, skb);
-       }
-       return ret;
+       dev_consume_skb_any(skb);
+       return NETDEV_TX_OK;
+drop:
+       stats->tx_dropped++;
+       sparx5->tx.dropped++;
+       dev_kfree_skb_any(skb);
+       return NETDEV_TX_OK;
+busy:
+       if (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP &&
+           SPARX5_SKB_CB(skb)->rew_op == IFH_REW_OP_TWO_STEP_PTP)
+               sparx5_ptp_txtstamp_release(port, skb);
+       return NETDEV_TX_BUSY;
 }
 
 static enum hrtimer_restart sparx5_injection_timeout(struct hrtimer *tmr)
index f2435d7..664aae3 100644 (file)
@@ -169,6 +169,227 @@ static void vcap_encode_typegroups(u32 *stream, int sw_width,
        }
 }
 
+static bool vcap_bitarray_zero(int width, u8 *value)
+{
+       int bytes = DIV_ROUND_UP(width, BITS_PER_BYTE);
+       u8 total = 0, bmask = 0xff;
+       int rwidth = width;
+       int idx;
+
+       for (idx = 0; idx < bytes; ++idx, rwidth -= BITS_PER_BYTE) {
+               if (rwidth && rwidth < BITS_PER_BYTE)
+                       bmask = (1 << rwidth) - 1;
+               total += value[idx] & bmask;
+       }
+       return total == 0;
+}
+
+static bool vcap_get_bit(u32 *stream, struct vcap_stream_iter *itr)
+{
+       u32 mask = BIT(itr->reg_bitpos);
+       u32 *p = &stream[itr->reg_idx];
+
+       return !!(*p & mask);
+}
+
+static void vcap_decode_field(u32 *stream, struct vcap_stream_iter *itr,
+                             int width, u8 *value)
+{
+       int idx;
+
+       /* Loop over the field value bits and get the field bits and
+        * set them in the output value byte array
+        */
+       for (idx = 0; idx < width; idx++) {
+               u8 bidx = idx & 0x7;
+
+               /* Decode one field value bit */
+               if (vcap_get_bit(stream, itr))
+                       *value |= 1 << bidx;
+               vcap_iter_next(itr);
+               if (bidx == 7)
+                       value++;
+       }
+}
+
+/* Verify that the type id in the stream matches the type id of the keyset */
+static bool vcap_verify_keystream_keyset(struct vcap_control *vctrl,
+                                        enum vcap_type vt,
+                                        u32 *keystream,
+                                        u32 *mskstream,
+                                        enum vcap_keyfield_set keyset)
+{
+       const struct vcap_info *vcap = &vctrl->vcaps[vt];
+       const struct vcap_field *typefld;
+       const struct vcap_typegroup *tgt;
+       const struct vcap_field *fields;
+       struct vcap_stream_iter iter;
+       const struct vcap_set *info;
+       u32 value = 0;
+       u32 mask = 0;
+
+       if (vcap_keyfield_count(vctrl, vt, keyset) == 0)
+               return false;
+
+       info = vcap_keyfieldset(vctrl, vt, keyset);
+       /* Check that the keyset is valid */
+       if (!info)
+               return false;
+
+       /* a type_id of value -1 means that there is no type field */
+       if (info->type_id == (u8)-1)
+               return true;
+
+       /* Get a valid typegroup for the specific keyset */
+       tgt = vcap_keyfield_typegroup(vctrl, vt, keyset);
+       if (!tgt)
+               return false;
+
+       fields = vcap_keyfields(vctrl, vt, keyset);
+       if (!fields)
+               return false;
+
+       typefld = &fields[VCAP_KF_TYPE];
+       vcap_iter_init(&iter, vcap->sw_width, tgt, typefld->offset);
+       vcap_decode_field(mskstream, &iter, typefld->width, (u8 *)&mask);
+       /* no type info if there are no mask bits */
+       if (vcap_bitarray_zero(typefld->width, (u8 *)&mask))
+               return false;
+
+       /* Get the value of the type field in the stream and compare to the
+        * one define in the vcap keyset
+        */
+       vcap_iter_init(&iter, vcap->sw_width, tgt, typefld->offset);
+       vcap_decode_field(keystream, &iter, typefld->width, (u8 *)&value);
+
+       return (value & mask) == (info->type_id & mask);
+}
+
+/* Verify that the typegroup bits have the correct values */
+static int vcap_verify_typegroups(u32 *stream, int sw_width,
+                                 const struct vcap_typegroup *tgt, bool mask,
+                                 int sw_max)
+{
+       struct vcap_stream_iter iter;
+       int sw_cnt, idx;
+
+       vcap_iter_set(&iter, sw_width, tgt, 0);
+       sw_cnt = 0;
+       while (iter.tg->width) {
+               u32 value = 0;
+               u32 tg_value = iter.tg->value;
+
+               if (mask)
+                       tg_value = (1 << iter.tg->width) - 1;
+               /* Set position to current typegroup bit */
+               iter.offset = iter.tg->offset;
+               vcap_iter_update(&iter);
+               for (idx = 0; idx < iter.tg->width; idx++) {
+                       /* Decode one typegroup bit */
+                       if (vcap_get_bit(stream, &iter))
+                               value |= 1 << idx;
+                       iter.offset++;
+                       vcap_iter_update(&iter);
+               }
+               if (value != tg_value)
+                       return -EINVAL;
+               iter.tg++; /* next typegroup */
+               sw_cnt++;
+               /* Stop checking more typegroups */
+               if (sw_max && sw_cnt >= sw_max)
+                       break;
+       }
+       return 0;
+}
+
+/* Find the subword width of the key typegroup that matches the stream data */
+static int vcap_find_keystream_typegroup_sw(struct vcap_control *vctrl,
+                                           enum vcap_type vt, u32 *stream,
+                                           bool mask, int sw_max)
+{
+       const struct vcap_typegroup **tgt;
+       int sw_idx, res;
+
+       tgt = vctrl->vcaps[vt].keyfield_set_typegroups;
+       /* Try the longest subword match first */
+       for (sw_idx = vctrl->vcaps[vt].sw_count; sw_idx >= 0; sw_idx--) {
+               if (!tgt[sw_idx])
+                       continue;
+
+               res = vcap_verify_typegroups(stream, vctrl->vcaps[vt].sw_width,
+                                            tgt[sw_idx], mask, sw_max);
+               if (res == 0)
+                       return sw_idx;
+       }
+       return -EINVAL;
+}
+
+/* Verify that the typegroup information, subword count, keyset and type id
+ * are in sync and correct, return the list of matchin keysets
+ */
+int
+vcap_find_keystream_keysets(struct vcap_control *vctrl,
+                           enum vcap_type vt,
+                           u32 *keystream,
+                           u32 *mskstream,
+                           bool mask, int sw_max,
+                           struct vcap_keyset_list *kslist)
+{
+       const struct vcap_set *keyfield_set;
+       int sw_count, idx;
+
+       sw_count = vcap_find_keystream_typegroup_sw(vctrl, vt, keystream, mask,
+                                                   sw_max);
+       if (sw_count < 0)
+               return sw_count;
+
+       keyfield_set = vctrl->vcaps[vt].keyfield_set;
+       for (idx = 0; idx < vctrl->vcaps[vt].keyfield_set_size; ++idx) {
+               if (keyfield_set[idx].sw_per_item != sw_count)
+                       continue;
+
+               if (vcap_verify_keystream_keyset(vctrl, vt, keystream,
+                                                mskstream, idx))
+                       vcap_keyset_list_add(kslist, idx);
+       }
+       if (kslist->cnt > 0)
+               return 0;
+       return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(vcap_find_keystream_keysets);
+
+/* Read key data from a VCAP address and discover if there are any rule keysets
+ * here
+ */
+int vcap_addr_keysets(struct vcap_control *vctrl,
+                     struct net_device *ndev,
+                     struct vcap_admin *admin,
+                     int addr,
+                     struct vcap_keyset_list *kslist)
+{
+       enum vcap_type vt = admin->vtype;
+       int keyset_sw_regs, idx;
+       u32 key = 0, mask = 0;
+
+       /* Read the cache at the specified address */
+       keyset_sw_regs = DIV_ROUND_UP(vctrl->vcaps[vt].sw_width, 32);
+       vctrl->ops->update(ndev, admin, VCAP_CMD_READ, VCAP_SEL_ALL, addr);
+       vctrl->ops->cache_read(ndev, admin, VCAP_SEL_ENTRY, 0,
+                              keyset_sw_regs);
+       /* Skip uninitialized key/mask entries */
+       for (idx = 0; idx < keyset_sw_regs; ++idx) {
+               key |= ~admin->cache.keystream[idx];
+               mask |= admin->cache.maskstream[idx];
+       }
+       if (key == 0 && mask == 0)
+               return -EINVAL;
+       /* Decode and locate the keysets */
+       return vcap_find_keystream_keysets(vctrl, vt, admin->cache.keystream,
+                                          admin->cache.maskstream, false, 0,
+                                          kslist);
+}
+EXPORT_SYMBOL_GPL(vcap_addr_keysets);
+
 /* Return the list of keyfields for the keyset */
 const struct vcap_field *vcap_keyfields(struct vcap_control *vctrl,
                                        enum vcap_type vt,
@@ -618,6 +839,517 @@ struct vcap_rule_internal *vcap_dup_rule(struct vcap_rule_internal *ri)
        return duprule;
 }
 
+static void vcap_apply_width(u8 *dst, int width, int bytes)
+{
+       u8 bmask;
+       int idx;
+
+       for (idx = 0; idx < bytes; idx++) {
+               if (width > 0)
+                       if (width < 8)
+                               bmask = (1 << width) - 1;
+                       else
+                               bmask = ~0;
+               else
+                       bmask = 0;
+               dst[idx] &= bmask;
+               width -= 8;
+       }
+}
+
+static void vcap_copy_from_w32be(u8 *dst, u8 *src, int size, int width)
+{
+       int idx, ridx, wstart, nidx;
+       int tail_bytes = (((size + 4) >> 2) << 2) - size;
+
+       for (idx = 0, ridx = size - 1; idx < size; ++idx, --ridx) {
+               wstart = (idx >> 2) << 2;
+               nidx = wstart + 3 - (idx & 0x3);
+               if (nidx >= size)
+                       nidx -= tail_bytes;
+               dst[nidx] = src[ridx];
+       }
+
+       vcap_apply_width(dst, width, size);
+}
+
+static void vcap_copy_action_bit_field(struct vcap_u1_action *field, u8 *value)
+{
+       field->value = (*value) & 0x1;
+}
+
+static void vcap_copy_limited_actionfield(u8 *dstvalue, u8 *srcvalue,
+                                         int width, int bytes)
+{
+       memcpy(dstvalue, srcvalue, bytes);
+       vcap_apply_width(dstvalue, width, bytes);
+}
+
+static void vcap_copy_to_client_actionfield(struct vcap_rule_internal *ri,
+                                           struct vcap_client_actionfield *field,
+                                           u8 *value, u16 width)
+{
+       int field_size = actionfield_size_table[field->ctrl.type];
+
+       if (ri->admin->w32be) {
+               switch (field->ctrl.type) {
+               case VCAP_FIELD_BIT:
+                       vcap_copy_action_bit_field(&field->data.u1, value);
+                       break;
+               case VCAP_FIELD_U32:
+                       vcap_copy_limited_actionfield((u8 *)&field->data.u32.value,
+                                                     value,
+                                                     width, field_size);
+                       break;
+               case VCAP_FIELD_U48:
+                       vcap_copy_from_w32be(field->data.u48.value, value,
+                                            field_size, width);
+                       break;
+               case VCAP_FIELD_U56:
+                       vcap_copy_from_w32be(field->data.u56.value, value,
+                                            field_size, width);
+                       break;
+               case VCAP_FIELD_U64:
+                       vcap_copy_from_w32be(field->data.u64.value, value,
+                                            field_size, width);
+                       break;
+               case VCAP_FIELD_U72:
+                       vcap_copy_from_w32be(field->data.u72.value, value,
+                                            field_size, width);
+                       break;
+               case VCAP_FIELD_U112:
+                       vcap_copy_from_w32be(field->data.u112.value, value,
+                                            field_size, width);
+                       break;
+               case VCAP_FIELD_U128:
+                       vcap_copy_from_w32be(field->data.u128.value, value,
+                                            field_size, width);
+                       break;
+               };
+       } else {
+               switch (field->ctrl.type) {
+               case VCAP_FIELD_BIT:
+                       vcap_copy_action_bit_field(&field->data.u1, value);
+                       break;
+               case VCAP_FIELD_U32:
+                       vcap_copy_limited_actionfield((u8 *)&field->data.u32.value,
+                                                     value,
+                                                     width, field_size);
+                       break;
+               case VCAP_FIELD_U48:
+                       vcap_copy_limited_actionfield(field->data.u48.value,
+                                                     value,
+                                                     width, field_size);
+                       break;
+               case VCAP_FIELD_U56:
+                       vcap_copy_limited_actionfield(field->data.u56.value,
+                                                     value,
+                                                     width, field_size);
+                       break;
+               case VCAP_FIELD_U64:
+                       vcap_copy_limited_actionfield(field->data.u64.value,
+                                                     value,
+                                                     width, field_size);
+                       break;
+               case VCAP_FIELD_U72:
+                       vcap_copy_limited_actionfield(field->data.u72.value,
+                                                     value,
+                                                     width, field_size);
+                       break;
+               case VCAP_FIELD_U112:
+                       vcap_copy_limited_actionfield(field->data.u112.value,
+                                                     value,
+                                                     width, field_size);
+                       break;
+               case VCAP_FIELD_U128:
+                       vcap_copy_limited_actionfield(field->data.u128.value,
+                                                     value,
+                                                     width, field_size);
+                       break;
+               };
+       }
+}
+
+static void vcap_copy_key_bit_field(struct vcap_u1_key *field,
+                                   u8 *value, u8 *mask)
+{
+       field->value = (*value) & 0x1;
+       field->mask = (*mask) & 0x1;
+}
+
+static void vcap_copy_limited_keyfield(u8 *dstvalue, u8 *dstmask,
+                                      u8 *srcvalue, u8 *srcmask,
+                                      int width, int bytes)
+{
+       memcpy(dstvalue, srcvalue, bytes);
+       vcap_apply_width(dstvalue, width, bytes);
+       memcpy(dstmask, srcmask, bytes);
+       vcap_apply_width(dstmask, width, bytes);
+}
+
+static void vcap_copy_to_client_keyfield(struct vcap_rule_internal *ri,
+                                        struct vcap_client_keyfield *field,
+                                        u8 *value, u8 *mask, u16 width)
+{
+       int field_size = keyfield_size_table[field->ctrl.type] / 2;
+
+       if (ri->admin->w32be) {
+               switch (field->ctrl.type) {
+               case VCAP_FIELD_BIT:
+                       vcap_copy_key_bit_field(&field->data.u1, value, mask);
+                       break;
+               case VCAP_FIELD_U32:
+                       vcap_copy_limited_keyfield((u8 *)&field->data.u32.value,
+                                                  (u8 *)&field->data.u32.mask,
+                                                  value, mask,
+                                                  width, field_size);
+                       break;
+               case VCAP_FIELD_U48:
+                       vcap_copy_from_w32be(field->data.u48.value, value,
+                                            field_size, width);
+                       vcap_copy_from_w32be(field->data.u48.mask,  mask,
+                                            field_size, width);
+                       break;
+               case VCAP_FIELD_U56:
+                       vcap_copy_from_w32be(field->data.u56.value, value,
+                                            field_size, width);
+                       vcap_copy_from_w32be(field->data.u56.mask,  mask,
+                                            field_size, width);
+                       break;
+               case VCAP_FIELD_U64:
+                       vcap_copy_from_w32be(field->data.u64.value, value,
+                                            field_size, width);
+                       vcap_copy_from_w32be(field->data.u64.mask,  mask,
+                                            field_size, width);
+                       break;
+               case VCAP_FIELD_U72:
+                       vcap_copy_from_w32be(field->data.u72.value, value,
+                                            field_size, width);
+                       vcap_copy_from_w32be(field->data.u72.mask,  mask,
+                                            field_size, width);
+                       break;
+               case VCAP_FIELD_U112:
+                       vcap_copy_from_w32be(field->data.u112.value, value,
+                                            field_size, width);
+                       vcap_copy_from_w32be(field->data.u112.mask,  mask,
+                                            field_size, width);
+                       break;
+               case VCAP_FIELD_U128:
+                       vcap_copy_from_w32be(field->data.u128.value, value,
+                                            field_size, width);
+                       vcap_copy_from_w32be(field->data.u128.mask,  mask,
+                                            field_size, width);
+                       break;
+               };
+       } else {
+               switch (field->ctrl.type) {
+               case VCAP_FIELD_BIT:
+                       vcap_copy_key_bit_field(&field->data.u1, value, mask);
+                       break;
+               case VCAP_FIELD_U32:
+                       vcap_copy_limited_keyfield((u8 *)&field->data.u32.value,
+                                                  (u8 *)&field->data.u32.mask,
+                                                  value, mask,
+                                                  width, field_size);
+                       break;
+               case VCAP_FIELD_U48:
+                       vcap_copy_limited_keyfield(field->data.u48.value,
+                                                  field->data.u48.mask,
+                                                  value, mask,
+                                                  width, field_size);
+                       break;
+               case VCAP_FIELD_U56:
+                       vcap_copy_limited_keyfield(field->data.u56.value,
+                                                  field->data.u56.mask,
+                                                  value, mask,
+                                                  width, field_size);
+                       break;
+               case VCAP_FIELD_U64:
+                       vcap_copy_limited_keyfield(field->data.u64.value,
+                                                  field->data.u64.mask,
+                                                  value, mask,
+                                                  width, field_size);
+                       break;
+               case VCAP_FIELD_U72:
+                       vcap_copy_limited_keyfield(field->data.u72.value,
+                                                  field->data.u72.mask,
+                                                  value, mask,
+                                                  width, field_size);
+                       break;
+               case VCAP_FIELD_U112:
+                       vcap_copy_limited_keyfield(field->data.u112.value,
+                                                  field->data.u112.mask,
+                                                  value, mask,
+                                                  width, field_size);
+                       break;
+               case VCAP_FIELD_U128:
+                       vcap_copy_limited_keyfield(field->data.u128.value,
+                                                  field->data.u128.mask,
+                                                  value, mask,
+                                                  width, field_size);
+                       break;
+               };
+       }
+}
+
+static void vcap_rule_alloc_keyfield(struct vcap_rule_internal *ri,
+                                    const struct vcap_field *keyfield,
+                                    enum vcap_key_field key,
+                                    u8 *value, u8 *mask)
+{
+       struct vcap_client_keyfield *field;
+
+       field = kzalloc(sizeof(*field), GFP_KERNEL);
+       if (!field)
+               return;
+       INIT_LIST_HEAD(&field->ctrl.list);
+       field->ctrl.key = key;
+       field->ctrl.type = keyfield->type;
+       vcap_copy_to_client_keyfield(ri, field, value, mask, keyfield->width);
+       list_add_tail(&field->ctrl.list, &ri->data.keyfields);
+}
+
+/* Read key data from a VCAP address and discover if there is a rule keyset
+ * here
+ */
+static bool
+vcap_verify_actionstream_actionset(struct vcap_control *vctrl,
+                                  enum vcap_type vt,
+                                  u32 *actionstream,
+                                  enum vcap_actionfield_set actionset)
+{
+       const struct vcap_typegroup *tgt;
+       const struct vcap_field *fields;
+       const struct vcap_set *info;
+
+       if (vcap_actionfield_count(vctrl, vt, actionset) == 0)
+               return false;
+
+       info = vcap_actionfieldset(vctrl, vt, actionset);
+       /* Check that the actionset is valid */
+       if (!info)
+               return false;
+
+       /* a type_id of value -1 means that there is no type field */
+       if (info->type_id == (u8)-1)
+               return true;
+
+       /* Get a valid typegroup for the specific actionset */
+       tgt = vcap_actionfield_typegroup(vctrl, vt, actionset);
+       if (!tgt)
+               return false;
+
+       fields = vcap_actionfields(vctrl, vt, actionset);
+       if (!fields)
+               return false;
+
+       /* Later this will be expanded with a check of the type id */
+       return true;
+}
+
+/* Find the subword width of the action typegroup that matches the stream data
+ */
+static int vcap_find_actionstream_typegroup_sw(struct vcap_control *vctrl,
+                                              enum vcap_type vt, u32 *stream,
+                                              int sw_max)
+{
+       const struct vcap_typegroup **tgt;
+       int sw_idx, res;
+
+       tgt = vctrl->vcaps[vt].actionfield_set_typegroups;
+       /* Try the longest subword match first */
+       for (sw_idx = vctrl->vcaps[vt].sw_count; sw_idx >= 0; sw_idx--) {
+               if (!tgt[sw_idx])
+                       continue;
+               res = vcap_verify_typegroups(stream, vctrl->vcaps[vt].act_width,
+                                            tgt[sw_idx], false, sw_max);
+               if (res == 0)
+                       return sw_idx;
+       }
+       return -EINVAL;
+}
+
+/* Verify that the typegroup information, subword count, actionset and type id
+ * are in sync and correct, return the actionset
+ */
+static enum vcap_actionfield_set
+vcap_find_actionstream_actionset(struct vcap_control *vctrl,
+                                enum vcap_type vt,
+                                u32 *stream,
+                                int sw_max)
+{
+       const struct vcap_set *actionfield_set;
+       int sw_count, idx;
+       bool res;
+
+       sw_count = vcap_find_actionstream_typegroup_sw(vctrl, vt, stream,
+                                                      sw_max);
+       if (sw_count < 0)
+               return sw_count;
+
+       actionfield_set = vctrl->vcaps[vt].actionfield_set;
+       for (idx = 0; idx < vctrl->vcaps[vt].actionfield_set_size; ++idx) {
+               if (actionfield_set[idx].sw_per_item != sw_count)
+                       continue;
+
+               res = vcap_verify_actionstream_actionset(vctrl, vt,
+                                                        stream, idx);
+               if (res)
+                       return idx;
+       }
+       return -EINVAL;
+}
+
+/* Store action value in an element in a list for the client */
+static void vcap_rule_alloc_actionfield(struct vcap_rule_internal *ri,
+                                       const struct vcap_field *actionfield,
+                                       enum vcap_action_field action,
+                                       u8 *value)
+{
+       struct vcap_client_actionfield *field;
+
+       field = kzalloc(sizeof(*field), GFP_KERNEL);
+       if (!field)
+               return;
+       INIT_LIST_HEAD(&field->ctrl.list);
+       field->ctrl.action = action;
+       field->ctrl.type = actionfield->type;
+       vcap_copy_to_client_actionfield(ri, field, value, actionfield->width);
+       list_add_tail(&field->ctrl.list, &ri->data.actionfields);
+}
+
+static int vcap_decode_actionset(struct vcap_rule_internal *ri)
+{
+       struct vcap_control *vctrl = ri->vctrl;
+       struct vcap_admin *admin = ri->admin;
+       const struct vcap_field *actionfield;
+       enum vcap_actionfield_set actionset;
+       enum vcap_type vt = admin->vtype;
+       const struct vcap_typegroup *tgt;
+       struct vcap_stream_iter iter;
+       int idx, res, actfield_count;
+       u32 *actstream;
+       u8 value[16];
+
+       actstream = admin->cache.actionstream;
+       res = vcap_find_actionstream_actionset(vctrl, vt, actstream, 0);
+       if (res < 0) {
+               pr_err("%s:%d: could not find valid actionset: %d\n",
+                      __func__, __LINE__, res);
+               return -EINVAL;
+       }
+       actionset = res;
+       actfield_count = vcap_actionfield_count(vctrl, vt, actionset);
+       actionfield = vcap_actionfields(vctrl, vt, actionset);
+       tgt = vcap_actionfield_typegroup(vctrl, vt, actionset);
+       /* Start decoding the stream */
+       for (idx = 0; idx < actfield_count; ++idx) {
+               if (actionfield[idx].width <= 0)
+                       continue;
+               /* Get the action */
+               memset(value, 0, DIV_ROUND_UP(actionfield[idx].width, 8));
+               vcap_iter_init(&iter, vctrl->vcaps[vt].act_width, tgt,
+                              actionfield[idx].offset);
+               vcap_decode_field(actstream, &iter, actionfield[idx].width,
+                                 value);
+               /* Skip if no bits are set */
+               if (vcap_bitarray_zero(actionfield[idx].width, value))
+                       continue;
+               vcap_rule_alloc_actionfield(ri, &actionfield[idx], idx, value);
+               /* Later the action id will also be checked */
+       }
+       return vcap_set_rule_set_actionset((struct vcap_rule *)ri, actionset);
+}
+
+static int vcap_decode_keyset(struct vcap_rule_internal *ri)
+{
+       struct vcap_control *vctrl = ri->vctrl;
+       struct vcap_stream_iter kiter, miter;
+       struct vcap_admin *admin = ri->admin;
+       enum vcap_keyfield_set keysets[10];
+       const struct vcap_field *keyfield;
+       enum vcap_type vt = admin->vtype;
+       const struct vcap_typegroup *tgt;
+       struct vcap_keyset_list matches;
+       enum vcap_keyfield_set keyset;
+       int idx, res, keyfield_count;
+       u32 *maskstream;
+       u32 *keystream;
+       u8 value[16];
+       u8 mask[16];
+
+       keystream = admin->cache.keystream;
+       maskstream = admin->cache.maskstream;
+       matches.keysets = keysets;
+       matches.cnt = 0;
+       matches.max = ARRAY_SIZE(keysets);
+       res = vcap_find_keystream_keysets(vctrl, vt, keystream, maskstream,
+                                         false, 0, &matches);
+       if (res < 0) {
+               pr_err("%s:%d: could not find valid keysets: %d\n",
+                      __func__, __LINE__, res);
+               return -EINVAL;
+       }
+       keyset = matches.keysets[0];
+       keyfield_count = vcap_keyfield_count(vctrl, vt, keyset);
+       keyfield = vcap_keyfields(vctrl, vt, keyset);
+       tgt = vcap_keyfield_typegroup(vctrl, vt, keyset);
+       /* Start decoding the streams */
+       for (idx = 0; idx < keyfield_count; ++idx) {
+               if (keyfield[idx].width <= 0)
+                       continue;
+               /* First get the mask */
+               memset(mask, 0, DIV_ROUND_UP(keyfield[idx].width, 8));
+               vcap_iter_init(&miter, vctrl->vcaps[vt].sw_width, tgt,
+                              keyfield[idx].offset);
+               vcap_decode_field(maskstream, &miter, keyfield[idx].width,
+                                 mask);
+               /* Skip if no mask bits are set */
+               if (vcap_bitarray_zero(keyfield[idx].width, mask))
+                       continue;
+               /* Get the key */
+               memset(value, 0, DIV_ROUND_UP(keyfield[idx].width, 8));
+               vcap_iter_init(&kiter, vctrl->vcaps[vt].sw_width, tgt,
+                              keyfield[idx].offset);
+               vcap_decode_field(keystream, &kiter, keyfield[idx].width,
+                                 value);
+               vcap_rule_alloc_keyfield(ri, &keyfield[idx], idx, value, mask);
+       }
+       return vcap_set_rule_set_keyset((struct vcap_rule *)ri, keyset);
+}
+
+/* Read VCAP content into the VCAP cache */
+static int vcap_read_rule(struct vcap_rule_internal *ri)
+{
+       struct vcap_admin *admin = ri->admin;
+       int sw_idx, ent_idx = 0, act_idx = 0;
+       u32 addr = ri->addr;
+
+       if (!ri->size || !ri->keyset_sw_regs || !ri->actionset_sw_regs) {
+               pr_err("%s:%d: rule is empty\n", __func__, __LINE__);
+               return -EINVAL;
+       }
+       vcap_erase_cache(ri);
+       /* Use the values in the streams to read the VCAP cache */
+       for (sw_idx = 0; sw_idx < ri->size; sw_idx++, addr++) {
+               ri->vctrl->ops->update(ri->ndev, admin, VCAP_CMD_READ,
+                                      VCAP_SEL_ALL, addr);
+               ri->vctrl->ops->cache_read(ri->ndev, admin,
+                                          VCAP_SEL_ENTRY, ent_idx,
+                                          ri->keyset_sw_regs);
+               ri->vctrl->ops->cache_read(ri->ndev, admin,
+                                          VCAP_SEL_ACTION, act_idx,
+                                          ri->actionset_sw_regs);
+               if (sw_idx == 0)
+                       ri->vctrl->ops->cache_read(ri->ndev, admin,
+                                                  VCAP_SEL_COUNTER,
+                                                  ri->counter_id, 0);
+               ent_idx += ri->keyset_sw_regs;
+               act_idx += ri->actionset_sw_regs;
+       }
+       return 0;
+}
+
 /* Write VCAP cache content to the VCAP HW instance */
 static int vcap_write_rule(struct vcap_rule_internal *ri)
 {
@@ -1183,6 +1915,82 @@ void vcap_free_rule(struct vcap_rule *rule)
 }
 EXPORT_SYMBOL_GPL(vcap_free_rule);
 
+struct vcap_rule *vcap_get_rule(struct vcap_control *vctrl, u32 id)
+{
+       struct vcap_rule_internal *elem;
+       struct vcap_rule_internal *ri;
+       int err;
+
+       ri = NULL;
+
+       err = vcap_api_check(vctrl);
+       if (err)
+               return ERR_PTR(err);
+       elem = vcap_lookup_rule(vctrl, id);
+       if (!elem)
+               return NULL;
+       mutex_lock(&elem->admin->lock);
+       ri = vcap_dup_rule(elem);
+       if (IS_ERR(ri))
+               goto unlock;
+       err = vcap_read_rule(ri);
+       if (err) {
+               ri = ERR_PTR(err);
+               goto unlock;
+       }
+       err = vcap_decode_keyset(ri);
+       if (err) {
+               ri = ERR_PTR(err);
+               goto unlock;
+       }
+       err = vcap_decode_actionset(ri);
+       if (err) {
+               ri = ERR_PTR(err);
+               goto unlock;
+       }
+
+unlock:
+       mutex_unlock(&elem->admin->lock);
+       return (struct vcap_rule *)ri;
+}
+EXPORT_SYMBOL_GPL(vcap_get_rule);
+
+/* Update existing rule */
+int vcap_mod_rule(struct vcap_rule *rule)
+{
+       struct vcap_rule_internal *ri = to_intrule(rule);
+       struct vcap_counter ctr;
+       int err;
+
+       err = vcap_api_check(ri->vctrl);
+       if (err)
+               return err;
+
+       if (!vcap_lookup_rule(ri->vctrl, ri->data.id))
+               return -ENOENT;
+
+       mutex_lock(&ri->admin->lock);
+       /* Encode the bitstreams to the VCAP cache */
+       vcap_erase_cache(ri);
+       err = vcap_encode_rule(ri);
+       if (err)
+               goto out;
+
+       err = vcap_write_rule(ri);
+       if (err)
+               goto out;
+
+       memset(&ctr, 0, sizeof(ctr));
+       err =  vcap_write_counter(ri, &ctr);
+       if (err)
+               goto out;
+
+out:
+       mutex_unlock(&ri->admin->lock);
+       return err;
+}
+EXPORT_SYMBOL_GPL(vcap_mod_rule);
+
 /* Return the alignment offset for a new rule address */
 static int vcap_valid_rule_move(struct vcap_rule_internal *el, int offset)
 {
@@ -1389,7 +2197,7 @@ static void vcap_copy_from_client_keyfield(struct vcap_rule *rule,
                vcap_copy_to_w32be(field->data.u128.value, data->u128.value, size);
                vcap_copy_to_w32be(field->data.u128.mask,  data->u128.mask, size);
                break;
-       };
+       }
 }
 
 /* Check if the keyfield is already in the rule */
@@ -1530,6 +2338,22 @@ int vcap_rule_add_key_u128(struct vcap_rule *rule, enum vcap_key_field key,
 }
 EXPORT_SYMBOL_GPL(vcap_rule_add_key_u128);
 
+int vcap_rule_get_key_u32(struct vcap_rule *rule, enum vcap_key_field key,
+                         u32 *value, u32 *mask)
+{
+       struct vcap_client_keyfield *ckf;
+
+       ckf = vcap_find_keyfield(rule, key);
+       if (!ckf)
+               return -ENOENT;
+
+       *value = ckf->data.u32.value;
+       *mask = ckf->data.u32.mask;
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(vcap_rule_get_key_u32);
+
 /* Find a client action field in a rule */
 static struct vcap_client_actionfield *
 vcap_find_actionfield(struct vcap_rule *rule, enum vcap_action_field act)
@@ -1579,7 +2403,7 @@ static void vcap_copy_from_client_actionfield(struct vcap_rule *rule,
        case VCAP_FIELD_U128:
                vcap_copy_to_w32be(field->data.u128.value, data->u128.value, size);
                break;
-       };
+       }
 }
 
 /* Check if the actionfield is already in the rule */
index 93a0fcb..0319866 100644 (file)
@@ -170,6 +170,10 @@ int vcap_add_rule(struct vcap_rule *rule);
 int vcap_del_rule(struct vcap_control *vctrl, struct net_device *ndev, u32 id);
 /* Make a full copy of an existing rule with a new rule id */
 struct vcap_rule *vcap_copy_rule(struct vcap_rule *rule);
+/* Get rule from a VCAP instance */
+struct vcap_rule *vcap_get_rule(struct vcap_control *vctrl, u32 id);
+/* Update existing rule */
+int vcap_mod_rule(struct vcap_rule *rule);
 
 /* Update the keyset for the rule */
 int vcap_set_rule_set_keyset(struct vcap_rule *rule,
@@ -254,4 +258,8 @@ int vcap_rule_mod_action_u32(struct vcap_rule *rule,
                             enum vcap_action_field action,
                             u32 value);
 
+/* Get a 32 bit key field value and mask from the rule */
+int vcap_rule_get_key_u32(struct vcap_rule *rule, enum vcap_key_field key,
+                         u32 *value, u32 *mask);
+
 #endif /* __VCAP_API_CLIENT__ */
index 3b8d165..895bfff 100644 (file)
@@ -18,355 +18,15 @@ struct vcap_port_debugfs_info {
        struct net_device *ndev;
 };
 
-static bool vcap_bitarray_zero(int width, u8 *value)
-{
-       int bytes = DIV_ROUND_UP(width, BITS_PER_BYTE);
-       u8 total = 0, bmask = 0xff;
-       int rwidth = width;
-       int idx;
-
-       for (idx = 0; idx < bytes; ++idx, rwidth -= BITS_PER_BYTE) {
-               if (rwidth && rwidth < BITS_PER_BYTE)
-                       bmask = (1 << rwidth) - 1;
-               total += value[idx] & bmask;
-       }
-       return total == 0;
-}
-
-static bool vcap_get_bit(u32 *stream, struct vcap_stream_iter *itr)
-{
-       u32 mask = BIT(itr->reg_bitpos);
-       u32 *p = &stream[itr->reg_idx];
-
-       return !!(*p & mask);
-}
-
-static void vcap_decode_field(u32 *stream, struct vcap_stream_iter *itr,
-                             int width, u8 *value)
-{
-       int idx;
-
-       /* Loop over the field value bits and get the field bits and
-        * set them in the output value byte array
-        */
-       for (idx = 0; idx < width; idx++) {
-               u8 bidx = idx & 0x7;
-
-               /* Decode one field value bit */
-               if (vcap_get_bit(stream, itr))
-                       *value |= 1 << bidx;
-               vcap_iter_next(itr);
-               if (bidx == 7)
-                       value++;
-       }
-}
-
-/* Verify that the typegroup bits have the correct values */
-static int vcap_verify_typegroups(u32 *stream, int sw_width,
-                                 const struct vcap_typegroup *tgt, bool mask,
-                                 int sw_max)
-{
-       struct vcap_stream_iter iter;
-       int sw_cnt, idx;
-
-       vcap_iter_set(&iter, sw_width, tgt, 0);
-       sw_cnt = 0;
-       while (iter.tg->width) {
-               u32 value = 0;
-               u32 tg_value = iter.tg->value;
-
-               if (mask)
-                       tg_value = (1 << iter.tg->width) - 1;
-               /* Set position to current typegroup bit */
-               iter.offset = iter.tg->offset;
-               vcap_iter_update(&iter);
-               for (idx = 0; idx < iter.tg->width; idx++) {
-                       /* Decode one typegroup bit */
-                       if (vcap_get_bit(stream, &iter))
-                               value |= 1 << idx;
-                       iter.offset++;
-                       vcap_iter_update(&iter);
-               }
-               if (value != tg_value)
-                       return -EINVAL;
-               iter.tg++; /* next typegroup */
-               sw_cnt++;
-               /* Stop checking more typegroups */
-               if (sw_max && sw_cnt >= sw_max)
-                       break;
-       }
-       return 0;
-}
-
-/* Find the subword width of the key typegroup that matches the stream data */
-static int vcap_find_keystream_typegroup_sw(struct vcap_control *vctrl,
-                                           enum vcap_type vt, u32 *stream,
-                                           bool mask, int sw_max)
-{
-       const struct vcap_typegroup **tgt;
-       int sw_idx, res;
-
-       tgt = vctrl->vcaps[vt].keyfield_set_typegroups;
-       /* Try the longest subword match first */
-       for (sw_idx = vctrl->vcaps[vt].sw_count; sw_idx >= 0; sw_idx--) {
-               if (!tgt[sw_idx])
-                       continue;
-
-               res = vcap_verify_typegroups(stream, vctrl->vcaps[vt].sw_width,
-                                            tgt[sw_idx], mask, sw_max);
-               if (res == 0)
-                       return sw_idx;
-       }
-       return -EINVAL;
-}
-
-/* Find the subword width of the action typegroup that matches the stream data
- */
-static int vcap_find_actionstream_typegroup_sw(struct vcap_control *vctrl,
-                                              enum vcap_type vt, u32 *stream,
-                                              int sw_max)
-{
-       const struct vcap_typegroup **tgt;
-       int sw_idx, res;
-
-       tgt = vctrl->vcaps[vt].actionfield_set_typegroups;
-       /* Try the longest subword match first */
-       for (sw_idx = vctrl->vcaps[vt].sw_count; sw_idx >= 0; sw_idx--) {
-               if (!tgt[sw_idx])
-                       continue;
-               res = vcap_verify_typegroups(stream, vctrl->vcaps[vt].act_width,
-                                            tgt[sw_idx], false, sw_max);
-               if (res == 0)
-                       return sw_idx;
-       }
-       return -EINVAL;
-}
-
-/* Verify that the type id in the stream matches the type id of the keyset */
-static bool vcap_verify_keystream_keyset(struct vcap_control *vctrl,
-                                        enum vcap_type vt,
-                                        u32 *keystream,
-                                        u32 *mskstream,
-                                        enum vcap_keyfield_set keyset)
-{
-       const struct vcap_info *vcap = &vctrl->vcaps[vt];
-       const struct vcap_field *typefld;
-       const struct vcap_typegroup *tgt;
-       const struct vcap_field *fields;
-       struct vcap_stream_iter iter;
-       const struct vcap_set *info;
-       u32 value = 0;
-       u32 mask = 0;
-
-       if (vcap_keyfield_count(vctrl, vt, keyset) == 0)
-               return false;
-
-       info = vcap_keyfieldset(vctrl, vt, keyset);
-       /* Check that the keyset is valid */
-       if (!info)
-               return false;
-
-       /* a type_id of value -1 means that there is no type field */
-       if (info->type_id == (u8)-1)
-               return true;
-
-       /* Get a valid typegroup for the specific keyset */
-       tgt = vcap_keyfield_typegroup(vctrl, vt, keyset);
-       if (!tgt)
-               return false;
-
-       fields = vcap_keyfields(vctrl, vt, keyset);
-       if (!fields)
-               return false;
-
-       typefld = &fields[VCAP_KF_TYPE];
-       vcap_iter_init(&iter, vcap->sw_width, tgt, typefld->offset);
-       vcap_decode_field(mskstream, &iter, typefld->width, (u8 *)&mask);
-       /* no type info if there are no mask bits */
-       if (vcap_bitarray_zero(typefld->width, (u8 *)&mask))
-               return false;
-
-       /* Get the value of the type field in the stream and compare to the
-        * one define in the vcap keyset
-        */
-       vcap_iter_init(&iter, vcap->sw_width, tgt, typefld->offset);
-       vcap_decode_field(keystream, &iter, typefld->width, (u8 *)&value);
-
-       return (value & mask) == (info->type_id & mask);
-}
-
-/* Verify that the typegroup information, subword count, keyset and type id
- * are in sync and correct, return the list of matching keysets
- */
-static int
-vcap_find_keystream_keysets(struct vcap_control *vctrl,
-                           enum vcap_type vt,
-                           u32 *keystream,
-                           u32 *mskstream,
-                           bool mask, int sw_max,
-                           struct vcap_keyset_list *kslist)
-{
-       const struct vcap_set *keyfield_set;
-       int sw_count, idx;
-
-       sw_count = vcap_find_keystream_typegroup_sw(vctrl, vt, keystream, mask,
-                                                   sw_max);
-       if (sw_count < 0)
-               return sw_count;
-
-       keyfield_set = vctrl->vcaps[vt].keyfield_set;
-       for (idx = 0; idx < vctrl->vcaps[vt].keyfield_set_size; ++idx) {
-               if (keyfield_set[idx].sw_per_item != sw_count)
-                       continue;
-
-               if (vcap_verify_keystream_keyset(vctrl, vt, keystream,
-                                                mskstream, idx))
-                       vcap_keyset_list_add(kslist, idx);
-       }
-       if (kslist->cnt > 0)
-               return 0;
-       return -EINVAL;
-}
-
-/* Read key data from a VCAP address and discover if there is a rule keyset
- * here
- */
-static bool
-vcap_verify_actionstream_actionset(struct vcap_control *vctrl,
-                                  enum vcap_type vt,
-                                  u32 *actionstream,
-                                  enum vcap_actionfield_set actionset)
-{
-       const struct vcap_typegroup *tgt;
-       const struct vcap_field *fields;
-       const struct vcap_set *info;
-
-       if (vcap_actionfield_count(vctrl, vt, actionset) == 0)
-               return false;
-
-       info = vcap_actionfieldset(vctrl, vt, actionset);
-       /* Check that the actionset is valid */
-       if (!info)
-               return false;
-
-       /* a type_id of value -1 means that there is no type field */
-       if (info->type_id == (u8)-1)
-               return true;
-
-       /* Get a valid typegroup for the specific actionset */
-       tgt = vcap_actionfield_typegroup(vctrl, vt, actionset);
-       if (!tgt)
-               return false;
-
-       fields = vcap_actionfields(vctrl, vt, actionset);
-       if (!fields)
-               return false;
-
-       /* Later this will be expanded with a check of the type id */
-       return true;
-}
-
-/* Verify that the typegroup information, subword count, actionset and type id
- * are in sync and correct, return the actionset
- */
-static enum vcap_actionfield_set
-vcap_find_actionstream_actionset(struct vcap_control *vctrl,
-                                enum vcap_type vt,
-                                u32 *stream,
-                                int sw_max)
-{
-       const struct vcap_set *actionfield_set;
-       int sw_count, idx;
-       bool res;
-
-       sw_count = vcap_find_actionstream_typegroup_sw(vctrl, vt, stream,
-                                                      sw_max);
-       if (sw_count < 0)
-               return sw_count;
-
-       actionfield_set = vctrl->vcaps[vt].actionfield_set;
-       for (idx = 0; idx < vctrl->vcaps[vt].actionfield_set_size; ++idx) {
-               if (actionfield_set[idx].sw_per_item != sw_count)
-                       continue;
-
-               res = vcap_verify_actionstream_actionset(vctrl, vt,
-                                                        stream, idx);
-               if (res)
-                       return idx;
-       }
-       return -EINVAL;
-}
-
-/* Read key data from a VCAP address and discover if there are any rule keysets
- * here
- */
-static int vcap_addr_keysets(struct vcap_control *vctrl,
-                            struct net_device *ndev,
-                            struct vcap_admin *admin,
-                            int addr,
-                            struct vcap_keyset_list *kslist)
-{
-       enum vcap_type vt = admin->vtype;
-       int keyset_sw_regs, idx;
-       u32 key = 0, mask = 0;
-
-       /* Read the cache at the specified address */
-       keyset_sw_regs = DIV_ROUND_UP(vctrl->vcaps[vt].sw_width, 32);
-       vctrl->ops->update(ndev, admin, VCAP_CMD_READ, VCAP_SEL_ALL, addr);
-       vctrl->ops->cache_read(ndev, admin, VCAP_SEL_ENTRY, 0,
-                              keyset_sw_regs);
-       /* Skip uninitialized key/mask entries */
-       for (idx = 0; idx < keyset_sw_regs; ++idx) {
-               key |= ~admin->cache.keystream[idx];
-               mask |= admin->cache.maskstream[idx];
-       }
-       if (key == 0 && mask == 0)
-               return -EINVAL;
-       /* Decode and locate the keysets */
-       return vcap_find_keystream_keysets(vctrl, vt, admin->cache.keystream,
-                                          admin->cache.maskstream, false, 0,
-                                          kslist);
-}
-
-static int vcap_read_rule(struct vcap_rule_internal *ri)
-{
-       struct vcap_admin *admin = ri->admin;
-       int sw_idx, ent_idx = 0, act_idx = 0;
-       u32 addr = ri->addr;
-
-       if (!ri->size || !ri->keyset_sw_regs || !ri->actionset_sw_regs) {
-               pr_err("%s:%d: rule is empty\n", __func__, __LINE__);
-               return -EINVAL;
-       }
-       vcap_erase_cache(ri);
-       /* Use the values in the streams to read the VCAP cache */
-       for (sw_idx = 0; sw_idx < ri->size; sw_idx++, addr++) {
-               ri->vctrl->ops->update(ri->ndev, admin, VCAP_CMD_READ,
-                                      VCAP_SEL_ALL, addr);
-               ri->vctrl->ops->cache_read(ri->ndev, admin,
-                                          VCAP_SEL_ENTRY, ent_idx,
-                                          ri->keyset_sw_regs);
-               ri->vctrl->ops->cache_read(ri->ndev, admin,
-                                          VCAP_SEL_ACTION, act_idx,
-                                          ri->actionset_sw_regs);
-               if (sw_idx == 0)
-                       ri->vctrl->ops->cache_read(ri->ndev, admin,
-                                                  VCAP_SEL_COUNTER,
-                                                  ri->counter_id, 0);
-               ent_idx += ri->keyset_sw_regs;
-               act_idx += ri->actionset_sw_regs;
-       }
-       return 0;
-}
-
 /* Dump the keyfields value and mask values */
 static void vcap_debugfs_show_rule_keyfield(struct vcap_control *vctrl,
                                            struct vcap_output_print *out,
                                            enum vcap_key_field key,
                                            const struct vcap_field *keyfield,
-                                           u8 *value, u8 *mask)
+                                           struct vcap_client_keyfield_data *data)
 {
        bool hex = false;
+       u8 *value, *mask;
        int idx, bytes;
 
        out->prf(out->dst, "    %s: W%d: ", vcap_keyfield_name(vctrl, key),
@@ -374,40 +34,62 @@ static void vcap_debugfs_show_rule_keyfield(struct vcap_control *vctrl,
 
        switch (keyfield[key].type) {
        case VCAP_FIELD_BIT:
-               out->prf(out->dst, "%d/%d", value[0], mask[0]);
+               out->prf(out->dst, "%d/%d", data->u1.value, data->u1.mask);
                break;
        case VCAP_FIELD_U32:
+               value = (u8 *)(&data->u32.value);
+               mask = (u8 *)(&data->u32.mask);
+
                if (key == VCAP_KF_L3_IP4_SIP || key == VCAP_KF_L3_IP4_DIP) {
-                       out->prf(out->dst, "%pI4h/%pI4h", value, mask);
+                       out->prf(out->dst, "%pI4h/%pI4h", &data->u32.value,
+                                &data->u32.mask);
                } else if (key == VCAP_KF_ETYPE ||
                           key == VCAP_KF_IF_IGR_PORT_MASK) {
                        hex = true;
                } else {
                        u32 fmsk = (1 << keyfield[key].width) - 1;
-                       u32 val = *(u32 *)value;
-                       u32 msk = *(u32 *)mask;
 
-                       out->prf(out->dst, "%u/%u", val & fmsk, msk & fmsk);
+                       out->prf(out->dst, "%u/%u", data->u32.value & fmsk,
+                                data->u32.mask & fmsk);
                }
                break;
        case VCAP_FIELD_U48:
+               value = data->u48.value;
+               mask = data->u48.mask;
                if (key == VCAP_KF_L2_SMAC || key == VCAP_KF_L2_DMAC)
-                       out->prf(out->dst, "%pMR/%pMR", value, mask);
+                       out->prf(out->dst, "%pMR/%pMR", data->u48.value,
+                                data->u48.mask);
                else
                        hex = true;
                break;
        case VCAP_FIELD_U56:
+               value = data->u56.value;
+               mask = data->u56.mask;
+               hex = true;
+               break;
        case VCAP_FIELD_U64:
+               value = data->u64.value;
+               mask = data->u64.mask;
+               hex = true;
+               break;
        case VCAP_FIELD_U72:
+               value = data->u72.value;
+               mask = data->u72.mask;
+               hex = true;
+               break;
        case VCAP_FIELD_U112:
+               value = data->u112.value;
+               mask = data->u112.mask;
                hex = true;
                break;
        case VCAP_FIELD_U128:
                if (key == VCAP_KF_L3_IP6_SIP || key == VCAP_KF_L3_IP6_DIP) {
                        u8 nvalue[16], nmask[16];
 
-                       vcap_netbytes_copy(nvalue, value, sizeof(nvalue));
-                       vcap_netbytes_copy(nmask, mask, sizeof(nmask));
+                       vcap_netbytes_copy(nvalue, data->u128.value,
+                                          sizeof(nvalue));
+                       vcap_netbytes_copy(nmask, data->u128.mask,
+                                          sizeof(nmask));
                        out->prf(out->dst, "%pI6/%pI6", nvalue, nmask);
                } else {
                        hex = true;
@@ -472,19 +154,15 @@ static int vcap_debugfs_show_rule_keyset(struct vcap_rule_internal *ri,
                                         struct vcap_output_print *out)
 {
        struct vcap_control *vctrl = ri->vctrl;
-       struct vcap_stream_iter kiter, miter;
        struct vcap_admin *admin = ri->admin;
        enum vcap_keyfield_set keysets[10];
        const struct vcap_field *keyfield;
        enum vcap_type vt = admin->vtype;
-       const struct vcap_typegroup *tgt;
+       struct vcap_client_keyfield *ckf;
        struct vcap_keyset_list matches;
-       enum vcap_keyfield_set keyset;
-       int idx, res, keyfield_count;
        u32 *maskstream;
        u32 *keystream;
-       u8 value[16];
-       u8 mask[16];
+       int res;
 
        keystream = admin->cache.keystream;
        maskstream = admin->cache.maskstream;
@@ -498,39 +176,20 @@ static int vcap_debugfs_show_rule_keyset(struct vcap_rule_internal *ri,
                       __func__, __LINE__, res);
                return -EINVAL;
        }
-       keyset = matches.keysets[0];
        out->prf(out->dst, "  keysets:");
-       for (idx = 0; idx < matches.cnt; ++idx)
+       for (int idx = 0; idx < matches.cnt; ++idx)
                out->prf(out->dst, " %s",
                         vcap_keyset_name(vctrl, matches.keysets[idx]));
        out->prf(out->dst, "\n");
        out->prf(out->dst, "  keyset_sw: %d\n", ri->keyset_sw);
        out->prf(out->dst, "  keyset_sw_regs: %d\n", ri->keyset_sw_regs);
-       keyfield_count = vcap_keyfield_count(vctrl, vt, keyset);
-       keyfield = vcap_keyfields(vctrl, vt, keyset);
-       tgt = vcap_keyfield_typegroup(vctrl, vt, keyset);
-       /* Start decoding the streams */
-       for (idx = 0; idx < keyfield_count; ++idx) {
-               if (keyfield[idx].width <= 0)
-                       continue;
-               /* First get the mask */
-               memset(mask, 0, DIV_ROUND_UP(keyfield[idx].width, 8));
-               vcap_iter_init(&miter, vctrl->vcaps[vt].sw_width, tgt,
-                              keyfield[idx].offset);
-               vcap_decode_field(maskstream, &miter, keyfield[idx].width,
-                                 mask);
-               /* Skip if no mask bits are set */
-               if (vcap_bitarray_zero(keyfield[idx].width, mask))
-                       continue;
-               /* Get the key */
-               memset(value, 0, DIV_ROUND_UP(keyfield[idx].width, 8));
-               vcap_iter_init(&kiter, vctrl->vcaps[vt].sw_width, tgt,
-                              keyfield[idx].offset);
-               vcap_decode_field(keystream, &kiter, keyfield[idx].width,
-                                 value);
-               vcap_debugfs_show_rule_keyfield(vctrl, out, idx, keyfield,
-                                               value, mask);
+
+       list_for_each_entry(ckf, &ri->data.keyfields, ctrl.list) {
+               keyfield = vcap_keyfields(vctrl, admin->vtype, ri->data.keyset);
+               vcap_debugfs_show_rule_keyfield(vctrl, out, ckf->ctrl.key,
+                                               keyfield, &ckf->data);
        }
+
        return 0;
 }
 
@@ -540,48 +199,21 @@ static int vcap_debugfs_show_rule_actionset(struct vcap_rule_internal *ri,
        struct vcap_control *vctrl = ri->vctrl;
        struct vcap_admin *admin = ri->admin;
        const struct vcap_field *actionfield;
-       enum vcap_actionfield_set actionset;
-       enum vcap_type vt = admin->vtype;
-       const struct vcap_typegroup *tgt;
-       struct vcap_stream_iter iter;
-       int idx, res, actfield_count;
-       u32 *actstream;
-       u8 value[16];
-       bool no_bits;
-
-       actstream = admin->cache.actionstream;
-       res = vcap_find_actionstream_actionset(vctrl, vt, actstream, 0);
-       if (res < 0) {
-               pr_err("%s:%d: could not find valid actionset: %d\n",
-                      __func__, __LINE__, res);
-               return -EINVAL;
-       }
-       actionset = res;
+       struct vcap_client_actionfield *caf;
+
        out->prf(out->dst, "  actionset: %s\n",
                 vcap_actionset_name(vctrl, ri->data.actionset));
        out->prf(out->dst, "  actionset_sw: %d\n", ri->actionset_sw);
        out->prf(out->dst, "  actionset_sw_regs: %d\n", ri->actionset_sw_regs);
-       actfield_count = vcap_actionfield_count(vctrl, vt, actionset);
-       actionfield = vcap_actionfields(vctrl, vt, actionset);
-       tgt = vcap_actionfield_typegroup(vctrl, vt, actionset);
-       /* Start decoding the stream */
-       for (idx = 0; idx < actfield_count; ++idx) {
-               if (actionfield[idx].width <= 0)
-                       continue;
-               /* Get the action */
-               memset(value, 0, DIV_ROUND_UP(actionfield[idx].width, 8));
-               vcap_iter_init(&iter, vctrl->vcaps[vt].act_width, tgt,
-                              actionfield[idx].offset);
-               vcap_decode_field(actstream, &iter, actionfield[idx].width,
-                                 value);
-               /* Skip if no bits are set */
-               no_bits = vcap_bitarray_zero(actionfield[idx].width, value);
-               if (no_bits)
-                       continue;
-               /* Later the action id will also be checked */
-               vcap_debugfs_show_rule_actionfield(vctrl, out, idx, actionfield,
-                                                  value);
+
+       list_for_each_entry(caf, &ri->data.actionfields, ctrl.list) {
+               actionfield = vcap_actionfields(vctrl, admin->vtype,
+                                               ri->data.actionset);
+               vcap_debugfs_show_rule_actionfield(vctrl, out, caf->ctrl.action,
+                                                  actionfield,
+                                                  &caf->data.u1.value);
        }
+
        return 0;
 }
 
@@ -632,32 +264,22 @@ static int vcap_show_admin(struct vcap_control *vctrl,
                           struct vcap_admin *admin,
                           struct vcap_output_print *out)
 {
-       struct vcap_rule_internal *elem, *ri;
+       struct vcap_rule_internal *elem;
+       struct vcap_rule *vrule;
        int ret = 0;
 
        vcap_show_admin_info(vctrl, admin, out);
-       mutex_lock(&admin->lock);
        list_for_each_entry(elem, &admin->rules, list) {
-               ri = vcap_dup_rule(elem);
-               if (IS_ERR(ri)) {
-                       ret = PTR_ERR(ri);
-                       goto err_unlock;
+               vrule = vcap_get_rule(vctrl, elem->data.id);
+               if (IS_ERR_OR_NULL(vrule)) {
+                       ret = PTR_ERR(vrule);
+                       break;
                }
-               /* Read data from VCAP */
-               ret = vcap_read_rule(ri);
-               if (ret)
-                       goto err_free_rule;
+
                out->prf(out->dst, "\n");
-               vcap_show_admin_rule(vctrl, admin, out, ri);
-               vcap_free_rule((struct vcap_rule *)ri);
+               vcap_show_admin_rule(vctrl, admin, out, to_intrule(vrule));
+               vcap_free_rule(vrule);
        }
-       mutex_unlock(&admin->lock);
-       return 0;
-
-err_free_rule:
-       vcap_free_rule((struct vcap_rule *)ri);
-err_unlock:
-       mutex_unlock(&admin->lock);
        return ret;
 }
 
index 9ac1b1d..4fd21da 100644 (file)
@@ -96,4 +96,18 @@ const char *vcap_actionset_name(struct vcap_control *vctrl,
 const char *vcap_actionfield_name(struct vcap_control *vctrl,
                                  enum vcap_action_field action);
 
+/* Read key data from a VCAP address and discover if there are any rule keysets
+ * here
+ */
+int vcap_addr_keysets(struct vcap_control *vctrl, struct net_device *ndev,
+                     struct vcap_admin *admin, int addr,
+                     struct vcap_keyset_list *kslist);
+
+/* Verify that the typegroup information, subword count, keyset and type id
+ * are in sync and correct, return the list of matchin keysets
+ */
+int vcap_find_keystream_keysets(struct vcap_control *vctrl, enum vcap_type vt,
+                               u32 *keystream, u32 *mskstream, bool mask,
+                               int sw_max, struct vcap_keyset_list *kslist);
+
 #endif /* __VCAP_API_PRIVATE__ */
index ad1277a..2f6a048 100644 (file)
@@ -1363,10 +1363,11 @@ static void mana_poll_rx_cq(struct mana_cq *cq)
                xdp_do_flush();
 }
 
-static void mana_cq_handler(void *context, struct gdma_queue *gdma_queue)
+static int mana_cq_handler(void *context, struct gdma_queue *gdma_queue)
 {
        struct mana_cq *cq = context;
        u8 arm_bit;
+       int w;
 
        WARN_ON_ONCE(cq->gdma_cq != gdma_queue);
 
@@ -1375,26 +1376,31 @@ static void mana_cq_handler(void *context, struct gdma_queue *gdma_queue)
        else
                mana_poll_tx_cq(cq);
 
-       if (cq->work_done < cq->budget &&
-           napi_complete_done(&cq->napi, cq->work_done)) {
+       w = cq->work_done;
+
+       if (w < cq->budget &&
+           napi_complete_done(&cq->napi, w)) {
                arm_bit = SET_ARM_BIT;
        } else {
                arm_bit = 0;
        }
 
        mana_gd_ring_cq(gdma_queue, arm_bit);
+
+       return w;
 }
 
 static int mana_poll(struct napi_struct *napi, int budget)
 {
        struct mana_cq *cq = container_of(napi, struct mana_cq, napi);
+       int w;
 
        cq->work_done = 0;
        cq->budget = budget;
 
-       mana_cq_handler(cq, cq->gdma_cq);
+       w = mana_cq_handler(cq, cq->gdma_cq);
 
-       return min(cq->work_done, budget);
+       return min(w, budget);
 }
 
 static void mana_schedule_napi(void *context, struct gdma_queue *gdma_queue)
index 9063e2e..8073d7a 100644 (file)
@@ -552,8 +552,7 @@ myri10ge_validate_firmware(struct myri10ge_priv *mgp,
        }
 
        /* save firmware version for ethtool */
-       strncpy(mgp->fw_version, hdr->version, sizeof(mgp->fw_version));
-       mgp->fw_version[sizeof(mgp->fw_version) - 1] = '\0';
+       strscpy(mgp->fw_version, hdr->version, sizeof(mgp->fw_version));
 
        sscanf(mgp->fw_version, "%d.%d.%d", &mgp->fw_ver_major,
               &mgp->fw_ver_minor, &mgp->fw_ver_tiny);
index 4247bca..aa8aba4 100644 (file)
@@ -503,7 +503,7 @@ nfp_ccm_mbox_msg_prepare(struct nfp_net *nn, struct sk_buff *skb,
        max_len = max(max_reply_size, round_up(skb->len, 4));
        if (max_len > mbox_max) {
                nn_dp_warn(&nn->dp,
-                          "message too big for tha mailbox: %u/%u vs %u\n",
+                          "message too big for the mailbox: %u/%u vs %u\n",
                           skb->len, max_reply_size, mbox_max);
                return -EMSGSIZE;
        }
index 3728870..4632268 100644 (file)
@@ -302,6 +302,11 @@ static int nfp_net_xfrm_add_state(struct xfrm_state *x)
                return -EINVAL;
        }
 
+       if (x->xso.type != XFRM_DEV_OFFLOAD_CRYPTO) {
+               nn_err(nn, "Unsupported xfrm offload tyoe\n");
+               return -EINVAL;
+       }
+
        cfg->spi = ntohl(x->id.spi);
 
        /* Hash/Authentication */
index 2b427d8..ccacb6a 100644 (file)
@@ -282,7 +282,7 @@ netdev_tx_t nfp_nfdk_tx(struct sk_buff *skb, struct net_device *netdev)
        dma_len = skb_headlen(skb);
        if (skb_is_gso(skb))
                type = NFDK_DESC_TX_TYPE_TSO;
-       else if (!nr_frags && dma_len < NFDK_TX_MAX_DATA_PER_HEAD)
+       else if (!nr_frags && dma_len <= NFDK_TX_MAX_DATA_PER_HEAD)
                type = NFDK_DESC_TX_TYPE_SIMPLE;
        else
                type = NFDK_DESC_TX_TYPE_GATHER;
@@ -927,7 +927,7 @@ nfp_nfdk_tx_xdp_buf(struct nfp_net_dp *dp, struct nfp_net_rx_ring *rx_ring,
        dma_len = pkt_len;
        dma_addr = rxbuf->dma_addr + dma_off;
 
-       if (dma_len < NFDK_TX_MAX_DATA_PER_HEAD)
+       if (dma_len <= NFDK_TX_MAX_DATA_PER_HEAD)
                type = NFDK_DESC_TX_TYPE_SIMPLE;
        else
                type = NFDK_DESC_TX_TYPE_GATHER;
@@ -1325,7 +1325,7 @@ nfp_nfdk_ctrl_tx_one(struct nfp_net *nn, struct nfp_net_r_vector *r_vec,
        txbuf = &tx_ring->ktxbufs[wr_idx];
 
        dma_len = skb_headlen(skb);
-       if (dma_len < NFDK_TX_MAX_DATA_PER_HEAD)
+       if (dma_len <= NFDK_TX_MAX_DATA_PER_HEAD)
                type = NFDK_DESC_TX_TYPE_SIMPLE;
        else
                type = NFDK_DESC_TX_TYPE_GATHER;
index 6c83e47..da33f09 100644 (file)
@@ -88,6 +88,9 @@
 #define NFP_NET_FL_BATCH       16      /* Add freelist in this Batch size */
 #define NFP_NET_XDP_MAX_COMPLETE 2048  /* XDP bufs to reclaim in NAPI poll */
 
+/* MC definitions */
+#define NFP_NET_CFG_MAC_MC_MAX 1024    /* The maximum number of MC address per port*/
+
 /* Offload definitions */
 #define NFP_NET_N_VXLAN_PORTS  (NFP_NET_CFG_VXLAN_SZ / sizeof(__be16))
 
@@ -476,6 +479,7 @@ struct nfp_stat_pair {
  * @rx_dma_off:                Offset at which DMA packets (for XDP headroom)
  * @rx_offset:         Offset in the RX buffers where packet data starts
  * @ctrl:              Local copy of the control register/word.
+ * @ctrl_w1:           Local copy of the control register/word1.
  * @fl_bufsz:          Currently configured size of the freelist buffers
  * @xdp_prog:          Installed XDP program
  * @tx_rings:          Array of pre-allocated TX ring structures
@@ -508,6 +512,7 @@ struct nfp_net_dp {
        u32 rx_dma_off;
 
        u32 ctrl;
+       u32 ctrl_w1;
        u32 fl_bufsz;
 
        struct bpf_prog *xdp_prog;
index 682a919..2314cf5 100644 (file)
@@ -1007,6 +1007,7 @@ static int nfp_net_set_config_and_enable(struct nfp_net *nn)
                new_ctrl |= NFP_NET_CFG_CTRL_RINGCFG;
 
        nn_writel(nn, NFP_NET_CFG_CTRL, new_ctrl);
+       nn_writel(nn, NFP_NET_CFG_CTRL_WORD1, nn->dp.ctrl_w1);
        err = nfp_net_reconfig(nn, update);
        if (err) {
                nfp_net_clear_config_and_disable(nn);
@@ -1333,18 +1334,59 @@ err_unlock:
        return err;
 }
 
+static int nfp_net_mc_cfg(struct net_device *netdev, const unsigned char *addr, const u32 cmd)
+{
+       struct nfp_net *nn = netdev_priv(netdev);
+       int ret;
+
+       ret = nfp_net_mbox_lock(nn, NFP_NET_CFG_MULTICAST_SZ);
+       if (ret)
+               return ret;
+
+       nn_writel(nn, nn->tlv_caps.mbox_off + NFP_NET_CFG_MULTICAST_MAC_HI,
+                 get_unaligned_be32(addr));
+       nn_writew(nn, nn->tlv_caps.mbox_off + NFP_NET_CFG_MULTICAST_MAC_LO,
+                 get_unaligned_be16(addr + 4));
+
+       return nfp_net_mbox_reconfig_and_unlock(nn, cmd);
+}
+
+static int nfp_net_mc_sync(struct net_device *netdev, const unsigned char *addr)
+{
+       struct nfp_net *nn = netdev_priv(netdev);
+
+       if (netdev_mc_count(netdev) > NFP_NET_CFG_MAC_MC_MAX) {
+               nn_err(nn, "Requested number of MC addresses (%d) exceeds maximum (%d).\n",
+                      netdev_mc_count(netdev), NFP_NET_CFG_MAC_MC_MAX);
+               return -EINVAL;
+       }
+
+       return nfp_net_mc_cfg(netdev, addr, NFP_NET_CFG_MBOX_CMD_MULTICAST_ADD);
+}
+
+static int nfp_net_mc_unsync(struct net_device *netdev, const unsigned char *addr)
+{
+       return nfp_net_mc_cfg(netdev, addr, NFP_NET_CFG_MBOX_CMD_MULTICAST_DEL);
+}
+
 static void nfp_net_set_rx_mode(struct net_device *netdev)
 {
        struct nfp_net *nn = netdev_priv(netdev);
-       u32 new_ctrl;
+       u32 new_ctrl, new_ctrl_w1;
 
        new_ctrl = nn->dp.ctrl;
+       new_ctrl_w1 = nn->dp.ctrl_w1;
 
        if (!netdev_mc_empty(netdev) || netdev->flags & IFF_ALLMULTI)
                new_ctrl |= nn->cap & NFP_NET_CFG_CTRL_L2MC;
        else
                new_ctrl &= ~NFP_NET_CFG_CTRL_L2MC;
 
+       if (netdev->flags & IFF_ALLMULTI)
+               new_ctrl_w1 &= ~NFP_NET_CFG_CTRL_MCAST_FILTER;
+       else
+               new_ctrl_w1 |= nn->cap_w1 & NFP_NET_CFG_CTRL_MCAST_FILTER;
+
        if (netdev->flags & IFF_PROMISC) {
                if (nn->cap & NFP_NET_CFG_CTRL_PROMISC)
                        new_ctrl |= NFP_NET_CFG_CTRL_PROMISC;
@@ -1354,13 +1396,21 @@ static void nfp_net_set_rx_mode(struct net_device *netdev)
                new_ctrl &= ~NFP_NET_CFG_CTRL_PROMISC;
        }
 
-       if (new_ctrl == nn->dp.ctrl)
+       if ((nn->cap_w1 & NFP_NET_CFG_CTRL_MCAST_FILTER) &&
+           __dev_mc_sync(netdev, nfp_net_mc_sync, nfp_net_mc_unsync))
+               netdev_err(netdev, "Sync mc address failed\n");
+
+       if (new_ctrl == nn->dp.ctrl && new_ctrl_w1 == nn->dp.ctrl_w1)
                return;
 
-       nn_writel(nn, NFP_NET_CFG_CTRL, new_ctrl);
+       if (new_ctrl != nn->dp.ctrl)
+               nn_writel(nn, NFP_NET_CFG_CTRL, new_ctrl);
+       if (new_ctrl_w1 != nn->dp.ctrl_w1)
+               nn_writel(nn, NFP_NET_CFG_CTRL_WORD1, new_ctrl_w1);
        nfp_net_reconfig_post(nn, NFP_NET_CFG_UPDATE_GEN);
 
        nn->dp.ctrl = new_ctrl;
+       nn->dp.ctrl_w1 = new_ctrl_w1;
 }
 
 static void nfp_net_rss_init_itbl(struct nfp_net *nn)
@@ -2092,7 +2142,7 @@ void nfp_net_info(struct nfp_net *nn)
                nn->fw_ver.extend, nn->fw_ver.class,
                nn->fw_ver.major, nn->fw_ver.minor,
                nn->max_mtu);
-       nn_info(nn, "CAP: %#x %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n",
+       nn_info(nn, "CAP: %#x %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n",
                nn->cap,
                nn->cap & NFP_NET_CFG_CTRL_PROMISC  ? "PROMISC "  : "",
                nn->cap & NFP_NET_CFG_CTRL_L2BC     ? "L2BCFILT " : "",
@@ -2120,6 +2170,7 @@ void nfp_net_info(struct nfp_net *nn)
                nn->cap & NFP_NET_CFG_CTRL_CSUM_COMPLETE ?
                                                      "RXCSUM_COMPLETE " : "",
                nn->cap & NFP_NET_CFG_CTRL_LIVE_ADDR ? "LIVE_ADDR " : "",
+               nn->cap_w1 & NFP_NET_CFG_CTRL_MCAST_FILTER ? "MULTICAST_FILTER " : "",
                nfp_app_extra_cap(nn->app, nn));
 }
 
@@ -2548,6 +2599,9 @@ int nfp_net_init(struct nfp_net *nn)
        if (nn->cap & NFP_NET_CFG_CTRL_TXRWB)
                nn->dp.ctrl |= NFP_NET_CFG_CTRL_TXRWB;
 
+       if (nn->cap_w1 & NFP_NET_CFG_CTRL_MCAST_FILTER)
+               nn->dp.ctrl_w1 |= NFP_NET_CFG_CTRL_MCAST_FILTER;
+
        /* Stash the re-configuration queue away.  First odd queue in TX Bar */
        nn->qcp_cfg = nn->tx_bar + NFP_QCP_QUEUE_ADDR_SZ;
 
@@ -2555,6 +2609,7 @@ int nfp_net_init(struct nfp_net *nn)
        nn_writel(nn, NFP_NET_CFG_CTRL, 0);
        nn_writeq(nn, NFP_NET_CFG_TXRS_ENABLE, 0);
        nn_writeq(nn, NFP_NET_CFG_RXRS_ENABLE, 0);
+       nn_writel(nn, NFP_NET_CFG_CTRL_WORD1, 0);
        err = nfp_net_reconfig(nn, NFP_NET_CFG_UPDATE_RING |
                                   NFP_NET_CFG_UPDATE_GEN);
        if (err)
index cc11b3d..5112430 100644 (file)
 #define NFP_NET_CFG_CTRL_WORD1         0x0098
 #define   NFP_NET_CFG_CTRL_PKT_TYPE      (0x1 << 0) /* Pkttype offload */
 #define   NFP_NET_CFG_CTRL_IPSEC         (0x1 << 1) /* IPsec offload */
+#define   NFP_NET_CFG_CTRL_MCAST_FILTER          (0x1 << 2) /* Multicast Filter */
 
 #define NFP_NET_CFG_CAP_WORD1          0x00a4
 
 #define NFP_NET_CFG_MBOX_CMD_PCI_DSCP_PRIOMAP_SET      5
 #define NFP_NET_CFG_MBOX_CMD_TLV_CMSG                  6
 
+#define NFP_NET_CFG_MBOX_CMD_MULTICAST_ADD             8
+#define NFP_NET_CFG_MBOX_CMD_MULTICAST_DEL             9
+
 /* VLAN filtering using general use mailbox
  * %NFP_NET_CFG_VLAN_FILTER:           Base address of VLAN filter mailbox
  * %NFP_NET_CFG_VLAN_FILTER_VID:       VLAN ID to filter
 #define  NFP_NET_CFG_VLAN_FILTER_PROTO  (NFP_NET_CFG_VLAN_FILTER + 2)
 #define NFP_NET_CFG_VLAN_FILTER_SZ      0x0004
 
+/* Multicast filtering using general use mailbox
+ * %NFP_NET_CFG_MULTICAST:             Base address of Multicast filter mailbox
+ * %NFP_NET_CFG_MULTICAST_MAC_HI:      High 32-bits of Multicast MAC address
+ * %NFP_NET_CFG_MULTICAST_MAC_LO:      Low 16-bits of Multicast MAC address
+ * %NFP_NET_CFG_MULTICAST_SZ:          Size of the Multicast filter mailbox in bytes
+ */
+#define NFP_NET_CFG_MULTICAST          NFP_NET_CFG_MBOX_SIMPLE_VAL
+#define NFP_NET_CFG_MULTICAST_MAC_HI   NFP_NET_CFG_MULTICAST
+#define NFP_NET_CFG_MULTICAST_MAC_LO   (NFP_NET_CFG_MULTICAST + 6)
+#define NFP_NET_CFG_MULTICAST_SZ       0x0006
+
 /* TLV capabilities
  * %NFP_NET_CFG_TLV_TYPE:      Offset of type within the TLV
  * %NFP_NET_CFG_TLV_TYPE_REQUIRED: Driver must be able to parse the TLV
index ed274f0..e5116a8 100644 (file)
@@ -200,7 +200,7 @@ static void qed_ll2b_complete_rx_packet(void *cxt,
        dma_unmap_single(&cdev->pdev->dev, buffer->phys_addr,
                         cdev->ll2->rx_size, DMA_FROM_DEVICE);
 
-       skb = build_skb(buffer->data, 0);
+       skb = slab_build_skb(buffer->data);
        if (!skb) {
                DP_INFO(cdev, "Failed to build SKB\n");
                kfree(buffer->data);
index 1ee491f..c1436e1 100644 (file)
@@ -711,7 +711,7 @@ static int qlcnic_validate_ring_count(struct qlcnic_adapter *adapter,
                }
        }
 
-        if (tx_ring != 0) {
+       if (tx_ring != 0) {
                if (tx_ring > adapter->max_tx_rings) {
                        netdev_err(adapter->netdev,
                                   "Invalid ring count, Tx ring count %d should not be greater than max %d driver Tx rings.\n",
index ec15788..a9dcc98 100644 (file)
@@ -5283,6 +5283,8 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
        dev->hw_features |= NETIF_F_RXALL;
        dev->hw_features |= NETIF_F_RXFCS;
 
+       netdev_sw_irq_coalesce_default_on(dev);
+
        /* configure chip for default features */
        rtl8169_set_features(dev, dev->features);
 
index 6bc9233..33f723a 100644 (file)
@@ -841,7 +841,7 @@ static bool ravb_rx_gbeth(struct net_device *ndev, int *quota, int q)
                                napi_gro_receive(&priv->napi[q],
                                                 priv->rx_1st_skb);
                                stats->rx_packets++;
-                               stats->rx_bytes += priv->rx_1st_skb->len;
+                               stats->rx_bytes += pkt_len;
                                break;
                        }
                }
index c2224e4..cc30524 100644 (file)
@@ -1164,7 +1164,7 @@ static ssize_t mcdi_logging_show(struct device *dev,
        struct efx_nic *efx = dev_get_drvdata(dev);
        struct efx_mcdi_iface *mcdi = efx_mcdi(efx);
 
-       return scnprintf(buf, PAGE_SIZE, "%d\n", mcdi->logging_enabled);
+       return sysfs_emit(buf, "%d\n", mcdi->logging_enabled);
 }
 
 static ssize_t mcdi_logging_store(struct device *dev,
index 1fd396b..e4b294b 100644 (file)
@@ -1178,7 +1178,7 @@ static ssize_t mcdi_logging_show(struct device *dev,
        struct efx_nic *efx = dev_get_drvdata(dev);
        struct efx_mcdi_iface *mcdi = efx_mcdi(efx);
 
-       return scnprintf(buf, PAGE_SIZE, "%d\n", mcdi->logging_enabled);
+       return sysfs_emit(buf, "%d\n", mcdi->logging_enabled);
 }
 
 static ssize_t mcdi_logging_store(struct device *dev,
index 31ff351..f77511f 100644 (file)
@@ -235,6 +235,15 @@ config DWMAC_INTEL_PLAT
          the stmmac device driver. This driver is used for the Intel Keem Bay
          SoC.
 
+config DWMAC_TEGRA
+       tristate "NVIDIA Tegra MGBE support"
+       depends on ARCH_TEGRA || COMPILE_TEST
+       help
+         This selects the Multi-GigaBit Ethernet (MGBE) Controller that is
+         found on the NVIDIA Tegra SoC devices. This driver provides the glue
+         layer on top of the stmmac driver required for these NVIDIA Tegra SoC
+         devices.
+
 config DWMAC_VISCONTI
        tristate "Toshiba Visconti DWMAC support"
        default ARCH_VISCONTI
index d4e12e9..057e4ba 100644 (file)
@@ -31,6 +31,7 @@ obj-$(CONFIG_DWMAC_DWC_QOS_ETH)       += dwmac-dwc-qos-eth.o
 obj-$(CONFIG_DWMAC_INTEL_PLAT) += dwmac-intel-plat.o
 obj-$(CONFIG_DWMAC_GENERIC)    += dwmac-generic.o
 obj-$(CONFIG_DWMAC_IMX8)       += dwmac-imx.o
+obj-$(CONFIG_DWMAC_TEGRA)      += dwmac-tegra.o
 obj-$(CONFIG_DWMAC_VISCONTI)   += dwmac-visconti.o
 stmmac-platform-objs:= stmmac_platform.o
 dwmac-altr-socfpga-objs := altr_tse_pcs.o dwmac-socfpga.o
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-tegra.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-tegra.c
new file mode 100644 (file)
index 0000000..bdf990c
--- /dev/null
@@ -0,0 +1,388 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/platform_device.h>
+#include <linux/of_device.h>
+#include <linux/module.h>
+#include <linux/stmmac.h>
+#include <linux/clk.h>
+
+#include "stmmac_platform.h"
+
+static const char *const mgbe_clks[] = {
+       "rx-pcs", "tx", "tx-pcs", "mac-divider", "mac", "mgbe", "ptp-ref", "mac"
+};
+
+struct tegra_mgbe {
+       struct device *dev;
+
+       struct clk_bulk_data *clks;
+
+       struct reset_control *rst_mac;
+       struct reset_control *rst_pcs;
+
+       void __iomem *hv;
+       void __iomem *regs;
+       void __iomem *xpcs;
+
+       struct mii_bus *mii;
+};
+
+#define XPCS_WRAP_UPHY_RX_CONTROL 0x801c
+#define XPCS_WRAP_UPHY_RX_CONTROL_RX_SW_OVRD BIT(31)
+#define XPCS_WRAP_UPHY_RX_CONTROL_RX_PCS_PHY_RDY BIT(10)
+#define XPCS_WRAP_UPHY_RX_CONTROL_RX_CDR_RESET BIT(9)
+#define XPCS_WRAP_UPHY_RX_CONTROL_RX_CAL_EN BIT(8)
+#define XPCS_WRAP_UPHY_RX_CONTROL_RX_SLEEP (BIT(7) | BIT(6))
+#define XPCS_WRAP_UPHY_RX_CONTROL_AUX_RX_IDDQ BIT(5)
+#define XPCS_WRAP_UPHY_RX_CONTROL_RX_IDDQ BIT(4)
+#define XPCS_WRAP_UPHY_RX_CONTROL_RX_DATA_EN BIT(0)
+#define XPCS_WRAP_UPHY_HW_INIT_CTRL 0x8020
+#define XPCS_WRAP_UPHY_HW_INIT_CTRL_TX_EN BIT(0)
+#define XPCS_WRAP_UPHY_HW_INIT_CTRL_RX_EN BIT(2)
+#define XPCS_WRAP_UPHY_STATUS 0x8044
+#define XPCS_WRAP_UPHY_STATUS_TX_P_UP BIT(0)
+#define XPCS_WRAP_IRQ_STATUS 0x8050
+#define XPCS_WRAP_IRQ_STATUS_PCS_LINK_STS BIT(6)
+
+#define XPCS_REG_ADDR_SHIFT 10
+#define XPCS_REG_ADDR_MASK 0x1fff
+#define XPCS_ADDR 0x3fc
+
+#define MGBE_WRAP_COMMON_INTR_ENABLE   0x8704
+#define MAC_SBD_INTR                   BIT(2)
+#define MGBE_WRAP_AXI_ASID0_CTRL       0x8400
+#define MGBE_SID                       0x6
+
+static int __maybe_unused tegra_mgbe_suspend(struct device *dev)
+{
+       struct tegra_mgbe *mgbe = get_stmmac_bsp_priv(dev);
+       int err;
+
+       err = stmmac_suspend(dev);
+       if (err)
+               return err;
+
+       clk_bulk_disable_unprepare(ARRAY_SIZE(mgbe_clks), mgbe->clks);
+
+       return reset_control_assert(mgbe->rst_mac);
+}
+
+static int __maybe_unused tegra_mgbe_resume(struct device *dev)
+{
+       struct tegra_mgbe *mgbe = get_stmmac_bsp_priv(dev);
+       u32 value;
+       int err;
+
+       err = clk_bulk_prepare_enable(ARRAY_SIZE(mgbe_clks), mgbe->clks);
+       if (err < 0)
+               return err;
+
+       err = reset_control_deassert(mgbe->rst_mac);
+       if (err < 0)
+               return err;
+
+       /* Enable common interrupt at wrapper level */
+       writel(MAC_SBD_INTR, mgbe->regs + MGBE_WRAP_COMMON_INTR_ENABLE);
+
+       /* Program SID */
+       writel(MGBE_SID, mgbe->hv + MGBE_WRAP_AXI_ASID0_CTRL);
+
+       value = readl(mgbe->xpcs + XPCS_WRAP_UPHY_STATUS);
+       if ((value & XPCS_WRAP_UPHY_STATUS_TX_P_UP) == 0) {
+               value = readl(mgbe->xpcs + XPCS_WRAP_UPHY_HW_INIT_CTRL);
+               value |= XPCS_WRAP_UPHY_HW_INIT_CTRL_TX_EN;
+               writel(value, mgbe->xpcs + XPCS_WRAP_UPHY_HW_INIT_CTRL);
+       }
+
+       err = readl_poll_timeout(mgbe->xpcs + XPCS_WRAP_UPHY_HW_INIT_CTRL, value,
+                                (value & XPCS_WRAP_UPHY_HW_INIT_CTRL_TX_EN) == 0,
+                                500, 500 * 2000);
+       if (err < 0) {
+               dev_err(mgbe->dev, "timeout waiting for TX lane to become enabled\n");
+               clk_bulk_disable_unprepare(ARRAY_SIZE(mgbe_clks), mgbe->clks);
+               return err;
+       }
+
+       err = stmmac_resume(dev);
+       if (err < 0)
+               clk_bulk_disable_unprepare(ARRAY_SIZE(mgbe_clks), mgbe->clks);
+
+       return err;
+}
+
+static int mgbe_uphy_lane_bringup_serdes_up(struct net_device *ndev, void *mgbe_data)
+{
+       struct tegra_mgbe *mgbe = (struct tegra_mgbe *)mgbe_data;
+       u32 value;
+       int err;
+
+       value = readl(mgbe->xpcs + XPCS_WRAP_UPHY_RX_CONTROL);
+       value |= XPCS_WRAP_UPHY_RX_CONTROL_RX_SW_OVRD;
+       writel(value, mgbe->xpcs + XPCS_WRAP_UPHY_RX_CONTROL);
+
+       value = readl(mgbe->xpcs + XPCS_WRAP_UPHY_RX_CONTROL);
+       value &= ~XPCS_WRAP_UPHY_RX_CONTROL_RX_IDDQ;
+       writel(value, mgbe->xpcs + XPCS_WRAP_UPHY_RX_CONTROL);
+
+       value = readl(mgbe->xpcs + XPCS_WRAP_UPHY_RX_CONTROL);
+       value &= ~XPCS_WRAP_UPHY_RX_CONTROL_AUX_RX_IDDQ;
+       writel(value, mgbe->xpcs + XPCS_WRAP_UPHY_RX_CONTROL);
+
+       value = readl(mgbe->xpcs + XPCS_WRAP_UPHY_RX_CONTROL);
+       value &= ~XPCS_WRAP_UPHY_RX_CONTROL_RX_SLEEP;
+       writel(value, mgbe->xpcs + XPCS_WRAP_UPHY_RX_CONTROL);
+
+       value = readl(mgbe->xpcs + XPCS_WRAP_UPHY_RX_CONTROL);
+       value |= XPCS_WRAP_UPHY_RX_CONTROL_RX_CAL_EN;
+       writel(value, mgbe->xpcs + XPCS_WRAP_UPHY_RX_CONTROL);
+
+       err = readl_poll_timeout(mgbe->xpcs + XPCS_WRAP_UPHY_RX_CONTROL, value,
+                                (value & XPCS_WRAP_UPHY_RX_CONTROL_RX_CAL_EN) == 0,
+                                1000, 1000 * 2000);
+       if (err < 0) {
+               dev_err(mgbe->dev, "timeout waiting for RX calibration to become enabled\n");
+               return err;
+       }
+
+       value = readl(mgbe->xpcs + XPCS_WRAP_UPHY_RX_CONTROL);
+       value |= XPCS_WRAP_UPHY_RX_CONTROL_RX_DATA_EN;
+       writel(value, mgbe->xpcs + XPCS_WRAP_UPHY_RX_CONTROL);
+
+       value = readl(mgbe->xpcs + XPCS_WRAP_UPHY_RX_CONTROL);
+       value |= XPCS_WRAP_UPHY_RX_CONTROL_RX_CDR_RESET;
+       writel(value, mgbe->xpcs + XPCS_WRAP_UPHY_RX_CONTROL);
+
+       value = readl(mgbe->xpcs + XPCS_WRAP_UPHY_RX_CONTROL);
+       value &= ~XPCS_WRAP_UPHY_RX_CONTROL_RX_CDR_RESET;
+       writel(value, mgbe->xpcs + XPCS_WRAP_UPHY_RX_CONTROL);
+
+       value = readl(mgbe->xpcs + XPCS_WRAP_UPHY_RX_CONTROL);
+       value |= XPCS_WRAP_UPHY_RX_CONTROL_RX_PCS_PHY_RDY;
+       writel(value, mgbe->xpcs + XPCS_WRAP_UPHY_RX_CONTROL);
+
+       err = readl_poll_timeout(mgbe->xpcs + XPCS_WRAP_IRQ_STATUS, value,
+                                value & XPCS_WRAP_IRQ_STATUS_PCS_LINK_STS,
+                                500, 500 * 2000);
+       if (err < 0) {
+               dev_err(mgbe->dev, "timeout waiting for link to become ready\n");
+               return err;
+       }
+
+       /* clear status */
+       writel(value, mgbe->xpcs + XPCS_WRAP_IRQ_STATUS);
+
+       return 0;
+}
+
+static void mgbe_uphy_lane_bringup_serdes_down(struct net_device *ndev, void *mgbe_data)
+{
+       struct tegra_mgbe *mgbe = (struct tegra_mgbe *)mgbe_data;
+       u32 value;
+
+       value = readl(mgbe->xpcs + XPCS_WRAP_UPHY_RX_CONTROL);
+       value |= XPCS_WRAP_UPHY_RX_CONTROL_RX_SW_OVRD;
+       writel(value, mgbe->xpcs + XPCS_WRAP_UPHY_RX_CONTROL);
+
+       value = readl(mgbe->xpcs + XPCS_WRAP_UPHY_RX_CONTROL);
+       value &= ~XPCS_WRAP_UPHY_RX_CONTROL_RX_DATA_EN;
+       writel(value, mgbe->xpcs + XPCS_WRAP_UPHY_RX_CONTROL);
+
+       value = readl(mgbe->xpcs + XPCS_WRAP_UPHY_RX_CONTROL);
+       value |= XPCS_WRAP_UPHY_RX_CONTROL_RX_SLEEP;
+       writel(value, mgbe->xpcs + XPCS_WRAP_UPHY_RX_CONTROL);
+
+       value = readl(mgbe->xpcs + XPCS_WRAP_UPHY_RX_CONTROL);
+       value |= XPCS_WRAP_UPHY_RX_CONTROL_AUX_RX_IDDQ;
+       writel(value, mgbe->xpcs + XPCS_WRAP_UPHY_RX_CONTROL);
+
+       value = readl(mgbe->xpcs + XPCS_WRAP_UPHY_RX_CONTROL);
+       value |= XPCS_WRAP_UPHY_RX_CONTROL_RX_IDDQ;
+       writel(value, mgbe->xpcs + XPCS_WRAP_UPHY_RX_CONTROL);
+}
+
+static int tegra_mgbe_probe(struct platform_device *pdev)
+{
+       struct plat_stmmacenet_data *plat;
+       struct stmmac_resources res;
+       struct tegra_mgbe *mgbe;
+       int irq, err, i;
+       u32 value;
+
+       mgbe = devm_kzalloc(&pdev->dev, sizeof(*mgbe), GFP_KERNEL);
+       if (!mgbe)
+               return -ENOMEM;
+
+       mgbe->dev = &pdev->dev;
+
+       memset(&res, 0, sizeof(res));
+
+       irq = platform_get_irq(pdev, 0);
+       if (irq < 0)
+               return irq;
+
+       mgbe->hv = devm_platform_ioremap_resource_byname(pdev, "hypervisor");
+       if (IS_ERR(mgbe->hv))
+               return PTR_ERR(mgbe->hv);
+
+       mgbe->regs = devm_platform_ioremap_resource_byname(pdev, "mac");
+       if (IS_ERR(mgbe->regs))
+               return PTR_ERR(mgbe->regs);
+
+       mgbe->xpcs = devm_platform_ioremap_resource_byname(pdev, "xpcs");
+       if (IS_ERR(mgbe->xpcs))
+               return PTR_ERR(mgbe->xpcs);
+
+       res.addr = mgbe->regs;
+       res.irq = irq;
+
+       mgbe->clks = devm_kzalloc(&pdev->dev, sizeof(*mgbe->clks), GFP_KERNEL);
+       if (!mgbe->clks)
+               return -ENOMEM;
+
+       for (i = 0; i <  ARRAY_SIZE(mgbe_clks); i++)
+               mgbe->clks[i].id = mgbe_clks[i];
+
+       err = devm_clk_bulk_get(mgbe->dev, ARRAY_SIZE(mgbe_clks), mgbe->clks);
+       if (err < 0)
+               return err;
+
+       err = clk_bulk_prepare_enable(ARRAY_SIZE(mgbe_clks), mgbe->clks);
+       if (err < 0)
+               return err;
+
+       /* Perform MAC reset */
+       mgbe->rst_mac = devm_reset_control_get(&pdev->dev, "mac");
+       if (IS_ERR(mgbe->rst_mac)) {
+               err = PTR_ERR(mgbe->rst_mac);
+               goto disable_clks;
+       }
+
+       err = reset_control_assert(mgbe->rst_mac);
+       if (err < 0)
+               goto disable_clks;
+
+       usleep_range(2000, 4000);
+
+       err = reset_control_deassert(mgbe->rst_mac);
+       if (err < 0)
+               goto disable_clks;
+
+       /* Perform PCS reset */
+       mgbe->rst_pcs = devm_reset_control_get(&pdev->dev, "pcs");
+       if (IS_ERR(mgbe->rst_pcs)) {
+               err = PTR_ERR(mgbe->rst_pcs);
+               goto disable_clks;
+       }
+
+       err = reset_control_assert(mgbe->rst_pcs);
+       if (err < 0)
+               goto disable_clks;
+
+       usleep_range(2000, 4000);
+
+       err = reset_control_deassert(mgbe->rst_pcs);
+       if (err < 0)
+               goto disable_clks;
+
+       plat = stmmac_probe_config_dt(pdev, res.mac);
+       if (IS_ERR(plat)) {
+               err = PTR_ERR(plat);
+               goto disable_clks;
+       }
+
+       plat->has_xgmac = 1;
+       plat->tso_en = 1;
+       plat->pmt = 1;
+       plat->bsp_priv = mgbe;
+
+       if (!plat->mdio_node)
+               plat->mdio_node = of_get_child_by_name(pdev->dev.of_node, "mdio");
+
+       if (!plat->mdio_bus_data) {
+               plat->mdio_bus_data = devm_kzalloc(&pdev->dev, sizeof(*plat->mdio_bus_data),
+                                                  GFP_KERNEL);
+               if (!plat->mdio_bus_data) {
+                       err = -ENOMEM;
+                       goto remove;
+               }
+       }
+
+       plat->mdio_bus_data->needs_reset = true;
+
+       value = readl(mgbe->xpcs + XPCS_WRAP_UPHY_STATUS);
+       if ((value & XPCS_WRAP_UPHY_STATUS_TX_P_UP) == 0) {
+               value = readl(mgbe->xpcs + XPCS_WRAP_UPHY_HW_INIT_CTRL);
+               value |= XPCS_WRAP_UPHY_HW_INIT_CTRL_TX_EN;
+               writel(value, mgbe->xpcs + XPCS_WRAP_UPHY_HW_INIT_CTRL);
+       }
+
+       err = readl_poll_timeout(mgbe->xpcs + XPCS_WRAP_UPHY_HW_INIT_CTRL, value,
+                                (value & XPCS_WRAP_UPHY_HW_INIT_CTRL_TX_EN) == 0,
+                                500, 500 * 2000);
+       if (err < 0) {
+               dev_err(mgbe->dev, "timeout waiting for TX lane to become enabled\n");
+               goto remove;
+       }
+
+       plat->serdes_powerup = mgbe_uphy_lane_bringup_serdes_up;
+       plat->serdes_powerdown = mgbe_uphy_lane_bringup_serdes_down;
+
+       /* Tx FIFO Size - 128KB */
+       plat->tx_fifo_size = 131072;
+       /* Rx FIFO Size - 192KB */
+       plat->rx_fifo_size = 196608;
+
+       /* Enable common interrupt at wrapper level */
+       writel(MAC_SBD_INTR, mgbe->regs + MGBE_WRAP_COMMON_INTR_ENABLE);
+
+       /* Program SID */
+       writel(MGBE_SID, mgbe->hv + MGBE_WRAP_AXI_ASID0_CTRL);
+
+       plat->serdes_up_after_phy_linkup = 1;
+
+       err = stmmac_dvr_probe(&pdev->dev, plat, &res);
+       if (err < 0)
+               goto remove;
+
+       return 0;
+
+remove:
+       stmmac_remove_config_dt(pdev, plat);
+disable_clks:
+       clk_bulk_disable_unprepare(ARRAY_SIZE(mgbe_clks), mgbe->clks);
+
+       return err;
+}
+
+static int tegra_mgbe_remove(struct platform_device *pdev)
+{
+       struct tegra_mgbe *mgbe = get_stmmac_bsp_priv(&pdev->dev);
+
+       clk_bulk_disable_unprepare(ARRAY_SIZE(mgbe_clks), mgbe->clks);
+
+       stmmac_pltfr_remove(pdev);
+
+       return 0;
+}
+
+static const struct of_device_id tegra_mgbe_match[] = {
+       { .compatible = "nvidia,tegra234-mgbe", },
+       { }
+};
+MODULE_DEVICE_TABLE(of, tegra_mgbe_match);
+
+static SIMPLE_DEV_PM_OPS(tegra_mgbe_pm_ops, tegra_mgbe_suspend, tegra_mgbe_resume);
+
+static struct platform_driver tegra_mgbe_driver = {
+       .probe = tegra_mgbe_probe,
+       .remove = tegra_mgbe_remove,
+       .driver = {
+               .name = "tegra-mgbe",
+               .pm             = &tegra_mgbe_pm_ops,
+               .of_match_table = tegra_mgbe_match,
+       },
+};
+module_platform_driver(tegra_mgbe_driver);
+
+MODULE_AUTHOR("Thierry Reding <treding@nvidia.com>");
+MODULE_DESCRIPTION("NVIDIA Tegra MGBE driver");
+MODULE_LICENSE("GPL");
index 18c7ca2..f36590d 100644 (file)
@@ -988,6 +988,9 @@ static void stmmac_mac_link_up(struct phylink_config *config,
        struct stmmac_priv *priv = netdev_priv(to_net_dev(config->dev));
        u32 old_ctrl, ctrl;
 
+       if (priv->plat->serdes_up_after_phy_linkup && priv->plat->serdes_powerup)
+               priv->plat->serdes_powerup(priv->dev, priv->plat->bsp_priv);
+
        old_ctrl = readl(priv->ioaddr + MAC_CTRL_REG);
        ctrl = old_ctrl & ~priv->hw->link.speed_mask;
 
@@ -3809,7 +3812,7 @@ static int __stmmac_open(struct net_device *dev,
 
        stmmac_reset_queues_param(priv);
 
-       if (priv->plat->serdes_powerup) {
+       if (!priv->plat->serdes_up_after_phy_linkup && priv->plat->serdes_powerup) {
                ret = priv->plat->serdes_powerup(dev, priv->plat->bsp_priv);
                if (ret < 0) {
                        netdev_err(priv->dev, "%s: Serdes powerup failed\n",
@@ -7518,7 +7521,7 @@ int stmmac_resume(struct device *dev)
                        stmmac_mdio_reset(priv->mii);
        }
 
-       if (priv->plat->serdes_powerup) {
+       if (!priv->plat->serdes_up_after_phy_linkup && priv->plat->serdes_powerup) {
                ret = priv->plat->serdes_powerup(ndev,
                                                 priv->plat->bsp_priv);
 
index 50f6b4a..eb6d9cd 100644 (file)
@@ -108,10 +108,10 @@ static struct stmmac_axi *stmmac_axi_setup(struct platform_device *pdev)
 
        axi->axi_lpi_en = of_property_read_bool(np, "snps,lpi_en");
        axi->axi_xit_frm = of_property_read_bool(np, "snps,xit_frm");
-       axi->axi_kbbe = of_property_read_bool(np, "snps,axi_kbbe");
-       axi->axi_fb = of_property_read_bool(np, "snps,axi_fb");
-       axi->axi_mb = of_property_read_bool(np, "snps,axi_mb");
-       axi->axi_rb =  of_property_read_bool(np, "snps,axi_rb");
+       axi->axi_kbbe = of_property_read_bool(np, "snps,kbbe");
+       axi->axi_fb = of_property_read_bool(np, "snps,fb");
+       axi->axi_mb = of_property_read_bool(np, "snps,mb");
+       axi->axi_rb =  of_property_read_bool(np, "snps,rb");
 
        if (of_property_read_u32(np, "snps,wr_osr_lmt", &axi->axi_wr_osr_lmt))
                axi->axi_wr_osr_lmt = 1;
index 773e415..2cfb18c 100644 (file)
@@ -926,6 +926,9 @@ static int tc_setup_taprio(struct stmmac_priv *priv,
        int i, ret = 0;
        u64 ctr;
 
+       if (qopt->base_time < 0)
+               return -ERANGE;
+
        if (!priv->dma_cap.estsel)
                return -EOPNOTSUPP;
 
index 51c37e9..9decb0c 100644 (file)
                         NETIF_MSG_IFUP | NETIF_MSG_PROBE | NETIF_MSG_IFDOWN | \
                         NETIF_MSG_RX_ERR | NETIF_MSG_TX_ERR)
 
-static int am65_cpsw_nuss_init_tx_chns(struct am65_cpsw_common *common);
-static int am65_cpsw_nuss_init_rx_chns(struct am65_cpsw_common *common);
-static void am65_cpsw_nuss_free_tx_chns(struct am65_cpsw_common *common);
-static void am65_cpsw_nuss_free_rx_chns(struct am65_cpsw_common *common);
-
 static void am65_cpsw_port_set_sl_mac(struct am65_cpsw_port *slave,
                                      const u8 *dev_addr)
 {
@@ -379,20 +374,6 @@ static int am65_cpsw_nuss_common_open(struct am65_cpsw_common *common)
        if (common->usage_count)
                return 0;
 
-       /* init tx/rx channels */
-       ret = am65_cpsw_nuss_init_tx_chns(common);
-       if (ret) {
-               dev_err(common->dev, "init_tx_chns failed\n");
-               return ret;
-       }
-
-       ret = am65_cpsw_nuss_init_rx_chns(common);
-       if (ret) {
-               dev_err(common->dev, "init_rx_chns failed\n");
-               am65_cpsw_nuss_free_tx_chns(common);
-               return ret;
-       }
-
        /* Control register */
        writel(AM65_CPSW_CTL_P0_ENABLE | AM65_CPSW_CTL_P0_TX_CRC_REMOVE |
               AM65_CPSW_CTL_VLAN_AWARE | AM65_CPSW_CTL_P0_RX_PAD,
@@ -421,7 +402,6 @@ static int am65_cpsw_nuss_common_open(struct am65_cpsw_common *common)
        /* disable priority elevation */
        writel(0, common->cpsw_base + AM65_CPSW_REG_PTYPE);
 
-       cpsw_ale_control_set(common->ale, 0, ALE_CLEAR, 1);
        cpsw_ale_start(common->ale);
 
        /* limit to one RX flow only */
@@ -453,8 +433,7 @@ static int am65_cpsw_nuss_common_open(struct am65_cpsw_common *common)
                                                  GFP_KERNEL);
                if (!skb) {
                        dev_err(common->dev, "cannot allocate skb\n");
-                       ret = -ENOMEM;
-                       goto err;
+                       return -ENOMEM;
                }
 
                ret = am65_cpsw_nuss_rx_push(common, skb);
@@ -463,7 +442,7 @@ static int am65_cpsw_nuss_common_open(struct am65_cpsw_common *common)
                                "cannot submit skb to channel rx, error %d\n",
                                ret);
                        kfree_skb(skb);
-                       goto err;
+                       return ret;
                }
                kmemleak_not_leak(skb);
        }
@@ -472,7 +451,7 @@ static int am65_cpsw_nuss_common_open(struct am65_cpsw_common *common)
        for (i = 0; i < common->tx_ch_num; i++) {
                ret = k3_udma_glue_enable_tx_chn(common->tx_chns[i].tx_chn);
                if (ret)
-                       goto err;
+                       return ret;
                napi_enable(&common->tx_chns[i].napi_tx);
        }
 
@@ -484,12 +463,6 @@ static int am65_cpsw_nuss_common_open(struct am65_cpsw_common *common)
 
        dev_dbg(common->dev, "cpsw_nuss started\n");
        return 0;
-
-err:
-       am65_cpsw_nuss_free_tx_chns(common);
-       am65_cpsw_nuss_free_rx_chns(common);
-
-       return ret;
 }
 
 static void am65_cpsw_nuss_tx_cleanup(void *data, dma_addr_t desc_dma);
@@ -543,9 +516,6 @@ static int am65_cpsw_nuss_common_stop(struct am65_cpsw_common *common)
        writel(0, common->cpsw_base + AM65_CPSW_REG_CTL);
        writel(0, common->cpsw_base + AM65_CPSW_REG_STAT_PORT_EN);
 
-       am65_cpsw_nuss_free_tx_chns(common);
-       am65_cpsw_nuss_free_rx_chns(common);
-
        dev_dbg(common->dev, "cpsw_nuss stopped\n");
        return 0;
 }
@@ -587,7 +557,6 @@ static int am65_cpsw_nuss_ndo_slave_open(struct net_device *ndev)
        struct am65_cpsw_port *port = am65_ndev_to_port(ndev);
        int ret, i;
        u32 reg;
-       int tmo;
 
        ret = pm_runtime_resume_and_get(common->dev);
        if (ret < 0)
@@ -595,31 +564,30 @@ static int am65_cpsw_nuss_ndo_slave_open(struct net_device *ndev)
 
        /* Idle MAC port */
        cpsw_sl_ctl_set(port->slave.mac_sl, CPSW_SL_CTL_CMD_IDLE);
-
-       tmo = cpsw_sl_wait_for_idle(port->slave.mac_sl, 100);
-       dev_info(common->dev, "down msc_sl %08x tmo %d\n",
-                cpsw_sl_reg_read(port->slave.mac_sl, CPSW_SL_MACSTATUS), tmo);
-
+       cpsw_sl_wait_for_idle(port->slave.mac_sl, 100);
        cpsw_sl_ctl_reset(port->slave.mac_sl);
 
        /* soft reset MAC */
        cpsw_sl_reg_write(port->slave.mac_sl, CPSW_SL_SOFT_RESET, 1);
        mdelay(1);
        reg = cpsw_sl_reg_read(port->slave.mac_sl, CPSW_SL_SOFT_RESET);
-       if (reg)
-               dev_info(common->dev, "mac reset not yet done\n");
+       if (reg) {
+               dev_err(common->dev, "soft RESET didn't complete\n");
+               ret = -ETIMEDOUT;
+               goto runtime_put;
+       }
 
        /* Notify the stack of the actual queue counts. */
        ret = netif_set_real_num_tx_queues(ndev, common->tx_ch_num);
        if (ret) {
                dev_err(common->dev, "cannot set real number of tx queues\n");
-               return ret;
+               goto runtime_put;
        }
 
        ret = netif_set_real_num_rx_queues(ndev, AM65_CPSW_MAX_RX_QUEUES);
        if (ret) {
                dev_err(common->dev, "cannot set real number of rx queues\n");
-               return ret;
+               goto runtime_put;
        }
 
        for (i = 0; i < common->tx_ch_num; i++)
@@ -627,7 +595,7 @@ static int am65_cpsw_nuss_ndo_slave_open(struct net_device *ndev)
 
        ret = am65_cpsw_nuss_common_open(common);
        if (ret)
-               return ret;
+               goto runtime_put;
 
        common->usage_count++;
 
@@ -655,6 +623,10 @@ static int am65_cpsw_nuss_ndo_slave_open(struct net_device *ndev)
 error_cleanup:
        am65_cpsw_nuss_ndo_slave_stop(ndev);
        return ret;
+
+runtime_put:
+       pm_runtime_put(common->dev);
+       return ret;
 }
 
 static void am65_cpsw_nuss_rx_cleanup(void *data, dma_addr_t desc_dma)
@@ -1495,7 +1467,7 @@ static void am65_cpsw_nuss_mac_link_up(struct phylink_config *config, struct phy
 
        if (speed == SPEED_1000)
                mac_control |= CPSW_SL_CTL_GIG;
-       if (speed == SPEED_10 && interface == PHY_INTERFACE_MODE_RGMII)
+       if (speed == SPEED_10 && phy_interface_mode_is_rgmii(interface))
                /* Can be used with in band mode only */
                mac_control |= CPSW_SL_CTL_EXT_EN;
        if (speed == SPEED_100 && interface == PHY_INTERFACE_MODE_RMII)
@@ -1539,9 +1511,9 @@ static void am65_cpsw_nuss_slave_disable_unused(struct am65_cpsw_port *port)
        cpsw_sl_ctl_reset(port->slave.mac_sl);
 }
 
-static void am65_cpsw_nuss_free_tx_chns(struct am65_cpsw_common *common)
+static void am65_cpsw_nuss_free_tx_chns(void *data)
 {
-       struct device *dev = common->dev;
+       struct am65_cpsw_common *common = data;
        int i;
 
        for (i = 0; i < common->tx_ch_num; i++) {
@@ -1553,11 +1525,7 @@ static void am65_cpsw_nuss_free_tx_chns(struct am65_cpsw_common *common)
                if (!IS_ERR_OR_NULL(tx_chn->tx_chn))
                        k3_udma_glue_release_tx_chn(tx_chn->tx_chn);
 
-               /* Don't clear tx_chn memory as we need to preserve
-                * data between suspend/resume
-                */
-               if (!(tx_chn->irq < 0))
-                       devm_free_irq(dev, tx_chn->irq, tx_chn);
+               memset(tx_chn, 0, sizeof(*tx_chn));
        }
 }
 
@@ -1566,10 +1534,12 @@ void am65_cpsw_nuss_remove_tx_chns(struct am65_cpsw_common *common)
        struct device *dev = common->dev;
        int i;
 
+       devm_remove_action(dev, am65_cpsw_nuss_free_tx_chns, common);
+
        for (i = 0; i < common->tx_ch_num; i++) {
                struct am65_cpsw_tx_chn *tx_chn = &common->tx_chns[i];
 
-               if (!(tx_chn->irq < 0))
+               if (tx_chn->irq)
                        devm_free_irq(dev, tx_chn->irq, tx_chn);
 
                netif_napi_del(&tx_chn->napi_tx);
@@ -1584,6 +1554,32 @@ void am65_cpsw_nuss_remove_tx_chns(struct am65_cpsw_common *common)
        }
 }
 
+static int am65_cpsw_nuss_ndev_add_tx_napi(struct am65_cpsw_common *common)
+{
+       struct device *dev = common->dev;
+       int i, ret = 0;
+
+       for (i = 0; i < common->tx_ch_num; i++) {
+               struct am65_cpsw_tx_chn *tx_chn = &common->tx_chns[i];
+
+               netif_napi_add_tx(common->dma_ndev, &tx_chn->napi_tx,
+                                 am65_cpsw_nuss_tx_poll);
+
+               ret = devm_request_irq(dev, tx_chn->irq,
+                                      am65_cpsw_nuss_tx_irq,
+                                      IRQF_TRIGGER_HIGH,
+                                      tx_chn->tx_chn_name, tx_chn);
+               if (ret) {
+                       dev_err(dev, "failure requesting tx%u irq %u, %d\n",
+                               tx_chn->id, tx_chn->irq, ret);
+                       goto err;
+               }
+       }
+
+err:
+       return ret;
+}
+
 static int am65_cpsw_nuss_init_tx_chns(struct am65_cpsw_common *common)
 {
        u32  max_desc_num = ALIGN(AM65_CPSW_MAX_TX_DESC, MAX_SKB_FRAGS);
@@ -1639,7 +1635,7 @@ static int am65_cpsw_nuss_init_tx_chns(struct am65_cpsw_common *common)
                }
 
                tx_chn->irq = k3_udma_glue_tx_get_irq(tx_chn->tx_chn);
-               if (tx_chn->irq < 0) {
+               if (tx_chn->irq <= 0) {
                        dev_err(dev, "Failed to get tx dma irq %d\n",
                                tx_chn->irq);
                        goto err;
@@ -1648,41 +1644,59 @@ static int am65_cpsw_nuss_init_tx_chns(struct am65_cpsw_common *common)
                snprintf(tx_chn->tx_chn_name,
                         sizeof(tx_chn->tx_chn_name), "%s-tx%d",
                         dev_name(dev), tx_chn->id);
-
-               ret = devm_request_irq(dev, tx_chn->irq,
-                                      am65_cpsw_nuss_tx_irq,
-                                      IRQF_TRIGGER_HIGH,
-                                      tx_chn->tx_chn_name, tx_chn);
-               if (ret) {
-                       dev_err(dev, "failure requesting tx%u irq %u, %d\n",
-                               tx_chn->id, tx_chn->irq, ret);
-                       tx_chn->irq = -EINVAL;
-                       goto err;
-               }
        }
 
-       return 0;
+       ret = am65_cpsw_nuss_ndev_add_tx_napi(common);
+       if (ret) {
+               dev_err(dev, "Failed to add tx NAPI %d\n", ret);
+               goto err;
+       }
 
 err:
-       am65_cpsw_nuss_free_tx_chns(common);
+       i = devm_add_action(dev, am65_cpsw_nuss_free_tx_chns, common);
+       if (i) {
+               dev_err(dev, "Failed to add free_tx_chns action %d\n", i);
+               return i;
+       }
 
        return ret;
 }
 
-static void am65_cpsw_nuss_free_rx_chns(struct am65_cpsw_common *common)
+static void am65_cpsw_nuss_free_rx_chns(void *data)
+{
+       struct am65_cpsw_common *common = data;
+       struct am65_cpsw_rx_chn *rx_chn;
+
+       rx_chn = &common->rx_chns;
+
+       if (!IS_ERR_OR_NULL(rx_chn->desc_pool))
+               k3_cppi_desc_pool_destroy(rx_chn->desc_pool);
+
+       if (!IS_ERR_OR_NULL(rx_chn->rx_chn))
+               k3_udma_glue_release_rx_chn(rx_chn->rx_chn);
+}
+
+static void am65_cpsw_nuss_remove_rx_chns(void *data)
 {
+       struct am65_cpsw_common *common = data;
        struct am65_cpsw_rx_chn *rx_chn;
+       struct device *dev = common->dev;
 
        rx_chn = &common->rx_chns;
+       devm_remove_action(dev, am65_cpsw_nuss_free_rx_chns, common);
 
        if (!(rx_chn->irq < 0))
-               devm_free_irq(common->dev, rx_chn->irq, common);
+               devm_free_irq(dev, rx_chn->irq, common);
+
+       netif_napi_del(&common->napi_rx);
 
        if (!IS_ERR_OR_NULL(rx_chn->desc_pool))
                k3_cppi_desc_pool_destroy(rx_chn->desc_pool);
 
        if (!IS_ERR_OR_NULL(rx_chn->rx_chn))
                k3_udma_glue_release_rx_chn(rx_chn->rx_chn);
+
+       common->rx_flow_id_base = -1;
 }
 
 static int am65_cpsw_nuss_init_rx_chns(struct am65_cpsw_common *common)
@@ -1700,7 +1714,7 @@ static int am65_cpsw_nuss_init_rx_chns(struct am65_cpsw_common *common)
 
        rx_cfg.swdata_size = AM65_CPSW_NAV_SW_DATA_SIZE;
        rx_cfg.flow_id_num = AM65_CPSW_MAX_RX_FLOWS;
-       rx_cfg.flow_id_base = -1;
+       rx_cfg.flow_id_base = common->rx_flow_id_base;
 
        /* init all flows */
        rx_chn->dev = dev;
@@ -1772,20 +1786,24 @@ static int am65_cpsw_nuss_init_rx_chns(struct am65_cpsw_common *common)
                }
        }
 
+       netif_napi_add(common->dma_ndev, &common->napi_rx,
+                      am65_cpsw_nuss_rx_poll);
+
        ret = devm_request_irq(dev, rx_chn->irq,
                               am65_cpsw_nuss_rx_irq,
                               IRQF_TRIGGER_HIGH, dev_name(dev), common);
        if (ret) {
                dev_err(dev, "failure requesting rx irq %u, %d\n",
                        rx_chn->irq, ret);
-               rx_chn->irq = -EINVAL;
                goto err;
        }
 
-       return 0;
-
 err:
-       am65_cpsw_nuss_free_rx_chns(common);
+       i = devm_add_action(dev, am65_cpsw_nuss_free_rx_chns, common);
+       if (i) {
+               dev_err(dev, "Failed to add free_rx_chns action %d\n", i);
+               return i;
+       }
 
        return ret;
 }
@@ -2105,26 +2123,9 @@ static int am65_cpsw_nuss_init_ndevs(struct am65_cpsw_common *common)
                        return ret;
        }
 
-       netif_napi_add(common->dma_ndev, &common->napi_rx,
-                      am65_cpsw_nuss_rx_poll);
-
        return ret;
 }
 
-static int am65_cpsw_nuss_ndev_add_tx_napi(struct am65_cpsw_common *common)
-{
-       int i;
-
-       for (i = 0; i < common->tx_ch_num; i++) {
-               struct am65_cpsw_tx_chn *tx_chn = &common->tx_chns[i];
-
-               netif_napi_add_tx(common->dma_ndev, &tx_chn->napi_tx,
-                                 am65_cpsw_nuss_tx_poll);
-       }
-
-       return 0;
-}
-
 static void am65_cpsw_nuss_cleanup_ndev(struct am65_cpsw_common *common)
 {
        struct am65_cpsw_port *port;
@@ -2587,7 +2588,11 @@ static int am65_cpsw_nuss_register_ndevs(struct am65_cpsw_common *common)
        struct am65_cpsw_port *port;
        int ret = 0, i;
 
-       ret = am65_cpsw_nuss_ndev_add_tx_napi(common);
+       /* init tx channels */
+       ret = am65_cpsw_nuss_init_tx_chns(common);
+       if (ret)
+               return ret;
+       ret = am65_cpsw_nuss_init_rx_chns(common);
        if (ret)
                return ret;
 
@@ -2634,10 +2639,8 @@ int am65_cpsw_nuss_update_tx_chns(struct am65_cpsw_common *common, int num_tx)
 
        common->tx_ch_num = num_tx;
        ret = am65_cpsw_nuss_init_tx_chns(common);
-       if (ret)
-               return ret;
 
-       return am65_cpsw_nuss_ndev_add_tx_napi(common);
+       return ret;
 }
 
 struct am65_cpsw_soc_pdata {
@@ -2745,6 +2748,7 @@ static int am65_cpsw_nuss_probe(struct platform_device *pdev)
        if (common->port_num < 1 || common->port_num > AM65_CPSW_MAX_PORTS)
                return -ENOENT;
 
+       common->rx_flow_id_base = -1;
        init_completion(&common->tdown_complete);
        common->tx_ch_num = 1;
        common->pf_p0_rx_ptype_rrobin = false;
@@ -2878,10 +2882,10 @@ static int am65_cpsw_nuss_remove(struct platform_device *pdev)
 static int am65_cpsw_nuss_suspend(struct device *dev)
 {
        struct am65_cpsw_common *common = dev_get_drvdata(dev);
+       struct am65_cpsw_host *host_p = am65_common_get_host(common);
        struct am65_cpsw_port *port;
        struct net_device *ndev;
        int i, ret;
-       struct am65_cpsw_host *host_p = am65_common_get_host(common);
 
        cpsw_ale_dump(common->ale, common->ale_context);
        host_p->vid_context = readl(host_p->port_base + AM65_CPSW_PORT_VLAN_REG_OFFSET);
@@ -2907,6 +2911,9 @@ static int am65_cpsw_nuss_suspend(struct device *dev)
 
        am65_cpts_suspend(common->cpts);
 
+       am65_cpsw_nuss_remove_rx_chns(common);
+       am65_cpsw_nuss_remove_tx_chns(common);
+
        return 0;
 }
 
@@ -2918,6 +2925,17 @@ static int am65_cpsw_nuss_resume(struct device *dev)
        int i, ret;
        struct am65_cpsw_host *host_p = am65_common_get_host(common);
 
+       ret = am65_cpsw_nuss_init_tx_chns(common);
+       if (ret)
+               return ret;
+       ret = am65_cpsw_nuss_init_rx_chns(common);
+       if (ret)
+               return ret;
+
+       /* If RX IRQ was disabled before suspend, keep it disabled */
+       if (common->rx_irq_disabled)
+               disable_irq(common->rx_chns.irq);
+
        am65_cpts_resume(common->cpts);
 
        for (i = 0; i < common->port_num; i++) {
index 450b16a..e1a569b 100644 (file)
@@ -885,7 +885,7 @@ static int ca8210_spi_transfer(
 
        dev_dbg(&spi->dev, "%s called\n", __func__);
 
-       cas_ctl = kmalloc(sizeof(*cas_ctl), GFP_ATOMIC);
+       cas_ctl = kzalloc(sizeof(*cas_ctl), GFP_ATOMIC);
        if (!cas_ctl)
                return -ENOMEM;
 
index c69b87d..edc769d 100644 (file)
@@ -970,7 +970,7 @@ static int cc2520_hw_init(struct cc2520_private *priv)
 
                if (timeout-- <= 0) {
                        dev_err(&priv->spi->dev, "oscillator start failed!\n");
-                       return ret;
+                       return -ETIMEDOUT;
                }
                udelay(1);
        } while (!(status & CC2520_STATUS_XOSC32M_STABLE));
index 48255fc..8cdcaaf 100644 (file)
@@ -2,7 +2,7 @@
 #
 # Makefile for the Qualcomm IPA driver.
 
-IPA_VERSIONS           :=      3.1 3.5.1 4.2 4.5 4.9 4.11
+IPA_VERSIONS           :=      3.1 3.5.1 4.2 4.5 4.7 4.9 4.11
 
 obj-$(CONFIG_QCOM_IPA) +=      ipa.o
 
diff --git a/drivers/net/ipa/data/ipa_data-v4.7.c b/drivers/net/ipa/data/ipa_data-v4.7.c
new file mode 100644 (file)
index 0000000..7552c40
--- /dev/null
@@ -0,0 +1,405 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/* Copyright (C) 2022 Linaro Ltd. */
+
+#include <linux/log2.h>
+
+#include "../gsi.h"
+#include "../ipa_data.h"
+#include "../ipa_endpoint.h"
+#include "../ipa_mem.h"
+
+/** enum ipa_resource_type - IPA resource types for an SoC having IPA v4.7 */
+enum ipa_resource_type {
+       /* Source resource types; first must have value 0 */
+       IPA_RESOURCE_TYPE_SRC_PKT_CONTEXTS              = 0,
+       IPA_RESOURCE_TYPE_SRC_DESCRIPTOR_LISTS,
+       IPA_RESOURCE_TYPE_SRC_DESCRIPTOR_BUFF,
+       IPA_RESOURCE_TYPE_SRC_HPS_DMARS,
+       IPA_RESOURCE_TYPE_SRC_ACK_ENTRIES,
+
+       /* Destination resource types; first must have value 0 */
+       IPA_RESOURCE_TYPE_DST_DATA_SECTORS              = 0,
+       IPA_RESOURCE_TYPE_DST_DPS_DMARS,
+};
+
+/* Resource groups used for an SoC having IPA v4.7 */
+enum ipa_rsrc_group_id {
+       /* Source resource group identifiers */
+       IPA_RSRC_GROUP_SRC_UL_DL                        = 0,
+       IPA_RSRC_GROUP_SRC_UC_RX_Q,
+       IPA_RSRC_GROUP_SRC_COUNT,       /* Last in set; not a source group */
+
+       /* Destination resource group identifiers */
+       IPA_RSRC_GROUP_DST_UL_DL_DPL                    = 0,
+       IPA_RSRC_GROUP_DST_UNUSED_1,
+       IPA_RSRC_GROUP_DST_COUNT,       /* Last; not a destination group */
+};
+
+/* QSB configuration data for an SoC having IPA v4.7 */
+static const struct ipa_qsb_data ipa_qsb_data[] = {
+       [IPA_QSB_MASTER_DDR] = {
+               .max_writes             = 8,
+               .max_reads              = 0,    /* no limit (hardware max) */
+               .max_reads_beats        = 120,
+       },
+};
+
+/* Endpoint configuration data for an SoC having IPA v4.7 */
+static const struct ipa_gsi_endpoint_data ipa_gsi_endpoint_data[] = {
+       [IPA_ENDPOINT_AP_COMMAND_TX] = {
+               .ee_id          = GSI_EE_AP,
+               .channel_id     = 5,
+               .endpoint_id    = 7,
+               .toward_ipa     = true,
+               .channel = {
+                       .tre_count      = 256,
+                       .event_count    = 256,
+                       .tlv_count      = 20,
+               },
+               .endpoint = {
+                       .config = {
+                               .resource_group = IPA_RSRC_GROUP_SRC_UL_DL,
+                               .dma_mode       = true,
+                               .dma_endpoint   = IPA_ENDPOINT_AP_LAN_RX,
+                               .tx = {
+                                       .seq_type = IPA_SEQ_DMA,
+                               },
+                       },
+               },
+       },
+       [IPA_ENDPOINT_AP_LAN_RX] = {
+               .ee_id          = GSI_EE_AP,
+               .channel_id     = 14,
+               .endpoint_id    = 9,
+               .toward_ipa     = false,
+               .channel = {
+                       .tre_count      = 256,
+                       .event_count    = 256,
+                       .tlv_count      = 9,
+               },
+               .endpoint = {
+                       .config = {
+                               .resource_group = IPA_RSRC_GROUP_DST_UL_DL_DPL,
+                               .aggregation    = true,
+                               .status_enable  = true,
+                               .rx = {
+                                       .buffer_size    = 8192,
+                                       .pad_align      = ilog2(sizeof(u32)),
+                                       .aggr_time_limit = 500,
+                               },
+                       },
+               },
+       },
+       [IPA_ENDPOINT_AP_MODEM_TX] = {
+               .ee_id          = GSI_EE_AP,
+               .channel_id     = 2,
+               .endpoint_id    = 2,
+               .toward_ipa     = true,
+               .channel = {
+                       .tre_count      = 512,
+                       .event_count    = 512,
+                       .tlv_count      = 16,
+               },
+               .endpoint = {
+                       .filter_support = true,
+                       .config = {
+                               .resource_group = IPA_RSRC_GROUP_SRC_UL_DL,
+                               .qmap           = true,
+                               .status_enable  = true,
+                               .tx = {
+                                       .seq_type = IPA_SEQ_2_PASS_SKIP_LAST_UC,
+                                       .status_endpoint =
+                                               IPA_ENDPOINT_MODEM_AP_RX,
+                               },
+                       },
+               },
+       },
+       [IPA_ENDPOINT_AP_MODEM_RX] = {
+               .ee_id          = GSI_EE_AP,
+               .channel_id     = 7,
+               .endpoint_id    = 16,
+               .toward_ipa     = false,
+               .channel = {
+                       .tre_count      = 256,
+                       .event_count    = 256,
+                       .tlv_count      = 9,
+               },
+               .endpoint = {
+                       .config = {
+                               .resource_group = IPA_RSRC_GROUP_DST_UL_DL_DPL,
+                               .qmap           = true,
+                               .aggregation    = true,
+                               .rx = {
+                                       .buffer_size    = 8192,
+                                       .aggr_time_limit = 500,
+                                       .aggr_close_eof = true,
+                               },
+                       },
+               },
+       },
+       [IPA_ENDPOINT_MODEM_AP_TX] = {
+               .ee_id          = GSI_EE_MODEM,
+               .channel_id     = 0,
+               .endpoint_id    = 5,
+               .toward_ipa     = true,
+               .endpoint = {
+                       .filter_support = true,
+               },
+       },
+       [IPA_ENDPOINT_MODEM_AP_RX] = {
+               .ee_id          = GSI_EE_MODEM,
+               .channel_id     = 7,
+               .endpoint_id    = 14,
+               .toward_ipa     = false,
+       },
+       [IPA_ENDPOINT_MODEM_DL_NLO_TX] = {
+               .ee_id          = GSI_EE_MODEM,
+               .channel_id     = 2,
+               .endpoint_id    = 8,
+               .toward_ipa     = true,
+               .endpoint = {
+                       .filter_support = true,
+               },
+       },
+};
+
+/* Source resource configuration data for an SoC having IPA v4.7 */
+static const struct ipa_resource ipa_resource_src[] = {
+       [IPA_RESOURCE_TYPE_SRC_PKT_CONTEXTS] = {
+               .limits[IPA_RSRC_GROUP_SRC_UL_DL] = {
+                       .min = 8,       .max = 8,
+               },
+       },
+       [IPA_RESOURCE_TYPE_SRC_DESCRIPTOR_LISTS] = {
+               .limits[IPA_RSRC_GROUP_SRC_UL_DL] = {
+                       .min = 8,       .max = 8,
+               },
+       },
+       [IPA_RESOURCE_TYPE_SRC_DESCRIPTOR_BUFF] = {
+               .limits[IPA_RSRC_GROUP_SRC_UL_DL] = {
+                       .min = 18,      .max = 18,
+               },
+       },
+       [IPA_RESOURCE_TYPE_SRC_HPS_DMARS] = {
+               .limits[IPA_RSRC_GROUP_SRC_UL_DL] = {
+                       .min = 2,       .max = 2,
+               },
+       },
+       [IPA_RESOURCE_TYPE_SRC_ACK_ENTRIES] = {
+               .limits[IPA_RSRC_GROUP_SRC_UL_DL] = {
+                       .min = 15,      .max = 15,
+               },
+       },
+};
+
+/* Destination resource configuration data for an SoC having IPA v4.7 */
+static const struct ipa_resource ipa_resource_dst[] = {
+       [IPA_RESOURCE_TYPE_DST_DATA_SECTORS] = {
+               .limits[IPA_RSRC_GROUP_DST_UL_DL_DPL] = {
+                       .min = 7,       .max = 7,
+               },
+       },
+       [IPA_RESOURCE_TYPE_DST_DPS_DMARS] = {
+               .limits[IPA_RSRC_GROUP_DST_UL_DL_DPL] = {
+                       .min = 2,       .max = 2,
+               },
+       },
+};
+
+/* Resource configuration data for an SoC having IPA v4.7 */
+static const struct ipa_resource_data ipa_resource_data = {
+       .rsrc_group_dst_count   = IPA_RSRC_GROUP_DST_COUNT,
+       .rsrc_group_src_count   = IPA_RSRC_GROUP_SRC_COUNT,
+       .resource_src_count     = ARRAY_SIZE(ipa_resource_src),
+       .resource_src           = ipa_resource_src,
+       .resource_dst_count     = ARRAY_SIZE(ipa_resource_dst),
+       .resource_dst           = ipa_resource_dst,
+};
+
+/* IPA-resident memory region data for an SoC having IPA v4.7 */
+static const struct ipa_mem ipa_mem_local_data[] = {
+       {
+               .id             = IPA_MEM_UC_SHARED,
+               .offset         = 0x0000,
+               .size           = 0x0080,
+               .canary_count   = 0,
+       },
+       {
+               .id             = IPA_MEM_UC_INFO,
+               .offset         = 0x0080,
+               .size           = 0x0200,
+               .canary_count   = 0,
+       },
+       {
+               .id             = IPA_MEM_V4_FILTER_HASHED,
+               .offset         = 0x0288,
+               .size           = 0x0078,
+               .canary_count   = 2,
+       },
+       {
+               .id             = IPA_MEM_V4_FILTER,
+               .offset         = 0x0308,
+               .size           = 0x0078,
+               .canary_count   = 2,
+       },
+       {
+               .id             = IPA_MEM_V6_FILTER_HASHED,
+               .offset         = 0x0388,
+               .size           = 0x0078,
+               .canary_count   = 2,
+       },
+       {
+               .id             = IPA_MEM_V6_FILTER,
+               .offset         = 0x0408,
+               .size           = 0x0078,
+               .canary_count   = 2,
+       },
+       {
+               .id             = IPA_MEM_V4_ROUTE_HASHED,
+               .offset         = 0x0488,
+               .size           = 0x0078,
+               .canary_count   = 2,
+       },
+       {
+               .id             = IPA_MEM_V4_ROUTE,
+               .offset         = 0x0508,
+               .size           = 0x0078,
+               .canary_count   = 2,
+       },
+       {
+               .id             = IPA_MEM_V6_ROUTE_HASHED,
+               .offset         = 0x0588,
+               .size           = 0x0078,
+               .canary_count   = 2,
+       },
+       {
+               .id             = IPA_MEM_V6_ROUTE,
+               .offset         = 0x0608,
+               .size           = 0x0078,
+               .canary_count   = 2,
+       },
+       {
+               .id             = IPA_MEM_MODEM_HEADER,
+               .offset         = 0x0688,
+               .size           = 0x0240,
+               .canary_count   = 2,
+       },
+       {
+               .id             = IPA_MEM_AP_HEADER,
+               .offset         = 0x08c8,
+               .size           = 0x0200,
+               .canary_count   = 0,
+       },
+       {
+               .id             = IPA_MEM_MODEM_PROC_CTX,
+               .offset         = 0x0ad0,
+               .size           = 0x0200,
+               .canary_count   = 2,
+       },
+       {
+               .id             = IPA_MEM_AP_PROC_CTX,
+               .offset         = 0x0cd0,
+               .size           = 0x0200,
+               .canary_count   = 0,
+       },
+       {
+               .id             = IPA_MEM_NAT_TABLE,
+               .offset         = 0x0ee0,
+               .size           = 0x0d00,
+               .canary_count   = 4,
+       },
+       {
+               .id             = IPA_MEM_PDN_CONFIG,
+               .offset         = 0x1be8,
+               .size           = 0x0050,
+               .canary_count   = 0,
+       },
+       {
+               .id             = IPA_MEM_STATS_QUOTA_MODEM,
+               .offset         = 0x1c40,
+               .size           = 0x0030,
+               .canary_count   = 4,
+       },
+       {
+               .id             = IPA_MEM_STATS_QUOTA_AP,
+               .offset         = 0x1c70,
+               .size           = 0x0048,
+               .canary_count   = 0,
+       },
+       {
+               .id             = IPA_MEM_STATS_TETHERING,
+               .offset         = 0x1cb8,
+               .size           = 0x0238,
+               .canary_count   = 0,
+       },
+       {
+               .id             = IPA_MEM_STATS_DROP,
+               .offset         = 0x1ef0,
+               .size           = 0x0020,
+               .canary_count   = 0,
+       },
+       {
+               .id             = IPA_MEM_MODEM,
+               .offset         = 0x1f18,
+               .size           = 0x100c,
+               .canary_count   = 2,
+       },
+       {
+               .id             = IPA_MEM_END_MARKER,
+               .offset         = 0x3000,
+               .size           = 0x0000,
+               .canary_count   = 1,
+       },
+};
+
+/* Memory configuration data for an SoC having IPA v4.7 */
+static const struct ipa_mem_data ipa_mem_data = {
+       .local_count    = ARRAY_SIZE(ipa_mem_local_data),
+       .local          = ipa_mem_local_data,
+       .imem_addr      = 0x146a9000,
+       .imem_size      = 0x00002000,
+       .smem_id        = 497,
+       .smem_size      = 0x00009000,
+};
+
+/* Interconnect rates are in 1000 byte/second units */
+static const struct ipa_interconnect_data ipa_interconnect_data[] = {
+       {
+               .name                   = "memory",
+               .peak_bandwidth         = 600000,       /* 600 MBps */
+               .average_bandwidth      = 150000,       /* 150 MBps */
+       },
+       /* Average rate is unused for the next two interconnects */
+       {
+               .name                   = "imem",
+               .peak_bandwidth         = 450000,       /* 450 MBps */
+               .average_bandwidth      = 75000,        /* 75 MBps (unused?) */
+       },
+       {
+               .name                   = "config",
+               .peak_bandwidth         = 171400,       /* 171.4 MBps */
+               .average_bandwidth      = 0,            /* unused */
+       },
+};
+
+/* Clock and interconnect configuration data for an SoC having IPA v4.7 */
+static const struct ipa_power_data ipa_power_data = {
+       /* XXX Downstream code says 150 MHz (DT SVS2), 60 MHz (code) */
+       .core_clock_rate        = 100 * 1000 * 1000,    /* Hz (150?  60?) */
+       .interconnect_count     = ARRAY_SIZE(ipa_interconnect_data),
+       .interconnect_data      = ipa_interconnect_data,
+};
+
+/* Configuration data for an SoC having IPA v4.7 */
+const struct ipa_data ipa_data_v4_7 = {
+       .version                = IPA_VERSION_4_7,
+       .qsb_count              = ARRAY_SIZE(ipa_qsb_data),
+       .qsb_data               = ipa_qsb_data,
+       .modem_route_count      = 8,
+       .endpoint_count         = ARRAY_SIZE(ipa_gsi_endpoint_data),
+       .endpoint_data          = ipa_gsi_endpoint_data,
+       .resource_data          = &ipa_resource_data,
+       .mem_data               = &ipa_mem_data,
+       .power_data             = &ipa_power_data,
+};
index 412edbf..818e641 100644 (file)
@@ -246,6 +246,7 @@ extern const struct ipa_data ipa_data_v3_1;
 extern const struct ipa_data ipa_data_v3_5_1;
 extern const struct ipa_data ipa_data_v4_2;
 extern const struct ipa_data ipa_data_v4_5;
+extern const struct ipa_data ipa_data_v4_7;
 extern const struct ipa_data ipa_data_v4_9;
 extern const struct ipa_data ipa_data_v4_11;
 
index 8f20825..4fb92f7 100644 (file)
@@ -663,6 +663,10 @@ static const struct of_device_id ipa_match[] = {
                .data           = &ipa_data_v4_5,
        },
        {
+               .compatible     = "qcom,sm6350-ipa",
+               .data           = &ipa_data_v4_7,
+       },
+       {
                .compatible     = "qcom,sm8350-ipa",
                .data           = &ipa_data_v4_9,
        },
index 22f0677..ddd5291 100644 (file)
@@ -86,6 +86,8 @@ static const struct ipa_regs *ipa_regs(enum ipa_version version)
                return &ipa_regs_v4_2;
        case IPA_VERSION_4_5:
                return &ipa_regs_v4_5;
+       case IPA_VERSION_4_7:
+               return &ipa_regs_v4_7;
        case IPA_VERSION_4_9:
                return &ipa_regs_v4_9;
        case IPA_VERSION_4_11:
index 7bf70f7..ff64b19 100644 (file)
@@ -658,6 +658,7 @@ extern const struct ipa_regs ipa_regs_v3_1;
 extern const struct ipa_regs ipa_regs_v3_5_1;
 extern const struct ipa_regs ipa_regs_v4_2;
 extern const struct ipa_regs ipa_regs_v4_5;
+extern const struct ipa_regs ipa_regs_v4_7;
 extern const struct ipa_regs ipa_regs_v4_9;
 extern const struct ipa_regs ipa_regs_v4_11;
 
index 5cbc15a..14bd2f9 100644 (file)
@@ -46,7 +46,7 @@ version_show(struct device *dev, struct device_attribute *attr, char *buf)
 {
        struct ipa *ipa = dev_get_drvdata(dev);
 
-       return scnprintf(buf, PAGE_SIZE, "%s\n", ipa_version_string(ipa));
+       return sysfs_emit(buf, "%s\n", ipa_version_string(ipa));
 }
 
 static DEVICE_ATTR_RO(version);
@@ -70,7 +70,7 @@ static ssize_t rx_offload_show(struct device *dev,
 {
        struct ipa *ipa = dev_get_drvdata(dev);
 
-       return scnprintf(buf, PAGE_SIZE, "%s\n", ipa_offload_string(ipa));
+       return sysfs_emit(buf, "%s\n", ipa_offload_string(ipa));
 }
 
 static DEVICE_ATTR_RO(rx_offload);
@@ -80,7 +80,7 @@ static ssize_t tx_offload_show(struct device *dev,
 {
        struct ipa *ipa = dev_get_drvdata(dev);
 
-       return scnprintf(buf, PAGE_SIZE, "%s\n", ipa_offload_string(ipa));
+       return sysfs_emit(buf, "%s\n", ipa_offload_string(ipa));
 }
 
 static DEVICE_ATTR_RO(tx_offload);
index 7889c31..d158214 100644 (file)
@@ -48,6 +48,7 @@ static inline bool ipa_version_supported(enum ipa_version version)
        case IPA_VERSION_3_5_1:
        case IPA_VERSION_4_2:
        case IPA_VERSION_4_5:
+       case IPA_VERSION_4_7:
        case IPA_VERSION_4_9:
        case IPA_VERSION_4_11:
        case IPA_VERSION_5_0:
diff --git a/drivers/net/ipa/reg/ipa_reg-v4.7.c b/drivers/net/ipa/reg/ipa_reg-v4.7.c
new file mode 100644 (file)
index 0000000..21f8a58
--- /dev/null
@@ -0,0 +1,507 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/* Copyright (C) 2022 Linaro Ltd. */
+
+#include <linux/types.h>
+
+#include "../ipa.h"
+#include "../ipa_reg.h"
+
+static const u32 ipa_reg_comp_cfg_fmask[] = {
+       [RAM_ARB_PRI_CLIENT_SAMP_FIX_DIS]               = BIT(0),
+       [GSI_SNOC_BYPASS_DIS]                           = BIT(1),
+       [GEN_QMB_0_SNOC_BYPASS_DIS]                     = BIT(2),
+       [GEN_QMB_1_SNOC_BYPASS_DIS]                     = BIT(3),
+                                               /* Bit 4 reserved */
+       [IPA_QMB_SELECT_CONS_EN]                        = BIT(5),
+       [IPA_QMB_SELECT_PROD_EN]                        = BIT(6),
+       [GSI_MULTI_INORDER_RD_DIS]                      = BIT(7),
+       [GSI_MULTI_INORDER_WR_DIS]                      = BIT(8),
+       [GEN_QMB_0_MULTI_INORDER_RD_DIS]                = BIT(9),
+       [GEN_QMB_1_MULTI_INORDER_RD_DIS]                = BIT(10),
+       [GEN_QMB_0_MULTI_INORDER_WR_DIS]                = BIT(11),
+       [GEN_QMB_1_MULTI_INORDER_WR_DIS]                = BIT(12),
+       [GEN_QMB_0_SNOC_CNOC_LOOP_PROT_DIS]             = BIT(13),
+       [GSI_SNOC_CNOC_LOOP_PROT_DISABLE]               = BIT(14),
+       [GSI_MULTI_AXI_MASTERS_DIS]                     = BIT(15),
+       [IPA_QMB_SELECT_GLOBAL_EN]                      = BIT(16),
+       [ATOMIC_FETCHER_ARB_LOCK_DIS]                   = GENMASK(20, 17),
+       [FULL_FLUSH_WAIT_RS_CLOSURE_EN]                 = BIT(21),
+                                               /* Bits 22-31 reserved */
+};
+
+IPA_REG_FIELDS(COMP_CFG, comp_cfg, 0x0000003c);
+
+static const u32 ipa_reg_clkon_cfg_fmask[] = {
+       [CLKON_RX]                                      = BIT(0),
+       [CLKON_PROC]                                    = BIT(1),
+       [TX_WRAPPER]                                    = BIT(2),
+       [CLKON_MISC]                                    = BIT(3),
+       [RAM_ARB]                                       = BIT(4),
+       [FTCH_HPS]                                      = BIT(5),
+       [FTCH_DPS]                                      = BIT(6),
+       [CLKON_HPS]                                     = BIT(7),
+       [CLKON_DPS]                                     = BIT(8),
+       [RX_HPS_CMDQS]                                  = BIT(9),
+       [HPS_DPS_CMDQS]                                 = BIT(10),
+       [DPS_TX_CMDQS]                                  = BIT(11),
+       [RSRC_MNGR]                                     = BIT(12),
+       [CTX_HANDLER]                                   = BIT(13),
+       [ACK_MNGR]                                      = BIT(14),
+       [D_DCPH]                                        = BIT(15),
+       [H_DCPH]                                        = BIT(16),
+       [CLKON_DCMP]                                    = BIT(17),
+       [NTF_TX_CMDQS]                                  = BIT(18),
+       [CLKON_TX_0]                                    = BIT(19),
+       [CLKON_TX_1]                                    = BIT(20),
+       [CLKON_FNR]                                     = BIT(21),
+       [QSB2AXI_CMDQ_L]                                = BIT(22),
+       [AGGR_WRAPPER]                                  = BIT(23),
+       [RAM_SLAVEWAY]                                  = BIT(24),
+       [CLKON_QMB]                                     = BIT(25),
+       [WEIGHT_ARB]                                    = BIT(26),
+       [GSI_IF]                                        = BIT(27),
+       [CLKON_GLOBAL]                                  = BIT(28),
+       [GLOBAL_2X_CLK]                                 = BIT(29),
+       [DPL_FIFO]                                      = BIT(30),
+       [DRBIP]                                         = BIT(31),
+};
+
+IPA_REG_FIELDS(CLKON_CFG, clkon_cfg, 0x00000044);
+
+static const u32 ipa_reg_route_fmask[] = {
+       [ROUTE_DIS]                                     = BIT(0),
+       [ROUTE_DEF_PIPE]                                = GENMASK(5, 1),
+       [ROUTE_DEF_HDR_TABLE]                           = BIT(6),
+       [ROUTE_DEF_HDR_OFST]                            = GENMASK(16, 7),
+       [ROUTE_FRAG_DEF_PIPE]                           = GENMASK(21, 17),
+                                               /* Bits 22-23 reserved */
+       [ROUTE_DEF_RETAIN_HDR]                          = BIT(24),
+                                               /* Bits 25-31 reserved */
+};
+
+IPA_REG_FIELDS(ROUTE, route, 0x00000048);
+
+static const u32 ipa_reg_shared_mem_size_fmask[] = {
+       [MEM_SIZE]                                      = GENMASK(15, 0),
+       [MEM_BADDR]                                     = GENMASK(31, 16),
+};
+
+IPA_REG_FIELDS(SHARED_MEM_SIZE, shared_mem_size, 0x00000054);
+
+static const u32 ipa_reg_qsb_max_writes_fmask[] = {
+       [GEN_QMB_0_MAX_WRITES]                          = GENMASK(3, 0),
+       [GEN_QMB_1_MAX_WRITES]                          = GENMASK(7, 4),
+                                               /* Bits 8-31 reserved */
+};
+
+IPA_REG_FIELDS(QSB_MAX_WRITES, qsb_max_writes, 0x00000074);
+
+static const u32 ipa_reg_qsb_max_reads_fmask[] = {
+       [GEN_QMB_0_MAX_READS]                           = GENMASK(3, 0),
+       [GEN_QMB_1_MAX_READS]                           = GENMASK(7, 4),
+                                               /* Bits 8-15 reserved */
+       [GEN_QMB_0_MAX_READS_BEATS]                     = GENMASK(23, 16),
+       [GEN_QMB_1_MAX_READS_BEATS]                     = GENMASK(31, 24),
+};
+
+IPA_REG_FIELDS(QSB_MAX_READS, qsb_max_reads, 0x00000078);
+
+static const u32 ipa_reg_filt_rout_hash_en_fmask[] = {
+       [IPV6_ROUTER_HASH]                              = BIT(0),
+                                               /* Bits 1-3 reserved */
+       [IPV6_FILTER_HASH]                              = BIT(4),
+                                               /* Bits 5-7 reserved */
+       [IPV4_ROUTER_HASH]                              = BIT(8),
+                                               /* Bits 9-11 reserved */
+       [IPV4_FILTER_HASH]                              = BIT(12),
+                                               /* Bits 13-31 reserved */
+};
+
+IPA_REG_FIELDS(FILT_ROUT_HASH_EN, filt_rout_hash_en, 0x0000148);
+
+static const u32 ipa_reg_filt_rout_hash_flush_fmask[] = {
+       [IPV6_ROUTER_HASH]                              = BIT(0),
+                                               /* Bits 1-3 reserved */
+       [IPV6_FILTER_HASH]                              = BIT(4),
+                                               /* Bits 5-7 reserved */
+       [IPV4_ROUTER_HASH]                              = BIT(8),
+                                               /* Bits 9-11 reserved */
+       [IPV4_FILTER_HASH]                              = BIT(12),
+                                               /* Bits 13-31 reserved */
+};
+
+IPA_REG_FIELDS(FILT_ROUT_HASH_FLUSH, filt_rout_hash_flush, 0x000014c);
+
+/* Valid bits defined by ipa->available */
+IPA_REG_STRIDE(STATE_AGGR_ACTIVE, state_aggr_active, 0x000000b4, 0x0004);
+
+static const u32 ipa_reg_local_pkt_proc_cntxt_fmask[] = {
+       [IPA_BASE_ADDR]                                 = GENMASK(17, 0),
+                                               /* Bits 18-31 reserved */
+};
+
+/* Offset must be a multiple of 8 */
+IPA_REG_FIELDS(LOCAL_PKT_PROC_CNTXT, local_pkt_proc_cntxt, 0x000001e8);
+
+/* Valid bits defined by ipa->available */
+IPA_REG_STRIDE(AGGR_FORCE_CLOSE, aggr_force_close, 0x000001ec, 0x0004);
+
+static const u32 ipa_reg_ipa_tx_cfg_fmask[] = {
+                                               /* Bits 0-1 reserved */
+       [PREFETCH_ALMOST_EMPTY_SIZE_TX0]                = GENMASK(5, 2),
+       [DMAW_SCND_OUTSD_PRED_THRESHOLD]                = GENMASK(9, 6),
+       [DMAW_SCND_OUTSD_PRED_EN]                       = BIT(10),
+       [DMAW_MAX_BEATS_256_DIS]                        = BIT(11),
+       [PA_MASK_EN]                                    = BIT(12),
+       [PREFETCH_ALMOST_EMPTY_SIZE_TX1]                = GENMASK(16, 13),
+       [DUAL_TX_ENABLE]                                = BIT(17),
+       [SSPND_PA_NO_START_STATE]                       = BIT(18),
+                                               /* Bits 19-31 reserved */
+};
+
+IPA_REG_FIELDS(IPA_TX_CFG, ipa_tx_cfg, 0x000001fc);
+
+static const u32 ipa_reg_flavor_0_fmask[] = {
+       [MAX_PIPES]                                     = GENMASK(3, 0),
+                                               /* Bits 4-7 reserved */
+       [MAX_CONS_PIPES]                                = GENMASK(12, 8),
+                                               /* Bits 13-15 reserved */
+       [MAX_PROD_PIPES]                                = GENMASK(20, 16),
+                                               /* Bits 21-23 reserved */
+       [PROD_LOWEST]                                   = GENMASK(27, 24),
+                                               /* Bits 28-31 reserved */
+};
+
+IPA_REG_FIELDS(FLAVOR_0, flavor_0, 0x00000210);
+
+static const u32 ipa_reg_idle_indication_cfg_fmask[] = {
+       [ENTER_IDLE_DEBOUNCE_THRESH]                    = GENMASK(15, 0),
+       [CONST_NON_IDLE_ENABLE]                         = BIT(16),
+                                               /* Bits 17-31 reserved */
+};
+
+IPA_REG_FIELDS(IDLE_INDICATION_CFG, idle_indication_cfg, 0x00000240);
+
+static const u32 ipa_reg_qtime_timestamp_cfg_fmask[] = {
+       [DPL_TIMESTAMP_LSB]                             = GENMASK(4, 0),
+                                               /* Bits 5-6 reserved */
+       [DPL_TIMESTAMP_SEL]                             = BIT(7),
+       [TAG_TIMESTAMP_LSB]                             = GENMASK(12, 8),
+                                               /* Bits 13-15 reserved */
+       [NAT_TIMESTAMP_LSB]                             = GENMASK(20, 16),
+                                               /* Bits 21-31 reserved */
+};
+
+IPA_REG_FIELDS(QTIME_TIMESTAMP_CFG, qtime_timestamp_cfg, 0x0000024c);
+
+static const u32 ipa_reg_timers_xo_clk_div_cfg_fmask[] = {
+       [DIV_VALUE]                                     = GENMASK(8, 0),
+                                               /* Bits 9-30 reserved */
+       [DIV_ENABLE]                                    = BIT(31),
+};
+
+IPA_REG_FIELDS(TIMERS_XO_CLK_DIV_CFG, timers_xo_clk_div_cfg, 0x00000250);
+
+static const u32 ipa_reg_timers_pulse_gran_cfg_fmask[] = {
+       [PULSE_GRAN_0]                                  = GENMASK(2, 0),
+       [PULSE_GRAN_1]                                  = GENMASK(5, 3),
+       [PULSE_GRAN_2]                                  = GENMASK(8, 6),
+};
+
+IPA_REG_FIELDS(TIMERS_PULSE_GRAN_CFG, timers_pulse_gran_cfg, 0x00000254);
+
+static const u32 ipa_reg_src_rsrc_grp_01_rsrc_type_fmask[] = {
+       [X_MIN_LIM]                                     = GENMASK(5, 0),
+                                               /* Bits 6-7 reserved */
+       [X_MAX_LIM]                                     = GENMASK(13, 8),
+                                               /* Bits 14-15 reserved */
+       [Y_MIN_LIM]                                     = GENMASK(21, 16),
+                                               /* Bits 22-23 reserved */
+       [Y_MAX_LIM]                                     = GENMASK(29, 24),
+                                               /* Bits 30-31 reserved */
+};
+
+IPA_REG_STRIDE_FIELDS(SRC_RSRC_GRP_01_RSRC_TYPE, src_rsrc_grp_01_rsrc_type,
+                     0x00000400, 0x0020);
+
+static const u32 ipa_reg_src_rsrc_grp_23_rsrc_type_fmask[] = {
+       [X_MIN_LIM]                                     = GENMASK(5, 0),
+                                               /* Bits 6-7 reserved */
+       [X_MAX_LIM]                                     = GENMASK(13, 8),
+                                               /* Bits 14-15 reserved */
+       [Y_MIN_LIM]                                     = GENMASK(21, 16),
+                                               /* Bits 22-23 reserved */
+       [Y_MAX_LIM]                                     = GENMASK(29, 24),
+                                               /* Bits 30-31 reserved */
+};
+
+IPA_REG_STRIDE_FIELDS(SRC_RSRC_GRP_23_RSRC_TYPE, src_rsrc_grp_23_rsrc_type,
+                     0x00000404, 0x0020);
+
+static const u32 ipa_reg_dst_rsrc_grp_01_rsrc_type_fmask[] = {
+       [X_MIN_LIM]                                     = GENMASK(5, 0),
+                                               /* Bits 6-7 reserved */
+       [X_MAX_LIM]                                     = GENMASK(13, 8),
+                                               /* Bits 14-15 reserved */
+       [Y_MIN_LIM]                                     = GENMASK(21, 16),
+                                               /* Bits 22-23 reserved */
+       [Y_MAX_LIM]                                     = GENMASK(29, 24),
+                                               /* Bits 30-31 reserved */
+};
+
+IPA_REG_STRIDE_FIELDS(DST_RSRC_GRP_01_RSRC_TYPE, dst_rsrc_grp_01_rsrc_type,
+                     0x00000500, 0x0020);
+
+static const u32 ipa_reg_dst_rsrc_grp_23_rsrc_type_fmask[] = {
+       [X_MIN_LIM]                                     = GENMASK(5, 0),
+                                               /* Bits 6-7 reserved */
+       [X_MAX_LIM]                                     = GENMASK(13, 8),
+                                               /* Bits 14-15 reserved */
+       [Y_MIN_LIM]                                     = GENMASK(21, 16),
+                                               /* Bits 22-23 reserved */
+       [Y_MAX_LIM]                                     = GENMASK(29, 24),
+                                               /* Bits 30-31 reserved */
+};
+
+IPA_REG_STRIDE_FIELDS(DST_RSRC_GRP_23_RSRC_TYPE, dst_rsrc_grp_23_rsrc_type,
+                     0x00000504, 0x0020);
+
+static const u32 ipa_reg_endp_init_cfg_fmask[] = {
+       [FRAG_OFFLOAD_EN]                               = BIT(0),
+       [CS_OFFLOAD_EN]                                 = GENMASK(2, 1),
+       [CS_METADATA_HDR_OFFSET]                        = GENMASK(6, 3),
+                                               /* Bit 7 reserved */
+       [CS_GEN_QMB_MASTER_SEL]                         = BIT(8),
+                                               /* Bits 9-31 reserved */
+};
+
+IPA_REG_STRIDE_FIELDS(ENDP_INIT_CFG, endp_init_cfg, 0x00000808, 0x0070);
+
+static const u32 ipa_reg_endp_init_nat_fmask[] = {
+       [NAT_EN]                                        = GENMASK(1, 0),
+                                               /* Bits 2-31 reserved */
+};
+
+IPA_REG_STRIDE_FIELDS(ENDP_INIT_NAT, endp_init_nat, 0x0000080c, 0x0070);
+
+static const u32 ipa_reg_endp_init_hdr_fmask[] = {
+       [HDR_LEN]                                       = GENMASK(5, 0),
+       [HDR_OFST_METADATA_VALID]                       = BIT(6),
+       [HDR_OFST_METADATA]                             = GENMASK(12, 7),
+       [HDR_ADDITIONAL_CONST_LEN]                      = GENMASK(18, 13),
+       [HDR_OFST_PKT_SIZE_VALID]                       = BIT(19),
+       [HDR_OFST_PKT_SIZE]                             = GENMASK(25, 20),
+       [HDR_A5_MUX]                                    = BIT(26),
+       [HDR_LEN_INC_DEAGG_HDR]                         = BIT(27),
+       [HDR_LEN_MSB]                                   = GENMASK(29, 28),
+       [HDR_OFST_METADATA_MSB]                         = GENMASK(31, 30),
+};
+
+IPA_REG_STRIDE_FIELDS(ENDP_INIT_HDR, endp_init_hdr, 0x00000810, 0x0070);
+
+static const u32 ipa_reg_endp_init_hdr_ext_fmask[] = {
+       [HDR_ENDIANNESS]                                = BIT(0),
+       [HDR_TOTAL_LEN_OR_PAD_VALID]                    = BIT(1),
+       [HDR_TOTAL_LEN_OR_PAD]                          = BIT(2),
+       [HDR_PAYLOAD_LEN_INC_PADDING]                   = BIT(3),
+       [HDR_TOTAL_LEN_OR_PAD_OFFSET]                   = GENMASK(9, 4),
+       [HDR_PAD_TO_ALIGNMENT]                          = GENMASK(13, 10),
+                                               /* Bits 14-15 reserved */
+       [HDR_TOTAL_LEN_OR_PAD_OFFSET_MSB]               = GENMASK(17, 16),
+       [HDR_OFST_PKT_SIZE_MSB]                         = GENMASK(19, 18),
+       [HDR_ADDITIONAL_CONST_LEN_MSB]                  = GENMASK(21, 20),
+                                               /* Bits 22-31 reserved */
+};
+
+IPA_REG_STRIDE_FIELDS(ENDP_INIT_HDR_EXT, endp_init_hdr_ext, 0x00000814, 0x0070);
+
+IPA_REG_STRIDE(ENDP_INIT_HDR_METADATA_MASK, endp_init_hdr_metadata_mask,
+              0x00000818, 0x0070);
+
+static const u32 ipa_reg_endp_init_mode_fmask[] = {
+       [ENDP_MODE]                                     = GENMASK(2, 0),
+       [DCPH_ENABLE]                                   = BIT(3),
+       [DEST_PIPE_INDEX]                               = GENMASK(8, 4),
+                                               /* Bits 9-11 reserved */
+       [BYTE_THRESHOLD]                                = GENMASK(27, 12),
+       [PIPE_REPLICATION_EN]                           = BIT(28),
+       [PAD_EN]                                        = BIT(29),
+                                               /* Bits 30-31 reserved */
+};
+
+IPA_REG_STRIDE_FIELDS(ENDP_INIT_MODE, endp_init_mode, 0x00000820, 0x0070);
+
+static const u32 ipa_reg_endp_init_aggr_fmask[] = {
+       [AGGR_EN]                                       = GENMASK(1, 0),
+       [AGGR_TYPE]                                     = GENMASK(4, 2),
+       [BYTE_LIMIT]                                    = GENMASK(10, 5),
+                                               /* Bit 11 reserved */
+       [TIME_LIMIT]                                    = GENMASK(16, 12),
+       [PKT_LIMIT]                                     = GENMASK(22, 17),
+       [SW_EOF_ACTIVE]                                 = BIT(23),
+       [FORCE_CLOSE]                                   = BIT(24),
+                                               /* Bit 25 reserved */
+       [HARD_BYTE_LIMIT_EN]                            = BIT(26),
+       [AGGR_GRAN_SEL]                                 = BIT(27),
+                                               /* Bits 28-31 reserved */
+};
+
+IPA_REG_STRIDE_FIELDS(ENDP_INIT_AGGR, endp_init_aggr, 0x00000824, 0x0070);
+
+static const u32 ipa_reg_endp_init_hol_block_en_fmask[] = {
+       [HOL_BLOCK_EN]                                  = BIT(0),
+                                               /* Bits 1-31 reserved */
+};
+
+IPA_REG_STRIDE_FIELDS(ENDP_INIT_HOL_BLOCK_EN, endp_init_hol_block_en,
+                     0x0000082c, 0x0070);
+
+static const u32 ipa_reg_endp_init_hol_block_timer_fmask[] = {
+       [TIMER_LIMIT]                                   = GENMASK(4, 0),
+                                               /* Bits 5-7 reserved */
+       [TIMER_GRAN_SEL]                                = BIT(8),
+                                               /* Bits 9-31 reserved */
+};
+
+IPA_REG_STRIDE_FIELDS(ENDP_INIT_HOL_BLOCK_TIMER, endp_init_hol_block_timer,
+                     0x00000830, 0x0070);
+
+static const u32 ipa_reg_endp_init_deaggr_fmask[] = {
+       [DEAGGR_HDR_LEN]                                = GENMASK(5, 0),
+       [SYSPIPE_ERR_DETECTION]                         = BIT(6),
+       [PACKET_OFFSET_VALID]                           = BIT(7),
+       [PACKET_OFFSET_LOCATION]                        = GENMASK(13, 8),
+       [IGNORE_MIN_PKT_ERR]                            = BIT(14),
+                                               /* Bit 15 reserved */
+       [MAX_PACKET_LEN]                                = GENMASK(31, 16),
+};
+
+IPA_REG_STRIDE_FIELDS(ENDP_INIT_DEAGGR, endp_init_deaggr, 0x00000834, 0x0070);
+
+static const u32 ipa_reg_endp_init_rsrc_grp_fmask[] = {
+       [ENDP_RSRC_GRP]                                 = BIT(0),
+                                               /* Bits 1-31 reserved */
+};
+
+IPA_REG_STRIDE_FIELDS(ENDP_INIT_RSRC_GRP, endp_init_rsrc_grp,
+                     0x00000838, 0x0070);
+
+static const u32 ipa_reg_endp_init_seq_fmask[] = {
+       [SEQ_TYPE]                                      = GENMASK(7, 0),
+                                               /* Bits 8-31 reserved */
+};
+
+IPA_REG_STRIDE_FIELDS(ENDP_INIT_SEQ, endp_init_seq, 0x0000083c, 0x0070);
+
+static const u32 ipa_reg_endp_status_fmask[] = {
+       [STATUS_EN]                                     = BIT(0),
+       [STATUS_ENDP]                                   = GENMASK(5, 1),
+                                               /* Bits 6-8 reserved */
+       [STATUS_PKT_SUPPRESS]                           = BIT(9),
+                                               /* Bits 10-31 reserved */
+};
+
+IPA_REG_STRIDE_FIELDS(ENDP_STATUS, endp_status, 0x00000840, 0x0070);
+
+static const u32 ipa_reg_endp_filter_router_hsh_cfg_fmask[] = {
+       [FILTER_HASH_MSK_SRC_ID]                        = BIT(0),
+       [FILTER_HASH_MSK_SRC_IP]                        = BIT(1),
+       [FILTER_HASH_MSK_DST_IP]                        = BIT(2),
+       [FILTER_HASH_MSK_SRC_PORT]                      = BIT(3),
+       [FILTER_HASH_MSK_DST_PORT]                      = BIT(4),
+       [FILTER_HASH_MSK_PROTOCOL]                      = BIT(5),
+       [FILTER_HASH_MSK_METADATA]                      = BIT(6),
+       [FILTER_HASH_MSK_ALL]                           = GENMASK(6, 0),
+                                               /* Bits 7-15 reserved */
+       [ROUTER_HASH_MSK_SRC_ID]                        = BIT(16),
+       [ROUTER_HASH_MSK_SRC_IP]                        = BIT(17),
+       [ROUTER_HASH_MSK_DST_IP]                        = BIT(18),
+       [ROUTER_HASH_MSK_SRC_PORT]                      = BIT(19),
+       [ROUTER_HASH_MSK_DST_PORT]                      = BIT(20),
+       [ROUTER_HASH_MSK_PROTOCOL]                      = BIT(21),
+       [ROUTER_HASH_MSK_METADATA]                      = BIT(22),
+       [ROUTER_HASH_MSK_ALL]                           = GENMASK(22, 16),
+                                               /* Bits 23-31 reserved */
+};
+
+IPA_REG_STRIDE_FIELDS(ENDP_FILTER_ROUTER_HSH_CFG, endp_filter_router_hsh_cfg,
+                     0x0000085c, 0x0070);
+
+/* Valid bits defined by enum ipa_irq_id; only used for GSI_EE_AP */
+IPA_REG(IPA_IRQ_STTS, ipa_irq_stts, 0x00003008 + 0x1000 * GSI_EE_AP);
+
+/* Valid bits defined by enum ipa_irq_id; only used for GSI_EE_AP */
+IPA_REG(IPA_IRQ_EN, ipa_irq_en, 0x0000300c + 0x1000 * GSI_EE_AP);
+
+/* Valid bits defined by enum ipa_irq_id; only used for GSI_EE_AP */
+IPA_REG(IPA_IRQ_CLR, ipa_irq_clr, 0x00003010 + 0x1000 * GSI_EE_AP);
+
+static const u32 ipa_reg_ipa_irq_uc_fmask[] = {
+       [UC_INTR]                                       = BIT(0),
+                                               /* Bits 1-31 reserved */
+};
+
+IPA_REG_FIELDS(IPA_IRQ_UC, ipa_irq_uc, 0x0000301c + 0x1000 * GSI_EE_AP);
+
+/* Valid bits defined by ipa->available */
+IPA_REG_STRIDE(IRQ_SUSPEND_INFO, irq_suspend_info,
+              0x00003030 + 0x1000 * GSI_EE_AP, 0x0004);
+
+/* Valid bits defined by ipa->available */
+IPA_REG_STRIDE(IRQ_SUSPEND_EN, irq_suspend_en,
+              0x00003034 + 0x1000 * GSI_EE_AP, 0x0004);
+
+/* Valid bits defined by ipa->available */
+IPA_REG_STRIDE(IRQ_SUSPEND_CLR, irq_suspend_clr,
+              0x00003038 + 0x1000 * GSI_EE_AP, 0x0004);
+
+static const struct ipa_reg *ipa_reg_array[] = {
+       [COMP_CFG]                      = &ipa_reg_comp_cfg,
+       [CLKON_CFG]                     = &ipa_reg_clkon_cfg,
+       [ROUTE]                         = &ipa_reg_route,
+       [SHARED_MEM_SIZE]               = &ipa_reg_shared_mem_size,
+       [QSB_MAX_WRITES]                = &ipa_reg_qsb_max_writes,
+       [QSB_MAX_READS]                 = &ipa_reg_qsb_max_reads,
+       [FILT_ROUT_HASH_EN]             = &ipa_reg_filt_rout_hash_en,
+       [FILT_ROUT_HASH_FLUSH]          = &ipa_reg_filt_rout_hash_flush,
+       [STATE_AGGR_ACTIVE]             = &ipa_reg_state_aggr_active,
+       [LOCAL_PKT_PROC_CNTXT]          = &ipa_reg_local_pkt_proc_cntxt,
+       [AGGR_FORCE_CLOSE]              = &ipa_reg_aggr_force_close,
+       [IPA_TX_CFG]                    = &ipa_reg_ipa_tx_cfg,
+       [FLAVOR_0]                      = &ipa_reg_flavor_0,
+       [IDLE_INDICATION_CFG]           = &ipa_reg_idle_indication_cfg,
+       [QTIME_TIMESTAMP_CFG]           = &ipa_reg_qtime_timestamp_cfg,
+       [TIMERS_XO_CLK_DIV_CFG]         = &ipa_reg_timers_xo_clk_div_cfg,
+       [TIMERS_PULSE_GRAN_CFG]         = &ipa_reg_timers_pulse_gran_cfg,
+       [SRC_RSRC_GRP_01_RSRC_TYPE]     = &ipa_reg_src_rsrc_grp_01_rsrc_type,
+       [SRC_RSRC_GRP_23_RSRC_TYPE]     = &ipa_reg_src_rsrc_grp_23_rsrc_type,
+       [DST_RSRC_GRP_01_RSRC_TYPE]     = &ipa_reg_dst_rsrc_grp_01_rsrc_type,
+       [DST_RSRC_GRP_23_RSRC_TYPE]     = &ipa_reg_dst_rsrc_grp_23_rsrc_type,
+       [ENDP_INIT_CFG]                 = &ipa_reg_endp_init_cfg,
+       [ENDP_INIT_NAT]                 = &ipa_reg_endp_init_nat,
+       [ENDP_INIT_HDR]                 = &ipa_reg_endp_init_hdr,
+       [ENDP_INIT_HDR_EXT]             = &ipa_reg_endp_init_hdr_ext,
+       [ENDP_INIT_HDR_METADATA_MASK]   = &ipa_reg_endp_init_hdr_metadata_mask,
+       [ENDP_INIT_MODE]                = &ipa_reg_endp_init_mode,
+       [ENDP_INIT_AGGR]                = &ipa_reg_endp_init_aggr,
+       [ENDP_INIT_HOL_BLOCK_EN]        = &ipa_reg_endp_init_hol_block_en,
+       [ENDP_INIT_HOL_BLOCK_TIMER]     = &ipa_reg_endp_init_hol_block_timer,
+       [ENDP_INIT_DEAGGR]              = &ipa_reg_endp_init_deaggr,
+       [ENDP_INIT_RSRC_GRP]            = &ipa_reg_endp_init_rsrc_grp,
+       [ENDP_INIT_SEQ]                 = &ipa_reg_endp_init_seq,
+       [ENDP_STATUS]                   = &ipa_reg_endp_status,
+       [ENDP_FILTER_ROUTER_HSH_CFG]    = &ipa_reg_endp_filter_router_hsh_cfg,
+       [IPA_IRQ_STTS]                  = &ipa_reg_ipa_irq_stts,
+       [IPA_IRQ_EN]                    = &ipa_reg_ipa_irq_en,
+       [IPA_IRQ_CLR]                   = &ipa_reg_ipa_irq_clr,
+       [IPA_IRQ_UC]                    = &ipa_reg_ipa_irq_uc,
+       [IRQ_SUSPEND_INFO]              = &ipa_reg_irq_suspend_info,
+       [IRQ_SUSPEND_EN]                = &ipa_reg_irq_suspend_en,
+       [IRQ_SUSPEND_CLR]               = &ipa_reg_irq_suspend_clr,
+};
+
+const struct ipa_regs ipa_regs_v4_7 = {
+       .reg_count      = ARRAY_SIZE(ipa_reg_array),
+       .reg            = ipa_reg_array,
+};
index d73b9d5..937f5b1 100644 (file)
@@ -3698,6 +3698,7 @@ static const struct nla_policy macsec_rtnl_policy[IFLA_MACSEC_MAX + 1] = {
        [IFLA_MACSEC_SCB] = { .type = NLA_U8 },
        [IFLA_MACSEC_REPLAY_PROTECT] = { .type = NLA_U8 },
        [IFLA_MACSEC_VALIDATION] = { .type = NLA_U8 },
+       [IFLA_MACSEC_OFFLOAD] = { .type = NLA_U8 },
 };
 
 static void macsec_free_netdev(struct net_device *dev)
index eb344f6..b782c35 100644 (file)
@@ -98,6 +98,7 @@ int fwnode_mdiobus_phy_device_register(struct mii_bus *mdio,
         */
        rc = phy_device_register(phy);
        if (rc) {
+               device_set_node(&phy->mdio.dev, NULL);
                fwnode_handle_put(child);
                return rc;
        }
@@ -153,7 +154,8 @@ int fwnode_mdiobus_register_phy(struct mii_bus *bus,
                /* All data is now stored in the phy struct, so register it */
                rc = phy_device_register(phy);
                if (rc) {
-                       fwnode_handle_put(phy->mdio.dev.fwnode);
+                       phy->mdio.dev.fwnode = NULL;
+                       fwnode_handle_put(child);
                        goto clean_phy;
                }
        } else if (is_of_node(child)) {
index 796e9c7..510822d 100644 (file)
@@ -68,8 +68,9 @@ static int of_mdiobus_register_device(struct mii_bus *mdio,
        /* All data is now stored in the mdiodev struct; register it. */
        rc = mdio_device_register(mdiodev);
        if (rc) {
+               device_set_node(&mdiodev->dev, NULL);
+               fwnode_handle_put(fwnode);
                mdio_device_free(mdiodev);
-               of_node_put(child);
                return rc;
        }
 
index 386336a..b93baf5 100644 (file)
@@ -149,6 +149,11 @@ static int nsim_ipsec_add_sa(struct xfrm_state *xs)
                return -EINVAL;
        }
 
+       if (xs->xso.type != XFRM_DEV_OFFLOAD_CRYPTO) {
+               netdev_err(dev, "Unsupported ipsec offload type\n");
+               return -EINVAL;
+       }
+
        /* find the first unused index */
        ret = nsim_ipsec_find_empty_idx(ipsec);
        if (ret < 0) {
index af00cf4..1327290 100644 (file)
@@ -47,7 +47,6 @@ config LED_TRIGGER_PHY
 
 config FIXED_PHY
        tristate "MDIO Bus/PHY emulation with fixed speed/link PHYs"
-       depends on PHYLIB
        select SWPHY
        help
          Adds the platform "fixed" MDIO Bus to cover the boards that use
@@ -112,7 +111,6 @@ config BROADCOM_PHY
 
 config BCM54140_PHY
        tristate "Broadcom BCM54140 PHY"
-       depends on PHYLIB
        depends on HWMON || HWMON=n
        select BCM_NET_PHYLIB
        help
@@ -137,7 +135,6 @@ config BCM7XXX_PHY
 
 config BCM84881_PHY
        tristate "Broadcom BCM84881 PHY"
-       depends on PHYLIB
        help
          Support the Broadcom BCM84881 PHY.
 
index 250742f..044828d 100644 (file)
@@ -21,6 +21,7 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/unistd.h>
+#include <linux/property.h>
 
 void mdio_device_free(struct mdio_device *mdiodev)
 {
@@ -30,6 +31,7 @@ EXPORT_SYMBOL(mdio_device_free);
 
 static void mdio_device_release(struct device *dev)
 {
+       fwnode_handle_put(dev->fwnode);
        kfree(to_mdio_device(dev));
 }
 
index 27c0f16..147d7a5 100644 (file)
@@ -9,6 +9,7 @@
 #include <linux/module.h>
 #include <linux/bitfield.h>
 #include <linux/hwmon.h>
+#include <linux/mutex.h>
 #include <linux/phy.h>
 #include <linux/polynomial.h>
 #include <linux/netdevice.h>
                                 VSPEC1_SGMII_CTRL_ANRS)
 
 /* Temperature sensor */
-#define VPSPEC1_TEMP_STA       0x0E
-#define VPSPEC1_TEMP_STA_DATA  GENMASK(9, 0)
+#define VSPEC1_TEMP_STA        0x0E
+#define VSPEC1_TEMP_STA_DATA   GENMASK(9, 0)
+
+/* Mailbox */
+#define VSPEC1_MBOX_DATA       0x5
+#define VSPEC1_MBOX_ADDRLO     0x6
+#define VSPEC1_MBOX_CMD                0x7
+#define VSPEC1_MBOX_CMD_ADDRHI GENMASK(7, 0)
+#define VSPEC1_MBOX_CMD_RD     (0 << 8)
+#define VSPEC1_MBOX_CMD_READY  BIT(15)
 
 /* WoL */
 #define VPSPEC2_WOL_CTL                0x0E06
 #define VPSPEC2_WOL_AD45       0x0E0A
 #define WOL_EN                 BIT(0)
 
+/* Internal registers, access via mbox */
+#define REG_GPIO0_OUT          0xd3ce00
+
 struct gpy_priv {
+       /* serialize mailbox acesses */
+       struct mutex mbox_lock;
+
        u8 fw_major;
        u8 fw_minor;
 };
@@ -140,14 +155,14 @@ static int gpy_hwmon_read(struct device *dev,
        struct phy_device *phydev = dev_get_drvdata(dev);
        int ret;
 
-       ret = phy_read_mmd(phydev, MDIO_MMD_VEND1, VPSPEC1_TEMP_STA);
+       ret = phy_read_mmd(phydev, MDIO_MMD_VEND1, VSPEC1_TEMP_STA);
        if (ret < 0)
                return ret;
        if (!ret)
                return -ENODATA;
 
        *value = polynomial_calc(&poly_N_to_temp,
-                                FIELD_GET(VPSPEC1_TEMP_STA_DATA, ret));
+                                FIELD_GET(VSPEC1_TEMP_STA_DATA, ret));
 
        return 0;
 }
@@ -198,6 +213,45 @@ static int gpy_hwmon_register(struct phy_device *phydev)
 }
 #endif
 
+static int gpy_mbox_read(struct phy_device *phydev, u32 addr)
+{
+       struct gpy_priv *priv = phydev->priv;
+       int val, ret;
+       u16 cmd;
+
+       mutex_lock(&priv->mbox_lock);
+
+       ret = phy_write_mmd(phydev, MDIO_MMD_VEND1, VSPEC1_MBOX_ADDRLO,
+                           addr);
+       if (ret)
+               goto out;
+
+       cmd = VSPEC1_MBOX_CMD_RD;
+       cmd |= FIELD_PREP(VSPEC1_MBOX_CMD_ADDRHI, addr >> 16);
+
+       ret = phy_write_mmd(phydev, MDIO_MMD_VEND1, VSPEC1_MBOX_CMD, cmd);
+       if (ret)
+               goto out;
+
+       /* The mbox read is used in the interrupt workaround. It was observed
+        * that a read might take up to 2.5ms. This is also the time for which
+        * the interrupt line is stuck low. To be on the safe side, poll the
+        * ready bit for 10ms.
+        */
+       ret = phy_read_mmd_poll_timeout(phydev, MDIO_MMD_VEND1,
+                                       VSPEC1_MBOX_CMD, val,
+                                       (val & VSPEC1_MBOX_CMD_READY),
+                                       500, 10000, false);
+       if (ret)
+               goto out;
+
+       ret = phy_read_mmd(phydev, MDIO_MMD_VEND1, VSPEC1_MBOX_DATA);
+
+out:
+       mutex_unlock(&priv->mbox_lock);
+       return ret;
+}
+
 static int gpy_config_init(struct phy_device *phydev)
 {
        int ret;
@@ -212,6 +266,13 @@ static int gpy_config_init(struct phy_device *phydev)
        return ret < 0 ? ret : 0;
 }
 
+static bool gpy_has_broken_mdint(struct phy_device *phydev)
+{
+       /* At least these PHYs are known to have broken interrupt handling */
+       return phydev->drv->phy_id == PHY_ID_GPY215B ||
+              phydev->drv->phy_id == PHY_ID_GPY215C;
+}
+
 static int gpy_probe(struct phy_device *phydev)
 {
        struct device *dev = &phydev->mdio.dev;
@@ -229,6 +290,7 @@ static int gpy_probe(struct phy_device *phydev)
        if (!priv)
                return -ENOMEM;
        phydev->priv = priv;
+       mutex_init(&priv->mbox_lock);
 
        fw_version = phy_read(phydev, PHY_FWV);
        if (fw_version < 0)
@@ -574,6 +636,29 @@ static irqreturn_t gpy_handle_interrupt(struct phy_device *phydev)
        if (!(reg & PHY_IMASK_MASK))
                return IRQ_NONE;
 
+       /* The PHY might leave the interrupt line asserted even after PHY_ISTAT
+        * is read. To avoid interrupt storms, delay the interrupt handling as
+        * long as the PHY drives the interrupt line. An internal bus read will
+        * stall as long as the interrupt line is asserted, thus just read a
+        * random register here.
+        * Because we cannot access the internal bus at all while the interrupt
+        * is driven by the PHY, there is no way to make the interrupt line
+        * unstuck (e.g. by changing the pinmux to GPIO input) during that time
+        * frame. Therefore, polling is the best we can do and won't do any more
+        * harm.
+        * It was observed that this bug happens on link state and link speed
+        * changes on a GPY215B and GYP215C independent of the firmware version
+        * (which doesn't mean that this list is exhaustive).
+        */
+       if (gpy_has_broken_mdint(phydev) &&
+           (reg & (PHY_IMASK_LSTC | PHY_IMASK_LSPC))) {
+               reg = gpy_mbox_read(phydev, REG_GPIO0_OUT);
+               if (reg < 0) {
+                       phy_error(phydev);
+                       return IRQ_NONE;
+               }
+       }
+
        phy_trigger_machine(phydev);
 
        return IRQ_HANDLED;
index 39fd181..83b99d9 100644 (file)
@@ -2642,10 +2642,46 @@ static void sfp_cleanup(void *data)
        kfree(sfp);
 }
 
+static int sfp_i2c_get(struct sfp *sfp)
+{
+       struct acpi_handle *acpi_handle;
+       struct fwnode_handle *h;
+       struct i2c_adapter *i2c;
+       struct device_node *np;
+       int err;
+
+       h = fwnode_find_reference(dev_fwnode(sfp->dev), "i2c-bus", 0);
+       if (IS_ERR(h)) {
+               dev_err(sfp->dev, "missing 'i2c-bus' property\n");
+               return -ENODEV;
+       }
+
+       if (is_acpi_device_node(h)) {
+               acpi_handle = ACPI_HANDLE_FWNODE(h);
+               i2c = i2c_acpi_find_adapter_by_handle(acpi_handle);
+       } else if ((np = to_of_node(h)) != NULL) {
+               i2c = of_find_i2c_adapter_by_node(np);
+       } else {
+               err = -EINVAL;
+               goto put;
+       }
+
+       if (!i2c) {
+               err = -EPROBE_DEFER;
+               goto put;
+       }
+
+       err = sfp_i2c_configure(sfp, i2c);
+       if (err)
+               i2c_put_adapter(i2c);
+put:
+       fwnode_handle_put(h);
+       return err;
+}
+
 static int sfp_probe(struct platform_device *pdev)
 {
        const struct sff_data *sff;
-       struct i2c_adapter *i2c;
        char *sfp_irq_name;
        struct sfp *sfp;
        int err, i;
@@ -2663,51 +2699,20 @@ static int sfp_probe(struct platform_device *pdev)
        sff = sfp->type = &sfp_data;
 
        if (pdev->dev.of_node) {
-               struct device_node *node = pdev->dev.of_node;
                const struct of_device_id *id;
-               struct device_node *np;
 
-               id = of_match_node(sfp_of_match, node);
+               id = of_match_node(sfp_of_match, pdev->dev.of_node);
                if (WARN_ON(!id))
                        return -EINVAL;
 
                sff = sfp->type = id->data;
-
-               np = of_parse_phandle(node, "i2c-bus", 0);
-               if (!np) {
-                       dev_err(sfp->dev, "missing 'i2c-bus' property\n");
-                       return -ENODEV;
-               }
-
-               i2c = of_find_i2c_adapter_by_node(np);
-               of_node_put(np);
-       } else if (has_acpi_companion(&pdev->dev)) {
-               struct acpi_device *adev = ACPI_COMPANION(&pdev->dev);
-               struct fwnode_handle *fw = acpi_fwnode_handle(adev);
-               struct fwnode_reference_args args;
-               struct acpi_handle *acpi_handle;
-               int ret;
-
-               ret = acpi_node_get_property_reference(fw, "i2c-bus", 0, &args);
-               if (ret || !is_acpi_device_node(args.fwnode)) {
-                       dev_err(&pdev->dev, "missing 'i2c-bus' property\n");
-                       return -ENODEV;
-               }
-
-               acpi_handle = ACPI_HANDLE_FWNODE(args.fwnode);
-               i2c = i2c_acpi_find_adapter_by_handle(acpi_handle);
-       } else {
+       } else if (!has_acpi_companion(&pdev->dev)) {
                return -EINVAL;
        }
 
-       if (!i2c)
-               return -EPROBE_DEFER;
-
-       err = sfp_i2c_configure(sfp, i2c);
-       if (err < 0) {
-               i2c_put_adapter(i2c);
+       err = sfp_i2c_get(sfp);
+       if (err)
                return err;
-       }
 
        for (i = 0; i < GPIO_MAX; i++)
                if (sff->gpios & BIT(i)) {
index c8791e9..40ce8ab 100644 (file)
@@ -450,12 +450,12 @@ plip_bh_timeout_error(struct net_device *dev, struct net_local *nl,
        }
        rcv->state = PLIP_PK_DONE;
        if (rcv->skb) {
-               kfree_skb(rcv->skb);
+               dev_kfree_skb_irq(rcv->skb);
                rcv->skb = NULL;
        }
        snd->state = PLIP_PK_DONE;
        if (snd->skb) {
-               dev_kfree_skb(snd->skb);
+               dev_consume_skb_irq(snd->skb);
                snd->skb = NULL;
        }
        spin_unlock_irq(&nl->lock);
index 9e75ed3..a2be199 100644 (file)
@@ -957,6 +957,10 @@ static int set_offload(struct tap_queue *q, unsigned long arg)
                        if (arg & TUN_F_TSO6)
                                feature_mask |= NETIF_F_TSO6;
                }
+
+               /* TODO: for now USO4 and USO6 should work simultaneously */
+               if ((arg & (TUN_F_USO4 | TUN_F_USO6)) == (TUN_F_USO4 | TUN_F_USO6))
+                       features |= NETIF_F_GSO_UDP_L4;
        }
 
        /* tun/tap driver inverts the usage for TSO offloads, where
@@ -967,7 +971,8 @@ static int set_offload(struct tap_queue *q, unsigned long arg)
         * When user space turns off TSO, we turn off GSO/LRO so that
         * user-space will not receive TSO frames.
         */
-       if (feature_mask & (NETIF_F_TSO | NETIF_F_TSO6))
+       if (feature_mask & (NETIF_F_TSO | NETIF_F_TSO6) ||
+           (feature_mask & (TUN_F_USO4 | TUN_F_USO6)) == (TUN_F_USO4 | TUN_F_USO6))
                features |= RX_OFFLOADS;
        else
                features &= ~RX_OFFLOADS;
@@ -1091,7 +1096,8 @@ static long tap_ioctl(struct file *file, unsigned int cmd,
        case TUNSETOFFLOAD:
                /* let the user check for future flags */
                if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 |
-                           TUN_F_TSO_ECN | TUN_F_UFO))
+                           TUN_F_TSO_ECN | TUN_F_UFO |
+                           TUN_F_USO4 | TUN_F_USO6))
                        return -EINVAL;
 
                rtnl_lock();
index d10606f..fcd43d6 100644 (file)
@@ -1044,6 +1044,7 @@ static int team_port_enter(struct team *team, struct team_port *port)
                        goto err_port_enter;
                }
        }
+       port->dev->priv_flags |= IFF_NO_ADDRCONF;
 
        return 0;
 
@@ -1057,6 +1058,7 @@ static void team_port_leave(struct team *team, struct team_port *port)
 {
        if (team->ops.port_leave)
                team->ops.port_leave(team, port);
+       port->dev->priv_flags &= ~IFF_NO_ADDRCONF;
        dev_put(team->dev);
 }
 
index 4ed7f5b..9904847 100644 (file)
@@ -914,6 +914,7 @@ static int tbnet_open(struct net_device *dev)
                                eof_mask, tbnet_start_poll, net);
        if (!ring) {
                netdev_err(dev, "failed to allocate Rx ring\n");
+               tb_xdomain_release_out_hopid(xd, hopid);
                tb_ring_free(net->tx_ring.ring);
                net->tx_ring.ring = NULL;
                return -ENOMEM;
index b4baa20..a7d17c6 100644 (file)
@@ -185,7 +185,7 @@ struct tun_struct {
        struct net_device       *dev;
        netdev_features_t       set_features;
 #define TUN_USER_FEATURES (NETIF_F_HW_CSUM|NETIF_F_TSO_ECN|NETIF_F_TSO| \
-                         NETIF_F_TSO6)
+                         NETIF_F_TSO6 | NETIF_F_GSO_UDP_L4)
 
        int                     align;
        int                     vnet_hdr_sz;
@@ -2878,6 +2878,12 @@ static int set_offload(struct tun_struct *tun, unsigned long arg)
                }
 
                arg &= ~TUN_F_UFO;
+
+               /* TODO: for now USO4 and USO6 should work simultaneously */
+               if (arg & TUN_F_USO4 && arg & TUN_F_USO6) {
+                       features |= NETIF_F_GSO_UDP_L4;
+                       arg &= ~(TUN_F_USO4 | TUN_F_USO6);
+               }
        }
 
        /* This gives the user a way to test for new features in future by
index 0fe3773..743cbf5 100644 (file)
@@ -1350,6 +1350,20 @@ static const struct driver_info ax88772b_info = {
        .data = FLAG_EEPROM_MAC,
 };
 
+static const struct driver_info lxausb_t1l_info = {
+       .description = "Linux Automation GmbH USB 10Base-T1L",
+       .bind = ax88772_bind,
+       .unbind = ax88772_unbind,
+       .status = asix_status,
+       .reset = ax88772_reset,
+       .stop = ax88772_stop,
+       .flags = FLAG_ETHER | FLAG_FRAMING_AX | FLAG_LINK_INTR |
+                FLAG_MULTI_PACKET,
+       .rx_fixup = asix_rx_fixup_common,
+       .tx_fixup = asix_tx_fixup,
+       .data = FLAG_EEPROM_MAC,
+};
+
 static const struct driver_info ax88178_info = {
        .description = "ASIX AX88178 USB 2.0 Ethernet",
        .bind = ax88178_bind,
@@ -1538,6 +1552,10 @@ static const struct usb_device_id        products [] = {
         */
        USB_DEVICE(0x066b, 0x20f9),
        .driver_info = (unsigned long) &hg20f9_info,
+}, {
+       // Linux Automation GmbH USB 10Base-T1L
+       USB_DEVICE(0x33f7, 0x0004),
+       .driver_info = (unsigned long) &lxausb_t1l_info,
 },
        { },            // END
 };
index 19eee06..7723b2a 100644 (file)
@@ -60,13 +60,17 @@ static const unsigned long guest_offloads[] = {
        VIRTIO_NET_F_GUEST_TSO6,
        VIRTIO_NET_F_GUEST_ECN,
        VIRTIO_NET_F_GUEST_UFO,
-       VIRTIO_NET_F_GUEST_CSUM
+       VIRTIO_NET_F_GUEST_CSUM,
+       VIRTIO_NET_F_GUEST_USO4,
+       VIRTIO_NET_F_GUEST_USO6
 };
 
 #define GUEST_OFFLOAD_GRO_HW_MASK ((1ULL << VIRTIO_NET_F_GUEST_TSO4) | \
                                (1ULL << VIRTIO_NET_F_GUEST_TSO6) | \
                                (1ULL << VIRTIO_NET_F_GUEST_ECN)  | \
-                               (1ULL << VIRTIO_NET_F_GUEST_UFO))
+                               (1ULL << VIRTIO_NET_F_GUEST_UFO)  | \
+                               (1ULL << VIRTIO_NET_F_GUEST_USO4) | \
+                               (1ULL << VIRTIO_NET_F_GUEST_USO6))
 
 struct virtnet_stat_desc {
        char desc[ETH_GSTRING_LEN];
@@ -3085,7 +3089,9 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog,
                virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO6) ||
                virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_ECN) ||
                virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_UFO) ||
-               virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_CSUM))) {
+               virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_CSUM) ||
+               virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_USO4) ||
+               virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_USO6))) {
                NL_SET_ERR_MSG_MOD(extack, "Can't set XDP while host is implementing GRO_HW/CSUM, disable GRO_HW/CSUM first");
                return -EOPNOTSUPP;
        }
@@ -3690,7 +3696,9 @@ static bool virtnet_check_guest_gso(const struct virtnet_info *vi)
        return virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO4) ||
                virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO6) ||
                virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_ECN) ||
-               virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_UFO);
+               virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_UFO) ||
+               (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_USO4) &&
+               virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_USO6));
 }
 
 static void virtnet_set_big_packets(struct virtnet_info *vi, const int mtu)
@@ -3759,6 +3767,8 @@ static int virtnet_probe(struct virtio_device *vdev)
                        dev->hw_features |= NETIF_F_TSO6;
                if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_ECN))
                        dev->hw_features |= NETIF_F_TSO_ECN;
+               if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_USO))
+                       dev->hw_features |= NETIF_F_GSO_UDP_L4;
 
                dev->features |= NETIF_F_GSO_ROBUST;
 
@@ -4036,6 +4046,7 @@ static struct virtio_device_id id_table[] = {
        VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_HOST_TSO6, \
        VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, \
        VIRTIO_NET_F_GUEST_ECN, VIRTIO_NET_F_GUEST_UFO, \
+       VIRTIO_NET_F_HOST_USO, VIRTIO_NET_F_GUEST_USO4, VIRTIO_NET_F_GUEST_USO6, \
        VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ, \
        VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN, \
        VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, \
index d3e7b27..6f1e560 100644 (file)
@@ -75,8 +75,14 @@ vmxnet3_enable_all_intrs(struct vmxnet3_adapter *adapter)
 
        for (i = 0; i < adapter->intr.num_intrs; i++)
                vmxnet3_enable_intr(adapter, i);
-       adapter->shared->devRead.intrConf.intrCtrl &=
+       if (!VMXNET3_VERSION_GE_6(adapter) ||
+           !adapter->queuesExtEnabled) {
+               adapter->shared->devRead.intrConf.intrCtrl &=
+                                       cpu_to_le32(~VMXNET3_IC_DISABLE_ALL);
+       } else {
+               adapter->shared->devReadExt.intrConfExt.intrCtrl &=
                                        cpu_to_le32(~VMXNET3_IC_DISABLE_ALL);
+       }
 }
 
 
@@ -85,8 +91,14 @@ vmxnet3_disable_all_intrs(struct vmxnet3_adapter *adapter)
 {
        int i;
 
-       adapter->shared->devRead.intrConf.intrCtrl |=
+       if (!VMXNET3_VERSION_GE_6(adapter) ||
+           !adapter->queuesExtEnabled) {
+               adapter->shared->devRead.intrConf.intrCtrl |=
+                                       cpu_to_le32(VMXNET3_IC_DISABLE_ALL);
+       } else {
+               adapter->shared->devReadExt.intrConfExt.intrCtrl |=
                                        cpu_to_le32(VMXNET3_IC_DISABLE_ALL);
+       }
        for (i = 0; i < adapter->intr.num_intrs; i++)
                vmxnet3_disable_intr(adapter, i);
 }
@@ -1396,6 +1408,7 @@ vmxnet3_rq_rx_complete(struct vmxnet3_rx_queue *rq,
        };
        u32 num_pkts = 0;
        bool skip_page_frags = false;
+       bool encap_lro = false;
        struct Vmxnet3_RxCompDesc *rcd;
        struct vmxnet3_rx_ctx *ctx = &rq->rx_ctx;
        u16 segCnt = 0, mss = 0;
@@ -1556,13 +1569,18 @@ vmxnet3_rq_rx_complete(struct vmxnet3_rx_queue *rq,
                        if (VMXNET3_VERSION_GE_2(adapter) &&
                            rcd->type == VMXNET3_CDTYPE_RXCOMP_LRO) {
                                struct Vmxnet3_RxCompDescExt *rcdlro;
+                               union Vmxnet3_GenericDesc *gdesc;
+
                                rcdlro = (struct Vmxnet3_RxCompDescExt *)rcd;
+                               gdesc = (union Vmxnet3_GenericDesc *)rcd;
 
                                segCnt = rcdlro->segCnt;
                                WARN_ON_ONCE(segCnt == 0);
                                mss = rcdlro->mss;
                                if (unlikely(segCnt <= 1))
                                        segCnt = 0;
+                               encap_lro = (le32_to_cpu(gdesc->dword[0]) &
+                                       (1UL << VMXNET3_RCD_HDR_INNER_SHIFT));
                        } else {
                                segCnt = 0;
                        }
@@ -1630,7 +1648,7 @@ vmxnet3_rq_rx_complete(struct vmxnet3_rx_queue *rq,
                        vmxnet3_rx_csum(adapter, skb,
                                        (union Vmxnet3_GenericDesc *)rcd);
                        skb->protocol = eth_type_trans(skb, adapter->netdev);
-                       if (!rcd->tcp ||
+                       if ((!rcd->tcp && !encap_lro) ||
                            !(adapter->netdev->features & NETIF_F_LRO))
                                goto not_lro;
 
@@ -1639,7 +1657,7 @@ vmxnet3_rq_rx_complete(struct vmxnet3_rx_queue *rq,
                                        SKB_GSO_TCPV4 : SKB_GSO_TCPV6;
                                skb_shinfo(skb)->gso_size = mss;
                                skb_shinfo(skb)->gso_segs = segCnt;
-                       } else if (segCnt != 0 || skb->len > mtu) {
+                       } else if ((segCnt != 0 || skb->len > mtu) && !encap_lro) {
                                u32 hlen;
 
                                hlen = vmxnet3_get_hdr_len(adapter, skb,
@@ -1668,6 +1686,7 @@ not_lro:
                                napi_gro_receive(&rq->napi, skb);
 
                        ctx->skb = NULL;
+                       encap_lro = false;
                        num_pkts++;
                }
 
index 341c17f..9600212 100644 (file)
@@ -5174,7 +5174,7 @@ il_mac_reset_tsf(struct ieee80211_hw *hw, struct ieee80211_vif *vif)
        memset(&il->current_ht_config, 0, sizeof(struct il_ht_config));
 
        /* new association get rid of ibss beacon skb */
-       dev_kfree_skb(il->beacon_skb);
+       dev_consume_skb_irq(il->beacon_skb);
        il->beacon_skb = NULL;
        il->timestamp = 0;
 
@@ -5293,7 +5293,7 @@ il_beacon_update(struct ieee80211_hw *hw, struct ieee80211_vif *vif)
        }
 
        spin_lock_irqsave(&il->lock, flags);
-       dev_kfree_skb(il->beacon_skb);
+       dev_consume_skb_irq(il->beacon_skb);
        il->beacon_skb = skb;
 
        timestamp = ((struct ieee80211_mgmt *)skb->data)->u.beacon.timestamp;
index 8f5220c..ae97530 100644 (file)
@@ -869,6 +869,7 @@ static int lbs_init_adapter(struct lbs_private *priv)
        ret = kfifo_alloc(&priv->event_fifo, sizeof(u32) * 16, GFP_KERNEL);
        if (ret) {
                pr_err("Out of memory allocating event FIFO buffer\n");
+               lbs_free_cmd_buffer(priv);
                goto out;
        }
 
index fad5fe1..5d58218 100644 (file)
@@ -643,7 +643,7 @@ mt76_dma_wed_setup(struct mt76_dev *dev, struct mt76_queue *q)
                        q->wed_regs = wed->txfree_ring.reg_base;
                break;
        case MT76_WED_Q_RX:
-               ret = mtk_wed_device_rx_ring_setup(wed, ring, q->regs);
+               ret = mtk_wed_device_rx_ring_setup(wed, ring, q->regs, false);
                if (!ret)
                        q->wed_regs = wed->rx_ring[ring].reg_base;
                break;
index 2c4f403..97e7ff7 100644 (file)
@@ -1122,7 +1122,7 @@ static void rtl8188fu_phy_iqcalibrate(struct rtl8xxxu_priv *priv,
 
        if (t == 0) {
                val32 = rtl8xxxu_read32(priv, REG_FPGA0_XA_HSSI_PARM1);
-               priv->pi_enabled = val32 & FPGA0_HSSI_PARM1_PI;
+               priv->pi_enabled = u32_get_bits(val32, FPGA0_HSSI_PARM1_PI);
        }
 
        /* save RF path */
index 3ed4354..c8b82c5 100644 (file)
@@ -4208,10 +4208,12 @@ static int rtl8xxxu_init_device(struct ieee80211_hw *hw)
                 * should be equal or CCK RSSI report may be incorrect
                 */
                val32 = rtl8xxxu_read32(priv, REG_FPGA0_XA_HSSI_PARM2);
-               priv->cck_agc_report_type = val32 & FPGA0_HSSI_PARM2_CCK_HIGH_PWR;
+               priv->cck_agc_report_type =
+                       u32_get_bits(val32, FPGA0_HSSI_PARM2_CCK_HIGH_PWR);
 
                val32 = rtl8xxxu_read32(priv, REG_FPGA0_XB_HSSI_PARM2);
-               if (priv->cck_agc_report_type != (bool)(val32 & FPGA0_HSSI_PARM2_CCK_HIGH_PWR)) {
+               if (priv->cck_agc_report_type !=
+                   u32_get_bits(val32, FPGA0_HSSI_PARM2_CCK_HIGH_PWR)) {
                        if (priv->cck_agc_report_type)
                                val32 |= FPGA0_HSSI_PARM2_CCK_HIGH_PWR;
                        else
@@ -5274,7 +5276,7 @@ static void rtl8xxxu_queue_rx_urb(struct rtl8xxxu_priv *priv,
                pending = priv->rx_urb_pending_count;
        } else {
                skb = (struct sk_buff *)rx_urb->urb.context;
-               dev_kfree_skb(skb);
+               dev_kfree_skb_irq(skb);
                usb_free_urb(&rx_urb->urb);
        }
 
index 58c2ab3..de61c9c 100644 (file)
@@ -68,8 +68,10 @@ static void _rtl88ee_return_beacon_queue_skb(struct ieee80211_hw *hw)
        struct rtl_priv *rtlpriv = rtl_priv(hw);
        struct rtl_pci *rtlpci = rtl_pcidev(rtl_pcipriv(hw));
        struct rtl8192_tx_ring *ring = &rtlpci->tx_ring[BEACON_QUEUE];
+       struct sk_buff_head free_list;
        unsigned long flags;
 
+       skb_queue_head_init(&free_list);
        spin_lock_irqsave(&rtlpriv->locks.irq_th_lock, flags);
        while (skb_queue_len(&ring->queue)) {
                struct rtl_tx_desc *entry = &ring->desc[ring->idx];
@@ -79,10 +81,12 @@ static void _rtl88ee_return_beacon_queue_skb(struct ieee80211_hw *hw)
                                 rtlpriv->cfg->ops->get_desc(hw, (u8 *)entry,
                                                true, HW_DESC_TXBUFF_ADDR),
                                 skb->len, DMA_TO_DEVICE);
-               kfree_skb(skb);
+               __skb_queue_tail(&free_list, skb);
                ring->idx = (ring->idx + 1) % ring->entries;
        }
        spin_unlock_irqrestore(&rtlpriv->locks.irq_th_lock, flags);
+
+       __skb_queue_purge(&free_list);
 }
 
 static void _rtl88ee_disable_bcn_sub_func(struct ieee80211_hw *hw)
index 189cc64..0ba3bbe 100644 (file)
@@ -30,8 +30,10 @@ static void _rtl8723be_return_beacon_queue_skb(struct ieee80211_hw *hw)
        struct rtl_priv *rtlpriv = rtl_priv(hw);
        struct rtl_pci *rtlpci = rtl_pcidev(rtl_pcipriv(hw));
        struct rtl8192_tx_ring *ring = &rtlpci->tx_ring[BEACON_QUEUE];
+       struct sk_buff_head free_list;
        unsigned long flags;
 
+       skb_queue_head_init(&free_list);
        spin_lock_irqsave(&rtlpriv->locks.irq_th_lock, flags);
        while (skb_queue_len(&ring->queue)) {
                struct rtl_tx_desc *entry = &ring->desc[ring->idx];
@@ -41,10 +43,12 @@ static void _rtl8723be_return_beacon_queue_skb(struct ieee80211_hw *hw)
                                 rtlpriv->cfg->ops->get_desc(hw, (u8 *)entry,
                                                true, HW_DESC_TXBUFF_ADDR),
                                 skb->len, DMA_TO_DEVICE);
-               kfree_skb(skb);
+               __skb_queue_tail(&free_list, skb);
                ring->idx = (ring->idx + 1) % ring->entries;
        }
        spin_unlock_irqrestore(&rtlpriv->locks.irq_th_lock, flags);
+
+       __skb_queue_purge(&free_list);
 }
 
 static void _rtl8723be_set_bcn_ctrl_reg(struct ieee80211_hw *hw,
index 7e0f62d..a7e3250 100644 (file)
@@ -26,8 +26,10 @@ static void _rtl8821ae_return_beacon_queue_skb(struct ieee80211_hw *hw)
        struct rtl_priv *rtlpriv = rtl_priv(hw);
        struct rtl_pci *rtlpci = rtl_pcidev(rtl_pcipriv(hw));
        struct rtl8192_tx_ring *ring = &rtlpci->tx_ring[BEACON_QUEUE];
+       struct sk_buff_head free_list;
        unsigned long flags;
 
+       skb_queue_head_init(&free_list);
        spin_lock_irqsave(&rtlpriv->locks.irq_th_lock, flags);
        while (skb_queue_len(&ring->queue)) {
                struct rtl_tx_desc *entry = &ring->desc[ring->idx];
@@ -37,10 +39,12 @@ static void _rtl8821ae_return_beacon_queue_skb(struct ieee80211_hw *hw)
                                 rtlpriv->cfg->ops->get_desc(hw, (u8 *)entry,
                                                true, HW_DESC_TXBUFF_ADDR),
                                 skb->len, DMA_TO_DEVICE);
-               kfree_skb(skb);
+               __skb_queue_tail(&free_list, skb);
                ring->idx = (ring->idx + 1) % ring->entries;
        }
        spin_unlock_irqrestore(&rtlpriv->locks.irq_th_lock, flags);
+
+       __skb_queue_purge(&free_list);
 }
 
 static void _rtl8821ae_set_bcn_ctrl_reg(struct ieee80211_hw *hw,
index a29321e..5323ead 100644 (file)
@@ -1598,18 +1598,6 @@ static bool _rtl8812ae_get_integer_from_string(const char *str, u8 *pint)
        return true;
 }
 
-static bool _rtl8812ae_eq_n_byte(const char *str1, const char *str2, u32 num)
-{
-       if (num == 0)
-               return false;
-       while (num > 0) {
-               num--;
-               if (str1[num] != str2[num])
-                       return false;
-       }
-       return true;
-}
-
 static s8 _rtl8812ae_phy_get_chnl_idx_of_txpwr_lmt(struct ieee80211_hw *hw,
                                              u8 band, u8 channel)
 {
@@ -1659,42 +1647,42 @@ static void _rtl8812ae_phy_set_txpower_limit(struct ieee80211_hw *hw,
        power_limit = power_limit > MAX_POWER_INDEX ?
                      MAX_POWER_INDEX : power_limit;
 
-       if (_rtl8812ae_eq_n_byte(pregulation, "FCC", 3))
+       if (strcmp(pregulation, "FCC") == 0)
                regulation = 0;
-       else if (_rtl8812ae_eq_n_byte(pregulation, "MKK", 3))
+       else if (strcmp(pregulation, "MKK") == 0)
                regulation = 1;
-       else if (_rtl8812ae_eq_n_byte(pregulation, "ETSI", 4))
+       else if (strcmp(pregulation, "ETSI") == 0)
                regulation = 2;
-       else if (_rtl8812ae_eq_n_byte(pregulation, "WW13", 4))
+       else if (strcmp(pregulation, "WW13") == 0)
                regulation = 3;
 
-       if (_rtl8812ae_eq_n_byte(prate_section, "CCK", 3))
+       if (strcmp(prate_section, "CCK") == 0)
                rate_section = 0;
-       else if (_rtl8812ae_eq_n_byte(prate_section, "OFDM", 4))
+       else if (strcmp(prate_section, "OFDM") == 0)
                rate_section = 1;
-       else if (_rtl8812ae_eq_n_byte(prate_section, "HT", 2) &&
-                _rtl8812ae_eq_n_byte(prf_path, "1T", 2))
+       else if (strcmp(prate_section, "HT") == 0 &&
+                strcmp(prf_path, "1T") == 0)
                rate_section = 2;
-       else if (_rtl8812ae_eq_n_byte(prate_section, "HT", 2) &&
-                _rtl8812ae_eq_n_byte(prf_path, "2T", 2))
+       else if (strcmp(prate_section, "HT") == 0 &&
+                strcmp(prf_path, "2T") == 0)
                rate_section = 3;
-       else if (_rtl8812ae_eq_n_byte(prate_section, "VHT", 3) &&
-                _rtl8812ae_eq_n_byte(prf_path, "1T", 2))
+       else if (strcmp(prate_section, "VHT") == 0 &&
+                strcmp(prf_path, "1T") == 0)
                rate_section = 4;
-       else if (_rtl8812ae_eq_n_byte(prate_section, "VHT", 3) &&
-                _rtl8812ae_eq_n_byte(prf_path, "2T", 2))
+       else if (strcmp(prate_section, "VHT") == 0 &&
+                strcmp(prf_path, "2T") == 0)
                rate_section = 5;
 
-       if (_rtl8812ae_eq_n_byte(pbandwidth, "20M", 3))
+       if (strcmp(pbandwidth, "20M") == 0)
                bandwidth = 0;
-       else if (_rtl8812ae_eq_n_byte(pbandwidth, "40M", 3))
+       else if (strcmp(pbandwidth, "40M") == 0)
                bandwidth = 1;
-       else if (_rtl8812ae_eq_n_byte(pbandwidth, "80M", 3))
+       else if (strcmp(pbandwidth, "80M") == 0)
                bandwidth = 2;
-       else if (_rtl8812ae_eq_n_byte(pbandwidth, "160M", 4))
+       else if (strcmp(pbandwidth, "160M") == 0)
                bandwidth = 3;
 
-       if (_rtl8812ae_eq_n_byte(pband, "2.4G", 4)) {
+       if (strcmp(pband, "2.4G") == 0) {
                ret = _rtl8812ae_phy_get_chnl_idx_of_txpwr_lmt(hw,
                                                               BAND_ON_2_4G,
                                                               channel);
@@ -1718,7 +1706,7 @@ static void _rtl8812ae_phy_set_txpower_limit(struct ieee80211_hw *hw,
                        regulation, bandwidth, rate_section, channel_index,
                        rtlphy->txpwr_limit_2_4g[regulation][bandwidth]
                                [rate_section][channel_index][RF90_PATH_A]);
-       } else if (_rtl8812ae_eq_n_byte(pband, "5G", 2)) {
+       } else if (strcmp(pband, "5G") == 0) {
                ret = _rtl8812ae_phy_get_chnl_idx_of_txpwr_lmt(hw,
                                                               BAND_ON_5G,
                                                               channel);
index 931aff8..adc9617 100644 (file)
@@ -689,7 +689,9 @@ rtw89_core_tx_update_data_info(struct rtw89_dev *rtwdev,
                               struct rtw89_core_tx_request *tx_req)
 {
        struct ieee80211_vif *vif = tx_req->vif;
+       struct ieee80211_sta *sta = tx_req->sta;
        struct rtw89_vif *rtwvif = (struct rtw89_vif *)vif->drv_priv;
+       struct rtw89_sta *rtwsta = sta_to_rtwsta_safe(sta);
        struct rtw89_phy_rate_pattern *rate_pattern = &rtwvif->rate_pattern;
        const struct rtw89_chan *chan = rtw89_chan_get(rtwdev, RTW89_SUB_ENTITY_0);
        struct rtw89_tx_desc_info *desc_info = &tx_req->desc_info;
@@ -707,6 +709,7 @@ rtw89_core_tx_update_data_info(struct rtw89_dev *rtwdev,
        desc_info->qsel = qsel;
        desc_info->mac_id = rtw89_core_tx_get_mac_id(rtwdev, tx_req);
        desc_info->port = desc_info->hiq ? rtwvif->port : 0;
+       desc_info->er_cap = rtwsta ? rtwsta->er_cap : false;
 
        /* enable wd_info for AMPDU */
        desc_info->en_wd_info = true;
@@ -1006,7 +1009,9 @@ static __le32 rtw89_build_txwd_info0(struct rtw89_tx_desc_info *desc_info)
 static __le32 rtw89_build_txwd_info0_v1(struct rtw89_tx_desc_info *desc_info)
 {
        u32 dword = FIELD_PREP(RTW89_TXWD_INFO0_DISDATAFB, desc_info->dis_data_fb) |
-                   FIELD_PREP(RTW89_TXWD_INFO0_MULTIPORT_ID, desc_info->port);
+                   FIELD_PREP(RTW89_TXWD_INFO0_MULTIPORT_ID, desc_info->port) |
+                   FIELD_PREP(RTW89_TXWD_INFO0_DATA_ER, desc_info->er_cap) |
+                   FIELD_PREP(RTW89_TXWD_INFO0_DATA_BW_ER, 0);
 
        return cpu_to_le32(dword);
 }
@@ -2585,6 +2590,12 @@ int rtw89_core_sta_assoc(struct rtw89_dev *rtwdev,
        rtw89_mac_bf_monitor_calc(rtwdev, sta, false);
 
        if (vif->type == NL80211_IFTYPE_STATION && !sta->tdls) {
+               struct ieee80211_bss_conf *bss_conf = &vif->bss_conf;
+
+               if (bss_conf->he_support &&
+                   !(bss_conf->he_oper.params & IEEE80211_HE_OPERATION_ER_SU_DISABLE))
+                       rtwsta->er_cap = true;
+
                rtw89_btc_ntfy_role_info(rtwdev, rtwvif, rtwsta,
                                         BTC_ROLE_MSTS_STA_CONN_END);
                rtw89_core_get_no_ul_ofdma_htc(rtwdev, &rtwsta->htc_template);
index 2badb96..800ede1 100644 (file)
@@ -816,6 +816,7 @@ struct rtw89_tx_desc_info {
 #define RTW89_MGMT_HW_SEQ_MODE 1
        bool hiq;
        u8 port;
+       bool er_cap;
 };
 
 struct rtw89_core_tx_request {
@@ -2194,6 +2195,7 @@ struct rtw89_sec_cam_entry {
 struct rtw89_sta {
        u8 mac_id;
        bool disassoc;
+       bool er_cap;
        struct rtw89_dev *rtwdev;
        struct rtw89_vif *rtwvif;
        struct rtw89_ra_info ra;
index de1f237..65b6bd4 100644 (file)
@@ -91,6 +91,7 @@ static int rtw89_fw_hdr_parser(struct rtw89_dev *rtwdev, const u8 *fw, u32 len,
        const u8 *fwdynhdr;
        const u8 *bin;
        u32 base_hdr_len;
+       u32 mssc_len = 0;
        u32 i;
 
        if (!info)
@@ -120,6 +121,14 @@ static int rtw89_fw_hdr_parser(struct rtw89_dev *rtwdev, const u8 *fw, u32 len,
        fw += RTW89_FW_HDR_SIZE;
        section_info = info->section_info;
        for (i = 0; i < info->section_num; i++) {
+               section_info->type = GET_FWSECTION_HDR_SECTIONTYPE(fw);
+               if (section_info->type == FWDL_SECURITY_SECTION_TYPE) {
+                       section_info->mssc = GET_FWSECTION_HDR_MSSC(fw);
+                       mssc_len += section_info->mssc * FWDL_SECURITY_SIGLEN;
+               } else {
+                       section_info->mssc = 0;
+               }
+
                section_info->len = GET_FWSECTION_HDR_SEC_SIZE(fw);
                if (GET_FWSECTION_HDR_CHECKSUM(fw))
                        section_info->len += FWDL_SECTION_CHKSUM_LEN;
@@ -132,7 +141,7 @@ static int rtw89_fw_hdr_parser(struct rtw89_dev *rtwdev, const u8 *fw, u32 len,
                section_info++;
        }
 
-       if (fw_end != bin) {
+       if (fw_end != bin + mssc_len) {
                rtw89_err(rtwdev, "[ERR]fw bin size\n");
                return -EINVAL;
        }
index 4d2f9ea..4326e0e 100644 (file)
@@ -171,6 +171,8 @@ struct rtw89_fw_hdr_section_info {
        const u8 *addr;
        u32 len;
        u32 dladdr;
+       u32 mssc;
+       u8 type;
 };
 
 struct rtw89_fw_bin_info {
@@ -480,14 +482,21 @@ static inline void RTW89_SET_EDCA_PARAM(void *cmd, u32 val)
 #define FW_EDCA_PARAM_CWMIN_MSK GENMASK(11, 8)
 #define FW_EDCA_PARAM_AIFS_MSK GENMASK(7, 0)
 
+#define FWDL_SECURITY_SECTION_TYPE 9
+#define FWDL_SECURITY_SIGLEN 512
+
+#define GET_FWSECTION_HDR_DL_ADDR(fwhdr)       \
+       le32_get_bits(*((const __le32 *)(fwhdr)), GENMASK(31, 0))
+#define GET_FWSECTION_HDR_SECTIONTYPE(fwhdr)   \
+       le32_get_bits(*((const __le32 *)(fwhdr) + 1), GENMASK(27, 24))
 #define GET_FWSECTION_HDR_SEC_SIZE(fwhdr)      \
        le32_get_bits(*((const __le32 *)(fwhdr) + 1), GENMASK(23, 0))
 #define GET_FWSECTION_HDR_CHECKSUM(fwhdr)      \
        le32_get_bits(*((const __le32 *)(fwhdr) + 1), BIT(28))
 #define GET_FWSECTION_HDR_REDL(fwhdr)  \
        le32_get_bits(*((const __le32 *)(fwhdr) + 1), BIT(29))
-#define GET_FWSECTION_HDR_DL_ADDR(fwhdr)       \
-       le32_get_bits(*((const __le32 *)(fwhdr)), GENMASK(31, 0))
+#define GET_FWSECTION_HDR_MSSC(fwhdr)  \
+       le32_get_bits(*((const __le32 *)(fwhdr) + 2), GENMASK(31, 0))
 
 #define GET_FW_HDR_MAJOR_VERSION(fwhdr)        \
        le32_get_bits(*((const __le32 *)(fwhdr) + 1), GENMASK(7, 0))
index 017710c..5dc617a 100644 (file)
@@ -367,6 +367,7 @@ static void rtw89_phy_ra_sta_update(struct rtw89_dev *rtwdev,
        }
 
        ra->bw_cap = bw_mode;
+       ra->er_cap = rtwsta->er_cap;
        ra->mode_ctrl = mode;
        ra->macid = rtwsta->mac_id;
        ra->stbc_cap = stbc_en;
index 5324e64..ec5b8d5 100644 (file)
 #define RR_MOD_IQK GENMASK(19, 4)
 #define RR_MOD_DPK GENMASK(19, 5)
 #define RR_MOD_MASK GENMASK(19, 16)
+#define RR_MOD_DCK GENMASK(14, 10)
 #define RR_MOD_RGM GENMASK(13, 4)
 #define RR_MOD_V_DOWN 0x0
 #define RR_MOD_V_STANDBY 0x1
 #define RR_MOD_NBW GENMASK(15, 14)
 #define RR_MOD_M_RXG GENMASK(13, 4)
 #define RR_MOD_M_RXBB GENMASK(9, 5)
+#define RR_MOD_LO_SEL BIT(1)
 #define RR_MODOPT 0x01
 #define RR_MODOPT_M_TXPWR GENMASK(5, 0)
 #define RR_WLSEL 0x02
 #define RR_LUTWA_M2 GENMASK(4, 0)
 #define RR_LUTWD1 0x3e
 #define RR_LUTWD0 0x3f
+#define RR_LUTWD0_MB GENMASK(11, 6)
 #define RR_LUTWD0_LB GENMASK(5, 0)
 #define RR_TM 0x42
 #define RR_TM_TRI BIT(19)
 #define RR_TXRSV_GAPK BIT(19)
 #define RR_BIAS 0x5e
 #define RR_BIAS_GAPK BIT(19)
+#define RR_TXAC 0x5f
+#define RR_TXAC_IQG GENMASK(3, 0)
 #define RR_BIASA 0x60
 #define RR_BIASA_TXG GENMASK(15, 12)
 #define RR_BIASA_TXA GENMASK(19, 16)
 #define RR_XALNA2_SW2 GENMASK(9, 8)
 #define RR_XALNA2_SW GENMASK(1, 0)
 #define RR_DCK 0x92
+#define RR_DCK_S1 GENMASK(19, 16)
+#define RR_DCK_TIA GENMASK(15, 9)
 #define RR_DCK_DONE GENMASK(7, 5)
 #define RR_DCK_FINE BIT(1)
 #define RR_DCK_LV BIT(0)
 #define RR_DCK1 0x93
+#define RR_DCK1_S1 GENMASK(19, 16)
+#define RR_DCK1_TIA GENMASK(15, 9)
 #define RR_DCK1_DONE BIT(5)
 #define RR_DCK1_CLR GENMASK(3, 0)
 #define RR_DCK1_SEL BIT(3)
 #define RR_LUTDBG 0xdf
 #define RR_LUTDBG_TIA BIT(12)
 #define RR_LUTDBG_LOK BIT(2)
+#define RR_LUTPLL 0xec
+#define RR_CAL_RW BIT(19)
 #define RR_LUTWE2 0xee
 #define RR_LUTWE2_RTXBW BIT(2)
 #define RR_LUTWE 0xef
 #define RR_LUTWE_LOK BIT(2)
 #define RR_RFC 0xf0
+#define RR_WCAL BIT(16)
 #define RR_RFC_CKEN BIT(1)
 
 #define R_UPD_P0 0x0000
index 60cd676..b0ea23d 100644 (file)
@@ -59,6 +59,9 @@ static const u32 dpk_par_regs[RTW89_DPK_RF_PATH][4] = {
        {0x81a8, 0x81c4, 0x81c8, 0x81e8},
 };
 
+static const u8 _dck_addr_bs[RF_PATH_NUM_8852C] = {0x0, 0x10};
+static const u8 _dck_addr[RF_PATH_NUM_8852C] = {0xc, 0x1c};
+
 static u8 _kpath(struct rtw89_dev *rtwdev, enum rtw89_phy_idx phy_idx)
 {
        rtw89_debug(rtwdev, RTW89_DBG_RFK, "[RFK]dbcc_en: %x,  PHY%d\n",
@@ -337,7 +340,7 @@ static void _dack_reload_by_path(struct rtw89_dev *rtwdev,
                (dack->dadck_d[path][index] << 14);
        addr = 0xc210 + offset;
        rtw89_phy_write32(rtwdev, addr, val32);
-       rtw89_phy_write32_set(rtwdev, addr, BIT(1));
+       rtw89_phy_write32_set(rtwdev, addr, BIT(0));
 }
 
 static void _dack_reload(struct rtw89_dev *rtwdev, enum rtw89_rf_path path)
@@ -1536,6 +1539,155 @@ static void _iqk(struct rtw89_dev *rtwdev, enum rtw89_phy_idx phy_idx, bool forc
        }
 }
 
+static void _rx_dck_value_rewrite(struct rtw89_dev *rtwdev, u8 path, u8 addr,
+                                 u8 val_i, u8 val_q)
+{
+       u32 ofst_val;
+
+       rtw89_debug(rtwdev, RTW89_DBG_RFK,
+                   "[RX_DCK] rewrite val_i = 0x%x, val_q = 0x%x\n", val_i, val_q);
+
+       /* val_i and val_q are 7 bits, and target is 6 bits. */
+       ofst_val = u32_encode_bits(val_q >> 1, RR_LUTWD0_MB) |
+                  u32_encode_bits(val_i >> 1, RR_LUTWD0_LB);
+
+       rtw89_write_rf(rtwdev, path, RR_LUTPLL, RR_CAL_RW, 0x1);
+       rtw89_write_rf(rtwdev, path, RR_RFC, RR_WCAL, 0x1);
+       rtw89_write_rf(rtwdev, path, RR_DCK, RR_DCK_FINE, 0x1);
+       rtw89_write_rf(rtwdev, path, RR_LUTWA, MASKBYTE0, addr);
+       rtw89_write_rf(rtwdev, path, RR_LUTWD0, RFREG_MASK, ofst_val);
+       rtw89_write_rf(rtwdev, path, RR_LUTWD0, RFREG_MASK, ofst_val);
+       rtw89_write_rf(rtwdev, path, RR_DCK, RR_DCK_FINE, 0x0);
+       rtw89_write_rf(rtwdev, path, RR_RFC, RR_WCAL, 0x0);
+       rtw89_write_rf(rtwdev, path, RR_LUTPLL, RR_CAL_RW, 0x0);
+
+       rtw89_debug(rtwdev, RTW89_DBG_RFK, "[RX_DCK] Final val_i = 0x%x, val_q = 0x%x\n",
+                   u32_get_bits(ofst_val, RR_LUTWD0_LB) << 1,
+                   u32_get_bits(ofst_val, RR_LUTWD0_MB) << 1);
+}
+
+static bool _rx_dck_rek_check(struct rtw89_dev *rtwdev, u8 path)
+{
+       u8 i_even_bs, q_even_bs;
+       u8 i_odd_bs, q_odd_bs;
+       u8 i_even, q_even;
+       u8 i_odd, q_odd;
+       const u8 th = 10;
+       u8 i;
+
+       for (i = 0; i < RF_PATH_NUM_8852C; i++) {
+               rtw89_write_rf(rtwdev, path, RR_MOD, RR_MOD_DCK, _dck_addr_bs[i]);
+               i_even_bs = rtw89_read_rf(rtwdev, path, RR_DCK, RR_DCK_TIA);
+               q_even_bs = rtw89_read_rf(rtwdev, path, RR_DCK1, RR_DCK1_TIA);
+               rtw89_debug(rtwdev, RTW89_DBG_RFK,
+                           "[RX_DCK] Gain[0x%x] i_even_bs/ q_even_bs = 0x%x/ 0x%x\n",
+                           _dck_addr_bs[i], i_even_bs, q_even_bs);
+
+               rtw89_write_rf(rtwdev, path, RR_MOD, RR_MOD_DCK, _dck_addr[i]);
+               i_even = rtw89_read_rf(rtwdev, path, RR_DCK, RR_DCK_TIA);
+               q_even = rtw89_read_rf(rtwdev, path, RR_DCK1, RR_DCK1_TIA);
+               rtw89_debug(rtwdev, RTW89_DBG_RFK,
+                           "[RX_DCK] Gain[0x%x] i_even/ q_even = 0x%x/ 0x%x\n",
+                           _dck_addr[i], i_even, q_even);
+
+               if (abs(i_even_bs - i_even) > th || abs(q_even_bs - q_even) > th)
+                       return true;
+
+               rtw89_write_rf(rtwdev, path, RR_MOD, RR_MOD_DCK, _dck_addr_bs[i] + 1);
+               i_odd_bs = rtw89_read_rf(rtwdev, path, RR_DCK, RR_DCK_TIA);
+               q_odd_bs = rtw89_read_rf(rtwdev, path, RR_DCK1, RR_DCK1_TIA);
+               rtw89_debug(rtwdev, RTW89_DBG_RFK,
+                           "[RX_DCK] Gain[0x%x] i_odd_bs/ q_odd_bs = 0x%x/ 0x%x\n",
+                           _dck_addr_bs[i] + 1, i_odd_bs, q_odd_bs);
+
+               rtw89_write_rf(rtwdev, path, RR_MOD, RR_MOD_DCK, _dck_addr[i] + 1);
+               i_odd = rtw89_read_rf(rtwdev, path, RR_DCK, RR_DCK_TIA);
+               q_odd = rtw89_read_rf(rtwdev, path, RR_DCK1, RR_DCK1_TIA);
+               rtw89_debug(rtwdev, RTW89_DBG_RFK,
+                           "[RX_DCK] Gain[0x%x] i_odd/ q_odd = 0x%x/ 0x%x\n",
+                           _dck_addr[i] + 1, i_odd, q_odd);
+
+               if (abs(i_odd_bs - i_odd) > th || abs(q_odd_bs - q_odd) > th)
+                       return true;
+       }
+
+       return false;
+}
+
+static void _rx_dck_fix_if_need(struct rtw89_dev *rtwdev, u8 path, u8 addr,
+                               u8 val_i_bs, u8 val_q_bs, u8 val_i, u8 val_q)
+{
+       const u8 th = 10;
+
+       if ((abs(val_i_bs - val_i) < th) && (abs(val_q_bs - val_q) <= th)) {
+               rtw89_debug(rtwdev, RTW89_DBG_RFK, "[RX_DCK] offset check PASS!!\n");
+               return;
+       }
+
+       if (abs(val_i_bs - val_i) > th) {
+               rtw89_debug(rtwdev, RTW89_DBG_RFK,
+                           "[RX_DCK] val_i over TH (0x%x / 0x%x)\n", val_i_bs, val_i);
+               val_i = val_i_bs;
+       }
+
+       if (abs(val_q_bs - val_q) > th) {
+               rtw89_debug(rtwdev, RTW89_DBG_RFK,
+                           "[RX_DCK] val_q over TH (0x%x / 0x%x)\n", val_q_bs, val_q);
+               val_q = val_q_bs;
+       }
+
+       _rx_dck_value_rewrite(rtwdev, path, addr, val_i, val_q);
+}
+
+static void _rx_dck_recover(struct rtw89_dev *rtwdev, u8 path)
+{
+       u8 i_even_bs, q_even_bs;
+       u8 i_odd_bs, q_odd_bs;
+       u8 i_even, q_even;
+       u8 i_odd, q_odd;
+       u8 i;
+
+       rtw89_debug(rtwdev, RTW89_DBG_RFK, "[RX_DCK] ===> recovery\n");
+
+       for (i = 0; i < RF_PATH_NUM_8852C; i++) {
+               rtw89_write_rf(rtwdev, path, RR_MOD, RR_MOD_DCK, _dck_addr_bs[i]);
+               i_even_bs = rtw89_read_rf(rtwdev, path, RR_DCK, RR_DCK_TIA);
+               q_even_bs = rtw89_read_rf(rtwdev, path, RR_DCK1, RR_DCK1_TIA);
+
+               rtw89_write_rf(rtwdev, path, RR_MOD, RR_MOD_DCK, _dck_addr_bs[i] + 1);
+               i_odd_bs = rtw89_read_rf(rtwdev, path, RR_DCK, RR_DCK_TIA);
+               q_odd_bs = rtw89_read_rf(rtwdev, path, RR_DCK1, RR_DCK1_TIA);
+
+               rtw89_debug(rtwdev, RTW89_DBG_RFK,
+                           "[RX_DCK] Gain[0x%x] i_even_bs/ q_even_bs = 0x%x/ 0x%x\n",
+                           _dck_addr_bs[i], i_even_bs, q_even_bs);
+
+               rtw89_write_rf(rtwdev, path, RR_MOD, RR_MOD_DCK, _dck_addr[i]);
+               i_even = rtw89_read_rf(rtwdev, path, RR_DCK, RR_DCK_TIA);
+               q_even = rtw89_read_rf(rtwdev, path, RR_DCK1, RR_DCK1_TIA);
+
+               rtw89_debug(rtwdev, RTW89_DBG_RFK,
+                           "[RX_DCK] Gain[0x%x] i_even/ q_even = 0x%x/ 0x%x\n",
+                           _dck_addr[i], i_even, q_even);
+               _rx_dck_fix_if_need(rtwdev, path, _dck_addr[i],
+                                   i_even_bs, q_even_bs, i_even, q_even);
+
+               rtw89_debug(rtwdev, RTW89_DBG_RFK,
+                           "[RX_DCK] Gain[0x%x] i_odd_bs/ q_odd_bs = 0x%x/ 0x%x\n",
+                           _dck_addr_bs[i] + 1, i_odd_bs, q_odd_bs);
+
+               rtw89_write_rf(rtwdev, path, RR_MOD, RR_MOD_DCK, _dck_addr[i] + 1);
+               i_odd = rtw89_read_rf(rtwdev, path, RR_DCK, RR_DCK_TIA);
+               q_odd = rtw89_read_rf(rtwdev, path, RR_DCK1, RR_DCK1_TIA);
+
+               rtw89_debug(rtwdev, RTW89_DBG_RFK,
+                           "[RX_DCK] Gain[0x%x] i_odd/ q_odd = 0x%x/ 0x%x\n",
+                           _dck_addr[i] + 1, i_odd, q_odd);
+               _rx_dck_fix_if_need(rtwdev, path, _dck_addr[i] + 1,
+                                   i_odd_bs, q_odd_bs, i_odd, q_odd);
+       }
+}
+
 static void _rx_dck_toggle(struct rtw89_dev *rtwdev, u8 path)
 {
        int ret;
@@ -1573,6 +1725,37 @@ static void _set_rx_dck(struct rtw89_dev *rtwdev, enum rtw89_phy_idx phy, u8 pat
        }
 }
 
+static
+u8 _rx_dck_channel_calc(struct rtw89_dev *rtwdev, const struct rtw89_chan *chan)
+{
+       u8 target_ch = 0;
+
+       if (chan->band_type == RTW89_BAND_5G) {
+               if (chan->channel >= 36 && chan->channel <= 64) {
+                       target_ch = 100;
+               } else if (chan->channel >= 100 && chan->channel <= 144) {
+                       target_ch = chan->channel + 32;
+                       if (target_ch > 144)
+                               target_ch = chan->channel + 33;
+               } else if (chan->channel >= 149 && chan->channel <= 177) {
+                       target_ch = chan->channel - 33;
+               }
+       } else if (chan->band_type == RTW89_BAND_6G) {
+               if (chan->channel >= 1 && chan->channel <= 125)
+                       target_ch = chan->channel + 32;
+               else
+                       target_ch = chan->channel - 32;
+       } else {
+               target_ch = chan->channel;
+       }
+
+       rtw89_debug(rtwdev, RTW89_DBG_RFK,
+                   "[RX_DCK] cur_ch / target_ch = %d / %d\n",
+                   chan->channel, target_ch);
+
+       return target_ch;
+}
+
 #define RTW8852C_RF_REL_VERSION 34
 #define RTW8852C_DPK_VER 0x10
 #define RTW8852C_DPK_TH_AVG_NUM 4
@@ -1872,12 +2055,11 @@ static void _dpk_rf_setting(struct rtw89_dev *rtwdev, u8 gain,
                               0x50101 | BIT(rtwdev->dbcc_en));
                rtw89_write_rf(rtwdev, path, RR_MOD_V1, RR_MOD_MASK, RF_DPK);
 
-               if (dpk->bp[path][kidx].band == RTW89_BAND_6G && dpk->bp[path][kidx].ch >= 161) {
+               if (dpk->bp[path][kidx].band == RTW89_BAND_6G && dpk->bp[path][kidx].ch >= 161)
                        rtw89_write_rf(rtwdev, path, RR_IQGEN, RR_IQGEN_BIAS, 0x8);
-                       rtw89_write_rf(rtwdev, path, RR_LOGEN, RR_LOGEN_RPT, 0xd);
-               } else {
-                       rtw89_write_rf(rtwdev, path, RR_LOGEN, RR_LOGEN_RPT, 0xd);
-               }
+
+               rtw89_write_rf(rtwdev, path, RR_LOGEN, RR_LOGEN_RPT, 0xd);
+               rtw89_write_rf(rtwdev, path, RR_TXAC, RR_TXAC_IQG, 0x8);
 
                rtw89_write_rf(rtwdev, path, RR_RXA2, RR_RXA2_ATT, 0x0);
                rtw89_write_rf(rtwdev, path, RR_TXIQK, RR_TXIQK_ATT2, 0x3);
@@ -3875,11 +4057,14 @@ void rtw8852c_iqk(struct rtw89_dev *rtwdev, enum rtw89_phy_idx phy_idx)
 
 #define RXDCK_VER_8852C 0xe
 
-void rtw8852c_rx_dck(struct rtw89_dev *rtwdev, enum rtw89_phy_idx phy, bool is_afe)
+static void _rx_dck(struct rtw89_dev *rtwdev, enum rtw89_phy_idx phy,
+                   bool is_afe, u8 retry_limit)
 {
        struct rtw89_rx_dck_info *rx_dck = &rtwdev->rx_dck;
        u8 path, kpath;
        u32 rf_reg5;
+       bool is_fail;
+       u8 rek_cnt;
 
        kpath = _kpath(rtwdev, phy);
        rtw89_debug(rtwdev, RTW89_DBG_RFK,
@@ -3896,7 +4081,27 @@ void rtw8852c_rx_dck(struct rtw89_dev *rtwdev, enum rtw89_phy_idx phy, bool is_a
                                               B_P0_TSSI_TRK_EN, 0x1);
                rtw89_write_rf(rtwdev, path, RR_RSV1, RR_RSV1_RST, 0x0);
                rtw89_write_rf(rtwdev, path, RR_MOD, RR_MOD_MASK, RR_MOD_V_RX);
-               _set_rx_dck(rtwdev, phy, path, is_afe);
+               rtw89_write_rf(rtwdev, path, RR_MOD, RR_MOD_LO_SEL, rtwdev->dbcc_en);
+
+               for (rek_cnt = 0; rek_cnt < retry_limit; rek_cnt++) {
+                       _set_rx_dck(rtwdev, phy, path, is_afe);
+
+                       /* To reduce IO of dck_rek_check(), the last try is seen
+                        * as failure always, and then do recovery procedure.
+                        */
+                       if (rek_cnt == retry_limit - 1) {
+                               _rx_dck_recover(rtwdev, path);
+                               break;
+                       }
+
+                       is_fail = _rx_dck_rek_check(rtwdev, path);
+                       if (!is_fail)
+                               break;
+               }
+
+               rtw89_debug(rtwdev, RTW89_DBG_RFK, "[RX_DCK] rek_cnt[%d]=%d",
+                           path, rek_cnt);
+
                rx_dck->thermal[path] = ewma_thermal_read(&rtwdev->phystat.avg_thermal[path]);
                rtw89_write_rf(rtwdev, path, RR_RSV1, RFREG_MASK, rf_reg5);
 
@@ -3906,15 +4111,31 @@ void rtw8852c_rx_dck(struct rtw89_dev *rtwdev, enum rtw89_phy_idx phy, bool is_a
        }
 }
 
-#define RTW8852C_RX_DCK_TH 8
+void rtw8852c_rx_dck(struct rtw89_dev *rtwdev, enum rtw89_phy_idx phy, bool is_afe)
+{
+       _rx_dck(rtwdev, phy, is_afe, 1);
+}
+
+#define RTW8852C_RX_DCK_TH 12
 
 void rtw8852c_rx_dck_track(struct rtw89_dev *rtwdev)
 {
+       const struct rtw89_chan *chan = rtw89_chan_get(rtwdev, RTW89_SUB_ENTITY_0);
        struct rtw89_rx_dck_info *rx_dck = &rtwdev->rx_dck;
+       enum rtw89_phy_idx phy_idx = RTW89_PHY_0;
+       u8 phy_map = rtw89_btc_phymap(rtwdev, phy_idx, 0);
+       u8 dck_channel;
        u8 cur_thermal;
+       u32 tx_en;
        int delta;
        int path;
 
+       if (chan->band_type == RTW89_BAND_2G)
+               return;
+
+       if (rtwdev->scanning)
+               return;
+
        for (path = 0; path < RF_PATH_NUM_8852C; path++) {
                cur_thermal =
                        ewma_thermal_read(&rtwdev->phystat.avg_thermal[path]);
@@ -3924,11 +4145,28 @@ void rtw8852c_rx_dck_track(struct rtw89_dev *rtwdev)
                            "[RX_DCK] path=%d current thermal=0x%x delta=0x%x\n",
                            path, cur_thermal, delta);
 
-               if (delta >= RTW8852C_RX_DCK_TH) {
-                       rtw8852c_rx_dck(rtwdev, RTW89_PHY_0, false);
-                       return;
-               }
+               if (delta >= RTW8852C_RX_DCK_TH)
+                       goto trigger_rx_dck;
+       }
+
+       return;
+
+trigger_rx_dck:
+       rtw89_btc_ntfy_wl_rfk(rtwdev, phy_map, BTC_WRFKT_RXDCK, BTC_WRFK_START);
+       rtw89_chip_stop_sch_tx(rtwdev, phy_idx, &tx_en, RTW89_SCH_TX_SEL_ALL);
+
+       for (path = 0; path < RF_PATH_NUM_8852C; path++) {
+               dck_channel = _rx_dck_channel_calc(rtwdev, chan);
+               _ctrl_ch(rtwdev, RTW89_PHY_0, dck_channel, chan->band_type);
        }
+
+       _rx_dck(rtwdev, RTW89_PHY_0, false, 20);
+
+       for (path = 0; path < RF_PATH_NUM_8852C; path++)
+               _ctrl_ch(rtwdev, RTW89_PHY_0, chan->channel, chan->band_type);
+
+       rtw89_chip_resume_sch_tx(rtwdev, phy_idx, tx_en);
+       rtw89_btc_ntfy_wl_rfk(rtwdev, phy_map, BTC_WRFKT_RXDCK, BTC_WRFK_STOP);
 }
 
 void rtw8852c_dpk(struct rtw89_dev *rtwdev, enum rtw89_phy_idx phy_idx)
index 9d4c6b6..98eb960 100644 (file)
@@ -75,7 +75,9 @@
 #define RTW89_TXWD_INFO0_DATA_BW GENMASK(29, 28)
 #define RTW89_TXWD_INFO0_GI_LTF GENMASK(27, 25)
 #define RTW89_TXWD_INFO0_DATA_RATE GENMASK(24, 16)
+#define RTW89_TXWD_INFO0_DATA_ER BIT(15)
 #define RTW89_TXWD_INFO0_DISDATAFB BIT(10)
+#define RTW89_TXWD_INFO0_DATA_BW_ER BIT(8)
 #define RTW89_TXWD_INFO0_MULTIPORT_ID GENMASK(6, 4)
 
 /* TX WD INFO DWORD 1 */
index 8a3d868..45ac937 100644 (file)
@@ -160,6 +160,7 @@ int rsi_coex_attach(struct rsi_common *common)
                               rsi_coex_scheduler_thread,
                               "Coex-Tx-Thread")) {
                rsi_dbg(ERR_ZONE, "%s: Unable to init tx thrd\n", __func__);
+               kfree(coex_cb);
                return -EINVAL;
        }
        return 0;
index 9c7a9a2..fc928b2 100644 (file)
@@ -332,6 +332,7 @@ struct iosm_mux *ipc_mux_init(struct ipc_mux_config *mux_cfg,
                        if (!ipc_mux->ul_adb.pp_qlt[i]) {
                                for (j = i - 1; j >= 0; j--)
                                        kfree(ipc_mux->ul_adb.pp_qlt[j]);
+                               kfree(ipc_mux);
                                return NULL;
                        }
                }
index 1545cbe..3dbfc8a 100644 (file)
@@ -386,7 +386,7 @@ int xenvif_dealloc_kthread(void *data);
 irqreturn_t xenvif_ctrl_irq_fn(int irq, void *data);
 
 bool xenvif_have_rx_work(struct xenvif_queue *queue, bool test_kthread);
-void xenvif_rx_queue_tail(struct xenvif_queue *queue, struct sk_buff *skb);
+bool xenvif_rx_queue_tail(struct xenvif_queue *queue, struct sk_buff *skb);
 
 void xenvif_carrier_on(struct xenvif *vif);
 
index 650fa18..f3f2c07 100644 (file)
@@ -254,14 +254,16 @@ xenvif_start_xmit(struct sk_buff *skb, struct net_device *dev)
        if (vif->hash.alg == XEN_NETIF_CTRL_HASH_ALGORITHM_NONE)
                skb_clear_hash(skb);
 
-       xenvif_rx_queue_tail(queue, skb);
+       if (!xenvif_rx_queue_tail(queue, skb))
+               goto drop;
+
        xenvif_kick_thread(queue);
 
        return NETDEV_TX_OK;
 
  drop:
        vif->dev->stats.tx_dropped++;
-       dev_kfree_skb(skb);
+       dev_kfree_skb_any(skb);
        return NETDEV_TX_OK;
 }
 
index 3d2081b..bf627af 100644 (file)
@@ -332,10 +332,13 @@ static int xenvif_count_requests(struct xenvif_queue *queue,
 
 
 struct xenvif_tx_cb {
-       u16 pending_idx;
+       u16 copy_pending_idx[XEN_NETBK_LEGACY_SLOTS_MAX + 1];
+       u8 copy_count;
 };
 
 #define XENVIF_TX_CB(skb) ((struct xenvif_tx_cb *)(skb)->cb)
+#define copy_pending_idx(skb, i) (XENVIF_TX_CB(skb)->copy_pending_idx[i])
+#define copy_count(skb) (XENVIF_TX_CB(skb)->copy_count)
 
 static inline void xenvif_tx_create_map_op(struct xenvif_queue *queue,
                                           u16 pending_idx,
@@ -370,31 +373,93 @@ static inline struct sk_buff *xenvif_alloc_skb(unsigned int size)
        return skb;
 }
 
-static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif_queue *queue,
-                                                       struct sk_buff *skb,
-                                                       struct xen_netif_tx_request *txp,
-                                                       struct gnttab_map_grant_ref *gop,
-                                                       unsigned int frag_overflow,
-                                                       struct sk_buff *nskb)
+static void xenvif_get_requests(struct xenvif_queue *queue,
+                               struct sk_buff *skb,
+                               struct xen_netif_tx_request *first,
+                               struct xen_netif_tx_request *txfrags,
+                               unsigned *copy_ops,
+                               unsigned *map_ops,
+                               unsigned int frag_overflow,
+                               struct sk_buff *nskb,
+                               unsigned int extra_count,
+                               unsigned int data_len)
 {
        struct skb_shared_info *shinfo = skb_shinfo(skb);
        skb_frag_t *frags = shinfo->frags;
-       u16 pending_idx = XENVIF_TX_CB(skb)->pending_idx;
-       int start;
+       u16 pending_idx;
        pending_ring_idx_t index;
        unsigned int nr_slots;
+       struct gnttab_copy *cop = queue->tx_copy_ops + *copy_ops;
+       struct gnttab_map_grant_ref *gop = queue->tx_map_ops + *map_ops;
+       struct xen_netif_tx_request *txp = first;
+
+       nr_slots = shinfo->nr_frags + 1;
+
+       copy_count(skb) = 0;
+
+       /* Create copy ops for exactly data_len bytes into the skb head. */
+       __skb_put(skb, data_len);
+       while (data_len > 0) {
+               int amount = data_len > txp->size ? txp->size : data_len;
+
+               cop->source.u.ref = txp->gref;
+               cop->source.domid = queue->vif->domid;
+               cop->source.offset = txp->offset;
+
+               cop->dest.domid = DOMID_SELF;
+               cop->dest.offset = (offset_in_page(skb->data +
+                                                  skb_headlen(skb) -
+                                                  data_len)) & ~XEN_PAGE_MASK;
+               cop->dest.u.gmfn = virt_to_gfn(skb->data + skb_headlen(skb)
+                                              - data_len);
+
+               cop->len = amount;
+               cop->flags = GNTCOPY_source_gref;
 
-       nr_slots = shinfo->nr_frags;
+               index = pending_index(queue->pending_cons);
+               pending_idx = queue->pending_ring[index];
+               callback_param(queue, pending_idx).ctx = NULL;
+               copy_pending_idx(skb, copy_count(skb)) = pending_idx;
+               copy_count(skb)++;
+
+               cop++;
+               data_len -= amount;
 
-       /* Skip first skb fragment if it is on same page as header fragment. */
-       start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx);
+               if (amount == txp->size) {
+                       /* The copy op covered the full tx_request */
+
+                       memcpy(&queue->pending_tx_info[pending_idx].req,
+                              txp, sizeof(*txp));
+                       queue->pending_tx_info[pending_idx].extra_count =
+                               (txp == first) ? extra_count : 0;
+
+                       if (txp == first)
+                               txp = txfrags;
+                       else
+                               txp++;
+                       queue->pending_cons++;
+                       nr_slots--;
+               } else {
+                       /* The copy op partially covered the tx_request.
+                        * The remainder will be mapped.
+                        */
+                       txp->offset += amount;
+                       txp->size -= amount;
+               }
+       }
 
-       for (shinfo->nr_frags = start; shinfo->nr_frags < nr_slots;
-            shinfo->nr_frags++, txp++, gop++) {
+       for (shinfo->nr_frags = 0; shinfo->nr_frags < nr_slots;
+            shinfo->nr_frags++, gop++) {
                index = pending_index(queue->pending_cons++);
                pending_idx = queue->pending_ring[index];
-               xenvif_tx_create_map_op(queue, pending_idx, txp, 0, gop);
+               xenvif_tx_create_map_op(queue, pending_idx, txp,
+                                       txp == first ? extra_count : 0, gop);
                frag_set_pending_idx(&frags[shinfo->nr_frags], pending_idx);
+
+               if (txp == first)
+                       txp = txfrags;
+               else
+                       txp++;
        }
 
        if (frag_overflow) {
@@ -415,7 +480,8 @@ static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif_queue *que
                skb_shinfo(skb)->frag_list = nskb;
        }
 
-       return gop;
+       (*copy_ops) = cop - queue->tx_copy_ops;
+       (*map_ops) = gop - queue->tx_map_ops;
 }
 
 static inline void xenvif_grant_handle_set(struct xenvif_queue *queue,
@@ -451,7 +517,7 @@ static int xenvif_tx_check_gop(struct xenvif_queue *queue,
                               struct gnttab_copy **gopp_copy)
 {
        struct gnttab_map_grant_ref *gop_map = *gopp_map;
-       u16 pending_idx = XENVIF_TX_CB(skb)->pending_idx;
+       u16 pending_idx;
        /* This always points to the shinfo of the skb being checked, which
         * could be either the first or the one on the frag_list
         */
@@ -462,24 +528,37 @@ static int xenvif_tx_check_gop(struct xenvif_queue *queue,
        struct skb_shared_info *first_shinfo = NULL;
        int nr_frags = shinfo->nr_frags;
        const bool sharedslot = nr_frags &&
-                               frag_get_pending_idx(&shinfo->frags[0]) == pending_idx;
-       int i, err;
+                               frag_get_pending_idx(&shinfo->frags[0]) ==
+                                   copy_pending_idx(skb, copy_count(skb) - 1);
+       int i, err = 0;
 
-       /* Check status of header. */
-       err = (*gopp_copy)->status;
-       if (unlikely(err)) {
-               if (net_ratelimit())
-                       netdev_dbg(queue->vif->dev,
-                                  "Grant copy of header failed! status: %d pending_idx: %u ref: %u\n",
-                                  (*gopp_copy)->status,
-                                  pending_idx,
-                                  (*gopp_copy)->source.u.ref);
-               /* The first frag might still have this slot mapped */
-               if (!sharedslot)
-                       xenvif_idx_release(queue, pending_idx,
-                                          XEN_NETIF_RSP_ERROR);
+       for (i = 0; i < copy_count(skb); i++) {
+               int newerr;
+
+               /* Check status of header. */
+               pending_idx = copy_pending_idx(skb, i);
+
+               newerr = (*gopp_copy)->status;
+               if (likely(!newerr)) {
+                       /* The first frag might still have this slot mapped */
+                       if (i < copy_count(skb) - 1 || !sharedslot)
+                               xenvif_idx_release(queue, pending_idx,
+                                                  XEN_NETIF_RSP_OKAY);
+               } else {
+                       err = newerr;
+                       if (net_ratelimit())
+                               netdev_dbg(queue->vif->dev,
+                                          "Grant copy of header failed! status: %d pending_idx: %u ref: %u\n",
+                                          (*gopp_copy)->status,
+                                          pending_idx,
+                                          (*gopp_copy)->source.u.ref);
+                       /* The first frag might still have this slot mapped */
+                       if (i < copy_count(skb) - 1 || !sharedslot)
+                               xenvif_idx_release(queue, pending_idx,
+                                                  XEN_NETIF_RSP_ERROR);
+               }
+               (*gopp_copy)++;
        }
-       (*gopp_copy)++;
 
 check_frags:
        for (i = 0; i < nr_frags; i++, gop_map++) {
@@ -526,14 +605,6 @@ check_frags:
                if (err)
                        continue;
 
-               /* First error: if the header haven't shared a slot with the
-                * first frag, release it as well.
-                */
-               if (!sharedslot)
-                       xenvif_idx_release(queue,
-                                          XENVIF_TX_CB(skb)->pending_idx,
-                                          XEN_NETIF_RSP_OKAY);
-
                /* Invalidate preceding fragments of this skb. */
                for (j = 0; j < i; j++) {
                        pending_idx = frag_get_pending_idx(&shinfo->frags[j]);
@@ -803,7 +874,6 @@ static void xenvif_tx_build_gops(struct xenvif_queue *queue,
                                     unsigned *copy_ops,
                                     unsigned *map_ops)
 {
-       struct gnttab_map_grant_ref *gop = queue->tx_map_ops;
        struct sk_buff *skb, *nskb;
        int ret;
        unsigned int frag_overflow;
@@ -885,8 +955,12 @@ static void xenvif_tx_build_gops(struct xenvif_queue *queue,
                        continue;
                }
 
+               data_len = (txreq.size > XEN_NETBACK_TX_COPY_LEN) ?
+                       XEN_NETBACK_TX_COPY_LEN : txreq.size;
+
                ret = xenvif_count_requests(queue, &txreq, extra_count,
                                            txfrags, work_to_do);
+
                if (unlikely(ret < 0))
                        break;
 
@@ -912,9 +986,8 @@ static void xenvif_tx_build_gops(struct xenvif_queue *queue,
                index = pending_index(queue->pending_cons);
                pending_idx = queue->pending_ring[index];
 
-               data_len = (txreq.size > XEN_NETBACK_TX_COPY_LEN &&
-                           ret < XEN_NETBK_LEGACY_SLOTS_MAX) ?
-                       XEN_NETBACK_TX_COPY_LEN : txreq.size;
+               if (ret >= XEN_NETBK_LEGACY_SLOTS_MAX - 1 && data_len < txreq.size)
+                       data_len = txreq.size;
 
                skb = xenvif_alloc_skb(data_len);
                if (unlikely(skb == NULL)) {
@@ -925,8 +998,6 @@ static void xenvif_tx_build_gops(struct xenvif_queue *queue,
                }
 
                skb_shinfo(skb)->nr_frags = ret;
-               if (data_len < txreq.size)
-                       skb_shinfo(skb)->nr_frags++;
                /* At this point shinfo->nr_frags is in fact the number of
                 * slots, which can be as large as XEN_NETBK_LEGACY_SLOTS_MAX.
                 */
@@ -988,54 +1059,19 @@ static void xenvif_tx_build_gops(struct xenvif_queue *queue,
                                             type);
                }
 
-               XENVIF_TX_CB(skb)->pending_idx = pending_idx;
-
-               __skb_put(skb, data_len);
-               queue->tx_copy_ops[*copy_ops].source.u.ref = txreq.gref;
-               queue->tx_copy_ops[*copy_ops].source.domid = queue->vif->domid;
-               queue->tx_copy_ops[*copy_ops].source.offset = txreq.offset;
-
-               queue->tx_copy_ops[*copy_ops].dest.u.gmfn =
-                       virt_to_gfn(skb->data);
-               queue->tx_copy_ops[*copy_ops].dest.domid = DOMID_SELF;
-               queue->tx_copy_ops[*copy_ops].dest.offset =
-                       offset_in_page(skb->data) & ~XEN_PAGE_MASK;
-
-               queue->tx_copy_ops[*copy_ops].len = data_len;
-               queue->tx_copy_ops[*copy_ops].flags = GNTCOPY_source_gref;
-
-               (*copy_ops)++;
-
-               if (data_len < txreq.size) {
-                       frag_set_pending_idx(&skb_shinfo(skb)->frags[0],
-                                            pending_idx);
-                       xenvif_tx_create_map_op(queue, pending_idx, &txreq,
-                                               extra_count, gop);
-                       gop++;
-               } else {
-                       frag_set_pending_idx(&skb_shinfo(skb)->frags[0],
-                                            INVALID_PENDING_IDX);
-                       memcpy(&queue->pending_tx_info[pending_idx].req,
-                              &txreq, sizeof(txreq));
-                       queue->pending_tx_info[pending_idx].extra_count =
-                               extra_count;
-               }
-
-               queue->pending_cons++;
-
-               gop = xenvif_get_requests(queue, skb, txfrags, gop,
-                                         frag_overflow, nskb);
+               xenvif_get_requests(queue, skb, &txreq, txfrags, copy_ops,
+                                   map_ops, frag_overflow, nskb, extra_count,
+                                   data_len);
 
                __skb_queue_tail(&queue->tx_queue, skb);
 
                queue->tx.req_cons = idx;
 
-               if (((gop-queue->tx_map_ops) >= ARRAY_SIZE(queue->tx_map_ops)) ||
+               if ((*map_ops >= ARRAY_SIZE(queue->tx_map_ops)) ||
                    (*copy_ops >= ARRAY_SIZE(queue->tx_copy_ops)))
                        break;
        }
 
-       (*map_ops) = gop - queue->tx_map_ops;
        return;
 }
 
@@ -1114,9 +1150,8 @@ static int xenvif_tx_submit(struct xenvif_queue *queue)
        while ((skb = __skb_dequeue(&queue->tx_queue)) != NULL) {
                struct xen_netif_tx_request *txp;
                u16 pending_idx;
-               unsigned data_len;
 
-               pending_idx = XENVIF_TX_CB(skb)->pending_idx;
+               pending_idx = copy_pending_idx(skb, 0);
                txp = &queue->pending_tx_info[pending_idx].req;
 
                /* Check the remap error code. */
@@ -1135,18 +1170,6 @@ static int xenvif_tx_submit(struct xenvif_queue *queue)
                        continue;
                }
 
-               data_len = skb->len;
-               callback_param(queue, pending_idx).ctx = NULL;
-               if (data_len < txp->size) {
-                       /* Append the packet payload as a fragment. */
-                       txp->offset += data_len;
-                       txp->size -= data_len;
-               } else {
-                       /* Schedule a response immediately. */
-                       xenvif_idx_release(queue, pending_idx,
-                                          XEN_NETIF_RSP_OKAY);
-               }
-
                if (txp->flags & XEN_NETTXF_csum_blank)
                        skb->ip_summed = CHECKSUM_PARTIAL;
                else if (txp->flags & XEN_NETTXF_data_validated)
@@ -1332,7 +1355,7 @@ static inline void xenvif_tx_dealloc_action(struct xenvif_queue *queue)
 /* Called after netfront has transmitted */
 int xenvif_tx_action(struct xenvif_queue *queue, int budget)
 {
-       unsigned nr_mops, nr_cops = 0;
+       unsigned nr_mops = 0, nr_cops = 0;
        int work_done, ret;
 
        if (unlikely(!tx_work_todo(queue)))
index 9327621..0ba754e 100644 (file)
@@ -82,9 +82,10 @@ static bool xenvif_rx_ring_slots_available(struct xenvif_queue *queue)
        return false;
 }
 
-void xenvif_rx_queue_tail(struct xenvif_queue *queue, struct sk_buff *skb)
+bool xenvif_rx_queue_tail(struct xenvif_queue *queue, struct sk_buff *skb)
 {
        unsigned long flags;
+       bool ret = true;
 
        spin_lock_irqsave(&queue->rx_queue.lock, flags);
 
@@ -92,8 +93,7 @@ void xenvif_rx_queue_tail(struct xenvif_queue *queue, struct sk_buff *skb)
                struct net_device *dev = queue->vif->dev;
 
                netif_tx_stop_queue(netdev_get_tx_queue(dev, queue->id));
-               kfree_skb(skb);
-               queue->vif->dev->stats.rx_dropped++;
+               ret = false;
        } else {
                if (skb_queue_empty(&queue->rx_queue))
                        xenvif_update_needed_slots(queue, skb);
@@ -104,6 +104,8 @@ void xenvif_rx_queue_tail(struct xenvif_queue *queue, struct sk_buff *skb)
        }
 
        spin_unlock_irqrestore(&queue->rx_queue.lock, flags);
+
+       return ret;
 }
 
 static struct sk_buff *xenvif_rx_dequeue(struct xenvif_queue *queue)
index ef4e53b..14aec41 100644 (file)
@@ -1862,6 +1862,12 @@ static int netfront_resume(struct xenbus_device *dev)
        netif_tx_unlock_bh(info->netdev);
 
        xennet_disconnect_backend(info);
+
+       rtnl_lock();
+       if (info->queues)
+               xennet_destroy_queues(info);
+       rtnl_unlock();
+
        return 0;
 }
 
index da55ce4..69e3339 100644 (file)
@@ -4304,7 +4304,7 @@ static void nvme_ns_remove(struct nvme_ns *ns)
        mutex_unlock(&ns->ctrl->subsys->lock);
 
        /* guarantee not available in head->list */
-       synchronize_rcu();
+       synchronize_srcu(&ns->head->srcu);
 
        if (!nvme_ns_head_multipath(ns->head))
                nvme_cdev_del(&ns->cdev, &ns->cdev_device);
index 93e2138..7e025b8 100644 (file)
@@ -174,11 +174,14 @@ void nvme_mpath_revalidate_paths(struct nvme_ns *ns)
        struct nvme_ns_head *head = ns->head;
        sector_t capacity = get_capacity(head->disk);
        int node;
+       int srcu_idx;
 
+       srcu_idx = srcu_read_lock(&head->srcu);
        list_for_each_entry_rcu(ns, &head->list, siblings) {
                if (capacity != get_capacity(ns->disk))
                        clear_bit(NVME_NS_READY, &ns->flags);
        }
+       srcu_read_unlock(&head->srcu, srcu_idx);
 
        for_each_node(node)
                rcu_assign_pointer(head->current_path[node], NULL);
index f433551..488ad7d 100644 (file)
@@ -797,6 +797,8 @@ static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev,
        cmnd->dptr.prp1 = cpu_to_le64(iod->first_dma);
        if (bv->bv_len > first_prp_len)
                cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma + first_prp_len);
+       else
+               cmnd->dptr.prp2 = 0;
        return BLK_STS_OK;
 }
 
index 52ecd66..047a837 100644 (file)
@@ -436,9 +436,14 @@ static void __intel_gpio_set_direction(void __iomem *padcfg0, bool input)
        writel(value, padcfg0);
 }
 
+static int __intel_gpio_get_gpio_mode(u32 value)
+{
+       return (value & PADCFG0_PMODE_MASK) >> PADCFG0_PMODE_SHIFT;
+}
+
 static int intel_gpio_get_gpio_mode(void __iomem *padcfg0)
 {
-       return (readl(padcfg0) & PADCFG0_PMODE_MASK) >> PADCFG0_PMODE_SHIFT;
+       return __intel_gpio_get_gpio_mode(readl(padcfg0));
 }
 
 static void intel_gpio_set_gpio_mode(void __iomem *padcfg0)
@@ -1674,6 +1679,7 @@ EXPORT_SYMBOL_GPL(intel_pinctrl_get_soc_data);
 static bool intel_pinctrl_should_save(struct intel_pinctrl *pctrl, unsigned int pin)
 {
        const struct pin_desc *pd = pin_desc_get(pctrl->pctldev, pin);
+       u32 value;
 
        if (!pd || !intel_pad_usable(pctrl, pin))
                return false;
@@ -1688,6 +1694,25 @@ static bool intel_pinctrl_should_save(struct intel_pinctrl *pctrl, unsigned int
            gpiochip_line_is_irq(&pctrl->chip, intel_pin_to_gpio(pctrl, pin)))
                return true;
 
+       /*
+        * The firmware on some systems may configure GPIO pins to be
+        * an interrupt source in so called "direct IRQ" mode. In such
+        * cases the GPIO controller driver has no idea if those pins
+        * are being used or not. At the same time, there is a known bug
+        * in the firmwares that don't restore the pin settings correctly
+        * after suspend, i.e. by an unknown reason the Rx value becomes
+        * inverted.
+        *
+        * Hence, let's save and restore the pins that are configured
+        * as GPIOs in the input mode with GPIROUTIOXAPIC bit set.
+        *
+        * See https://bugzilla.kernel.org/show_bug.cgi?id=214749.
+        */
+       value = readl(intel_get_padcfg(pctrl, pin, PADCFG0));
+       if ((value & PADCFG0_GPIROUTIOXAPIC) && (value & PADCFG0_GPIOTXDIS) &&
+           (__intel_gpio_get_gpio_mode(value) == PADCFG0_PMODE_GPIO))
+               return true;
+
        return false;
 }
 
index 65d3129..27f0a54 100644 (file)
@@ -303,12 +303,15 @@ static struct irq_chip mtk_eint_irq_chip = {
 
 static unsigned int mtk_eint_hw_init(struct mtk_eint *eint)
 {
-       void __iomem *reg = eint->base + eint->regs->dom_en;
+       void __iomem *dom_en = eint->base + eint->regs->dom_en;
+       void __iomem *mask_set = eint->base + eint->regs->mask_set;
        unsigned int i;
 
        for (i = 0; i < eint->hw->ap_num; i += 32) {
-               writel(0xffffffff, reg);
-               reg += 4;
+               writel(0xffffffff, dom_en);
+               writel(0xffffffff, mask_set);
+               dom_en += 4;
+               mask_set += 4;
        }
 
        return 0;
index 67bec7e..414ee6b 100644 (file)
@@ -727,7 +727,7 @@ static int pcs_allocate_pin_table(struct pcs_device *pcs)
 
        mux_bytes = pcs->width / BITS_PER_BYTE;
 
-       if (pcs->bits_per_mux) {
+       if (pcs->bits_per_mux && pcs->fmask) {
                pcs->bits_per_pin = fls(pcs->fmask);
                nr_pins = (pcs->size * BITS_PER_BYTE) / pcs->bits_per_pin;
        } else {
index ef4ae97..439d282 100644 (file)
@@ -739,8 +739,14 @@ static void amd_pmc_s2idle_prepare(void)
 static void amd_pmc_s2idle_check(void)
 {
        struct amd_pmc_dev *pdev = &pmc;
+       struct smu_metrics table;
        int rc;
 
+       /* CZN: Ensure that future s0i3 entry attempts at least 10ms passed */
+       if (pdev->cpu_id == AMD_CPU_ID_CZN && !get_metrics_table(pdev, &table) &&
+           table.s0i3_last_entry_status)
+               usleep_range(10000, 20000);
+
        /* Dump the IdleMask before we add to the STB */
        amd_pmc_idlemask_read(pdev, pdev->dev, NULL);
 
index 9dc9358..c6ded3f 100644 (file)
@@ -758,7 +758,6 @@ static void qeth_l2_br2dev_worker(struct work_struct *work)
        struct list_head *iter;
        int err = 0;
 
-       kfree(br2dev_event_work);
        QETH_CARD_TEXT_(card, 4, "b2dw%04lx", event);
        QETH_CARD_TEXT_(card, 4, "ma%012llx", ether_addr_to_u64(addr));
 
@@ -815,6 +814,7 @@ unlock:
        dev_put(brdev);
        dev_put(lsyncdev);
        dev_put(dstdev);
+       kfree(br2dev_event_work);
 }
 
 static int qeth_l2_br2dev_queue_work(struct net_device *brdev,
index 4d59d92..127fac1 100644 (file)
@@ -1037,6 +1037,7 @@ char *usb_cache_string(struct usb_device *udev, int index)
        }
        return smallbuf;
 }
+EXPORT_SYMBOL_GPL(usb_cache_string);
 
 /*
  * usb_get_device_descriptor - (re)reads the device descriptor (usbcore)
index 82538da..0eac7d4 100644 (file)
@@ -47,7 +47,6 @@ extern int usb_get_device_descriptor(struct usb_device *dev,
 extern int usb_set_isoch_delay(struct usb_device *dev);
 extern int usb_get_bos_descriptor(struct usb_device *dev);
 extern void usb_release_bos_descriptor(struct usb_device *dev);
-extern char *usb_cache_string(struct usb_device *udev, int index);
 extern int usb_set_configuration(struct usb_device *dev, int configuration);
 extern int usb_choose_configuration(struct usb_device *udev);
 extern int usb_generic_driver_probe(struct usb_device *udev);
index 4981baf..b523720 100644 (file)
@@ -406,7 +406,7 @@ void afs_put_server(struct afs_net *net, struct afs_server *server,
        if (!server)
                return;
 
-       a = atomic_inc_return(&server->active);
+       a = atomic_read(&server->active);
        zero = __refcount_dec_and_test(&server->ref, &r);
        trace_afs_server(debug_id, r - 1, a, reason);
        if (unlikely(zero))
index 451d8a0..bce2492 100644 (file)
@@ -605,6 +605,14 @@ again:
                        set_bit(FSCACHE_COOKIE_DO_PREP_TO_WRITE, &cookie->flags);
                        queue = true;
                }
+               /*
+                * We could race with cookie_lru which may set LRU_DISCARD bit
+                * but has yet to run the cookie state machine.  If this happens
+                * and another thread tries to use the cookie, clear LRU_DISCARD
+                * so we don't end up withdrawing the cookie while in use.
+                */
+               if (test_and_clear_bit(FSCACHE_COOKIE_DO_LRU_DISCARD, &cookie->flags))
+                       fscache_see_cookie(cookie, fscache_cookie_see_lru_discard_clear);
                break;
 
        case FSCACHE_COOKIE_STATE_FAILED:
index 3b55e23..9930fa9 100644 (file)
@@ -111,6 +111,13 @@ static void nilfs_dat_commit_free(struct inode *dat,
        kunmap_atomic(kaddr);
 
        nilfs_dat_commit_entry(dat, req);
+
+       if (unlikely(req->pr_desc_bh == NULL || req->pr_bitmap_bh == NULL)) {
+               nilfs_error(dat->i_sb,
+                           "state inconsistency probably due to duplicate use of vblocknr = %llu",
+                           (unsigned long long)req->pr_entry_nr);
+               return;
+       }
        nilfs_palloc_commit_free_entry(dat, req);
 }
 
index 492dce4..cab7cfe 100644 (file)
@@ -222,12 +222,16 @@ extern void tlb_remove_table(struct mmu_gather *tlb, void *table);
 #define tlb_needs_table_invalidate() (true)
 #endif
 
+void tlb_remove_table_sync_one(void);
+
 #else
 
 #ifdef tlb_needs_table_invalidate
 #error tlb_needs_table_invalidate() requires MMU_GATHER_RCU_TABLE_FREE
 #endif
 
+static inline void tlb_remove_table_sync_one(void) { }
+
 #endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */
 
 
index 6745210..3de24cf 100644 (file)
@@ -775,7 +775,7 @@ enum bpf_reg_type {
        PTR_TO_MEM,              /* reg points to valid memory region */
        PTR_TO_BUF,              /* reg points to a read/write buffer */
        PTR_TO_FUNC,             /* reg points to a bpf program function */
-       PTR_TO_DYNPTR,           /* reg points to a dynptr */
+       CONST_PTR_TO_DYNPTR,     /* reg points to a const struct bpf_dynptr */
        __BPF_REG_TYPE_MAX,
 
        /* Extended reg_types. */
@@ -1909,11 +1909,6 @@ static inline bool bpf_allow_uninit_stack(void)
        return perfmon_capable();
 }
 
-static inline bool bpf_allow_ptr_to_map_access(void)
-{
-       return perfmon_capable();
-}
-
 static inline bool bpf_bypass_spec_v1(void)
 {
        return perfmon_capable();
@@ -2833,7 +2828,7 @@ void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data,
                     enum bpf_dynptr_type type, u32 offset, u32 size);
 void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr);
 int bpf_dynptr_check_size(u32 size);
-u32 bpf_dynptr_get_size(struct bpf_dynptr_kern *ptr);
+u32 bpf_dynptr_get_size(const struct bpf_dynptr_kern *ptr);
 
 #ifdef CONFIG_BPF_LSM
 void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype);
index 4bcf76a..1de7ece 100644 (file)
@@ -28,6 +28,7 @@ int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog,
                        const struct bpf_prog *prog);
 
 bool bpf_lsm_is_sleepable_hook(u32 btf_id);
+bool bpf_lsm_is_trusted(const struct bpf_prog *prog);
 
 static inline struct bpf_storage_blob *bpf_inode(
        const struct inode *inode)
@@ -51,6 +52,11 @@ static inline bool bpf_lsm_is_sleepable_hook(u32 btf_id)
        return false;
 }
 
+static inline bool bpf_lsm_is_trusted(const struct bpf_prog *prog)
+{
+       return false;
+}
+
 static inline int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog,
                                      const struct bpf_prog *prog)
 {
index c05aa6e..53d175c 100644 (file)
@@ -273,9 +273,9 @@ struct bpf_id_pair {
        u32 cur;
 };
 
-/* Maximum number of register states that can exist at once */
-#define BPF_ID_MAP_SIZE (MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE)
 #define MAX_CALL_FRAMES 8
+/* Maximum number of register states that can exist at once */
+#define BPF_ID_MAP_SIZE ((MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE) * MAX_CALL_FRAMES)
 struct bpf_verifier_state {
        /* call stack tracking */
        struct bpf_func_state *frame[MAX_CALL_FRAMES];
@@ -452,6 +452,7 @@ struct bpf_insn_aux_data {
        /* below fields are initialized once */
        unsigned int orig_idx; /* original instruction index */
        bool prune_point;
+       bool jmp_point;
 };
 
 #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
@@ -531,7 +532,6 @@ struct bpf_verifier_env {
        bool explore_alu_limits;
        bool allow_ptr_leaks;
        bool allow_uninit_stack;
-       bool allow_ptr_to_map_access;
        bool bpf_capable;
        bool bypass_spec_v1;
        bool bypass_spec_v4;
@@ -615,11 +615,9 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
                           enum bpf_arg_type arg_type);
 int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
                   u32 regno, u32 mem_size);
-bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env,
-                             struct bpf_reg_state *reg);
-bool is_dynptr_type_expected(struct bpf_verifier_env *env,
-                            struct bpf_reg_state *reg,
-                            enum bpf_arg_type arg_type);
+struct bpf_call_arg_meta;
+int process_dynptr_func(struct bpf_verifier_env *env, int regno,
+                       enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta);
 
 /* this lives here instead of in bpf.h because it needs to dereference tgt_prog */
 static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog,
@@ -683,7 +681,7 @@ static inline bool bpf_prog_check_recur(const struct bpf_prog *prog)
        }
 }
 
-#define BPF_REG_TRUSTED_MODIFIERS (MEM_ALLOC | MEM_RCU | PTR_TRUSTED)
+#define BPF_REG_TRUSTED_MODIFIERS (MEM_ALLOC | PTR_TRUSTED)
 
 static inline bool bpf_type_has_unsafe_modifiers(u32 type)
 {
index 9ed0007..5f628f3 100644 (file)
@@ -70,6 +70,7 @@
 #define KF_TRUSTED_ARGS (1 << 4) /* kfunc only takes trusted pointer arguments */
 #define KF_SLEEPABLE    (1 << 5) /* kfunc may sleep */
 #define KF_DESTRUCTIVE  (1 << 6) /* kfunc performs destructive actions */
+#define KF_RCU          (1 << 7) /* kfunc only takes rcu pointer arguments */
 
 /*
  * Return the name of the passed struct, if exists, or halt the build if for
@@ -477,8 +478,10 @@ struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog);
 u32 *btf_kfunc_id_set_contains(const struct btf *btf,
                               enum bpf_prog_type prog_type,
                               u32 kfunc_btf_id);
+u32 *btf_kfunc_is_modify_return(const struct btf *btf, u32 kfunc_btf_id);
 int register_btf_kfunc_id_set(enum bpf_prog_type prog_type,
                              const struct btf_kfunc_id_set *s);
+int register_btf_fmodret_id_set(const struct btf_kfunc_id_set *kset);
 s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id);
 int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_cnt,
                                struct module *owner);
index 9339771..3a4f7cd 100644 (file)
@@ -266,5 +266,6 @@ MAX_BTF_TRACING_TYPE,
 
 extern u32 btf_tracing_ids[];
 extern u32 bpf_cgroup_btf_id[];
+extern u32 bpf_local_storage_map_btf_id[];
 
 #endif
index 528bd44..2b7d077 100644 (file)
@@ -68,6 +68,7 @@ struct css_task_iter {
        struct list_head                iters_node;     /* css_set->task_iters */
 };
 
+extern struct file_system_type cgroup_fs_type;
 extern struct cgroup_root cgrp_dfl_root;
 extern struct css_set init_css_set;
 
index ef4aea3..65a7877 100644 (file)
@@ -210,6 +210,20 @@ alloc_pages_bulk_array_node(gfp_t gfp, int nid, unsigned long nr_pages, struct p
        return __alloc_pages_bulk(gfp, nid, NULL, nr_pages, NULL, page_array);
 }
 
+static inline void warn_if_node_offline(int this_node, gfp_t gfp_mask)
+{
+       gfp_t warn_gfp = gfp_mask & (__GFP_THISNODE|__GFP_NOWARN);
+
+       if (warn_gfp != (__GFP_THISNODE|__GFP_NOWARN))
+               return;
+
+       if (node_online(this_node))
+               return;
+
+       pr_warn("%pGg allocation from offline node %d\n", &gfp_mask, this_node);
+       dump_stack();
+}
+
 /*
  * Allocate pages, preferring the node given as nid. The node must be valid and
  * online. For more general interface, see alloc_pages_node().
@@ -218,7 +232,7 @@ static inline struct page *
 __alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
 {
        VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
-       VM_WARN_ON((gfp_mask & __GFP_THISNODE) && !node_online(nid));
+       warn_if_node_offline(nid, gfp_mask);
 
        return __alloc_pages(gfp_mask, order, nid, NULL);
 }
@@ -227,7 +241,7 @@ static inline
 struct folio *__folio_alloc_node(gfp_t gfp, unsigned int order, int nid)
 {
        VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
-       VM_WARN_ON((gfp & __GFP_THISNODE) && !node_online(nid));
+       warn_if_node_offline(nid, gfp);
 
        return __folio_alloc(gfp, order, nid, NULL);
 }
index ad937f5..7cce390 100644 (file)
@@ -2,8 +2,6 @@
 #ifndef __LICENSE_H
 #define __LICENSE_H
 
-#include <linux/string.h>
-
 static inline int license_is_gpl_compatible(const char *license)
 {
        return (strcmp(license, "GPL") == 0
index c7a9198..ba6958b 100644 (file)
@@ -50,6 +50,7 @@ enum mlx5_flow_destination_type {
        MLX5_FLOW_DESTINATION_TYPE_PORT,
        MLX5_FLOW_DESTINATION_TYPE_COUNTER,
        MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE_NUM,
+       MLX5_FLOW_DESTINATION_TYPE_RANGE,
 };
 
 enum {
@@ -143,6 +144,10 @@ enum {
        MLX5_FLOW_DEST_VPORT_REFORMAT_ID  = BIT(1),
 };
 
+enum mlx5_flow_dest_range_field {
+       MLX5_FLOW_DEST_RANGE_FIELD_PKT_LEN = 0,
+};
+
 struct mlx5_flow_destination {
        enum mlx5_flow_destination_type type;
        union {
@@ -156,6 +161,13 @@ struct mlx5_flow_destination {
                        struct mlx5_pkt_reformat *pkt_reformat;
                        u8              flags;
                } vport;
+               struct {
+                       struct mlx5_flow_table         *hit_ft;
+                       struct mlx5_flow_table         *miss_ft;
+                       enum mlx5_flow_dest_range_field field;
+                       u32                             min;
+                       u32                             max;
+               } range;
                u32                     sampler_id;
        };
 };
index 5a4e914..152d2d7 100644 (file)
@@ -68,6 +68,7 @@ enum {
        MLX5_SET_HCA_CAP_OP_MOD_ODP                   = 0x2,
        MLX5_SET_HCA_CAP_OP_MOD_ATOMIC                = 0x3,
        MLX5_SET_HCA_CAP_OP_MOD_ROCE                  = 0x4,
+       MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE2       = 0x20,
        MLX5_SET_HCA_CAP_OP_MODE_PORT_SELECTION       = 0x25,
 };
 
@@ -445,7 +446,10 @@ struct mlx5_ifc_flow_table_prop_layout_bits {
        u8         max_modify_header_actions[0x8];
        u8         max_ft_level[0x8];
 
-       u8         reserved_at_40[0x6];
+       u8         reformat_add_esp_trasport[0x1];
+       u8         reserved_at_41[0x2];
+       u8         reformat_del_esp_trasport[0x1];
+       u8         reserved_at_44[0x2];
        u8         execute_aso[0x1];
        u8         reserved_at_47[0x19];
 
@@ -638,8 +642,10 @@ struct mlx5_ifc_fte_match_set_misc2_bits {
        u8         reserved_at_1a0[0x8];
 
        u8         macsec_syndrome[0x8];
+       u8         ipsec_syndrome[0x8];
+       u8         reserved_at_1b8[0x8];
 
-       u8         reserved_at_1b0[0x50];
+       u8         reserved_at_1c0[0x40];
 };
 
 struct mlx5_ifc_fte_match_set_misc3_bits {
@@ -1875,7 +1881,10 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 };
 
 struct mlx5_ifc_cmd_hca_cap_2_bits {
-       u8         reserved_at_0[0xa0];
+       u8         reserved_at_0[0x80];
+
+       u8         migratable[0x1];
+       u8         reserved_at_81[0x1f];
 
        u8         max_reformat_insert_size[0x8];
        u8         max_reformat_insert_offset[0x8];
@@ -6104,6 +6113,38 @@ struct mlx5_ifc_match_definer_format_32_bits {
        u8         inner_dmac_15_0[0x10];
 };
 
+enum {
+       MLX5_IFC_DEFINER_FORMAT_ID_SELECT = 61,
+};
+
+#define MLX5_IFC_DEFINER_FORMAT_OFFSET_UNUSED 0x0
+#define MLX5_IFC_DEFINER_FORMAT_OFFSET_OUTER_ETH_PKT_LEN 0x48
+#define MLX5_IFC_DEFINER_DW_SELECTORS_NUM 9
+#define MLX5_IFC_DEFINER_BYTE_SELECTORS_NUM 8
+
+struct mlx5_ifc_match_definer_match_mask_bits {
+       u8         reserved_at_1c0[5][0x20];
+       u8         match_dw_8[0x20];
+       u8         match_dw_7[0x20];
+       u8         match_dw_6[0x20];
+       u8         match_dw_5[0x20];
+       u8         match_dw_4[0x20];
+       u8         match_dw_3[0x20];
+       u8         match_dw_2[0x20];
+       u8         match_dw_1[0x20];
+       u8         match_dw_0[0x20];
+
+       u8         match_byte_7[0x8];
+       u8         match_byte_6[0x8];
+       u8         match_byte_5[0x8];
+       u8         match_byte_4[0x8];
+
+       u8         match_byte_3[0x8];
+       u8         match_byte_2[0x8];
+       u8         match_byte_1[0x8];
+       u8         match_byte_0[0x8];
+};
+
 struct mlx5_ifc_match_definer_bits {
        u8         modify_field_select[0x40];
 
@@ -6112,9 +6153,41 @@ struct mlx5_ifc_match_definer_bits {
        u8         reserved_at_80[0x10];
        u8         format_id[0x10];
 
-       u8         reserved_at_a0[0x160];
+       u8         reserved_at_a0[0x60];
+
+       u8         format_select_dw3[0x8];
+       u8         format_select_dw2[0x8];
+       u8         format_select_dw1[0x8];
+       u8         format_select_dw0[0x8];
+
+       u8         format_select_dw7[0x8];
+       u8         format_select_dw6[0x8];
+       u8         format_select_dw5[0x8];
+       u8         format_select_dw4[0x8];
+
+       u8         reserved_at_100[0x18];
+       u8         format_select_dw8[0x8];
+
+       u8         reserved_at_120[0x20];
+
+       u8         format_select_byte3[0x8];
+       u8         format_select_byte2[0x8];
+       u8         format_select_byte1[0x8];
+       u8         format_select_byte0[0x8];
+
+       u8         format_select_byte7[0x8];
+       u8         format_select_byte6[0x8];
+       u8         format_select_byte5[0x8];
+       u8         format_select_byte4[0x8];
+
+       u8         reserved_at_180[0x40];
 
-       u8         match_mask[16][0x20];
+       union {
+               struct {
+                       u8         match_mask[16][0x20];
+               };
+               struct mlx5_ifc_match_definer_match_mask_bits match_mask_format;
+       };
 };
 
 struct mlx5_ifc_general_obj_in_cmd_hdr_bits {
@@ -6384,6 +6457,9 @@ enum mlx5_reformat_ctx_type {
        MLX5_REFORMAT_TYPE_L2_TO_L2_TUNNEL = 0x2,
        MLX5_REFORMAT_TYPE_L3_TUNNEL_TO_L2 = 0x3,
        MLX5_REFORMAT_TYPE_L2_TO_L3_TUNNEL = 0x4,
+       MLX5_REFORMAT_TYPE_ADD_ESP_TRANSPORT_OVER_IPV4 = 0x5,
+       MLX5_REFORMAT_TYPE_DEL_ESP_TRANSPORT = 0x8,
+       MLX5_REFORMAT_TYPE_ADD_ESP_TRANSPORT_OVER_IPV6 = 0xb,
        MLX5_REFORMAT_TYPE_INSERT_HDR = 0xf,
        MLX5_REFORMAT_TYPE_REMOVE_HDR = 0x10,
        MLX5_REFORMAT_TYPE_ADD_MACSEC = 0x11,
@@ -11563,6 +11639,41 @@ enum {
        MLX5_IPSEC_OBJECT_ICV_LEN_16B,
 };
 
+enum {
+       MLX5_IPSEC_ASO_REG_C_0_1 = 0x0,
+       MLX5_IPSEC_ASO_REG_C_2_3 = 0x1,
+       MLX5_IPSEC_ASO_REG_C_4_5 = 0x2,
+       MLX5_IPSEC_ASO_REG_C_6_7 = 0x3,
+};
+
+enum {
+       MLX5_IPSEC_ASO_MODE              = 0x0,
+       MLX5_IPSEC_ASO_REPLAY_PROTECTION = 0x1,
+       MLX5_IPSEC_ASO_INC_SN            = 0x2,
+};
+
+struct mlx5_ifc_ipsec_aso_bits {
+       u8         valid[0x1];
+       u8         reserved_at_201[0x1];
+       u8         mode[0x2];
+       u8         window_sz[0x2];
+       u8         soft_lft_arm[0x1];
+       u8         hard_lft_arm[0x1];
+       u8         remove_flow_enable[0x1];
+       u8         esn_event_arm[0x1];
+       u8         reserved_at_20a[0x16];
+
+       u8         remove_flow_pkt_cnt[0x20];
+
+       u8         remove_flow_soft_lft[0x20];
+
+       u8         reserved_at_260[0x80];
+
+       u8         mode_parameter[0x20];
+
+       u8         replay_protection_window[0x100];
+};
+
 struct mlx5_ifc_ipsec_obj_bits {
        u8         modify_field_select[0x40];
        u8         full_offload[0x1];
@@ -11584,7 +11695,11 @@ struct mlx5_ifc_ipsec_obj_bits {
 
        u8         implicit_iv[0x40];
 
-       u8         reserved_at_100[0x700];
+       u8         reserved_at_100[0x8];
+       u8         ipsec_aso_access_pd[0x18];
+       u8         reserved_at_120[0xe0];
+
+       struct mlx5_ifc_ipsec_aso_bits ipsec_aso;
 };
 
 struct mlx5_ifc_create_ipsec_obj_in_bits {
index aad53cb..7f31432 100644 (file)
@@ -132,4 +132,6 @@ int mlx5_nic_vport_affiliate_multiport(struct mlx5_core_dev *master_mdev,
 int mlx5_nic_vport_unaffiliate_multiport(struct mlx5_core_dev *port_mdev);
 
 u64 mlx5_query_nic_system_image_guid(struct mlx5_core_dev *mdev);
+int mlx5_vport_get_other_func_cap(struct mlx5_core_dev *dev, u16 function_id, void *out,
+                                 u16 opmod);
 #endif /* __MLX5_VPORT_H__ */
index 8bbcccb..974ccca 100644 (file)
@@ -1852,6 +1852,25 @@ static void __maybe_unused show_free_areas(unsigned int flags, nodemask_t *nodem
        __show_free_areas(flags, nodemask, MAX_NR_ZONES - 1);
 }
 
+/*
+ * Parameter block passed down to zap_pte_range in exceptional cases.
+ */
+struct zap_details {
+       struct folio *single_folio;     /* Locked folio to be unmapped */
+       bool even_cows;                 /* Zap COWed private pages too? */
+       zap_flags_t zap_flags;          /* Extra flags for zapping */
+};
+
+/*
+ * Whether to drop the pte markers, for example, the uffd-wp information for
+ * file-backed memory.  This should only be specified when we will completely
+ * drop the page in the mm, either by truncation or unmapping of the vma.  By
+ * default, the flag is not set.
+ */
+#define  ZAP_FLAG_DROP_MARKER        ((__force zap_flags_t) BIT(0))
+/* Set in unmap_vmas() to indicate a final unmap call.  Only used by hugetlb */
+#define  ZAP_FLAG_UNMAP              ((__force zap_flags_t) BIT(1))
+
 #ifdef CONFIG_MMU
 extern bool can_do_mlock(void);
 #else
@@ -1869,6 +1888,8 @@ void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
                  unsigned long size);
 void zap_page_range(struct vm_area_struct *vma, unsigned long address,
                    unsigned long size);
+void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
+                          unsigned long size, struct zap_details *details);
 void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,
                struct vm_area_struct *start_vma, unsigned long start,
                unsigned long end);
@@ -3467,12 +3488,4 @@ madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
 }
 #endif
 
-/*
- * Whether to drop the pte markers, for example, the uffd-wp information for
- * file-backed memory.  This should only be specified when we will completely
- * drop the page in the mm, either by truncation or unmapping of the vma.  By
- * default, the flag is not set.
- */
-#define  ZAP_FLAG_DROP_MARKER        ((__force zap_flags_t) BIT(0))
-
 #endif /* _LINUX_MM_H */
index 9c50bc4..6f79938 100644 (file)
@@ -451,7 +451,7 @@ static inline bool mmc_ready_for_data(u32 status)
 #define MMC_SECURE_TRIM1_ARG           0x80000001
 #define MMC_SECURE_TRIM2_ARG           0x80008000
 #define MMC_SECURE_ARGS                        0x80000000
-#define MMC_TRIM_ARGS                  0x00008001
+#define MMC_TRIM_OR_DISCARD_ARGS       0x00008003
 
 #define mmc_driver_type_mask(n)                (1 << (n))
 
index 5aa35c5..aad12a1 100644 (file)
@@ -78,6 +78,7 @@ struct xdp_buff;
 void synchronize_net(void);
 void netdev_set_default_ethtool_ops(struct net_device *dev,
                                    const struct ethtool_ops *ops);
+void netdev_sw_irq_coalesce_default_on(struct net_device *dev);
 
 /* Backlog congestion levels */
 #define NET_RX_SUCCESS         0       /* keep 'em coming, baby */
@@ -1040,6 +1041,10 @@ struct xfrmdev_ops {
        bool    (*xdo_dev_offload_ok) (struct sk_buff *skb,
                                       struct xfrm_state *x);
        void    (*xdo_dev_state_advance_esn) (struct xfrm_state *x);
+       void    (*xdo_dev_state_update_curlft) (struct xfrm_state *x);
+       int     (*xdo_dev_policy_add) (struct xfrm_policy *x);
+       void    (*xdo_dev_policy_delete) (struct xfrm_policy *x);
+       void    (*xdo_dev_policy_free) (struct xfrm_policy *x);
 };
 #endif
 
@@ -1657,6 +1662,7 @@ struct net_device_ops {
  * @IFF_FAILOVER: device is a failover master device
  * @IFF_FAILOVER_SLAVE: device is lower dev of a failover master device
  * @IFF_L3MDEV_RX_HANDLER: only invoke the rx handler of L3 master device
+ * @IFF_NO_ADDRCONF: prevent ipv6 addrconf
  * @IFF_TX_SKB_NO_LINEAR: device/driver is capable of xmitting frames with
  *     skb_headlen(skb) == 0 (data starts from frag0)
  * @IFF_CHANGE_PROTO_DOWN: device supports setting carrier via IFLA_PROTO_DOWN
@@ -1692,7 +1698,7 @@ enum netdev_priv_flags {
        IFF_FAILOVER                    = 1<<27,
        IFF_FAILOVER_SLAVE              = 1<<28,
        IFF_L3MDEV_RX_HANDLER           = 1<<29,
-       /* was IFF_LIVE_RENAME_OK */
+       IFF_NO_ADDRCONF                 = BIT_ULL(30),
        IFF_TX_SKB_NO_LINEAR            = BIT_ULL(31),
        IFF_CHANGE_PROTO_DOWN           = BIT_ULL(32),
 };
index ada1296..ab934ad 100644 (file)
@@ -515,6 +515,16 @@ ip_set_init_skbinfo(struct ip_set_skbinfo *skbinfo,
        *skbinfo = ext->skbinfo;
 }
 
+static inline void
+nf_inet_addr_mask_inplace(union nf_inet_addr *a1,
+                         const union nf_inet_addr *mask)
+{
+       a1->all[0] &= mask->all[0];
+       a1->all[1] &= mask->all[1];
+       a1->all[2] &= mask->all[2];
+       a1->all[3] &= mask->all[3];
+}
+
 #define IP_SET_INIT_KEXT(skb, opt, set)                        \
        { .bytes = (skb)->len, .packets = 1, .target = true,\
          .timeout = ip_set_adt_opt_timeout(opt, set) }
index a108b60..5f0d7d0 100644 (file)
@@ -165,6 +165,13 @@ static inline pte_t *virt_to_kpte(unsigned long vaddr)
        return pmd_none(*pmd) ? NULL : pte_offset_kernel(pmd, vaddr);
 }
 
+#ifndef pmd_young
+static inline int pmd_young(pmd_t pmd)
+{
+       return 0;
+}
+#endif
+
 #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
 extern int ptep_set_access_flags(struct vm_area_struct *vma,
                                 unsigned long address, pte_t *ptep,
@@ -260,6 +267,17 @@ static inline int pmdp_clear_flush_young(struct vm_area_struct *vma,
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif
 
+#ifndef arch_has_hw_nonleaf_pmd_young
+/*
+ * Return whether the accessed bit in non-leaf PMD entries is supported on the
+ * local CPU.
+ */
+static inline bool arch_has_hw_nonleaf_pmd_young(void)
+{
+       return IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG);
+}
+#endif
+
 #ifndef arch_has_hw_pte_young
 /*
  * Return whether the accessed bit is supported on the local CPU.
index 68dab3e..5b5357c 100644 (file)
@@ -323,29 +323,36 @@ static inline struct rhash_lock_head __rcu **rht_bucket_insert(
  * When we write to a bucket without unlocking, we use rht_assign_locked().
  */
 
-static inline void rht_lock(struct bucket_table *tbl,
-                           struct rhash_lock_head __rcu **bkt)
+static inline unsigned long rht_lock(struct bucket_table *tbl,
+                                    struct rhash_lock_head __rcu **bkt)
 {
-       local_bh_disable();
+       unsigned long flags;
+
+       local_irq_save(flags);
        bit_spin_lock(0, (unsigned long *)bkt);
        lock_map_acquire(&tbl->dep_map);
+       return flags;
 }
 
-static inline void rht_lock_nested(struct bucket_table *tbl,
-                                  struct rhash_lock_head __rcu **bucket,
-                                  unsigned int subclass)
+static inline unsigned long rht_lock_nested(struct bucket_table *tbl,
+                                       struct rhash_lock_head __rcu **bucket,
+                                       unsigned int subclass)
 {
-       local_bh_disable();
+       unsigned long flags;
+
+       local_irq_save(flags);
        bit_spin_lock(0, (unsigned long *)bucket);
        lock_acquire_exclusive(&tbl->dep_map, subclass, 0, NULL, _THIS_IP_);
+       return flags;
 }
 
 static inline void rht_unlock(struct bucket_table *tbl,
-                             struct rhash_lock_head __rcu **bkt)
+                             struct rhash_lock_head __rcu **bkt,
+                             unsigned long flags)
 {
        lock_map_release(&tbl->dep_map);
        bit_spin_unlock(0, (unsigned long *)bkt);
-       local_bh_enable();
+       local_irq_restore(flags);
 }
 
 static inline struct rhash_head *__rht_ptr(
@@ -393,7 +400,8 @@ static inline void rht_assign_locked(struct rhash_lock_head __rcu **bkt,
 
 static inline void rht_assign_unlock(struct bucket_table *tbl,
                                     struct rhash_lock_head __rcu **bkt,
-                                    struct rhash_head *obj)
+                                    struct rhash_head *obj,
+                                    unsigned long flags)
 {
        if (rht_is_a_nulls(obj))
                obj = NULL;
@@ -401,7 +409,7 @@ static inline void rht_assign_unlock(struct bucket_table *tbl,
        rcu_assign_pointer(*bkt, (void *)obj);
        preempt_enable();
        __release(bitlock);
-       local_bh_enable();
+       local_irq_restore(flags);
 }
 
 /**
@@ -706,6 +714,7 @@ static inline void *__rhashtable_insert_fast(
        struct rhash_head __rcu **pprev;
        struct bucket_table *tbl;
        struct rhash_head *head;
+       unsigned long flags;
        unsigned int hash;
        int elasticity;
        void *data;
@@ -720,11 +729,11 @@ static inline void *__rhashtable_insert_fast(
        if (!bkt)
                goto out;
        pprev = NULL;
-       rht_lock(tbl, bkt);
+       flags = rht_lock(tbl, bkt);
 
        if (unlikely(rcu_access_pointer(tbl->future_tbl))) {
 slow_path:
-               rht_unlock(tbl, bkt);
+               rht_unlock(tbl, bkt, flags);
                rcu_read_unlock();
                return rhashtable_insert_slow(ht, key, obj);
        }
@@ -756,9 +765,9 @@ slow_path:
                RCU_INIT_POINTER(list->rhead.next, head);
                if (pprev) {
                        rcu_assign_pointer(*pprev, obj);
-                       rht_unlock(tbl, bkt);
+                       rht_unlock(tbl, bkt, flags);
                } else
-                       rht_assign_unlock(tbl, bkt, obj);
+                       rht_assign_unlock(tbl, bkt, obj, flags);
                data = NULL;
                goto out;
        }
@@ -785,7 +794,7 @@ slow_path:
        }
 
        atomic_inc(&ht->nelems);
-       rht_assign_unlock(tbl, bkt, obj);
+       rht_assign_unlock(tbl, bkt, obj, flags);
 
        if (rht_grow_above_75(ht, tbl))
                schedule_work(&ht->run_work);
@@ -797,7 +806,7 @@ out:
        return data;
 
 out_unlock:
-       rht_unlock(tbl, bkt);
+       rht_unlock(tbl, bkt, flags);
        goto out;
 }
 
@@ -991,6 +1000,7 @@ static inline int __rhashtable_remove_fast_one(
        struct rhash_lock_head __rcu **bkt;
        struct rhash_head __rcu **pprev;
        struct rhash_head *he;
+       unsigned long flags;
        unsigned int hash;
        int err = -ENOENT;
 
@@ -999,7 +1009,7 @@ static inline int __rhashtable_remove_fast_one(
        if (!bkt)
                return -ENOENT;
        pprev = NULL;
-       rht_lock(tbl, bkt);
+       flags = rht_lock(tbl, bkt);
 
        rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) {
                struct rhlist_head *list;
@@ -1043,14 +1053,14 @@ static inline int __rhashtable_remove_fast_one(
 
                if (pprev) {
                        rcu_assign_pointer(*pprev, obj);
-                       rht_unlock(tbl, bkt);
+                       rht_unlock(tbl, bkt, flags);
                } else {
-                       rht_assign_unlock(tbl, bkt, obj);
+                       rht_assign_unlock(tbl, bkt, obj, flags);
                }
                goto unlocked;
        }
 
-       rht_unlock(tbl, bkt);
+       rht_unlock(tbl, bkt, flags);
 unlocked:
        if (err > 0) {
                atomic_dec(&ht->nelems);
@@ -1143,6 +1153,7 @@ static inline int __rhashtable_replace_fast(
        struct rhash_lock_head __rcu **bkt;
        struct rhash_head __rcu **pprev;
        struct rhash_head *he;
+       unsigned long flags;
        unsigned int hash;
        int err = -ENOENT;
 
@@ -1158,7 +1169,7 @@ static inline int __rhashtable_replace_fast(
                return -ENOENT;
 
        pprev = NULL;
-       rht_lock(tbl, bkt);
+       flags = rht_lock(tbl, bkt);
 
        rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) {
                if (he != obj_old) {
@@ -1169,15 +1180,15 @@ static inline int __rhashtable_replace_fast(
                rcu_assign_pointer(obj_new->next, obj_old->next);
                if (pprev) {
                        rcu_assign_pointer(*pprev, obj_new);
-                       rht_unlock(tbl, bkt);
+                       rht_unlock(tbl, bkt, flags);
                } else {
-                       rht_assign_unlock(tbl, bkt, obj_new);
+                       rht_assign_unlock(tbl, bkt, obj_new, flags);
                }
                err = 0;
                goto unlocked;
        }
 
-       rht_unlock(tbl, bkt);
+       rht_unlock(tbl, bkt, flags);
 
 unlocked:
        return err;
index 4e464a2..4c84924 100644 (file)
@@ -1255,6 +1255,7 @@ struct sk_buff *build_skb_around(struct sk_buff *skb,
 void skb_attempt_defer_free(struct sk_buff *skb);
 
 struct sk_buff *napi_build_skb(void *data, unsigned int frag_size);
+struct sk_buff *slab_build_skb(void *data);
 
 /**
  * alloc_skb - allocate a network buffer
index 70d6cb9..84f7874 100644 (file)
@@ -82,6 +82,7 @@ struct sk_psock {
        u32                             apply_bytes;
        u32                             cork_bytes;
        u32                             eval;
+       bool                            redir_ingress; /* undefined if sk_redir is null */
        struct sk_msg                   *cork;
        struct sk_psock_progs           progs;
 #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
index 7293259..db637a1 100644 (file)
@@ -159,7 +159,7 @@ struct mtk_wed_ops {
        int (*tx_ring_setup)(struct mtk_wed_device *dev, int ring,
                             void __iomem *regs, bool reset);
        int (*rx_ring_setup)(struct mtk_wed_device *dev, int ring,
-                            void __iomem *regs);
+                            void __iomem *regs, bool reset);
        int (*txfree_ring_setup)(struct mtk_wed_device *dev,
                                 void __iomem *regs);
        int (*msg_update)(struct mtk_wed_device *dev, int cmd_id,
@@ -227,8 +227,8 @@ mtk_wed_get_rx_capa(struct mtk_wed_device *dev)
        (_dev)->ops->irq_get(_dev, _mask)
 #define mtk_wed_device_irq_set_mask(_dev, _mask) \
        (_dev)->ops->irq_set_mask(_dev, _mask)
-#define mtk_wed_device_rx_ring_setup(_dev, _ring, _regs) \
-       (_dev)->ops->rx_ring_setup(_dev, _ring, _regs)
+#define mtk_wed_device_rx_ring_setup(_dev, _ring, _regs, _reset) \
+       (_dev)->ops->rx_ring_setup(_dev, _ring, _regs, _reset)
 #define mtk_wed_device_ppe_check(_dev, _skb, _reason, _hash) \
        (_dev)->ops->ppe_check(_dev, _skb, _reason, _hash)
 #define mtk_wed_device_update_msg(_dev, _id, _msg, _len) \
@@ -248,7 +248,7 @@ static inline bool mtk_wed_device_active(struct mtk_wed_device *dev)
 #define mtk_wed_device_reg_write(_dev, _reg, _val) do {} while (0)
 #define mtk_wed_device_irq_get(_dev, _mask) 0
 #define mtk_wed_device_irq_set_mask(_dev, _mask) do {} while (0)
-#define mtk_wed_device_rx_ring_setup(_dev, _ring, _regs) -ENODEV
+#define mtk_wed_device_rx_ring_setup(_dev, _ring, _regs, _reset) -ENODEV
 #define mtk_wed_device_ppe_check(_dev, _skb, _reason, _hash)  do {} while (0)
 #define mtk_wed_device_update_msg(_dev, _id, _msg, _len) -ENODEV
 #define mtk_wed_device_stop(_dev) do {} while (0)
index fb2e886..83ca2e8 100644 (file)
@@ -271,5 +271,6 @@ struct plat_stmmacenet_data {
        int msi_tx_base_vec;
        bool use_phy_wol;
        bool sph_disable;
+       bool serdes_up_after_phy_linkup;
 };
 #endif
index 9ff1ad4..d2d2f41 100644 (file)
@@ -1829,6 +1829,7 @@ static inline int usb_get_ptm_status(struct usb_device *dev, void *data)
 
 extern int usb_string(struct usb_device *dev, int index,
        char *buf, size_t size);
+extern char *usb_cache_string(struct usb_device *udev, int index);
 
 /* wrappers that also update important state inside usbcore */
 extern int usb_clear_halt(struct usb_device *dev, int pipe);
index a960de6..bdf8de2 100644 (file)
@@ -15,6 +15,7 @@ static inline bool virtio_net_hdr_match_proto(__be16 protocol, __u8 gso_type)
        case VIRTIO_NET_HDR_GSO_TCPV6:
                return protocol == cpu_to_be16(ETH_P_IPV6);
        case VIRTIO_NET_HDR_GSO_UDP:
+       case VIRTIO_NET_HDR_GSO_UDP_L4:
                return protocol == cpu_to_be16(ETH_P_IP) ||
                       protocol == cpu_to_be16(ETH_P_IPV6);
        default:
@@ -31,6 +32,7 @@ static inline int virtio_net_hdr_set_proto(struct sk_buff *skb,
        switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
        case VIRTIO_NET_HDR_GSO_TCPV4:
        case VIRTIO_NET_HDR_GSO_UDP:
+       case VIRTIO_NET_HDR_GSO_UDP_L4:
                skb->protocol = cpu_to_be16(ETH_P_IP);
                break;
        case VIRTIO_NET_HDR_GSO_TCPV6:
@@ -69,6 +71,11 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb,
                        ip_proto = IPPROTO_UDP;
                        thlen = sizeof(struct udphdr);
                        break;
+               case VIRTIO_NET_HDR_GSO_UDP_L4:
+                       gso_type = SKB_GSO_UDP_L4;
+                       ip_proto = IPPROTO_UDP;
+                       thlen = sizeof(struct udphdr);
+                       break;
                default:
                        return -EINVAL;
                }
@@ -182,6 +189,8 @@ static inline int virtio_net_hdr_from_skb(const struct sk_buff *skb,
                        hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
                else if (sinfo->gso_type & SKB_GSO_TCPV6)
                        hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
+               else if (sinfo->gso_type & SKB_GSO_UDP_L4)
+                       hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP_L4;
                else
                        return -EINVAL;
                if (sinfo->gso_type & SKB_GSO_TCP_ECN)
index c94ea1a..2a6f443 100644 (file)
@@ -101,11 +101,6 @@ static inline enum flow_action_hw_stats tc_act_hw_stats(u8 hw_stats)
        return hw_stats;
 }
 
-#ifdef CONFIG_NET_CLS_ACT
-
-#define ACT_P_CREATED 1
-#define ACT_P_DELETED 1
-
 typedef void (*tc_action_priv_destructor)(void *priv);
 
 struct tc_action_ops {
@@ -140,6 +135,11 @@ struct tc_action_ops {
                                     struct netlink_ext_ack *extack);
 };
 
+#ifdef CONFIG_NET_CLS_ACT
+
+#define ACT_P_CREATED 1
+#define ACT_P_DELETED 1
+
 struct tc_action_net {
        struct tcf_idrinfo *idrinfo;
        const struct tc_action_ops *ops;
index b69ca69..d5a5ae9 100644 (file)
@@ -66,10 +66,10 @@ int rxrpc_kernel_charge_accept(struct socket *, rxrpc_notify_rx_t,
 void rxrpc_kernel_set_tx_length(struct socket *, struct rxrpc_call *, s64);
 bool rxrpc_kernel_check_life(const struct socket *, const struct rxrpc_call *);
 u32 rxrpc_kernel_get_epoch(struct socket *, struct rxrpc_call *);
-bool rxrpc_kernel_call_is_complete(struct rxrpc_call *);
 void rxrpc_kernel_set_max_life(struct socket *, struct rxrpc_call *,
                               unsigned long);
 
 int rxrpc_sock_set_min_security_level(struct sock *sk, unsigned int val);
+int rxrpc_sock_set_security_keyring(struct sock *, struct key *);
 
 #endif /* _NET_RXRPC_H */
index e004ba0..8d773b0 100644 (file)
@@ -228,6 +228,17 @@ enum {
         */
        HCI_QUIRK_VALID_LE_STATES,
 
+       /* When this quirk is set, then erroneous data reporting
+        * is ignored. This is mainly due to the fact that the HCI
+        * Read Default Erroneous Data Reporting command is advertised,
+        * but not supported; these controllers often reply with unknown
+        * command and tend to lock up randomly. Needing a hard reset.
+        *
+        * This quirk can be set before hci_register_dev is called or
+        * during the hdev->setup vendor callback.
+        */
+       HCI_QUIRK_BROKEN_ERR_DATA_REPORTING,
+
        /*
         * When this quirk is set, then the hci_suspend_notifier is not
         * registered. This is intended for devices which drop completely
@@ -263,6 +274,26 @@ enum {
         * during the hdev->setup vendor callback.
         */
        HCI_QUIRK_BROKEN_ENHANCED_SETUP_SYNC_CONN,
+
+       /*
+        * When this quirk is set, the HCI_OP_LE_SET_EXT_SCAN_ENABLE command is
+        * disabled. This is required for some Broadcom controllers which
+        * erroneously claim to support extended scanning.
+        *
+        * This quirk can be set before hci_register_dev is called or
+        * during the hdev->setup vendor callback.
+        */
+       HCI_QUIRK_BROKEN_EXT_SCAN,
+
+       /*
+        * When this quirk is set, the HCI_OP_GET_MWS_TRANSPORT_CONFIG command is
+        * disabled. This is required for some Broadcom controllers which
+        * erroneously claim to support MWS Transport Layer Configuration.
+        *
+        * This quirk can be set before hci_register_dev is called or
+        * during the hdev->setup vendor callback.
+        */
+       HCI_QUIRK_BROKEN_MWS_TRANSPORT_CONFIG,
 };
 
 /* HCI device flags */
@@ -1424,7 +1455,6 @@ struct hci_std_codecs_v2 {
 } __packed;
 
 struct hci_vnd_codec_v2 {
-       __u8    id;
        __le16  cid;
        __le16  vid;
        __u8    transport;
@@ -2580,6 +2610,7 @@ struct hci_ev_le_conn_complete {
 #define LE_EXT_ADV_DIRECT_IND          0x0004
 #define LE_EXT_ADV_SCAN_RSP            0x0008
 #define LE_EXT_ADV_LEGACY_PDU          0x0010
+#define LE_EXT_ADV_EVT_TYPE_MASK       0x007f
 
 #define ADDR_LE_DEV_PUBLIC             0x00
 #define ADDR_LE_DEV_RANDOM             0x01
index c54bc71..7254edf 100644 (file)
@@ -659,6 +659,7 @@ struct hci_dev {
        int (*set_diag)(struct hci_dev *hdev, bool enable);
        int (*set_bdaddr)(struct hci_dev *hdev, const bdaddr_t *bdaddr);
        void (*cmd_timeout)(struct hci_dev *hdev);
+       void (*reset)(struct hci_dev *hdev);
        bool (*wakeup)(struct hci_dev *hdev);
        int (*set_quality_report)(struct hci_dev *hdev, bool enable);
        int (*get_data_path_id)(struct hci_dev *hdev, __u8 *data_path);
@@ -1689,7 +1690,9 @@ void hci_conn_del_sysfs(struct hci_conn *conn);
 
 /* Use ext scanning if set ext scan param and ext scan enable is supported */
 #define use_ext_scan(dev) (((dev)->commands[37] & 0x20) && \
-                          ((dev)->commands[37] & 0x40))
+                          ((dev)->commands[37] & 0x40) && \
+                          !test_bit(HCI_QUIRK_BROKEN_EXT_SCAN, &(dev)->quirks))
+
 /* Use ext create connection if command is supported */
 #define use_ext_conn(dev) ((dev)->commands[37] & 0x80)
 
@@ -1717,6 +1720,9 @@ void hci_conn_del_sysfs(struct hci_conn *conn);
        ((dev)->le_features[3] & HCI_LE_CIS_PERIPHERAL)
 #define bis_capable(dev) ((dev)->le_features[3] & HCI_LE_ISO_BROADCASTER)
 
+#define mws_transport_config_capable(dev) (((dev)->commands[30] & 0x08) && \
+       (!test_bit(HCI_QUIRK_BROKEN_MWS_TRANSPORT_CONFIG, &(dev)->quirks)))
+
 /* ----- HCI protocols ----- */
 #define HCI_PROTO_DEFER             0x01
 
index e1481f9..d09c393 100644 (file)
@@ -260,6 +260,24 @@ struct ieee802154_addr {
        };
 };
 
+/**
+ * struct ieee802154_coord_desc - Coordinator descriptor
+ * @addr: PAN ID and coordinator address
+ * @page: page this coordinator is using
+ * @channel: channel this coordinator is using
+ * @superframe_spec: SuperFrame specification as received
+ * @link_quality: link quality indicator at which the beacon was received
+ * @gts_permit: the coordinator accepts GTS requests
+ */
+struct ieee802154_coord_desc {
+       struct ieee802154_addr addr;
+       u8 page;
+       u8 channel;
+       u16 superframe_spec;
+       u8 link_quality;
+       bool gts_permit;
+};
+
 struct ieee802154_llsec_key_id {
        u8 mode;
        u8 id;
index 5f6eca5..6a2e4f2 100644 (file)
@@ -621,6 +621,8 @@ enum devlink_param_generic_id {
 #define DEVLINK_INFO_VERSION_GENERIC_FW_ROCE   "fw.roce"
 /* Firmware bundle identifier */
 #define DEVLINK_INFO_VERSION_GENERIC_FW_BUNDLE_ID      "fw.bundle_id"
+/* Bootloader */
+#define DEVLINK_INFO_VERSION_GENERIC_FW_BOOTLOADER     "fw.bootloader"
 
 /**
  * struct devlink_flash_update_params - Flash Update parameters
@@ -1452,6 +1454,45 @@ struct devlink_ops {
                                         const u8 *hw_addr, int hw_addr_len,
                                         struct netlink_ext_ack *extack);
        /**
+        * @port_fn_roce_get: Port function's roce get function.
+        *
+        * Query RoCE state of a function managed by the devlink port.
+        * Return -EOPNOTSUPP if port function RoCE handling is not supported.
+        */
+       int (*port_fn_roce_get)(struct devlink_port *devlink_port,
+                               bool *is_enable,
+                               struct netlink_ext_ack *extack);
+       /**
+        * @port_fn_roce_set: Port function's roce set function.
+        *
+        * Enable/Disable the RoCE state of a function managed by the devlink
+        * port.
+        * Return -EOPNOTSUPP if port function RoCE handling is not supported.
+        */
+       int (*port_fn_roce_set)(struct devlink_port *devlink_port,
+                               bool enable, struct netlink_ext_ack *extack);
+       /**
+        * @port_fn_migratable_get: Port function's migratable get function.
+        *
+        * Query migratable state of a function managed by the devlink port.
+        * Return -EOPNOTSUPP if port function migratable handling is not
+        * supported.
+        */
+       int (*port_fn_migratable_get)(struct devlink_port *devlink_port,
+                                     bool *is_enable,
+                                     struct netlink_ext_ack *extack);
+       /**
+        * @port_fn_migratable_set: Port function's migratable set function.
+        *
+        * Enable/Disable migratable state of a function managed by the devlink
+        * port.
+        * Return -EOPNOTSUPP if port function migratable handling is not
+        * supported.
+        */
+       int (*port_fn_migratable_set)(struct devlink_port *devlink_port,
+                                     bool enable,
+                                     struct netlink_ext_ack *extack);
+       /**
         * port_new() - Add a new port function of a specified flavor
         * @devlink: Devlink instance
         * @attrs: attributes of the new port
index a454cf4..1b7fae4 100644 (file)
@@ -26,6 +26,7 @@ struct macsec_info {
 struct xfrm_md_info {
        u32 if_id;
        int link;
+       struct dst_entry *dst_orig;
 };
 
 struct metadata_dst {
index ff1804a..c6c6110 100644 (file)
@@ -29,6 +29,7 @@
 #include <net/netfilter/nf_conntrack.h>
 #endif
 #include <net/net_namespace.h>         /* Netw namespace */
+#include <linux/sched/isolation.h>
 
 #define IP_VS_HDR_INVERSE      1
 #define IP_VS_HDR_ICMP         2
@@ -42,6 +43,8 @@ static inline struct netns_ipvs *net_ipvs(struct net* net)
 /* Connections' size value needed by ip_vs_ctl.c */
 extern int ip_vs_conn_tab_size;
 
+extern struct mutex __ip_vs_mutex;
+
 struct ip_vs_iphdr {
        int hdr_flags;  /* ipvs flags */
        __u32 off;      /* Where IP or IPv4 header starts */
@@ -351,11 +354,11 @@ struct ip_vs_seq {
 
 /* counters per cpu */
 struct ip_vs_counters {
-       __u64           conns;          /* connections scheduled */
-       __u64           inpkts;         /* incoming packets */
-       __u64           outpkts;        /* outgoing packets */
-       __u64           inbytes;        /* incoming bytes */
-       __u64           outbytes;       /* outgoing bytes */
+       u64_stats_t     conns;          /* connections scheduled */
+       u64_stats_t     inpkts;         /* incoming packets */
+       u64_stats_t     outpkts;        /* outgoing packets */
+       u64_stats_t     inbytes;        /* incoming bytes */
+       u64_stats_t     outbytes;       /* outgoing bytes */
 };
 /* Stats per cpu */
 struct ip_vs_cpu_stats {
@@ -363,9 +366,12 @@ struct ip_vs_cpu_stats {
        struct u64_stats_sync   syncp;
 };
 
+/* Default nice for estimator kthreads */
+#define IPVS_EST_NICE          0
+
 /* IPVS statistics objects */
 struct ip_vs_estimator {
-       struct list_head        list;
+       struct hlist_node       list;
 
        u64                     last_inbytes;
        u64                     last_outbytes;
@@ -378,6 +384,10 @@ struct ip_vs_estimator {
        u64                     outpps;
        u64                     inbps;
        u64                     outbps;
+
+       s32                     ktid:16,        /* kthread ID, -1=temp list */
+                               ktrow:8,        /* row/tick ID for kthread */
+                               ktcid:8;        /* chain ID for kthread tick */
 };
 
 /*
@@ -405,6 +415,76 @@ struct ip_vs_stats {
        struct ip_vs_kstats     kstats0;        /* reset values */
 };
 
+struct ip_vs_stats_rcu {
+       struct ip_vs_stats      s;
+       struct rcu_head         rcu_head;
+};
+
+int ip_vs_stats_init_alloc(struct ip_vs_stats *s);
+struct ip_vs_stats *ip_vs_stats_alloc(void);
+void ip_vs_stats_release(struct ip_vs_stats *stats);
+void ip_vs_stats_free(struct ip_vs_stats *stats);
+
+/* Process estimators in multiple timer ticks (20/50/100, see ktrow) */
+#define IPVS_EST_NTICKS                50
+/* Estimation uses a 2-second period containing ticks (in jiffies) */
+#define IPVS_EST_TICK          ((2 * HZ) / IPVS_EST_NTICKS)
+
+/* Limit of CPU load per kthread (8 for 12.5%), ratio of CPU capacity (1/C).
+ * Value of 4 and above ensures kthreads will take work without exceeding
+ * the CPU capacity under different circumstances.
+ */
+#define IPVS_EST_LOAD_DIVISOR  8
+
+/* Kthreads should not have work that exceeds the CPU load above 50% */
+#define IPVS_EST_CPU_KTHREADS  (IPVS_EST_LOAD_DIVISOR / 2)
+
+/* Desired number of chains per timer tick (chain load factor in 100us units),
+ * 48=4.8ms of 40ms tick (12% CPU usage):
+ * 2 sec * 1000 ms in sec * 10 (100us in ms) / 8 (12.5%) / 50
+ */
+#define IPVS_EST_CHAIN_FACTOR  \
+       ALIGN_DOWN(2 * 1000 * 10 / IPVS_EST_LOAD_DIVISOR / IPVS_EST_NTICKS, 8)
+
+/* Compiled number of chains per tick
+ * The defines should match cond_resched_rcu
+ */
+#if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU)
+#define IPVS_EST_TICK_CHAINS   IPVS_EST_CHAIN_FACTOR
+#else
+#define IPVS_EST_TICK_CHAINS   1
+#endif
+
+#if IPVS_EST_NTICKS > 127
+#error Too many timer ticks for ktrow
+#endif
+
+/* Multiple chains processed in same tick */
+struct ip_vs_est_tick_data {
+       struct hlist_head       chains[IPVS_EST_TICK_CHAINS];
+       DECLARE_BITMAP(present, IPVS_EST_TICK_CHAINS);
+       DECLARE_BITMAP(full, IPVS_EST_TICK_CHAINS);
+       int                     chain_len[IPVS_EST_TICK_CHAINS];
+};
+
+/* Context for estimation kthread */
+struct ip_vs_est_kt_data {
+       struct netns_ipvs       *ipvs;
+       struct task_struct      *task;          /* task if running */
+       struct ip_vs_est_tick_data __rcu *ticks[IPVS_EST_NTICKS];
+       DECLARE_BITMAP(avail, IPVS_EST_NTICKS); /* tick has space for ests */
+       unsigned long           est_timer;      /* estimation timer (jiffies) */
+       struct ip_vs_stats      *calc_stats;    /* Used for calculation */
+       int                     tick_len[IPVS_EST_NTICKS];      /* est count */
+       int                     id;             /* ktid per netns */
+       int                     chain_max;      /* max ests per tick chain */
+       int                     tick_max;       /* max ests per tick */
+       int                     est_count;      /* attached ests to kthread */
+       int                     est_max_count;  /* max ests per kthread */
+       int                     add_row;        /* row for new ests */
+       int                     est_row;        /* estimated row */
+};
+
 struct dst_entry;
 struct iphdr;
 struct ip_vs_conn;
@@ -688,6 +768,7 @@ struct ip_vs_dest {
        union nf_inet_addr      vaddr;          /* virtual IP address */
        __u32                   vfwmark;        /* firewall mark of service */
 
+       struct rcu_head         rcu_head;
        struct list_head        t_list;         /* in dest_trash */
        unsigned int            in_rs_table:1;  /* we are in rs_table */
 };
@@ -869,7 +950,7 @@ struct netns_ipvs {
        atomic_t                conn_count;      /* connection counter */
 
        /* ip_vs_ctl */
-       struct ip_vs_stats              tot_stats;  /* Statistics & est. */
+       struct ip_vs_stats_rcu  *tot_stats;      /* Statistics & est. */
 
        int                     num_services;    /* no of virtual services */
        int                     num_services6;   /* IPv6 virtual services */
@@ -932,6 +1013,12 @@ struct netns_ipvs {
        int                     sysctl_schedule_icmp;
        int                     sysctl_ignore_tunneled;
        int                     sysctl_run_estimation;
+#ifdef CONFIG_SYSCTL
+       cpumask_var_t           sysctl_est_cpulist;     /* kthread cpumask */
+       int                     est_cpulist_valid;      /* cpulist set */
+       int                     sysctl_est_nice;        /* kthread nice */
+       int                     est_stopped;            /* stop tasks */
+#endif
 
        /* ip_vs_lblc */
        int                     sysctl_lblc_expiration;
@@ -942,9 +1029,17 @@ struct netns_ipvs {
        struct ctl_table_header *lblcr_ctl_header;
        struct ctl_table        *lblcr_ctl_table;
        /* ip_vs_est */
-       struct list_head        est_list;       /* estimator list */
-       spinlock_t              est_lock;
-       struct timer_list       est_timer;      /* Estimation timer */
+       struct delayed_work     est_reload_work;/* Reload kthread tasks */
+       struct mutex            est_mutex;      /* protect kthread tasks */
+       struct hlist_head       est_temp_list;  /* Ests during calc phase */
+       struct ip_vs_est_kt_data **est_kt_arr;  /* Array of kthread data ptrs */
+       unsigned long           est_max_threads;/* Hard limit of kthreads */
+       int                     est_calc_phase; /* Calculation phase */
+       int                     est_chain_max;  /* Calculated chain_max */
+       int                     est_kt_count;   /* Allocated ptrs */
+       int                     est_add_ktid;   /* ktid where to add ests */
+       atomic_t                est_genid;      /* kthreads reload genid */
+       atomic_t                est_genid_done; /* applied genid */
        /* ip_vs_sync */
        spinlock_t              sync_lock;
        struct ipvs_master_sync_state *ms;
@@ -1077,6 +1172,19 @@ static inline int sysctl_run_estimation(struct netns_ipvs *ipvs)
        return ipvs->sysctl_run_estimation;
 }
 
+static inline const struct cpumask *sysctl_est_cpulist(struct netns_ipvs *ipvs)
+{
+       if (ipvs->est_cpulist_valid)
+               return ipvs->sysctl_est_cpulist;
+       else
+               return housekeeping_cpumask(HK_TYPE_KTHREAD);
+}
+
+static inline int sysctl_est_nice(struct netns_ipvs *ipvs)
+{
+       return ipvs->sysctl_est_nice;
+}
+
 #else
 
 static inline int sysctl_sync_threshold(struct netns_ipvs *ipvs)
@@ -1174,6 +1282,16 @@ static inline int sysctl_run_estimation(struct netns_ipvs *ipvs)
        return 1;
 }
 
+static inline const struct cpumask *sysctl_est_cpulist(struct netns_ipvs *ipvs)
+{
+       return housekeeping_cpumask(HK_TYPE_KTHREAD);
+}
+
+static inline int sysctl_est_nice(struct netns_ipvs *ipvs)
+{
+       return IPVS_EST_NICE;
+}
+
 #endif
 
 /* IPVS core functions
@@ -1475,10 +1593,41 @@ int stop_sync_thread(struct netns_ipvs *ipvs, int state);
 void ip_vs_sync_conn(struct netns_ipvs *ipvs, struct ip_vs_conn *cp, int pkts);
 
 /* IPVS rate estimator prototypes (from ip_vs_est.c) */
-void ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats);
+int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats);
 void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats);
 void ip_vs_zero_estimator(struct ip_vs_stats *stats);
 void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats);
+void ip_vs_est_reload_start(struct netns_ipvs *ipvs);
+int ip_vs_est_kthread_start(struct netns_ipvs *ipvs,
+                           struct ip_vs_est_kt_data *kd);
+void ip_vs_est_kthread_stop(struct ip_vs_est_kt_data *kd);
+
+static inline void ip_vs_est_stopped_recalc(struct netns_ipvs *ipvs)
+{
+#ifdef CONFIG_SYSCTL
+       /* Stop tasks while cpulist is empty or if disabled with flag */
+       ipvs->est_stopped = !sysctl_run_estimation(ipvs) ||
+                           (ipvs->est_cpulist_valid &&
+                            cpumask_empty(sysctl_est_cpulist(ipvs)));
+#endif
+}
+
+static inline bool ip_vs_est_stopped(struct netns_ipvs *ipvs)
+{
+#ifdef CONFIG_SYSCTL
+       return ipvs->est_stopped;
+#else
+       return false;
+#endif
+}
+
+static inline int ip_vs_est_max_threads(struct netns_ipvs *ipvs)
+{
+       unsigned int limit = IPVS_EST_CPU_KTHREADS *
+                            cpumask_weight(sysctl_est_cpulist(ipvs));
+
+       return max(1U, limit);
+}
 
 /* Various IPVS packet transmitters (from ip_vs_xmit.c) */
 int ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
index d383c89..03f3af0 100644 (file)
@@ -500,6 +500,39 @@ static inline int ipv6_has_hopopt_jumbo(const struct sk_buff *skb)
        return jhdr->nexthdr;
 }
 
+/* Return 0 if HBH header is successfully removed
+ * Or if HBH removal is unnecessary (packet is not big TCP)
+ * Return error to indicate dropping the packet
+ */
+static inline int ipv6_hopopt_jumbo_remove(struct sk_buff *skb)
+{
+       const int hophdr_len = sizeof(struct hop_jumbo_hdr);
+       int nexthdr = ipv6_has_hopopt_jumbo(skb);
+       struct ipv6hdr *h6;
+
+       if (!nexthdr)
+               return 0;
+
+       if (skb_cow_head(skb, 0))
+               return -1;
+
+       /* Remove the HBH header.
+        * Layout: [Ethernet header][IPv6 header][HBH][L4 Header]
+        */
+       memmove(skb_mac_header(skb) + hophdr_len, skb_mac_header(skb),
+               skb_network_header(skb) - skb_mac_header(skb) +
+               sizeof(struct ipv6hdr));
+
+       __skb_pull(skb, hophdr_len);
+       skb->network_header += hophdr_len;
+       skb->mac_header += hophdr_len;
+
+       h6 = ipv6_hdr(skb);
+       h6->nexthdr = nexthdr;
+
+       return 0;
+}
+
 static inline bool ipv6_accept_ra(struct inet6_dev *idev)
 {
        /* If forwarding is enabled, RA are not accepted unless the special
index 28d0687..d80c785 100644 (file)
@@ -522,7 +522,14 @@ enum {
 
 #define GDMA_DRV_CAP_FLAG_1_EQ_SHARING_MULTI_VPORT BIT(0)
 
-#define GDMA_DRV_CAP_FLAGS1 GDMA_DRV_CAP_FLAG_1_EQ_SHARING_MULTI_VPORT
+/* Advertise to the NIC firmware: the NAPI work_done variable race is fixed,
+ * so the driver is able to reliably support features like busy_poll.
+ */
+#define GDMA_DRV_CAP_FLAG_1_NAPI_WKDONE_FIX BIT(2)
+
+#define GDMA_DRV_CAP_FLAGS1 \
+       (GDMA_DRV_CAP_FLAG_1_EQ_SHARING_MULTI_VPORT | \
+        GDMA_DRV_CAP_FLAG_1_NAPI_WKDONE_FIX)
 
 #define GDMA_DRV_CAP_FLAGS2 0
 
index b2b9de7..71d1269 100644 (file)
@@ -71,8 +71,7 @@ static inline int nf_conntrack_confirm(struct sk_buff *skb)
        return ret;
 }
 
-unsigned int nf_confirm(struct sk_buff *skb, unsigned int protoff,
-                       struct nf_conn *ct, enum ip_conntrack_info ctinfo);
+unsigned int nf_confirm(void *priv, struct sk_buff *skb, const struct nf_hook_state *state);
 
 void print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple,
                 const struct nf_conntrack_l4proto *proto);
index e9eb01e..9877f06 100644 (file)
@@ -104,6 +104,10 @@ unsigned int
 nf_nat_inet_fn(void *priv, struct sk_buff *skb,
               const struct nf_hook_state *state);
 
+int nf_ct_nat(struct sk_buff *skb, struct nf_conn *ct,
+             enum ip_conntrack_info ctinfo, int *action,
+             const struct nf_nat_range2 *range, bool commit);
+
 static inline int nf_nat_initialized(const struct nf_conn *ct,
                                     enum nf_nat_manip_type manip)
 {
index e573426..21a4f25 100644 (file)
@@ -2,8 +2,8 @@
 #ifndef __NETNS_XDP_H__
 #define __NETNS_XDP_H__
 
-#include <linux/rculist.h>
 #include <linux/mutex.h>
+#include <linux/types.h>
 
 struct netns_xdp {
        struct mutex            lock;
index f5850b5..b79a89d 100644 (file)
@@ -72,6 +72,8 @@ enum nl802154_commands {
        NL802154_CMD_NEW_SEC_LEVEL,
        NL802154_CMD_DEL_SEC_LEVEL,
 
+       NL802154_CMD_SCAN_EVENT,
+
        /* add new commands above here */
 
        /* used to define NL802154_CMD_MAX below */
@@ -131,6 +133,8 @@ enum nl802154_attrs {
        NL802154_ATTR_PID,
        NL802154_ATTR_NETNS_FD,
 
+       NL802154_ATTR_COORDINATOR,
+
        /* add attributes here, update the policy in nl802154.c */
 
 #ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL
@@ -217,6 +221,45 @@ enum nl802154_wpan_phy_capability_attr {
 };
 
 /**
+ * enum nl802154_coord - Netlink attributes for a coord
+ *
+ * @__NL802154_COORD_INVALID: invalid
+ * @NL802154_COORD_PANID: PANID of the coordinator (2 bytes)
+ * @NL802154_COORD_ADDR: coordinator address, (8 bytes or 2 bytes)
+ * @NL802154_COORD_CHANNEL: channel number, related to @NL802154_COORD_PAGE (u8)
+ * @NL802154_COORD_PAGE: channel page, related to @NL802154_COORD_CHANNEL (u8)
+ * @NL802154_COORD_PREAMBLE_CODE: Preamble code used when the beacon was received,
+ *     this is PHY dependent and optional (u8)
+ * @NL802154_COORD_MEAN_PRF: Mean PRF used when the beacon was received,
+ *     this is PHY dependent and optional (u8)
+ * @NL802154_COORD_SUPERFRAME_SPEC: superframe specification of the PAN (u16)
+ * @NL802154_COORD_LINK_QUALITY: signal quality of beacon in unspecified units,
+ *     scaled to 0..255 (u8)
+ * @NL802154_COORD_GTS_PERMIT: set to true if GTS is permitted on this PAN
+ * @NL802154_COORD_PAYLOAD_DATA: binary data containing the raw data from the
+ *     frame payload, (only if beacon or probe response had data)
+ * @NL802154_COORD_PAD: attribute used for padding for 64-bit alignment
+ * @NL802154_COORD_MAX: highest coordinator attribute
+ */
+enum nl802154_coord {
+       __NL802154_COORD_INVALID,
+       NL802154_COORD_PANID,
+       NL802154_COORD_ADDR,
+       NL802154_COORD_CHANNEL,
+       NL802154_COORD_PAGE,
+       NL802154_COORD_PREAMBLE_CODE,
+       NL802154_COORD_MEAN_PRF,
+       NL802154_COORD_SUPERFRAME_SPEC,
+       NL802154_COORD_LINK_QUALITY,
+       NL802154_COORD_GTS_PERMIT,
+       NL802154_COORD_PAYLOAD_DATA,
+       NL802154_COORD_PAD,
+
+       /* keep last */
+       NL802154_COORD_MAX,
+};
+
+/**
  * enum nl802154_cca_modes - cca modes
  *
  * @__NL802154_CCA_INVALID: cca mode number 0 is reserved
index e4ff391..9233ad3 100644 (file)
@@ -16,9 +16,6 @@
 #define PING_HTABLE_SIZE       64
 #define PING_HTABLE_MASK       (PING_HTABLE_SIZE-1)
 
-#define ping_portaddr_for_each_entry(__sk, node, list) \
-       hlist_nulls_for_each_entry(__sk, node, list, sk_nulls_node)
-
 /*
  * gid_t is either uint or ushort.  We want to pass it to
  * proc_dointvec_minmax(), so it must not be larger than MAX_INT
index 6d207e7..ecea3dc 100644 (file)
@@ -503,10 +503,10 @@ struct sock {
 #if BITS_PER_LONG==32
        seqlock_t               sk_stamp_seq;
 #endif
-       u16                     sk_tsflags;
-       u8                      sk_shutdown;
        atomic_t                sk_tskey;
        atomic_t                sk_zckey;
+       u32                     sk_tsflags;
+       u8                      sk_shutdown;
 
        u8                      sk_clockid;
        u8                      sk_txtime_deadline_mode : 1,
@@ -1899,7 +1899,7 @@ static inline void sock_replace_proto(struct sock *sk, struct proto *proto)
 struct sockcm_cookie {
        u64 transmit_time;
        u32 mark;
-       u16 tsflags;
+       u32 tsflags;
 };
 
 static inline void sockcm_init(struct sockcm_cookie *sockc,
diff --git a/include/net/tc_wrapper.h b/include/net/tc_wrapper.h
new file mode 100644 (file)
index 0000000..ceed2fc
--- /dev/null
@@ -0,0 +1,251 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __NET_TC_WRAPPER_H
+#define __NET_TC_WRAPPER_H
+
+#include <net/pkt_cls.h>
+
+#if IS_ENABLED(CONFIG_RETPOLINE)
+
+#include <linux/cpufeature.h>
+#include <linux/static_key.h>
+#include <linux/indirect_call_wrapper.h>
+
+#define TC_INDIRECT_SCOPE
+
+extern struct static_key_false tc_skip_wrapper;
+
+/* TC Actions */
+#ifdef CONFIG_NET_CLS_ACT
+
+#define TC_INDIRECT_ACTION_DECLARE(fname)                              \
+       INDIRECT_CALLABLE_DECLARE(int fname(struct sk_buff *skb,       \
+                                           const struct tc_action *a, \
+                                           struct tcf_result *res))
+
+TC_INDIRECT_ACTION_DECLARE(tcf_bpf_act);
+TC_INDIRECT_ACTION_DECLARE(tcf_connmark_act);
+TC_INDIRECT_ACTION_DECLARE(tcf_csum_act);
+TC_INDIRECT_ACTION_DECLARE(tcf_ct_act);
+TC_INDIRECT_ACTION_DECLARE(tcf_ctinfo_act);
+TC_INDIRECT_ACTION_DECLARE(tcf_gact_act);
+TC_INDIRECT_ACTION_DECLARE(tcf_gate_act);
+TC_INDIRECT_ACTION_DECLARE(tcf_ife_act);
+TC_INDIRECT_ACTION_DECLARE(tcf_ipt_act);
+TC_INDIRECT_ACTION_DECLARE(tcf_mirred_act);
+TC_INDIRECT_ACTION_DECLARE(tcf_mpls_act);
+TC_INDIRECT_ACTION_DECLARE(tcf_nat_act);
+TC_INDIRECT_ACTION_DECLARE(tcf_pedit_act);
+TC_INDIRECT_ACTION_DECLARE(tcf_police_act);
+TC_INDIRECT_ACTION_DECLARE(tcf_sample_act);
+TC_INDIRECT_ACTION_DECLARE(tcf_simp_act);
+TC_INDIRECT_ACTION_DECLARE(tcf_skbedit_act);
+TC_INDIRECT_ACTION_DECLARE(tcf_skbmod_act);
+TC_INDIRECT_ACTION_DECLARE(tcf_vlan_act);
+TC_INDIRECT_ACTION_DECLARE(tunnel_key_act);
+
+static inline int tc_act(struct sk_buff *skb, const struct tc_action *a,
+                          struct tcf_result *res)
+{
+       if (static_branch_likely(&tc_skip_wrapper))
+               goto skip;
+
+#if IS_BUILTIN(CONFIG_NET_ACT_GACT)
+       if (a->ops->act == tcf_gact_act)
+               return tcf_gact_act(skb, a, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_ACT_MIRRED)
+       if (a->ops->act == tcf_mirred_act)
+               return tcf_mirred_act(skb, a, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_ACT_PEDIT)
+       if (a->ops->act == tcf_pedit_act)
+               return tcf_pedit_act(skb, a, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_ACT_SKBEDIT)
+       if (a->ops->act == tcf_skbedit_act)
+               return tcf_skbedit_act(skb, a, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_ACT_SKBMOD)
+       if (a->ops->act == tcf_skbmod_act)
+               return tcf_skbmod_act(skb, a, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_ACT_POLICE)
+       if (a->ops->act == tcf_police_act)
+               return tcf_police_act(skb, a, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_ACT_BPF)
+       if (a->ops->act == tcf_bpf_act)
+               return tcf_bpf_act(skb, a, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_ACT_CONNMARK)
+       if (a->ops->act == tcf_connmark_act)
+               return tcf_connmark_act(skb, a, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_ACT_CSUM)
+       if (a->ops->act == tcf_csum_act)
+               return tcf_csum_act(skb, a, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_ACT_CT)
+       if (a->ops->act == tcf_ct_act)
+               return tcf_ct_act(skb, a, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_ACT_CTINFO)
+       if (a->ops->act == tcf_ctinfo_act)
+               return tcf_ctinfo_act(skb, a, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_ACT_GATE)
+       if (a->ops->act == tcf_gate_act)
+               return tcf_gate_act(skb, a, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_ACT_MPLS)
+       if (a->ops->act == tcf_mpls_act)
+               return tcf_mpls_act(skb, a, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_ACT_NAT)
+       if (a->ops->act == tcf_nat_act)
+               return tcf_nat_act(skb, a, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_ACT_TUNNEL_KEY)
+       if (a->ops->act == tunnel_key_act)
+               return tunnel_key_act(skb, a, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_ACT_VLAN)
+       if (a->ops->act == tcf_vlan_act)
+               return tcf_vlan_act(skb, a, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_ACT_IFE)
+       if (a->ops->act == tcf_ife_act)
+               return tcf_ife_act(skb, a, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_ACT_IPT)
+       if (a->ops->act == tcf_ipt_act)
+               return tcf_ipt_act(skb, a, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_ACT_SIMP)
+       if (a->ops->act == tcf_simp_act)
+               return tcf_simp_act(skb, a, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_ACT_SAMPLE)
+       if (a->ops->act == tcf_sample_act)
+               return tcf_sample_act(skb, a, res);
+#endif
+
+skip:
+       return a->ops->act(skb, a, res);
+}
+
+#endif /* CONFIG_NET_CLS_ACT */
+
+/* TC Filters */
+#ifdef CONFIG_NET_CLS
+
+#define TC_INDIRECT_FILTER_DECLARE(fname)                               \
+       INDIRECT_CALLABLE_DECLARE(int fname(struct sk_buff *skb,        \
+                                           const struct tcf_proto *tp, \
+                                           struct tcf_result *res))
+
+TC_INDIRECT_FILTER_DECLARE(basic_classify);
+TC_INDIRECT_FILTER_DECLARE(cls_bpf_classify);
+TC_INDIRECT_FILTER_DECLARE(cls_cgroup_classify);
+TC_INDIRECT_FILTER_DECLARE(fl_classify);
+TC_INDIRECT_FILTER_DECLARE(flow_classify);
+TC_INDIRECT_FILTER_DECLARE(fw_classify);
+TC_INDIRECT_FILTER_DECLARE(mall_classify);
+TC_INDIRECT_FILTER_DECLARE(route4_classify);
+TC_INDIRECT_FILTER_DECLARE(rsvp_classify);
+TC_INDIRECT_FILTER_DECLARE(rsvp6_classify);
+TC_INDIRECT_FILTER_DECLARE(tcindex_classify);
+TC_INDIRECT_FILTER_DECLARE(u32_classify);
+
+static inline int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
+                               struct tcf_result *res)
+{
+       if (static_branch_likely(&tc_skip_wrapper))
+               goto skip;
+
+#if IS_BUILTIN(CONFIG_NET_CLS_BPF)
+       if (tp->classify == cls_bpf_classify)
+               return cls_bpf_classify(skb, tp, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_CLS_U32)
+       if (tp->classify == u32_classify)
+               return u32_classify(skb, tp, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_CLS_FLOWER)
+       if (tp->classify == fl_classify)
+               return fl_classify(skb, tp, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_CLS_FW)
+       if (tp->classify == fw_classify)
+               return fw_classify(skb, tp, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_CLS_MATCHALL)
+       if (tp->classify == mall_classify)
+               return mall_classify(skb, tp, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_CLS_BASIC)
+       if (tp->classify == basic_classify)
+               return basic_classify(skb, tp, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_CLS_CGROUP)
+       if (tp->classify == cls_cgroup_classify)
+               return cls_cgroup_classify(skb, tp, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_CLS_FLOW)
+       if (tp->classify == flow_classify)
+               return flow_classify(skb, tp, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_CLS_ROUTE4)
+       if (tp->classify == route4_classify)
+               return route4_classify(skb, tp, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_CLS_RSVP)
+       if (tp->classify == rsvp_classify)
+               return rsvp_classify(skb, tp, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_CLS_RSVP6)
+       if (tp->classify == rsvp6_classify)
+               return rsvp6_classify(skb, tp, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_CLS_TCINDEX)
+       if (tp->classify == tcindex_classify)
+               return tcindex_classify(skb, tp, res);
+#endif
+
+skip:
+       return tp->classify(skb, tp, res);
+}
+
+static inline void tc_wrapper_init(void)
+{
+#ifdef CONFIG_X86
+       if (!cpu_feature_enabled(X86_FEATURE_RETPOLINE))
+               static_branch_enable(&tc_skip_wrapper);
+#endif
+}
+
+#endif /* CONFIG_NET_CLS */
+
+#else
+
+#define TC_INDIRECT_SCOPE static
+
+static inline int tc_act(struct sk_buff *skb, const struct tc_action *a,
+                          struct tcf_result *res)
+{
+       return a->ops->act(skb, a, res);
+}
+
+static inline int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
+                               struct tcf_result *res)
+{
+       return tp->classify(skb, tp, res);
+}
+
+static inline void tc_wrapper_init(void)
+{
+}
+
+#endif
+
+#endif /* __NET_TC_WRAPPER_H */
index f925377..db9f828 100644 (file)
@@ -2323,8 +2323,8 @@ int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore);
 void tcp_bpf_clone(const struct sock *sk, struct sock *newsk);
 #endif /* CONFIG_BPF_SYSCALL */
 
-int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, u32 bytes,
-                         int flags);
+int tcp_bpf_sendmsg_redir(struct sock *sk, bool ingress,
+                         struct sk_msg *msg, u32 bytes, int flags);
 #endif /* CONFIG_NET_SOCK_MSG */
 
 #if !defined(CONFIG_BPF_SYSCALL) || !defined(CONFIG_NET_SOCK_MSG)
index 62c98a9..e7e157a 100644 (file)
@@ -2,6 +2,7 @@
 #ifndef _TSO_H
 #define _TSO_H
 
+#include <linux/skbuff.h>
 #include <net/ip.h>
 
 #define TSO_HEADER_SIZE                256
@@ -16,7 +17,12 @@ struct tso_t {
        u32     tcp_seq;
 };
 
-int tso_count_descs(const struct sk_buff *skb);
+/* Calculate the worst case buffer count */
+static inline int tso_count_descs(const struct sk_buff *skb)
+{
+       return skb_shinfo(skb)->gso_segs * 2 + skb_shinfo(skb)->nr_frags;
+}
+
 void tso_build_hdr(const struct sk_buff *skb, char *hdr, struct tso_t *tso,
                   int size, bool is_last);
 void tso_build_data(const struct sk_buff *skb, struct tso_t *tso, int size);
index e0cc679..3e1f70e 100644 (file)
@@ -129,6 +129,13 @@ struct xfrm_state_walk {
 enum {
        XFRM_DEV_OFFLOAD_IN = 1,
        XFRM_DEV_OFFLOAD_OUT,
+       XFRM_DEV_OFFLOAD_FWD,
+};
+
+enum {
+       XFRM_DEV_OFFLOAD_UNSPECIFIED,
+       XFRM_DEV_OFFLOAD_CRYPTO,
+       XFRM_DEV_OFFLOAD_PACKET,
 };
 
 struct xfrm_dev_offload {
@@ -137,6 +144,7 @@ struct xfrm_dev_offload {
        struct net_device       *real_dev;
        unsigned long           offload_handle;
        u8                      dir : 2;
+       u8                      type : 2;
 };
 
 struct xfrm_mode {
@@ -534,6 +542,8 @@ struct xfrm_policy {
        struct xfrm_tmpl        xfrm_vec[XFRM_MAX_DEPTH];
        struct hlist_node       bydst_inexact_list;
        struct rcu_head         rcu;
+
+       struct xfrm_dev_offload xdo;
 };
 
 static inline struct net *xp_net(const struct xfrm_policy *xp)
@@ -1093,6 +1103,29 @@ xfrm_state_addr_cmp(const struct xfrm_tmpl *tmpl, const struct xfrm_state *x, un
 }
 
 #ifdef CONFIG_XFRM
+static inline struct xfrm_state *xfrm_input_state(struct sk_buff *skb)
+{
+       struct sec_path *sp = skb_sec_path(skb);
+
+       return sp->xvec[sp->len - 1];
+}
+#endif
+
+static inline struct xfrm_offload *xfrm_offload(struct sk_buff *skb)
+{
+#ifdef CONFIG_XFRM
+       struct sec_path *sp = skb_sec_path(skb);
+
+       if (!sp || !sp->olen || sp->len != sp->olen)
+               return NULL;
+
+       return &sp->ovec[sp->olen - 1];
+#else
+       return NULL;
+#endif
+}
+
+#ifdef CONFIG_XFRM
 int __xfrm_policy_check(struct sock *, int dir, struct sk_buff *skb,
                        unsigned short family);
 
@@ -1123,10 +1156,19 @@ static inline int __xfrm_policy_check2(struct sock *sk, int dir,
 {
        struct net *net = dev_net(skb->dev);
        int ndir = dir | (reverse ? XFRM_POLICY_MASK + 1 : 0);
+       struct xfrm_offload *xo = xfrm_offload(skb);
+       struct xfrm_state *x;
 
        if (sk && sk->sk_policy[XFRM_POLICY_IN])
                return __xfrm_policy_check(sk, ndir, skb, family);
 
+       if (xo) {
+               x = xfrm_input_state(skb);
+               if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET)
+                       return (xo->flags & CRYPTO_DONE) &&
+                              (xo->status & CRYPTO_SUCCESS);
+       }
+
        return __xfrm_check_nopolicy(net, skb, dir) ||
               __xfrm_check_dev_nopolicy(skb, dir, family) ||
               __xfrm_policy_check(sk, ndir, skb, family);
@@ -1529,6 +1571,23 @@ struct xfrm_state *xfrm_stateonly_find(struct net *net, u32 mark, u32 if_id,
 struct xfrm_state *xfrm_state_lookup_byspi(struct net *net, __be32 spi,
                                              unsigned short family);
 int xfrm_state_check_expire(struct xfrm_state *x);
+#ifdef CONFIG_XFRM_OFFLOAD
+static inline void xfrm_dev_state_update_curlft(struct xfrm_state *x)
+{
+       struct xfrm_dev_offload *xdo = &x->xso;
+       struct net_device *dev = xdo->dev;
+
+       if (x->xso.type != XFRM_DEV_OFFLOAD_PACKET)
+               return;
+
+       if (dev && dev->xfrmdev_ops &&
+           dev->xfrmdev_ops->xdo_dev_state_update_curlft)
+               dev->xfrmdev_ops->xdo_dev_state_update_curlft(x);
+
+}
+#else
+static inline void xfrm_dev_state_update_curlft(struct xfrm_state *x) {}
+#endif
 void xfrm_state_insert(struct xfrm_state *x);
 int xfrm_state_add(struct xfrm_state *x);
 int xfrm_state_update(struct xfrm_state *x);
@@ -1578,6 +1637,8 @@ struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq);
 int xfrm_state_delete(struct xfrm_state *x);
 int xfrm_state_flush(struct net *net, u8 proto, bool task_valid, bool sync);
 int xfrm_dev_state_flush(struct net *net, struct net_device *dev, bool task_valid);
+int xfrm_dev_policy_flush(struct net *net, struct net_device *dev,
+                         bool task_valid);
 void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si);
 void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si);
 u32 xfrm_replay_seqhi(struct xfrm_state *x, __be32 net_seq);
@@ -1860,29 +1921,6 @@ static inline void xfrm_states_delete(struct xfrm_state **states, int n)
 }
 #endif
 
-#ifdef CONFIG_XFRM
-static inline struct xfrm_state *xfrm_input_state(struct sk_buff *skb)
-{
-       struct sec_path *sp = skb_sec_path(skb);
-
-       return sp->xvec[sp->len - 1];
-}
-#endif
-
-static inline struct xfrm_offload *xfrm_offload(struct sk_buff *skb)
-{
-#ifdef CONFIG_XFRM
-       struct sec_path *sp = skb_sec_path(skb);
-
-       if (!sp || !sp->olen || sp->len != sp->olen)
-               return NULL;
-
-       return &sp->ovec[sp->olen - 1];
-#else
-       return NULL;
-#endif
-}
-
 void __init xfrm_dev_init(void);
 
 #ifdef CONFIG_XFRM_OFFLOAD
@@ -1892,6 +1930,9 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur
 int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
                       struct xfrm_user_offload *xuo,
                       struct netlink_ext_ack *extack);
+int xfrm_dev_policy_add(struct net *net, struct xfrm_policy *xp,
+                       struct xfrm_user_offload *xuo, u8 dir,
+                       struct netlink_ext_ack *extack);
 bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x);
 
 static inline void xfrm_dev_state_advance_esn(struct xfrm_state *x)
@@ -1940,6 +1981,28 @@ static inline void xfrm_dev_state_free(struct xfrm_state *x)
                netdev_put(dev, &xso->dev_tracker);
        }
 }
+
+static inline void xfrm_dev_policy_delete(struct xfrm_policy *x)
+{
+       struct xfrm_dev_offload *xdo = &x->xdo;
+       struct net_device *dev = xdo->dev;
+
+       if (dev && dev->xfrmdev_ops && dev->xfrmdev_ops->xdo_dev_policy_delete)
+               dev->xfrmdev_ops->xdo_dev_policy_delete(x);
+}
+
+static inline void xfrm_dev_policy_free(struct xfrm_policy *x)
+{
+       struct xfrm_dev_offload *xdo = &x->xdo;
+       struct net_device *dev = xdo->dev;
+
+       if (dev && dev->xfrmdev_ops) {
+               if (dev->xfrmdev_ops->xdo_dev_policy_free)
+                       dev->xfrmdev_ops->xdo_dev_policy_free(x);
+               xdo->dev = NULL;
+               netdev_put(dev, &xdo->dev_tracker);
+       }
+}
 #else
 static inline void xfrm_dev_resume(struct sk_buff *skb)
 {
@@ -1967,6 +2030,21 @@ static inline void xfrm_dev_state_free(struct xfrm_state *x)
 {
 }
 
+static inline int xfrm_dev_policy_add(struct net *net, struct xfrm_policy *xp,
+                                     struct xfrm_user_offload *xuo, u8 dir,
+                                     struct netlink_ext_ack *extack)
+{
+       return 0;
+}
+
+static inline void xfrm_dev_policy_delete(struct xfrm_policy *x)
+{
+}
+
+static inline void xfrm_dev_policy_free(struct xfrm_policy *x)
+{
+}
+
 static inline bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x)
 {
        return false;
@@ -2086,4 +2164,21 @@ static inline bool xfrm6_local_dontfrag(const struct sock *sk)
        return false;
 }
 #endif
+
+#if (IS_BUILTIN(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \
+    (IS_MODULE(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES))
+
+extern struct metadata_dst __percpu *xfrm_bpf_md_dst;
+
+int register_xfrm_interface_bpf(void);
+
+#else
+
+static inline int register_xfrm_interface_bpf(void)
+{
+       return 0;
+}
+
+#endif
+
 #endif /* _NET_XFRM_H */
index c078c48..a6190aa 100644 (file)
@@ -66,6 +66,7 @@ enum fscache_cookie_trace {
        fscache_cookie_put_work,
        fscache_cookie_see_active,
        fscache_cookie_see_lru_discard,
+       fscache_cookie_see_lru_discard_clear,
        fscache_cookie_see_lru_do_one,
        fscache_cookie_see_relinquish,
        fscache_cookie_see_withdraw,
@@ -149,6 +150,7 @@ enum fscache_access_trace {
        EM(fscache_cookie_put_work,             "PQ  work ")            \
        EM(fscache_cookie_see_active,           "-   activ")            \
        EM(fscache_cookie_see_lru_discard,      "-   x-lru")            \
+       EM(fscache_cookie_see_lru_discard_clear,"-   lrudc")            \
        EM(fscache_cookie_see_lru_do_one,       "-   lrudo")            \
        EM(fscache_cookie_see_relinquish,       "-   x-rlq")            \
        EM(fscache_cookie_see_withdraw,         "-   x-wth")            \
index b9886d1..049b52e 100644 (file)
 /*
  * Declare tracing information enums and their string mappings for display.
  */
+#define rxrpc_call_poke_traces \
+       EM(rxrpc_call_poke_error,               "Error")        \
+       EM(rxrpc_call_poke_idle,                "Idle")         \
+       EM(rxrpc_call_poke_start,               "Start")        \
+       EM(rxrpc_call_poke_timer,               "Timer")        \
+       E_(rxrpc_call_poke_timer_now,           "Timer-now")
+
 #define rxrpc_skb_traces \
-       EM(rxrpc_skb_ack,                       "ACK") \
-       EM(rxrpc_skb_cleaned,                   "CLN") \
-       EM(rxrpc_skb_cloned_jumbo,              "CLJ") \
-       EM(rxrpc_skb_freed,                     "FRE") \
-       EM(rxrpc_skb_got,                       "GOT") \
-       EM(rxrpc_skb_lost,                      "*L*") \
-       EM(rxrpc_skb_new,                       "NEW") \
-       EM(rxrpc_skb_purged,                    "PUR") \
-       EM(rxrpc_skb_received,                  "RCV") \
-       EM(rxrpc_skb_rotated,                   "ROT") \
-       EM(rxrpc_skb_seen,                      "SEE") \
-       EM(rxrpc_skb_unshared,                  "UNS") \
-       E_(rxrpc_skb_unshared_nomem,            "US0")
+       EM(rxrpc_skb_eaten_by_unshare,          "ETN unshare  ") \
+       EM(rxrpc_skb_eaten_by_unshare_nomem,    "ETN unshar-nm") \
+       EM(rxrpc_skb_get_conn_work,             "GET conn-work") \
+       EM(rxrpc_skb_get_local_work,            "GET locl-work") \
+       EM(rxrpc_skb_get_reject_work,           "GET rej-work ") \
+       EM(rxrpc_skb_get_to_recvmsg,            "GET to-recv  ") \
+       EM(rxrpc_skb_get_to_recvmsg_oos,        "GET to-recv-o") \
+       EM(rxrpc_skb_new_encap_rcv,             "NEW encap-rcv") \
+       EM(rxrpc_skb_new_error_report,          "NEW error-rpt") \
+       EM(rxrpc_skb_new_jumbo_subpacket,       "NEW jumbo-sub") \
+       EM(rxrpc_skb_new_unshared,              "NEW unshared ") \
+       EM(rxrpc_skb_put_conn_work,             "PUT conn-work") \
+       EM(rxrpc_skb_put_error_report,          "PUT error-rep") \
+       EM(rxrpc_skb_put_input,                 "PUT input    ") \
+       EM(rxrpc_skb_put_jumbo_subpacket,       "PUT jumbo-sub") \
+       EM(rxrpc_skb_put_purge,                 "PUT purge    ") \
+       EM(rxrpc_skb_put_rotate,                "PUT rotate   ") \
+       EM(rxrpc_skb_put_unknown,               "PUT unknown  ") \
+       EM(rxrpc_skb_see_conn_work,             "SEE conn-work") \
+       EM(rxrpc_skb_see_recvmsg,               "SEE recvmsg  ") \
+       EM(rxrpc_skb_see_reject,                "SEE reject   ") \
+       EM(rxrpc_skb_see_rotate,                "SEE rotate   ") \
+       E_(rxrpc_skb_see_version,               "SEE version  ")
 
 #define rxrpc_local_traces \
-       EM(rxrpc_local_got,                     "GOT") \
-       EM(rxrpc_local_new,                     "NEW") \
-       EM(rxrpc_local_processing,              "PRO") \
-       EM(rxrpc_local_put,                     "PUT") \
-       EM(rxrpc_local_queued,                  "QUE") \
-       E_(rxrpc_local_tx_ack,                  "TAK")
+       EM(rxrpc_local_free,                    "FREE        ") \
+       EM(rxrpc_local_get_call,                "GET call    ") \
+       EM(rxrpc_local_get_client_conn,         "GET conn-cln") \
+       EM(rxrpc_local_get_for_use,             "GET for-use ") \
+       EM(rxrpc_local_get_peer,                "GET peer    ") \
+       EM(rxrpc_local_get_prealloc_conn,       "GET conn-pre") \
+       EM(rxrpc_local_new,                     "NEW         ") \
+       EM(rxrpc_local_put_bind,                "PUT bind    ") \
+       EM(rxrpc_local_put_call,                "PUT call    ") \
+       EM(rxrpc_local_put_for_use,             "PUT for-use ") \
+       EM(rxrpc_local_put_kill_conn,           "PUT conn-kil") \
+       EM(rxrpc_local_put_peer,                "PUT peer    ") \
+       EM(rxrpc_local_put_prealloc_conn,       "PUT conn-pre") \
+       EM(rxrpc_local_put_release_sock,        "PUT rel-sock") \
+       EM(rxrpc_local_stop,                    "STOP        ") \
+       EM(rxrpc_local_stopped,                 "STOPPED     ") \
+       EM(rxrpc_local_unuse_bind,              "UNU bind    ") \
+       EM(rxrpc_local_unuse_conn_work,         "UNU conn-wrk") \
+       EM(rxrpc_local_unuse_peer_keepalive,    "UNU peer-kpa") \
+       EM(rxrpc_local_unuse_release_sock,      "UNU rel-sock") \
+       EM(rxrpc_local_use_conn_work,           "USE conn-wrk") \
+       EM(rxrpc_local_use_lookup,              "USE lookup  ") \
+       E_(rxrpc_local_use_peer_keepalive,      "USE peer-kpa")
 
 #define rxrpc_peer_traces \
-       EM(rxrpc_peer_got,                      "GOT") \
-       EM(rxrpc_peer_new,                      "NEW") \
-       EM(rxrpc_peer_processing,               "PRO") \
-       E_(rxrpc_peer_put,                      "PUT")
+       EM(rxrpc_peer_free,                     "FREE        ") \
+       EM(rxrpc_peer_get_accept,               "GET accept  ") \
+       EM(rxrpc_peer_get_activate_call,        "GET act-call") \
+       EM(rxrpc_peer_get_bundle,               "GET bundle  ") \
+       EM(rxrpc_peer_get_client_conn,          "GET cln-conn") \
+       EM(rxrpc_peer_get_input,                "GET input   ") \
+       EM(rxrpc_peer_get_input_error,          "GET inpt-err") \
+       EM(rxrpc_peer_get_keepalive,            "GET keepaliv") \
+       EM(rxrpc_peer_get_lookup_client,        "GET look-cln") \
+       EM(rxrpc_peer_get_service_conn,         "GET srv-conn") \
+       EM(rxrpc_peer_new_client,               "NEW client  ") \
+       EM(rxrpc_peer_new_prealloc,             "NEW prealloc") \
+       EM(rxrpc_peer_put_bundle,               "PUT bundle  ") \
+       EM(rxrpc_peer_put_call,                 "PUT call    ") \
+       EM(rxrpc_peer_put_conn,                 "PUT conn    ") \
+       EM(rxrpc_peer_put_discard_tmp,          "PUT disc-tmp") \
+       EM(rxrpc_peer_put_input,                "PUT input   ") \
+       EM(rxrpc_peer_put_input_error,          "PUT inpt-err") \
+       E_(rxrpc_peer_put_keepalive,            "PUT keepaliv")
+
+#define rxrpc_bundle_traces \
+       EM(rxrpc_bundle_free,                   "FREE        ") \
+       EM(rxrpc_bundle_get_client_call,        "GET clt-call") \
+       EM(rxrpc_bundle_get_client_conn,        "GET clt-conn") \
+       EM(rxrpc_bundle_get_service_conn,       "GET svc-conn") \
+       EM(rxrpc_bundle_put_conn,               "PUT conn    ") \
+       EM(rxrpc_bundle_put_discard,            "PUT discard ") \
+       E_(rxrpc_bundle_new,                    "NEW         ")
 
 #define rxrpc_conn_traces \
-       EM(rxrpc_conn_got,                      "GOT") \
-       EM(rxrpc_conn_new_client,               "NWc") \
-       EM(rxrpc_conn_new_service,              "NWs") \
-       EM(rxrpc_conn_put_client,               "PTc") \
-       EM(rxrpc_conn_put_service,              "PTs") \
-       EM(rxrpc_conn_queued,                   "QUE") \
-       EM(rxrpc_conn_reap_service,             "RPs") \
-       E_(rxrpc_conn_seen,                     "SEE")
+       EM(rxrpc_conn_free,                     "FREE        ") \
+       EM(rxrpc_conn_get_activate_call,        "GET act-call") \
+       EM(rxrpc_conn_get_call_input,           "GET inp-call") \
+       EM(rxrpc_conn_get_conn_input,           "GET inp-conn") \
+       EM(rxrpc_conn_get_idle,                 "GET idle    ") \
+       EM(rxrpc_conn_get_poke,                 "GET poke    ") \
+       EM(rxrpc_conn_get_service_conn,         "GET svc-conn") \
+       EM(rxrpc_conn_new_client,               "NEW client  ") \
+       EM(rxrpc_conn_new_service,              "NEW service ") \
+       EM(rxrpc_conn_put_call,                 "PUT call    ") \
+       EM(rxrpc_conn_put_call_input,           "PUT inp-call") \
+       EM(rxrpc_conn_put_conn_input,           "PUT inp-conn") \
+       EM(rxrpc_conn_put_discard,              "PUT discard ") \
+       EM(rxrpc_conn_put_discard_idle,         "PUT disc-idl") \
+       EM(rxrpc_conn_put_local_dead,           "PUT loc-dead") \
+       EM(rxrpc_conn_put_noreuse,              "PUT noreuse ") \
+       EM(rxrpc_conn_put_poke,                 "PUT poke    ") \
+       EM(rxrpc_conn_put_service_reaped,       "PUT svc-reap") \
+       EM(rxrpc_conn_put_unbundle,             "PUT unbundle") \
+       EM(rxrpc_conn_put_unidle,               "PUT unidle  ") \
+       EM(rxrpc_conn_queue_challenge,          "QUE chall   ") \
+       EM(rxrpc_conn_queue_retry_work,         "QUE retry-wk") \
+       EM(rxrpc_conn_queue_rx_work,            "QUE rx-work ") \
+       EM(rxrpc_conn_queue_timer,              "QUE timer   ") \
+       EM(rxrpc_conn_see_new_service_conn,     "SEE new-svc ") \
+       EM(rxrpc_conn_see_reap_service,         "SEE reap-svc") \
+       E_(rxrpc_conn_see_work,                 "SEE work    ")
 
 #define rxrpc_client_traces \
        EM(rxrpc_client_activate_chans,         "Activa") \
        E_(rxrpc_client_to_idle,                "->Idle")
 
 #define rxrpc_call_traces \
-       EM(rxrpc_call_connected,                "CON") \
-       EM(rxrpc_call_error,                    "*E*") \
-       EM(rxrpc_call_got,                      "GOT") \
-       EM(rxrpc_call_got_kernel,               "Gke") \
-       EM(rxrpc_call_got_timer,                "GTM") \
-       EM(rxrpc_call_got_tx,                   "Gtx") \
-       EM(rxrpc_call_got_userid,               "Gus") \
-       EM(rxrpc_call_new_client,               "NWc") \
-       EM(rxrpc_call_new_service,              "NWs") \
-       EM(rxrpc_call_put,                      "PUT") \
-       EM(rxrpc_call_put_kernel,               "Pke") \
-       EM(rxrpc_call_put_noqueue,              "PnQ") \
-       EM(rxrpc_call_put_notimer,              "PnT") \
-       EM(rxrpc_call_put_timer,                "PTM") \
-       EM(rxrpc_call_put_tx,                   "Ptx") \
-       EM(rxrpc_call_put_userid,               "Pus") \
-       EM(rxrpc_call_queued,                   "QUE") \
-       EM(rxrpc_call_queued_ref,               "QUR") \
-       EM(rxrpc_call_release,                  "RLS") \
-       E_(rxrpc_call_seen,                     "SEE")
+       EM(rxrpc_call_get_input,                "GET input   ") \
+       EM(rxrpc_call_get_kernel_service,       "GET krnl-srv") \
+       EM(rxrpc_call_get_notify_socket,        "GET notify  ") \
+       EM(rxrpc_call_get_poke,                 "GET poke    ") \
+       EM(rxrpc_call_get_recvmsg,              "GET recvmsg ") \
+       EM(rxrpc_call_get_release_sock,         "GET rel-sock") \
+       EM(rxrpc_call_get_sendmsg,              "GET sendmsg ") \
+       EM(rxrpc_call_get_userid,               "GET user-id ") \
+       EM(rxrpc_call_new_client,               "NEW client  ") \
+       EM(rxrpc_call_new_prealloc_service,     "NEW prealloc") \
+       EM(rxrpc_call_put_discard_prealloc,     "PUT disc-pre") \
+       EM(rxrpc_call_put_discard_error,        "PUT disc-err") \
+       EM(rxrpc_call_put_input,                "PUT input   ") \
+       EM(rxrpc_call_put_kernel,               "PUT kernel  ") \
+       EM(rxrpc_call_put_poke,                 "PUT poke    ") \
+       EM(rxrpc_call_put_recvmsg,              "PUT recvmsg ") \
+       EM(rxrpc_call_put_release_sock,         "PUT rls-sock") \
+       EM(rxrpc_call_put_release_sock_tba,     "PUT rls-sk-a") \
+       EM(rxrpc_call_put_sendmsg,              "PUT sendmsg ") \
+       EM(rxrpc_call_put_unnotify,             "PUT unnotify") \
+       EM(rxrpc_call_put_userid_exists,        "PUT u-exists") \
+       EM(rxrpc_call_see_accept,               "SEE accept  ") \
+       EM(rxrpc_call_see_activate_client,      "SEE act-clnt") \
+       EM(rxrpc_call_see_connect_failed,       "SEE con-fail") \
+       EM(rxrpc_call_see_connected,            "SEE connect ") \
+       EM(rxrpc_call_see_distribute_error,     "SEE dist-err") \
+       EM(rxrpc_call_see_input,                "SEE input   ") \
+       EM(rxrpc_call_see_release,              "SEE release ") \
+       EM(rxrpc_call_see_userid_exists,        "SEE u-exists") \
+       E_(rxrpc_call_see_zap,                  "SEE zap     ")
 
 #define rxrpc_txqueue_traces \
        EM(rxrpc_txqueue_await_reply,           "AWR") \
        EM(rxrpc_propose_ack_respond_to_ping,   "Rsp2Png") \
        EM(rxrpc_propose_ack_retry_tx,          "RetryTx") \
        EM(rxrpc_propose_ack_rotate_rx,         "RxAck  ") \
+       EM(rxrpc_propose_ack_rx_idle,           "RxIdle ") \
        E_(rxrpc_propose_ack_terminal_ack,      "ClTerm ")
 
 #define rxrpc_congest_modes \
        EM(rxrpc_txbuf_put_rotated,             "PUT ROTATED")  \
        EM(rxrpc_txbuf_put_send_aborted,        "PUT SEND-X ")  \
        EM(rxrpc_txbuf_put_trans,               "PUT TRANS  ")  \
+       EM(rxrpc_txbuf_see_out_of_step,         "OUT-OF-STEP")  \
        EM(rxrpc_txbuf_see_send_more,           "SEE SEND+  ")  \
        E_(rxrpc_txbuf_see_unacked,             "SEE UNACKED")
 
 #define EM(a, b) a,
 #define E_(a, b) a
 
+enum rxrpc_bundle_trace                { rxrpc_bundle_traces } __mode(byte);
+enum rxrpc_call_poke_trace     { rxrpc_call_poke_traces } __mode(byte);
 enum rxrpc_call_trace          { rxrpc_call_traces } __mode(byte);
 enum rxrpc_client_trace                { rxrpc_client_traces } __mode(byte);
 enum rxrpc_congest_change      { rxrpc_congest_changes } __mode(byte);
@@ -316,6 +407,8 @@ enum rxrpc_txqueue_trace    { rxrpc_txqueue_traces } __mode(byte);
 #define EM(a, b) TRACE_DEFINE_ENUM(a);
 #define E_(a, b) TRACE_DEFINE_ENUM(a);
 
+rxrpc_bundle_traces;
+rxrpc_call_poke_traces;
 rxrpc_call_traces;
 rxrpc_client_traces;
 rxrpc_congest_changes;
@@ -345,83 +438,98 @@ rxrpc_txqueue_traces;
 
 TRACE_EVENT(rxrpc_local,
            TP_PROTO(unsigned int local_debug_id, enum rxrpc_local_trace op,
-                    int usage, const void *where),
+                    int ref, int usage),
 
-           TP_ARGS(local_debug_id, op, usage, where),
+           TP_ARGS(local_debug_id, op, ref, usage),
 
            TP_STRUCT__entry(
                    __field(unsigned int,       local           )
                    __field(int,                op              )
+                   __field(int,                ref             )
                    __field(int,                usage           )
-                   __field(const void *,       where           )
                             ),
 
            TP_fast_assign(
                    __entry->local = local_debug_id;
                    __entry->op = op;
+                   __entry->ref = ref;
                    __entry->usage = usage;
-                   __entry->where = where;
                           ),
 
-           TP_printk("L=%08x %s u=%d sp=%pSR",
+           TP_printk("L=%08x %s r=%d u=%d",
                      __entry->local,
                      __print_symbolic(__entry->op, rxrpc_local_traces),
-                     __entry->usage,
-                     __entry->where)
+                     __entry->ref,
+                     __entry->usage)
            );
 
 TRACE_EVENT(rxrpc_peer,
-           TP_PROTO(unsigned int peer_debug_id, enum rxrpc_peer_trace op,
-                    int usage, const void *where),
+           TP_PROTO(unsigned int peer_debug_id, int ref, enum rxrpc_peer_trace why),
 
-           TP_ARGS(peer_debug_id, op, usage, where),
+           TP_ARGS(peer_debug_id, ref, why),
 
            TP_STRUCT__entry(
                    __field(unsigned int,       peer            )
-                   __field(int,                op              )
-                   __field(int,                usage           )
-                   __field(const void *,       where           )
+                   __field(int,                ref             )
+                   __field(int,                why             )
                             ),
 
            TP_fast_assign(
                    __entry->peer = peer_debug_id;
-                   __entry->op = op;
-                   __entry->usage = usage;
-                   __entry->where = where;
+                   __entry->ref = ref;
+                   __entry->why = why;
                           ),
 
-           TP_printk("P=%08x %s u=%d sp=%pSR",
+           TP_printk("P=%08x %s r=%d",
                      __entry->peer,
-                     __print_symbolic(__entry->op, rxrpc_peer_traces),
-                     __entry->usage,
-                     __entry->where)
+                     __print_symbolic(__entry->why, rxrpc_peer_traces),
+                     __entry->ref)
+           );
+
+TRACE_EVENT(rxrpc_bundle,
+           TP_PROTO(unsigned int bundle_debug_id, int ref, enum rxrpc_bundle_trace why),
+
+           TP_ARGS(bundle_debug_id, ref, why),
+
+           TP_STRUCT__entry(
+                   __field(unsigned int,       bundle          )
+                   __field(int,                ref             )
+                   __field(int,                why             )
+                            ),
+
+           TP_fast_assign(
+                   __entry->bundle = bundle_debug_id;
+                   __entry->ref = ref;
+                   __entry->why = why;
+                          ),
+
+           TP_printk("CB=%08x %s r=%d",
+                     __entry->bundle,
+                     __print_symbolic(__entry->why, rxrpc_bundle_traces),
+                     __entry->ref)
            );
 
 TRACE_EVENT(rxrpc_conn,
-           TP_PROTO(unsigned int conn_debug_id, enum rxrpc_conn_trace op,
-                    int usage, const void *where),
+           TP_PROTO(unsigned int conn_debug_id, int ref, enum rxrpc_conn_trace why),
 
-           TP_ARGS(conn_debug_id, op, usage, where),
+           TP_ARGS(conn_debug_id, ref, why),
 
            TP_STRUCT__entry(
                    __field(unsigned int,       conn            )
-                   __field(int,                op              )
-                   __field(int,                usage           )
-                   __field(const void *,       where           )
+                   __field(int,                ref             )
+                   __field(int,                why             )
                             ),
 
            TP_fast_assign(
                    __entry->conn = conn_debug_id;
-                   __entry->op = op;
-                   __entry->usage = usage;
-                   __entry->where = where;
+                   __entry->ref = ref;
+                   __entry->why = why;
                           ),
 
-           TP_printk("C=%08x %s u=%d sp=%pSR",
+           TP_printk("C=%08x %s r=%d",
                      __entry->conn,
-                     __print_symbolic(__entry->op, rxrpc_conn_traces),
-                     __entry->usage,
-                     __entry->where)
+                     __print_symbolic(__entry->why, rxrpc_conn_traces),
+                     __entry->ref)
            );
 
 TRACE_EVENT(rxrpc_client,
@@ -455,63 +563,57 @@ TRACE_EVENT(rxrpc_client,
            );
 
 TRACE_EVENT(rxrpc_call,
-           TP_PROTO(unsigned int call_debug_id, enum rxrpc_call_trace op,
-                    int usage, const void *where, const void *aux),
+           TP_PROTO(unsigned int call_debug_id, int ref, unsigned long aux,
+                    enum rxrpc_call_trace why),
 
-           TP_ARGS(call_debug_id, op, usage, where, aux),
+           TP_ARGS(call_debug_id, ref, aux, why),
 
            TP_STRUCT__entry(
                    __field(unsigned int,               call            )
-                   __field(int,                        op              )
-                   __field(int,                        usage           )
-                   __field(const void *,               where           )
-                   __field(const void *,               aux             )
+                   __field(int,                        ref             )
+                   __field(int,                        why             )
+                   __field(unsigned long,              aux             )
                             ),
 
            TP_fast_assign(
                    __entry->call = call_debug_id;
-                   __entry->op = op;
-                   __entry->usage = usage;
-                   __entry->where = where;
+                   __entry->ref = ref;
+                   __entry->why = why;
                    __entry->aux = aux;
                           ),
 
-           TP_printk("c=%08x %s u=%d sp=%pSR a=%p",
+           TP_printk("c=%08x %s r=%d a=%lx",
                      __entry->call,
-                     __print_symbolic(__entry->op, rxrpc_call_traces),
-                     __entry->usage,
-                     __entry->where,
+                     __print_symbolic(__entry->why, rxrpc_call_traces),
+                     __entry->ref,
                      __entry->aux)
            );
 
 TRACE_EVENT(rxrpc_skb,
-           TP_PROTO(struct sk_buff *skb, enum rxrpc_skb_trace op,
-                    int usage, int mod_count, const void *where),
+           TP_PROTO(struct sk_buff *skb, int usage, int mod_count,
+                    enum rxrpc_skb_trace why),
 
-           TP_ARGS(skb, op, usage, mod_count, where),
+           TP_ARGS(skb, usage, mod_count, why),
 
            TP_STRUCT__entry(
                    __field(struct sk_buff *,           skb             )
-                   __field(enum rxrpc_skb_trace,       op              )
                    __field(int,                        usage           )
                    __field(int,                        mod_count       )
-                   __field(const void *,               where           )
+                   __field(enum rxrpc_skb_trace,       why             )
                             ),
 
            TP_fast_assign(
                    __entry->skb = skb;
-                   __entry->op = op;
                    __entry->usage = usage;
                    __entry->mod_count = mod_count;
-                   __entry->where = where;
+                   __entry->why = why;
                           ),
 
-           TP_printk("s=%p Rx %s u=%d m=%d p=%pSR",
+           TP_printk("s=%p Rx %s u=%d m=%d",
                      __entry->skb,
-                     __print_symbolic(__entry->op, rxrpc_skb_traces),
+                     __print_symbolic(__entry->why, rxrpc_skb_traces),
                      __entry->usage,
-                     __entry->mod_count,
-                     __entry->where)
+                     __entry->mod_count)
            );
 
 TRACE_EVENT(rxrpc_rx_packet,
@@ -623,6 +725,7 @@ TRACE_EVENT(rxrpc_txqueue,
                    __field(rxrpc_seq_t,                acks_hard_ack   )
                    __field(rxrpc_seq_t,                tx_bottom       )
                    __field(rxrpc_seq_t,                tx_top          )
+                   __field(rxrpc_seq_t,                tx_prepared     )
                    __field(int,                        tx_winsize      )
                             ),
 
@@ -632,16 +735,18 @@ TRACE_EVENT(rxrpc_txqueue,
                    __entry->acks_hard_ack = call->acks_hard_ack;
                    __entry->tx_bottom = call->tx_bottom;
                    __entry->tx_top = call->tx_top;
+                   __entry->tx_prepared = call->tx_prepared;
                    __entry->tx_winsize = call->tx_winsize;
                           ),
 
-           TP_printk("c=%08x %s f=%08x h=%08x n=%u/%u/%u",
+           TP_printk("c=%08x %s f=%08x h=%08x n=%u/%u/%u/%u",
                      __entry->call,
                      __print_symbolic(__entry->why, rxrpc_txqueue_traces),
                      __entry->tx_bottom,
                      __entry->acks_hard_ack,
                      __entry->tx_top - __entry->tx_bottom,
                      __entry->tx_top - __entry->acks_hard_ack,
+                     __entry->tx_prepared - __entry->tx_bottom,
                      __entry->tx_winsize)
            );
 
@@ -733,6 +838,66 @@ TRACE_EVENT(rxrpc_rx_abort,
                      __entry->abort_code)
            );
 
+TRACE_EVENT(rxrpc_rx_challenge,
+           TP_PROTO(struct rxrpc_connection *conn, rxrpc_serial_t serial,
+                    u32 version, u32 nonce, u32 min_level),
+
+           TP_ARGS(conn, serial, version, nonce, min_level),
+
+           TP_STRUCT__entry(
+                   __field(unsigned int,               conn            )
+                   __field(rxrpc_serial_t,             serial          )
+                   __field(u32,                        version         )
+                   __field(u32,                        nonce           )
+                   __field(u32,                        min_level       )
+                            ),
+
+           TP_fast_assign(
+                   __entry->conn = conn->debug_id;
+                   __entry->serial = serial;
+                   __entry->version = version;
+                   __entry->nonce = nonce;
+                   __entry->min_level = min_level;
+                          ),
+
+           TP_printk("C=%08x CHALLENGE %08x v=%x n=%x ml=%x",
+                     __entry->conn,
+                     __entry->serial,
+                     __entry->version,
+                     __entry->nonce,
+                     __entry->min_level)
+           );
+
+TRACE_EVENT(rxrpc_rx_response,
+           TP_PROTO(struct rxrpc_connection *conn, rxrpc_serial_t serial,
+                    u32 version, u32 kvno, u32 ticket_len),
+
+           TP_ARGS(conn, serial, version, kvno, ticket_len),
+
+           TP_STRUCT__entry(
+                   __field(unsigned int,               conn            )
+                   __field(rxrpc_serial_t,             serial          )
+                   __field(u32,                        version         )
+                   __field(u32,                        kvno            )
+                   __field(u32,                        ticket_len      )
+                            ),
+
+           TP_fast_assign(
+                   __entry->conn = conn->debug_id;
+                   __entry->serial = serial;
+                   __entry->version = version;
+                   __entry->kvno = kvno;
+                   __entry->ticket_len = ticket_len;
+                          ),
+
+           TP_printk("C=%08x RESPONSE %08x v=%x kvno=%x tl=%x",
+                     __entry->conn,
+                     __entry->serial,
+                     __entry->version,
+                     __entry->kvno,
+                     __entry->ticket_len)
+           );
+
 TRACE_EVENT(rxrpc_rx_rwind_change,
            TP_PROTO(struct rxrpc_call *call, rxrpc_serial_t serial,
                     u32 rwind, bool wake),
@@ -1278,6 +1443,44 @@ TRACE_EVENT(rxrpc_congest,
                      __entry->sum.retrans_timeo ? " rTxTo" : "")
            );
 
+TRACE_EVENT(rxrpc_reset_cwnd,
+           TP_PROTO(struct rxrpc_call *call, ktime_t now),
+
+           TP_ARGS(call, now),
+
+           TP_STRUCT__entry(
+                   __field(unsigned int,                       call            )
+                   __field(enum rxrpc_congest_mode,            mode            )
+                   __field(unsigned short,                     cwnd            )
+                   __field(unsigned short,                     extra           )
+                   __field(rxrpc_seq_t,                        hard_ack        )
+                   __field(rxrpc_seq_t,                        prepared        )
+                   __field(ktime_t,                            since_last_tx   )
+                   __field(bool,                               has_data        )
+                            ),
+
+           TP_fast_assign(
+                   __entry->call       = call->debug_id;
+                   __entry->mode       = call->cong_mode;
+                   __entry->cwnd       = call->cong_cwnd;
+                   __entry->extra      = call->cong_extra;
+                   __entry->hard_ack   = call->acks_hard_ack;
+                   __entry->prepared   = call->tx_prepared - call->tx_bottom;
+                   __entry->since_last_tx = ktime_sub(now, call->tx_last_sent);
+                   __entry->has_data   = !list_empty(&call->tx_sendmsg);
+                          ),
+
+           TP_printk("c=%08x q=%08x %s cw=%u+%u pr=%u tm=%llu d=%u",
+                     __entry->call,
+                     __entry->hard_ack,
+                     __print_symbolic(__entry->mode, rxrpc_congest_modes),
+                     __entry->cwnd,
+                     __entry->extra,
+                     __entry->prepared,
+                     ktime_to_ns(__entry->since_last_tx),
+                     __entry->has_data)
+           );
+
 TRACE_EVENT(rxrpc_disconnect_call,
            TP_PROTO(struct rxrpc_call *call),
 
@@ -1352,6 +1555,7 @@ TRACE_EVENT(rxrpc_connect_call,
                    __field(unsigned long,              user_call_ID    )
                    __field(u32,                        cid             )
                    __field(u32,                        call_id         )
+                   __field_struct(struct sockaddr_rxrpc, srx           )
                             ),
 
            TP_fast_assign(
@@ -1359,33 +1563,42 @@ TRACE_EVENT(rxrpc_connect_call,
                    __entry->user_call_ID = call->user_call_ID;
                    __entry->cid = call->cid;
                    __entry->call_id = call->call_id;
+                   __entry->srx = call->dest_srx;
                           ),
 
-           TP_printk("c=%08x u=%p %08x:%08x",
+           TP_printk("c=%08x u=%p %08x:%08x dst=%pISp",
                      __entry->call,
                      (void *)__entry->user_call_ID,
                      __entry->cid,
-                     __entry->call_id)
+                     __entry->call_id,
+                     &__entry->srx.transport)
            );
 
 TRACE_EVENT(rxrpc_resend,
-           TP_PROTO(struct rxrpc_call *call),
+           TP_PROTO(struct rxrpc_call *call, struct sk_buff *ack),
 
-           TP_ARGS(call),
+           TP_ARGS(call, ack),
 
            TP_STRUCT__entry(
                    __field(unsigned int,               call            )
                    __field(rxrpc_seq_t,                seq             )
+                   __field(rxrpc_seq_t,                transmitted     )
+                   __field(rxrpc_serial_t,             ack_serial      )
                             ),
 
            TP_fast_assign(
+                   struct rxrpc_skb_priv *sp = ack ? rxrpc_skb(ack) : NULL;
                    __entry->call = call->debug_id;
                    __entry->seq = call->acks_hard_ack;
+                   __entry->transmitted = call->tx_transmitted;
+                   __entry->ack_serial = sp ? sp->hdr.serial : 0;
                           ),
 
-           TP_printk("c=%08x q=%x",
+           TP_printk("c=%08x r=%x q=%x tq=%x",
                      __entry->call,
-                     __entry->seq)
+                     __entry->ack_serial,
+                     __entry->seq,
+                     __entry->transmitted)
            );
 
 TRACE_EVENT(rxrpc_rx_icmp,
@@ -1586,6 +1799,47 @@ TRACE_EVENT(rxrpc_txbuf,
                      __entry->ref)
            );
 
+TRACE_EVENT(rxrpc_poke_call,
+           TP_PROTO(struct rxrpc_call *call, bool busy,
+                    enum rxrpc_call_poke_trace what),
+
+           TP_ARGS(call, busy, what),
+
+           TP_STRUCT__entry(
+                   __field(unsigned int,               call_debug_id   )
+                   __field(bool,                       busy            )
+                   __field(enum rxrpc_call_poke_trace, what            )
+                            ),
+
+           TP_fast_assign(
+                   __entry->call_debug_id = call->debug_id;
+                   __entry->busy = busy;
+                   __entry->what = what;
+                          ),
+
+           TP_printk("c=%08x %s%s",
+                     __entry->call_debug_id,
+                     __print_symbolic(__entry->what, rxrpc_call_poke_traces),
+                     __entry->busy ? "!" : "")
+           );
+
+TRACE_EVENT(rxrpc_call_poked,
+           TP_PROTO(struct rxrpc_call *call),
+
+           TP_ARGS(call),
+
+           TP_STRUCT__entry(
+                   __field(unsigned int,               call_debug_id   )
+                            ),
+
+           TP_fast_assign(
+                   __entry->call_debug_id = call->debug_id;
+                          ),
+
+           TP_printk("c=%08x",
+                     __entry->call_debug_id)
+           );
+
 #undef EM
 #undef E_
 #endif /* _TRACE_RXRPC_H */
index f89de51..464ca3f 100644 (file)
@@ -5293,7 +5293,7 @@ union bpf_attr {
  *     Return
  *             Nothing. Always succeeds.
  *
- * long bpf_dynptr_read(void *dst, u32 len, struct bpf_dynptr *src, u32 offset, u64 flags)
+ * long bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr *src, u32 offset, u64 flags)
  *     Description
  *             Read *len* bytes from *src* into *dst*, starting from *offset*
  *             into *src*.
@@ -5303,7 +5303,7 @@ union bpf_attr {
  *             of *src*'s data, -EINVAL if *src* is an invalid dynptr or if
  *             *flags* is not 0.
  *
- * long bpf_dynptr_write(struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags)
+ * long bpf_dynptr_write(const struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags)
  *     Description
  *             Write *len* bytes from *src* into *dst*, starting from *offset*
  *             into *dst*.
@@ -5313,7 +5313,7 @@ union bpf_attr {
  *             of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst*
  *             is a read-only dynptr or if *flags* is not 0.
  *
- * void *bpf_dynptr_data(struct bpf_dynptr *ptr, u32 offset, u32 len)
+ * void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u32 offset, u32 len)
  *     Description
  *             Get a pointer to the underlying dynptr data.
  *
@@ -5414,7 +5414,7 @@ union bpf_attr {
  *             Drain samples from the specified user ring buffer, and invoke
  *             the provided callback for each such sample:
  *
- *             long (\*callback_fn)(struct bpf_dynptr \*dynptr, void \*ctx);
+ *             long (\*callback_fn)(const struct bpf_dynptr \*dynptr, void \*ctx);
  *
  *             If **callback_fn** returns 0, the helper will continue to try
  *             and drain the next sample, up to a maximum of
index 70191d9..3782d42 100644 (file)
@@ -658,11 +658,24 @@ enum devlink_resource_unit {
        DEVLINK_RESOURCE_UNIT_ENTRY,
 };
 
+enum devlink_port_fn_attr_cap {
+       DEVLINK_PORT_FN_ATTR_CAP_ROCE_BIT,
+       DEVLINK_PORT_FN_ATTR_CAP_MIGRATABLE_BIT,
+
+       /* Add new caps above */
+       __DEVLINK_PORT_FN_ATTR_CAPS_MAX,
+};
+
+#define DEVLINK_PORT_FN_CAP_ROCE _BITUL(DEVLINK_PORT_FN_ATTR_CAP_ROCE_BIT)
+#define DEVLINK_PORT_FN_CAP_MIGRATABLE \
+       _BITUL(DEVLINK_PORT_FN_ATTR_CAP_MIGRATABLE_BIT)
+
 enum devlink_port_function_attr {
        DEVLINK_PORT_FUNCTION_ATTR_UNSPEC,
        DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR,     /* binary */
        DEVLINK_PORT_FN_ATTR_STATE,     /* u8 */
        DEVLINK_PORT_FN_ATTR_OPSTATE,   /* u8 */
+       DEVLINK_PORT_FN_ATTR_CAPS,      /* bitfield32 */
 
        __DEVLINK_PORT_FUNCTION_ATTR_MAX,
        DEVLINK_PORT_FUNCTION_ATTR_MAX = __DEVLINK_PORT_FUNCTION_ATTR_MAX - 1
index aaf7c69..5799a9d 100644 (file)
@@ -51,6 +51,7 @@ enum {
        ETHTOOL_MSG_MODULE_SET,
        ETHTOOL_MSG_PSE_GET,
        ETHTOOL_MSG_PSE_SET,
+       ETHTOOL_MSG_RSS_GET,
 
        /* add new constants above here */
        __ETHTOOL_MSG_USER_CNT,
@@ -97,6 +98,7 @@ enum {
        ETHTOOL_MSG_MODULE_GET_REPLY,
        ETHTOOL_MSG_MODULE_NTF,
        ETHTOOL_MSG_PSE_GET_REPLY,
+       ETHTOOL_MSG_RSS_GET_REPLY,
 
        /* add new constants above here */
        __ETHTOOL_MSG_KERNEL_CNT,
@@ -880,6 +882,18 @@ enum {
        ETHTOOL_A_PSE_MAX = (__ETHTOOL_A_PSE_CNT - 1)
 };
 
+enum {
+       ETHTOOL_A_RSS_UNSPEC,
+       ETHTOOL_A_RSS_HEADER,
+       ETHTOOL_A_RSS_CONTEXT,          /* u32 */
+       ETHTOOL_A_RSS_HFUNC,            /* u32 */
+       ETHTOOL_A_RSS_INDIR,            /* binary */
+       ETHTOOL_A_RSS_HKEY,             /* binary */
+
+       __ETHTOOL_A_RSS_CNT,
+       ETHTOOL_A_RSS_MAX = (__ETHTOOL_A_RSS_CNT - 1),
+};
+
 /* generic netlink info */
 #define ETHTOOL_GENL_NAME "ethtool"
 #define ETHTOOL_GENL_VERSION 1
index a86a7e7..d9de241 100644 (file)
@@ -723,10 +723,31 @@ enum {
 enum {
        MDBE_ATTR_UNSPEC,
        MDBE_ATTR_SOURCE,
+       MDBE_ATTR_SRC_LIST,
+       MDBE_ATTR_GROUP_MODE,
+       MDBE_ATTR_RTPROT,
        __MDBE_ATTR_MAX,
 };
 #define MDBE_ATTR_MAX (__MDBE_ATTR_MAX - 1)
 
+/* per mdb entry source */
+enum {
+       MDBE_SRC_LIST_UNSPEC,
+       MDBE_SRC_LIST_ENTRY,
+       __MDBE_SRC_LIST_MAX,
+};
+#define MDBE_SRC_LIST_MAX (__MDBE_SRC_LIST_MAX - 1)
+
+/* per mdb entry per source attributes
+ * these are embedded in MDBE_SRC_LIST_ENTRY
+ */
+enum {
+       MDBE_SRCATTR_UNSPEC,
+       MDBE_SRCATTR_ADDRESS,
+       __MDBE_SRCATTR_MAX,
+};
+#define MDBE_SRCATTR_MAX (__MDBE_SRCATTR_MAX - 1)
+
 /* Embedded inside LINK_XSTATS_TYPE_BRIDGE */
 enum {
        BRIDGE_XSTATS_UNSPEC,
index b6d7b86..287cdc8 100644 (file)
@@ -90,6 +90,8 @@
 #define TUN_F_TSO6     0x04    /* I can handle TSO for IPv6 packets */
 #define TUN_F_TSO_ECN  0x08    /* I can handle TSO with ECN bits. */
 #define TUN_F_UFO      0x10    /* I can handle UFO packets */
+#define TUN_F_USO4     0x20    /* I can handle USO for IPv4 packets */
+#define TUN_F_USO6     0x40    /* I can handle USO for IPv6 packets */
 
 /* Protocol info prepended to the packets (when IFF_NO_PI is not set) */
 #define TUN_PKT_STRIP  0x0001
index 55501e5..a2c66b3 100644 (file)
@@ -31,8 +31,9 @@ enum {
        SOF_TIMESTAMPING_OPT_PKTINFO = (1<<13),
        SOF_TIMESTAMPING_OPT_TX_SWHW = (1<<14),
        SOF_TIMESTAMPING_BIND_PHC = (1 << 15),
+       SOF_TIMESTAMPING_OPT_ID_TCP = (1 << 16),
 
-       SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_BIND_PHC,
+       SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_OPT_ID_TCP,
        SOF_TIMESTAMPING_MASK = (SOF_TIMESTAMPING_LAST - 1) |
                                 SOF_TIMESTAMPING_LAST
 };
index 79e5d68..333807e 100644 (file)
@@ -85,6 +85,7 @@ enum {
        IPSET_ATTR_CADT_LINENO = IPSET_ATTR_LINENO,     /* 9 */
        IPSET_ATTR_MARK,        /* 10 */
        IPSET_ATTR_MARKMASK,    /* 11 */
+       IPSET_ATTR_BITMASK,     /* 12 */
        /* Reserve empty slots */
        IPSET_ATTR_CADT_MAX = 16,
        /* Create-only specific attributes */
@@ -153,6 +154,7 @@ enum ipset_errno {
        IPSET_ERR_COMMENT,
        IPSET_ERR_INVALID_MARKMASK,
        IPSET_ERR_SKBINFO,
+       IPSET_ERR_BITMASK_NETMASK_EXCL,
 
        /* Type specific error codes */
        IPSET_ERR_TYPE_SPECIFIC = 4352,
index edc6dda..c742469 100644 (file)
@@ -16,6 +16,7 @@ enum sctp_conntrack {
        SCTP_CONNTRACK_SHUTDOWN_ACK_SENT,
        SCTP_CONNTRACK_HEARTBEAT_SENT,
        SCTP_CONNTRACK_HEARTBEAT_ACKED,
+       SCTP_CONNTRACK_DATA_SENT,
        SCTP_CONNTRACK_MAX
 };
 
index 6b20fb2..94e7403 100644 (file)
@@ -95,6 +95,7 @@ enum ctattr_timeout_sctp {
        CTA_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT,
        CTA_TIMEOUT_SCTP_HEARTBEAT_SENT,
        CTA_TIMEOUT_SCTP_HEARTBEAT_ACKED,
+       CTA_TIMEOUT_SCTP_DATA_SENT,
        __CTA_TIMEOUT_SCTP_MAX
 };
 #define CTA_TIMEOUT_SCTP_MAX (__CTA_TIMEOUT_SCTP_MAX - 1)
index 94066f8..c5d62ee 100644 (file)
@@ -277,11 +277,25 @@ enum ovs_vport_attr {
        OVS_VPORT_ATTR_PAD,
        OVS_VPORT_ATTR_IFINDEX,
        OVS_VPORT_ATTR_NETNSID,
+       OVS_VPORT_ATTR_UPCALL_STATS,
        __OVS_VPORT_ATTR_MAX
 };
 
 #define OVS_VPORT_ATTR_MAX (__OVS_VPORT_ATTR_MAX - 1)
 
+/**
+ * enum ovs_vport_upcall_attr - attributes for %OVS_VPORT_UPCALL* commands
+ * @OVS_VPORT_UPCALL_SUCCESS: 64-bit upcall success packets.
+ * @OVS_VPORT_UPCALL_FAIL: 64-bit upcall fail packets.
+ */
+enum ovs_vport_upcall_attr {
+       OVS_VPORT_UPCALL_ATTR_SUCCESS,
+       OVS_VPORT_UPCALL_ATTR_FAIL,
+       __OVS_VPORT_UPCALL_ATTR_MAX
+};
+
+#define OVS_VPORT_UPCALL_ATTR_MAX (__OVS_VPORT_UPCALL_ATTR_MAX - 1)
+
 enum {
        OVS_VXLAN_EXT_UNSPEC,
        OVS_VXLAN_EXT_GBP,      /* Flag or __u32 */
index a7bd48d..af798f4 100644 (file)
@@ -9,6 +9,7 @@
 #define VIRTIO_BT_F_VND_HCI    0       /* Indicates vendor command support */
 #define VIRTIO_BT_F_MSFT_EXT   1       /* Indicates MSFT vendor support */
 #define VIRTIO_BT_F_AOSP_EXT   2       /* Indicates AOSP vendor support */
+#define VIRTIO_BT_F_CONFIG_V2  3       /* Use second version configuration */
 
 enum virtio_bt_config_type {
        VIRTIO_BT_CONFIG_TYPE_PRIMARY   = 0,
@@ -28,4 +29,11 @@ struct virtio_bt_config {
        __u16 msft_opcode;
 } __attribute__((packed));
 
+struct virtio_bt_config_v2 {
+       __u8  type;
+       __u8  alignment;
+       __u16 vendor;
+       __u16 msft_opcode;
+};
+
 #endif /* _UAPI_LINUX_VIRTIO_BT_H */
index 6cb842e..b4062be 100644 (file)
@@ -57,6 +57,9 @@
                                         * Steering */
 #define VIRTIO_NET_F_CTRL_MAC_ADDR 23  /* Set MAC address */
 #define VIRTIO_NET_F_NOTF_COAL 53      /* Device supports notifications coalescing */
+#define VIRTIO_NET_F_GUEST_USO4        54      /* Guest can handle USOv4 in. */
+#define VIRTIO_NET_F_GUEST_USO6        55      /* Guest can handle USOv6 in. */
+#define VIRTIO_NET_F_HOST_USO  56      /* Host can handle USO in. */
 #define VIRTIO_NET_F_HASH_REPORT  57   /* Supports hash report */
 #define VIRTIO_NET_F_RSS         60    /* Supports RSS RX steering */
 #define VIRTIO_NET_F_RSC_EXT     61    /* extended coalescing info */
@@ -130,6 +133,7 @@ struct virtio_net_hdr_v1 {
 #define VIRTIO_NET_HDR_GSO_TCPV4       1       /* GSO frame, IPv4 TCP (TSO) */
 #define VIRTIO_NET_HDR_GSO_UDP         3       /* GSO frame, IPv4 UDP (UFO) */
 #define VIRTIO_NET_HDR_GSO_TCPV6       4       /* GSO frame, IPv6 TCP */
+#define VIRTIO_NET_HDR_GSO_UDP_L4      5       /* GSO frame, IPv4& IPv6 UDP (USO) */
 #define VIRTIO_NET_HDR_GSO_ECN         0x80    /* TCP has ECN set */
        __u8 gso_type;
        __virtio16 hdr_len;     /* Ethernet + IP + tcp/udp hdrs */
index 4f84ea7..23543c3 100644 (file)
@@ -519,6 +519,12 @@ struct xfrm_user_offload {
  */
 #define XFRM_OFFLOAD_IPV6      1
 #define XFRM_OFFLOAD_INBOUND   2
+/* Two bits above are relevant for state path only, while
+ * offload is used for both policy and state flows.
+ *
+ * In policy offload mode, they are free and can be safely reused.
+ */
+#define XFRM_OFFLOAD_PACKET    4
 
 struct xfrm_userpolicy_default {
 #define XFRM_USERPOLICY_UNSPEC 0
index c8496f9..00f88aa 100644 (file)
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -2179,14 +2179,15 @@ long __do_semtimedop(int semid, struct sembuf *sops,
                 * scenarios where we were awakened externally, during the
                 * window between wake_q_add() and wake_up_q().
                 */
+               rcu_read_lock();
                error = READ_ONCE(queue.status);
                if (error != -EINTR) {
                        /* see SEM_BARRIER_2 for purpose/pairing */
                        smp_acquire__after_ctrl_dep();
+                       rcu_read_unlock();
                        goto out;
                }
 
-               rcu_read_lock();
                locknum = sem_lock(sma, sops, nsops);
 
                if (!ipc_valid_object(&sma->sem_perm))
index 3094038..6cdf6d9 100644 (file)
@@ -211,7 +211,6 @@ BPF_CALL_2(bpf_cgrp_storage_delete, struct bpf_map *, map, struct cgroup *, cgro
        return ret;
 }
 
-BTF_ID_LIST_SINGLE(cgroup_storage_map_btf_ids, struct, bpf_local_storage_map)
 const struct bpf_map_ops cgrp_storage_map_ops = {
        .map_meta_equal = bpf_map_meta_equal,
        .map_alloc_check = bpf_local_storage_map_alloc_check,
@@ -222,7 +221,7 @@ const struct bpf_map_ops cgrp_storage_map_ops = {
        .map_update_elem = bpf_cgrp_storage_update_elem,
        .map_delete_elem = bpf_cgrp_storage_delete_elem,
        .map_check_btf = bpf_local_storage_map_check_btf,
-       .map_btf_id = &cgroup_storage_map_btf_ids[0],
+       .map_btf_id = &bpf_local_storage_map_btf_id[0],
        .map_owner_storage_ptr = cgroup_storage_ptr,
 };
 
index 6a1d4d2..05f4c66 100644 (file)
@@ -213,8 +213,6 @@ static void inode_storage_map_free(struct bpf_map *map)
        bpf_local_storage_map_free(map, &inode_cache, NULL);
 }
 
-BTF_ID_LIST_SINGLE(inode_storage_map_btf_ids, struct,
-                  bpf_local_storage_map)
 const struct bpf_map_ops inode_storage_map_ops = {
        .map_meta_equal = bpf_map_meta_equal,
        .map_alloc_check = bpf_local_storage_map_alloc_check,
@@ -225,7 +223,7 @@ const struct bpf_map_ops inode_storage_map_ops = {
        .map_update_elem = bpf_fd_inode_storage_update_elem,
        .map_delete_elem = bpf_fd_inode_storage_delete_elem,
        .map_check_btf = bpf_local_storage_map_check_btf,
-       .map_btf_id = &inode_storage_map_btf_ids[0],
+       .map_btf_id = &bpf_local_storage_map_btf_id[0],
        .map_owner_storage_ptr = inode_storage_ptr,
 };
 
index ae0267f..9ea42a4 100644 (file)
@@ -345,11 +345,27 @@ BTF_ID(func, bpf_lsm_task_to_inode)
 BTF_ID(func, bpf_lsm_userns_create)
 BTF_SET_END(sleepable_lsm_hooks)
 
+BTF_SET_START(untrusted_lsm_hooks)
+BTF_ID(func, bpf_lsm_bpf_map_free_security)
+BTF_ID(func, bpf_lsm_bpf_prog_alloc_security)
+BTF_ID(func, bpf_lsm_bpf_prog_free_security)
+BTF_ID(func, bpf_lsm_file_alloc_security)
+BTF_ID(func, bpf_lsm_file_free_security)
+BTF_ID(func, bpf_lsm_sk_alloc_security)
+BTF_ID(func, bpf_lsm_sk_free_security)
+BTF_ID(func, bpf_lsm_task_free)
+BTF_SET_END(untrusted_lsm_hooks)
+
 bool bpf_lsm_is_sleepable_hook(u32 btf_id)
 {
        return btf_id_set_contains(&sleepable_lsm_hooks, btf_id);
 }
 
+bool bpf_lsm_is_trusted(const struct bpf_prog *prog)
+{
+       return !btf_id_set_contains(&untrusted_lsm_hooks, prog->aux->attach_btf_id);
+}
+
 const struct bpf_prog_ops lsm_prog_ops = {
 };
 
index 8e832db..1e48605 100644 (file)
@@ -324,7 +324,7 @@ static void task_storage_map_free(struct bpf_map *map)
        bpf_local_storage_map_free(map, &task_cache, &bpf_task_storage_busy);
 }
 
-BTF_ID_LIST_SINGLE(task_storage_map_btf_ids, struct, bpf_local_storage_map)
+BTF_ID_LIST_GLOBAL_SINGLE(bpf_local_storage_map_btf_id, struct, bpf_local_storage_map)
 const struct bpf_map_ops task_storage_map_ops = {
        .map_meta_equal = bpf_map_meta_equal,
        .map_alloc_check = bpf_local_storage_map_alloc_check,
@@ -335,7 +335,7 @@ const struct bpf_map_ops task_storage_map_ops = {
        .map_update_elem = bpf_pid_task_storage_update_elem,
        .map_delete_elem = bpf_pid_task_storage_delete_elem,
        .map_check_btf = bpf_local_storage_map_check_btf,
-       .map_btf_id = &task_storage_map_btf_ids[0],
+       .map_btf_id = &bpf_local_storage_map_btf_id[0],
        .map_owner_storage_ptr = task_storage_ptr,
 };
 
index d11cbf8..f7dd8af 100644 (file)
@@ -19,6 +19,7 @@
 #include <linux/bpf_verifier.h>
 #include <linux/btf.h>
 #include <linux/btf_ids.h>
+#include <linux/bpf_lsm.h>
 #include <linux/skmsg.h>
 #include <linux/perf_event.h>
 #include <linux/bsearch.h>
@@ -205,6 +206,7 @@ enum btf_kfunc_hook {
        BTF_KFUNC_HOOK_STRUCT_OPS,
        BTF_KFUNC_HOOK_TRACING,
        BTF_KFUNC_HOOK_SYSCALL,
+       BTF_KFUNC_HOOK_FMODRET,
        BTF_KFUNC_HOOK_MAX,
 };
 
@@ -5829,6 +5831,7 @@ static bool prog_args_trusted(const struct bpf_prog *prog)
        case BPF_PROG_TYPE_TRACING:
                return atype == BPF_TRACE_RAW_TP || atype == BPF_TRACE_ITER;
        case BPF_PROG_TYPE_LSM:
+               return bpf_lsm_is_trusted(prog);
        case BPF_PROG_TYPE_STRUCT_OPS:
                return true;
        default:
@@ -7606,11 +7609,14 @@ u32 *btf_kfunc_id_set_contains(const struct btf *btf,
        return __btf_kfunc_id_set_contains(btf, hook, kfunc_btf_id);
 }
 
-/* This function must be invoked only from initcalls/module init functions */
-int register_btf_kfunc_id_set(enum bpf_prog_type prog_type,
-                             const struct btf_kfunc_id_set *kset)
+u32 *btf_kfunc_is_modify_return(const struct btf *btf, u32 kfunc_btf_id)
+{
+       return __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_FMODRET, kfunc_btf_id);
+}
+
+static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook,
+                                      const struct btf_kfunc_id_set *kset)
 {
-       enum btf_kfunc_hook hook;
        struct btf *btf;
        int ret;
 
@@ -7629,13 +7635,29 @@ int register_btf_kfunc_id_set(enum bpf_prog_type prog_type,
        if (IS_ERR(btf))
                return PTR_ERR(btf);
 
-       hook = bpf_prog_type_to_kfunc_hook(prog_type);
        ret = btf_populate_kfunc_set(btf, hook, kset->set);
        btf_put(btf);
        return ret;
 }
+
+/* This function must be invoked only from initcalls/module init functions */
+int register_btf_kfunc_id_set(enum bpf_prog_type prog_type,
+                             const struct btf_kfunc_id_set *kset)
+{
+       enum btf_kfunc_hook hook;
+
+       hook = bpf_prog_type_to_kfunc_hook(prog_type);
+       return __register_btf_kfunc_id_set(hook, kset);
+}
 EXPORT_SYMBOL_GPL(register_btf_kfunc_id_set);
 
+/* This function must be invoked only from initcalls/module init functions */
+int register_btf_fmodret_id_set(const struct btf_kfunc_id_set *kset)
+{
+       return __register_btf_kfunc_id_set(BTF_KFUNC_HOOK_FMODRET, kset);
+}
+EXPORT_SYMBOL_GPL(register_btf_fmodret_id_set);
+
 s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id)
 {
        struct btf_id_dtor_kfunc_tab *tab = btf->dtor_kfunc_tab;
index a5a5114..af30c6c 100644 (file)
@@ -1404,7 +1404,7 @@ static const struct bpf_func_proto bpf_kptr_xchg_proto = {
 #define DYNPTR_SIZE_MASK       0xFFFFFF
 #define DYNPTR_RDONLY_BIT      BIT(31)
 
-static bool bpf_dynptr_is_rdonly(struct bpf_dynptr_kern *ptr)
+static bool bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr)
 {
        return ptr->size & DYNPTR_RDONLY_BIT;
 }
@@ -1414,7 +1414,7 @@ static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum bpf_dynptr_typ
        ptr->size |= type << DYNPTR_TYPE_SHIFT;
 }
 
-u32 bpf_dynptr_get_size(struct bpf_dynptr_kern *ptr)
+u32 bpf_dynptr_get_size(const struct bpf_dynptr_kern *ptr)
 {
        return ptr->size & DYNPTR_SIZE_MASK;
 }
@@ -1438,7 +1438,7 @@ void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr)
        memset(ptr, 0, sizeof(*ptr));
 }
 
-static int bpf_dynptr_check_off_len(struct bpf_dynptr_kern *ptr, u32 offset, u32 len)
+static int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u32 offset, u32 len)
 {
        u32 size = bpf_dynptr_get_size(ptr);
 
@@ -1483,7 +1483,7 @@ static const struct bpf_func_proto bpf_dynptr_from_mem_proto = {
        .arg4_type      = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT,
 };
 
-BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, struct bpf_dynptr_kern *, src,
+BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src,
           u32, offset, u64, flags)
 {
        int err;
@@ -1495,7 +1495,11 @@ BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, struct bpf_dynptr_kern *, src
        if (err)
                return err;
 
-       memcpy(dst, src->data + src->offset + offset, len);
+       /* Source and destination may possibly overlap, hence use memmove to
+        * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr
+        * pointing to overlapping PTR_TO_MAP_VALUE regions.
+        */
+       memmove(dst, src->data + src->offset + offset, len);
 
        return 0;
 }
@@ -1506,12 +1510,12 @@ static const struct bpf_func_proto bpf_dynptr_read_proto = {
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_UNINIT_MEM,
        .arg2_type      = ARG_CONST_SIZE_OR_ZERO,
-       .arg3_type      = ARG_PTR_TO_DYNPTR,
+       .arg3_type      = ARG_PTR_TO_DYNPTR | MEM_RDONLY,
        .arg4_type      = ARG_ANYTHING,
        .arg5_type      = ARG_ANYTHING,
 };
 
-BPF_CALL_5(bpf_dynptr_write, struct bpf_dynptr_kern *, dst, u32, offset, void *, src,
+BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, void *, src,
           u32, len, u64, flags)
 {
        int err;
@@ -1523,7 +1527,11 @@ BPF_CALL_5(bpf_dynptr_write, struct bpf_dynptr_kern *, dst, u32, offset, void *,
        if (err)
                return err;
 
-       memcpy(dst->data + dst->offset + offset, src, len);
+       /* Source and destination may possibly overlap, hence use memmove to
+        * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr
+        * pointing to overlapping PTR_TO_MAP_VALUE regions.
+        */
+       memmove(dst->data + dst->offset + offset, src, len);
 
        return 0;
 }
@@ -1532,14 +1540,14 @@ static const struct bpf_func_proto bpf_dynptr_write_proto = {
        .func           = bpf_dynptr_write,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
-       .arg1_type      = ARG_PTR_TO_DYNPTR,
+       .arg1_type      = ARG_PTR_TO_DYNPTR | MEM_RDONLY,
        .arg2_type      = ARG_ANYTHING,
        .arg3_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg4_type      = ARG_CONST_SIZE_OR_ZERO,
        .arg5_type      = ARG_ANYTHING,
 };
 
-BPF_CALL_3(bpf_dynptr_data, struct bpf_dynptr_kern *, ptr, u32, offset, u32, len)
+BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u32, len)
 {
        int err;
 
@@ -1560,7 +1568,7 @@ static const struct bpf_func_proto bpf_dynptr_data_proto = {
        .func           = bpf_dynptr_data,
        .gpl_only       = false,
        .ret_type       = RET_PTR_TO_DYNPTR_MEM_OR_NULL,
-       .arg1_type      = ARG_PTR_TO_DYNPTR,
+       .arg1_type      = ARG_PTR_TO_DYNPTR | MEM_RDONLY,
        .arg2_type      = ARG_ANYTHING,
        .arg3_type      = ARG_CONST_ALLOC_SIZE_OR_ZERO,
 };
@@ -1833,8 +1841,59 @@ struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head)
  */
 struct task_struct *bpf_task_acquire(struct task_struct *p)
 {
-       refcount_inc(&p->rcu_users);
-       return p;
+       return get_task_struct(p);
+}
+
+/**
+ * bpf_task_acquire_not_zero - Acquire a reference to a rcu task object. A task
+ * acquired by this kfunc which is not stored in a map as a kptr, must be
+ * released by calling bpf_task_release().
+ * @p: The task on which a reference is being acquired.
+ */
+struct task_struct *bpf_task_acquire_not_zero(struct task_struct *p)
+{
+       /* For the time being this function returns NULL, as it's not currently
+        * possible to safely acquire a reference to a task with RCU protection
+        * using get_task_struct() and put_task_struct(). This is due to the
+        * slightly odd mechanics of p->rcu_users, and how task RCU protection
+        * works.
+        *
+        * A struct task_struct is refcounted by two different refcount_t
+        * fields:
+        *
+        * 1. p->usage:     The "true" refcount field which tracks a task's
+        *                  lifetime. The task is freed as soon as this
+        *                  refcount drops to 0.
+        *
+        * 2. p->rcu_users: An "RCU users" refcount field which is statically
+        *                  initialized to 2, and is co-located in a union with
+        *                  a struct rcu_head field (p->rcu). p->rcu_users
+        *                  essentially encapsulates a single p->usage
+        *                  refcount, and when p->rcu_users goes to 0, an RCU
+        *                  callback is scheduled on the struct rcu_head which
+        *                  decrements the p->usage refcount.
+        *
+        * There are two important implications to this task refcounting logic
+        * described above. The first is that
+        * refcount_inc_not_zero(&p->rcu_users) cannot be used anywhere, as
+        * after the refcount goes to 0, the RCU callback being scheduled will
+        * cause the memory backing the refcount to again be nonzero due to the
+        * fields sharing a union. The other is that we can't rely on RCU to
+        * guarantee that a task is valid in a BPF program. This is because a
+        * task could have already transitioned to being in the TASK_DEAD
+        * state, had its rcu_users refcount go to 0, and its rcu callback
+        * invoked in which it drops its single p->usage reference. At this
+        * point the task will be freed as soon as the last p->usage reference
+        * goes to 0, without waiting for another RCU gp to elapse. The only
+        * way that a BPF program can guarantee that a task is valid is in this
+        * scenario is to hold a p->usage refcount itself.
+        *
+        * Until we're able to resolve this issue, either by pulling
+        * p->rcu_users and p->rcu out of the union, or by getting rid of
+        * p->usage and just using p->rcu_users for refcounting, we'll just
+        * return NULL here.
+        */
+       return NULL;
 }
 
 /**
@@ -1845,33 +1904,15 @@ struct task_struct *bpf_task_acquire(struct task_struct *p)
  */
 struct task_struct *bpf_task_kptr_get(struct task_struct **pp)
 {
-       struct task_struct *p;
-
-       rcu_read_lock();
-       p = READ_ONCE(*pp);
-
-       /* Another context could remove the task from the map and release it at
-        * any time, including after we've done the lookup above. This is safe
-        * because we're in an RCU read region, so the task is guaranteed to
-        * remain valid until at least the rcu_read_unlock() below.
+       /* We must return NULL here until we have clarity on how to properly
+        * leverage RCU for ensuring a task's lifetime. See the comment above
+        * in bpf_task_acquire_not_zero() for more details.
         */
-       if (p && !refcount_inc_not_zero(&p->rcu_users))
-               /* If the task had been removed from the map and freed as
-                * described above, refcount_inc_not_zero() will return false.
-                * The task will be freed at some point after the current RCU
-                * gp has ended, so just return NULL to the user.
-                */
-               p = NULL;
-       rcu_read_unlock();
-
-       return p;
+       return NULL;
 }
 
 /**
- * bpf_task_release - Release the reference acquired on a struct task_struct *.
- * If this kfunc is invoked in an RCU read region, the task_struct is
- * guaranteed to not be freed until the current grace period has ended, even if
- * its refcount drops to 0.
+ * bpf_task_release - Release the reference acquired on a task.
  * @p: The task on which a reference is being released.
  */
 void bpf_task_release(struct task_struct *p)
@@ -1879,7 +1920,7 @@ void bpf_task_release(struct task_struct *p)
        if (!p)
                return;
 
-       put_task_struct_rcu_user(p);
+       put_task_struct(p);
 }
 
 #ifdef CONFIG_CGROUPS
@@ -1927,7 +1968,7 @@ struct cgroup *bpf_cgroup_kptr_get(struct cgroup **cgrpp)
 }
 
 /**
- * bpf_cgroup_release - Release the reference acquired on a struct cgroup *.
+ * bpf_cgroup_release - Release the reference acquired on a cgroup.
  * If this kfunc is invoked in an RCU read region, the cgroup is guaranteed to
  * not be freed until the current grace period has ended, even if its refcount
  * drops to 0.
@@ -2013,6 +2054,7 @@ BTF_ID_FLAGS(func, bpf_list_push_back)
 BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_task_acquire_not_zero, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_task_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE)
 #ifdef CONFIG_CGROUPS
index 8f0d65f..ebcc3dd 100644 (file)
@@ -171,9 +171,24 @@ static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node)
        memcg = get_memcg(c);
        old_memcg = set_active_memcg(memcg);
        for (i = 0; i < cnt; i++) {
-               obj = __alloc(c, node);
-               if (!obj)
-                       break;
+               /*
+                * free_by_rcu is only manipulated by irq work refill_work().
+                * IRQ works on the same CPU are called sequentially, so it is
+                * safe to use __llist_del_first() here. If alloc_bulk() is
+                * invoked by the initial prefill, there will be no running
+                * refill_work(), so __llist_del_first() is fine as well.
+                *
+                * In most cases, objects on free_by_rcu are from the same CPU.
+                * If some objects come from other CPUs, it doesn't incur any
+                * harm because NUMA_NO_NODE means the preference for current
+                * numa node and it is not a guarantee.
+                */
+               obj = __llist_del_first(&c->free_by_rcu);
+               if (!obj) {
+                       obj = __alloc(c, node);
+                       if (!obj)
+                               break;
+               }
                if (IS_ENABLED(CONFIG_PREEMPT_RT))
                        /* In RT irq_work runs in per-cpu kthread, so disable
                         * interrupts to avoid preemption and interrupts and
@@ -449,9 +464,17 @@ static void free_mem_alloc(struct bpf_mem_alloc *ma)
 {
        /* waiting_for_gp lists was drained, but __free_rcu might
         * still execute. Wait for it now before we freeing percpu caches.
+        *
+        * rcu_barrier_tasks_trace() doesn't imply synchronize_rcu_tasks_trace(),
+        * but rcu_barrier_tasks_trace() and rcu_barrier() below are only used
+        * to wait for the pending __free_rcu_tasks_trace() and __free_rcu(),
+        * so if call_rcu(head, __free_rcu) is skipped due to
+        * rcu_trace_implies_rcu_gp(), it will be OK to skip rcu_barrier() by
+        * using rcu_trace_implies_rcu_gp() as well.
         */
        rcu_barrier_tasks_trace();
-       rcu_barrier();
+       if (!rcu_trace_implies_rcu_gp())
+               rcu_barrier();
        free_mem_alloc_no_barrier(ma);
 }
 
index 4e7f1d0..a5255a0 100644 (file)
@@ -451,6 +451,11 @@ static bool reg_type_not_null(enum bpf_reg_type type)
                type == PTR_TO_SOCK_COMMON;
 }
 
+static bool type_is_ptr_alloc_obj(u32 type)
+{
+       return base_type(type) == PTR_TO_BTF_ID && type_flag(type) & MEM_ALLOC;
+}
+
 static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg)
 {
        struct btf_record *rec = NULL;
@@ -458,7 +463,7 @@ static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg)
 
        if (reg->type == PTR_TO_MAP_VALUE) {
                rec = reg->map_ptr->record;
-       } else if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC)) {
+       } else if (type_is_ptr_alloc_obj(reg->type)) {
                meta = btf_find_struct_meta(reg->btf, reg->btf_id);
                if (meta)
                        rec = meta->record;
@@ -587,7 +592,7 @@ static const char *reg_type_str(struct bpf_verifier_env *env,
                [PTR_TO_BUF]            = "buf",
                [PTR_TO_FUNC]           = "func",
                [PTR_TO_MAP_KEY]        = "map_key",
-               [PTR_TO_DYNPTR]         = "dynptr_ptr",
+               [CONST_PTR_TO_DYNPTR]   = "dynptr_ptr",
        };
 
        if (type & PTR_MAYBE_NULL) {
@@ -720,6 +725,28 @@ static bool dynptr_type_refcounted(enum bpf_dynptr_type type)
        return type == BPF_DYNPTR_TYPE_RINGBUF;
 }
 
+static void __mark_dynptr_reg(struct bpf_reg_state *reg,
+                             enum bpf_dynptr_type type,
+                             bool first_slot);
+
+static void __mark_reg_not_init(const struct bpf_verifier_env *env,
+                               struct bpf_reg_state *reg);
+
+static void mark_dynptr_stack_regs(struct bpf_reg_state *sreg1,
+                                  struct bpf_reg_state *sreg2,
+                                  enum bpf_dynptr_type type)
+{
+       __mark_dynptr_reg(sreg1, type, true);
+       __mark_dynptr_reg(sreg2, type, false);
+}
+
+static void mark_dynptr_cb_reg(struct bpf_reg_state *reg,
+                              enum bpf_dynptr_type type)
+{
+       __mark_dynptr_reg(reg, type, true);
+}
+
+
 static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
                                   enum bpf_arg_type arg_type, int insn_idx)
 {
@@ -741,9 +768,8 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_
        if (type == BPF_DYNPTR_TYPE_INVALID)
                return -EINVAL;
 
-       state->stack[spi].spilled_ptr.dynptr.first_slot = true;
-       state->stack[spi].spilled_ptr.dynptr.type = type;
-       state->stack[spi - 1].spilled_ptr.dynptr.type = type;
+       mark_dynptr_stack_regs(&state->stack[spi].spilled_ptr,
+                              &state->stack[spi - 1].spilled_ptr, type);
 
        if (dynptr_type_refcounted(type)) {
                /* The id is used to track proper releasing */
@@ -751,8 +777,8 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_
                if (id < 0)
                        return id;
 
-               state->stack[spi].spilled_ptr.id = id;
-               state->stack[spi - 1].spilled_ptr.id = id;
+               state->stack[spi].spilled_ptr.ref_obj_id = id;
+               state->stack[spi - 1].spilled_ptr.ref_obj_id = id;
        }
 
        return 0;
@@ -774,25 +800,23 @@ static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_re
        }
 
        /* Invalidate any slices associated with this dynptr */
-       if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) {
-               release_reference(env, state->stack[spi].spilled_ptr.id);
-               state->stack[spi].spilled_ptr.id = 0;
-               state->stack[spi - 1].spilled_ptr.id = 0;
-       }
-
-       state->stack[spi].spilled_ptr.dynptr.first_slot = false;
-       state->stack[spi].spilled_ptr.dynptr.type = 0;
-       state->stack[spi - 1].spilled_ptr.dynptr.type = 0;
+       if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type))
+               WARN_ON_ONCE(release_reference(env, state->stack[spi].spilled_ptr.ref_obj_id));
 
+       __mark_reg_not_init(env, &state->stack[spi].spilled_ptr);
+       __mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr);
        return 0;
 }
 
 static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
 {
        struct bpf_func_state *state = func(env, reg);
-       int spi = get_spi(reg->off);
-       int i;
+       int spi, i;
+
+       if (reg->type == CONST_PTR_TO_DYNPTR)
+               return false;
 
+       spi = get_spi(reg->off);
        if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS))
                return true;
 
@@ -805,13 +829,17 @@ static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_
        return true;
 }
 
-bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env,
-                             struct bpf_reg_state *reg)
+static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
 {
        struct bpf_func_state *state = func(env, reg);
-       int spi = get_spi(reg->off);
+       int spi;
        int i;
 
+       /* This already represents first slot of initialized bpf_dynptr */
+       if (reg->type == CONST_PTR_TO_DYNPTR)
+               return true;
+
+       spi = get_spi(reg->off);
        if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) ||
            !state->stack[spi].spilled_ptr.dynptr.first_slot)
                return false;
@@ -825,21 +853,24 @@ bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env,
        return true;
 }
 
-bool is_dynptr_type_expected(struct bpf_verifier_env *env,
-                            struct bpf_reg_state *reg,
-                            enum bpf_arg_type arg_type)
+static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+                                   enum bpf_arg_type arg_type)
 {
        struct bpf_func_state *state = func(env, reg);
        enum bpf_dynptr_type dynptr_type;
-       int spi = get_spi(reg->off);
+       int spi;
 
        /* ARG_PTR_TO_DYNPTR takes any type of dynptr */
        if (arg_type == ARG_PTR_TO_DYNPTR)
                return true;
 
        dynptr_type = arg_to_dynptr_type(arg_type);
-
-       return state->stack[spi].spilled_ptr.dynptr.type == dynptr_type;
+       if (reg->type == CONST_PTR_TO_DYNPTR) {
+               return reg->dynptr.type == dynptr_type;
+       } else {
+               spi = get_spi(reg->off);
+               return state->stack[spi].spilled_ptr.dynptr.type == dynptr_type;
+       }
 }
 
 /* The reg state of a pointer or a bounded scalar was saved when
@@ -1351,9 +1382,6 @@ static const int caller_saved[CALLER_SAVED_REGS] = {
        BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
 };
 
-static void __mark_reg_not_init(const struct bpf_verifier_env *env,
-                               struct bpf_reg_state *reg);
-
 /* This helper doesn't clear reg->id */
 static void ___mark_reg_known(struct bpf_reg_state *reg, u64 imm)
 {
@@ -1416,6 +1444,19 @@ static void mark_reg_known_zero(struct bpf_verifier_env *env,
        __mark_reg_known_zero(regs + regno);
 }
 
+static void __mark_dynptr_reg(struct bpf_reg_state *reg, enum bpf_dynptr_type type,
+                             bool first_slot)
+{
+       /* reg->type has no meaning for STACK_DYNPTR, but when we set reg for
+        * callback arguments, it does need to be CONST_PTR_TO_DYNPTR, so simply
+        * set it unconditionally as it is ignored for STACK_DYNPTR anyway.
+        */
+       __mark_reg_known_zero(reg);
+       reg->type = CONST_PTR_TO_DYNPTR;
+       reg->dynptr.type = type;
+       reg->dynptr.first_slot = first_slot;
+}
+
 static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
 {
        if (base_type(reg->type) == PTR_TO_MAP_VALUE) {
@@ -2525,6 +2566,16 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
        return 0;
 }
 
+static void mark_jmp_point(struct bpf_verifier_env *env, int idx)
+{
+       env->insn_aux_data[idx].jmp_point = true;
+}
+
+static bool is_jmp_point(struct bpf_verifier_env *env, int insn_idx)
+{
+       return env->insn_aux_data[insn_idx].jmp_point;
+}
+
 /* for any branch, call, exit record the history of jmps in the given state */
 static int push_jmp_history(struct bpf_verifier_env *env,
                            struct bpf_verifier_state *cur)
@@ -2533,6 +2584,9 @@ static int push_jmp_history(struct bpf_verifier_env *env,
        struct bpf_idx_pair *p;
        size_t alloc_size;
 
+       if (!is_jmp_point(env, env->insn_idx))
+               return 0;
+
        cnt++;
        alloc_size = kmalloc_size_roundup(size_mul(cnt, sizeof(*p)));
        p = krealloc(cur->jmp_history, alloc_size, GFP_USER);
@@ -4275,7 +4329,7 @@ static bool is_trusted_reg(const struct bpf_reg_state *reg)
                return true;
 
        /* If a register is not referenced, it is trusted if it has the
-        * MEM_ALLOC, MEM_RCU or PTR_TRUSTED type modifiers, and no others. Some of the
+        * MEM_ALLOC or PTR_TRUSTED type modifiers, and no others. Some of the
         * other type modifiers may be safe, but we elect to take an opt-in
         * approach here as some (e.g. PTR_UNTRUSTED and PTR_MAYBE_NULL) are
         * not.
@@ -4287,6 +4341,11 @@ static bool is_trusted_reg(const struct bpf_reg_state *reg)
               !bpf_type_has_unsafe_modifiers(reg->type);
 }
 
+static bool is_rcu_reg(const struct bpf_reg_state *reg)
+{
+       return reg->type & MEM_RCU;
+}
+
 static int check_pkt_ptr_alignment(struct bpf_verifier_env *env,
                                   const struct bpf_reg_state *reg,
                                   int off, int size, bool strict)
@@ -4703,6 +4762,18 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
        u32 btf_id;
        int ret;
 
+       if (!env->allow_ptr_leaks) {
+               verbose(env,
+                       "'struct %s' access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN\n",
+                       tname);
+               return -EPERM;
+       }
+       if (!env->prog->gpl_compatible && btf_is_kernel(reg->btf)) {
+               verbose(env,
+                       "Cannot access kernel 'struct %s' from non-GPL compatible program\n",
+                       tname);
+               return -EINVAL;
+       }
        if (off < 0) {
                verbose(env,
                        "R%d is ptr_%s invalid negative access: off=%d\n",
@@ -4773,14 +4844,16 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 
        if (flag & MEM_RCU) {
                /* Mark value register as MEM_RCU only if it is protected by
-                * bpf_rcu_read_lock() and the ptr reg is trusted. MEM_RCU
+                * bpf_rcu_read_lock() and the ptr reg is rcu or trusted. MEM_RCU
                 * itself can already indicate trustedness inside the rcu
-                * read lock region. Also mark it as PTR_TRUSTED.
+                * read lock region. Also mark rcu pointer as PTR_MAYBE_NULL since
+                * it could be null in some cases.
                 */
-               if (!env->cur_state->active_rcu_lock || !is_trusted_reg(reg))
+               if (!env->cur_state->active_rcu_lock ||
+                   !(is_trusted_reg(reg) || is_rcu_reg(reg)))
                        flag &= ~MEM_RCU;
                else
-                       flag |= PTR_TRUSTED;
+                       flag |= PTR_MAYBE_NULL;
        } else if (reg->type & MEM_RCU) {
                /* ptr (reg) is marked as MEM_RCU, but the struct field is not tagged
                 * with __rcu. Mark the flag as PTR_UNTRUSTED conservatively.
@@ -4823,9 +4896,9 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env,
        t = btf_type_by_id(btf_vmlinux, *map->ops->map_btf_id);
        tname = btf_name_by_offset(btf_vmlinux, t->name_off);
 
-       if (!env->allow_ptr_to_map_access) {
+       if (!env->allow_ptr_leaks) {
                verbose(env,
-                       "%s access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN\n",
+                       "'struct %s' access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN\n",
                        tname);
                return -EPERM;
        }
@@ -5726,7 +5799,7 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno,
                cur->active_lock.ptr = NULL;
                cur->active_lock.id = 0;
 
-               for (i = 0; i < fstate->acquired_refs; i++) {
+               for (i = fstate->acquired_refs - 1; i >= 0; i--) {
                        int err;
 
                        /* Complain on error because this reference state cannot
@@ -5822,6 +5895,119 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno,
        return 0;
 }
 
+/* There are two register types representing a bpf_dynptr, one is PTR_TO_STACK
+ * which points to a stack slot, and the other is CONST_PTR_TO_DYNPTR.
+ *
+ * In both cases we deal with the first 8 bytes, but need to mark the next 8
+ * bytes as STACK_DYNPTR in case of PTR_TO_STACK. In case of
+ * CONST_PTR_TO_DYNPTR, we are guaranteed to get the beginning of the object.
+ *
+ * Mutability of bpf_dynptr is at two levels, one is at the level of struct
+ * bpf_dynptr itself, i.e. whether the helper is receiving a pointer to struct
+ * bpf_dynptr or pointer to const struct bpf_dynptr. In the former case, it can
+ * mutate the view of the dynptr and also possibly destroy it. In the latter
+ * case, it cannot mutate the bpf_dynptr itself but it can still mutate the
+ * memory that dynptr points to.
+ *
+ * The verifier will keep track both levels of mutation (bpf_dynptr's in
+ * reg->type and the memory's in reg->dynptr.type), but there is no support for
+ * readonly dynptr view yet, hence only the first case is tracked and checked.
+ *
+ * This is consistent with how C applies the const modifier to a struct object,
+ * where the pointer itself inside bpf_dynptr becomes const but not what it
+ * points to.
+ *
+ * Helpers which do not mutate the bpf_dynptr set MEM_RDONLY in their argument
+ * type, and declare it as 'const struct bpf_dynptr *' in their prototype.
+ */
+int process_dynptr_func(struct bpf_verifier_env *env, int regno,
+                       enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta)
+{
+       struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+
+       /* MEM_UNINIT and MEM_RDONLY are exclusive, when applied to an
+        * ARG_PTR_TO_DYNPTR (or ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_*):
+        */
+       if ((arg_type & (MEM_UNINIT | MEM_RDONLY)) == (MEM_UNINIT | MEM_RDONLY)) {
+               verbose(env, "verifier internal error: misconfigured dynptr helper type flags\n");
+               return -EFAULT;
+       }
+       /* CONST_PTR_TO_DYNPTR already has fixed and var_off as 0 due to
+        * check_func_arg_reg_off's logic. We only need to check offset
+        * alignment for PTR_TO_STACK.
+        */
+       if (reg->type == PTR_TO_STACK && (reg->off % BPF_REG_SIZE)) {
+               verbose(env, "cannot pass in dynptr at an offset=%d\n", reg->off);
+               return -EINVAL;
+       }
+       /*  MEM_UNINIT - Points to memory that is an appropriate candidate for
+        *               constructing a mutable bpf_dynptr object.
+        *
+        *               Currently, this is only possible with PTR_TO_STACK
+        *               pointing to a region of at least 16 bytes which doesn't
+        *               contain an existing bpf_dynptr.
+        *
+        *  MEM_RDONLY - Points to a initialized bpf_dynptr that will not be
+        *               mutated or destroyed. However, the memory it points to
+        *               may be mutated.
+        *
+        *  None       - Points to a initialized dynptr that can be mutated and
+        *               destroyed, including mutation of the memory it points
+        *               to.
+        */
+       if (arg_type & MEM_UNINIT) {
+               if (!is_dynptr_reg_valid_uninit(env, reg)) {
+                       verbose(env, "Dynptr has to be an uninitialized dynptr\n");
+                       return -EINVAL;
+               }
+
+               /* We only support one dynptr being uninitialized at the moment,
+                * which is sufficient for the helper functions we have right now.
+                */
+               if (meta->uninit_dynptr_regno) {
+                       verbose(env, "verifier internal error: multiple uninitialized dynptr args\n");
+                       return -EFAULT;
+               }
+
+               meta->uninit_dynptr_regno = regno;
+       } else /* MEM_RDONLY and None case from above */ {
+               /* For the reg->type == PTR_TO_STACK case, bpf_dynptr is never const */
+               if (reg->type == CONST_PTR_TO_DYNPTR && !(arg_type & MEM_RDONLY)) {
+                       verbose(env, "cannot pass pointer to const bpf_dynptr, the helper mutates it\n");
+                       return -EINVAL;
+               }
+
+               if (!is_dynptr_reg_valid_init(env, reg)) {
+                       verbose(env,
+                               "Expected an initialized dynptr as arg #%d\n",
+                               regno);
+                       return -EINVAL;
+               }
+
+               /* Fold modifiers (in this case, MEM_RDONLY) when checking expected type */
+               if (!is_dynptr_type_expected(env, reg, arg_type & ~MEM_RDONLY)) {
+                       const char *err_extra = "";
+
+                       switch (arg_type & DYNPTR_TYPE_FLAG_MASK) {
+                       case DYNPTR_TYPE_LOCAL:
+                               err_extra = "local";
+                               break;
+                       case DYNPTR_TYPE_RINGBUF:
+                               err_extra = "ringbuf";
+                               break;
+                       default:
+                               err_extra = "<unknown>";
+                               break;
+                       }
+                       verbose(env,
+                               "Expected a dynptr of type %s as arg #%d\n",
+                               err_extra, regno);
+                       return -EINVAL;
+               }
+       }
+       return 0;
+}
+
 static bool arg_type_is_mem_size(enum bpf_arg_type type)
 {
        return type == ARG_CONST_SIZE ||
@@ -5945,7 +6131,7 @@ static const struct bpf_reg_types btf_ptr_types = {
        .types = {
                PTR_TO_BTF_ID,
                PTR_TO_BTF_ID | PTR_TRUSTED,
-               PTR_TO_BTF_ID | MEM_RCU | PTR_TRUSTED,
+               PTR_TO_BTF_ID | MEM_RCU,
        },
 };
 static const struct bpf_reg_types percpu_btf_ptr_types = {
@@ -5962,7 +6148,7 @@ static const struct bpf_reg_types kptr_types = { .types = { PTR_TO_MAP_VALUE } }
 static const struct bpf_reg_types dynptr_types = {
        .types = {
                PTR_TO_STACK,
-               PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL,
+               CONST_PTR_TO_DYNPTR,
        }
 };
 
@@ -6091,17 +6277,38 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
                           const struct bpf_reg_state *reg, int regno,
                           enum bpf_arg_type arg_type)
 {
-       enum bpf_reg_type type = reg->type;
-       bool fixed_off_ok = false;
+       u32 type = reg->type;
 
-       switch ((u32)type) {
-       /* Pointer types where reg offset is explicitly allowed: */
-       case PTR_TO_STACK:
-               if (arg_type_is_dynptr(arg_type) && reg->off % BPF_REG_SIZE) {
-                       verbose(env, "cannot pass in dynptr at an offset\n");
+       /* When referenced register is passed to release function, its fixed
+        * offset must be 0.
+        *
+        * We will check arg_type_is_release reg has ref_obj_id when storing
+        * meta->release_regno.
+        */
+       if (arg_type_is_release(arg_type)) {
+               /* ARG_PTR_TO_DYNPTR with OBJ_RELEASE is a bit special, as it
+                * may not directly point to the object being released, but to
+                * dynptr pointing to such object, which might be at some offset
+                * on the stack. In that case, we simply to fallback to the
+                * default handling.
+                */
+               if (arg_type_is_dynptr(arg_type) && type == PTR_TO_STACK)
+                       return 0;
+               /* Doing check_ptr_off_reg check for the offset will catch this
+                * because fixed_off_ok is false, but checking here allows us
+                * to give the user a better error message.
+                */
+               if (reg->off) {
+                       verbose(env, "R%d must have zero offset when passed to release func or trusted arg to kfunc\n",
+                               regno);
                        return -EINVAL;
                }
-               fallthrough;
+               return __check_ptr_off_reg(env, reg, regno, false);
+       }
+
+       switch (type) {
+       /* Pointer types where both fixed and variable offset is explicitly allowed: */
+       case PTR_TO_STACK:
        case PTR_TO_PACKET:
        case PTR_TO_PACKET_META:
        case PTR_TO_MAP_KEY:
@@ -6112,47 +6319,38 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
        case PTR_TO_BUF:
        case PTR_TO_BUF | MEM_RDONLY:
        case SCALAR_VALUE:
-               /* Some of the argument types nevertheless require a
-                * zero register offset.
-                */
-               if (base_type(arg_type) != ARG_PTR_TO_RINGBUF_MEM)
-                       return 0;
-               break;
+               return 0;
        /* All the rest must be rejected, except PTR_TO_BTF_ID which allows
         * fixed offset.
         */
        case PTR_TO_BTF_ID:
        case PTR_TO_BTF_ID | MEM_ALLOC:
        case PTR_TO_BTF_ID | PTR_TRUSTED:
-       case PTR_TO_BTF_ID | MEM_RCU | PTR_TRUSTED:
+       case PTR_TO_BTF_ID | MEM_RCU:
        case PTR_TO_BTF_ID | MEM_ALLOC | PTR_TRUSTED:
                /* When referenced PTR_TO_BTF_ID is passed to release function,
-                * it's fixed offset must be 0. In the other cases, fixed offset
-                * can be non-zero.
+                * its fixed offset must be 0. In the other cases, fixed offset
+                * can be non-zero. This was already checked above. So pass
+                * fixed_off_ok as true to allow fixed offset for all other
+                * cases. var_off always must be 0 for PTR_TO_BTF_ID, hence we
+                * still need to do checks instead of returning.
                 */
-               if (arg_type_is_release(arg_type) && reg->off) {
-                       verbose(env, "R%d must have zero offset when passed to release func\n",
-                               regno);
-                       return -EINVAL;
-               }
-               /* For arg is release pointer, fixed_off_ok must be false, but
-                * we already checked and rejected reg->off != 0 above, so set
-                * to true to allow fixed offset for all other cases.
-                */
-               fixed_off_ok = true;
-               break;
+               return __check_ptr_off_reg(env, reg, regno, true);
        default:
-               break;
+               return __check_ptr_off_reg(env, reg, regno, false);
        }
-       return __check_ptr_off_reg(env, reg, regno, fixed_off_ok);
 }
 
-static u32 stack_slot_get_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+static u32 dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
 {
        struct bpf_func_state *state = func(env, reg);
-       int spi = get_spi(reg->off);
+       int spi;
 
-       return state->stack[spi].spilled_ptr.id;
+       if (reg->type == CONST_PTR_TO_DYNPTR)
+               return reg->ref_obj_id;
+
+       spi = get_spi(reg->off);
+       return state->stack[spi].spilled_ptr.ref_obj_id;
 }
 
 static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
@@ -6217,11 +6415,22 @@ skip_type_check:
        if (arg_type_is_release(arg_type)) {
                if (arg_type_is_dynptr(arg_type)) {
                        struct bpf_func_state *state = func(env, reg);
-                       int spi = get_spi(reg->off);
+                       int spi;
 
-                       if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) ||
-                           !state->stack[spi].spilled_ptr.id) {
-                               verbose(env, "arg %d is an unacquired reference\n", regno);
+                       /* Only dynptr created on stack can be released, thus
+                        * the get_spi and stack state checks for spilled_ptr
+                        * should only be done before process_dynptr_func for
+                        * PTR_TO_STACK.
+                        */
+                       if (reg->type == PTR_TO_STACK) {
+                               spi = get_spi(reg->off);
+                               if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) ||
+                                   !state->stack[spi].spilled_ptr.ref_obj_id) {
+                                       verbose(env, "arg %d is an unacquired reference\n", regno);
+                                       return -EINVAL;
+                               }
+                       } else {
+                               verbose(env, "cannot release unowned const bpf_dynptr\n");
                                return -EINVAL;
                        }
                } else if (!reg->ref_obj_id && !register_is_null(reg)) {
@@ -6318,19 +6527,22 @@ skip_type_check:
                break;
        case ARG_PTR_TO_SPIN_LOCK:
                if (meta->func_id == BPF_FUNC_spin_lock) {
-                       if (process_spin_lock(env, regno, true))
-                               return -EACCES;
+                       err = process_spin_lock(env, regno, true);
+                       if (err)
+                               return err;
                } else if (meta->func_id == BPF_FUNC_spin_unlock) {
-                       if (process_spin_lock(env, regno, false))
-                               return -EACCES;
+                       err = process_spin_lock(env, regno, false);
+                       if (err)
+                               return err;
                } else {
                        verbose(env, "verifier internal error\n");
                        return -EFAULT;
                }
                break;
        case ARG_PTR_TO_TIMER:
-               if (process_timer_func(env, regno, meta))
-                       return -EACCES;
+               err = process_timer_func(env, regno, meta);
+               if (err)
+                       return err;
                break;
        case ARG_PTR_TO_FUNC:
                meta->subprogno = reg->subprogno;
@@ -6353,52 +6565,9 @@ skip_type_check:
                err = check_mem_size_reg(env, reg, regno, true, meta);
                break;
        case ARG_PTR_TO_DYNPTR:
-               /* We only need to check for initialized / uninitialized helper
-                * dynptr args if the dynptr is not PTR_TO_DYNPTR, as the
-                * assumption is that if it is, that a helper function
-                * initialized the dynptr on behalf of the BPF program.
-                */
-               if (base_type(reg->type) == PTR_TO_DYNPTR)
-                       break;
-               if (arg_type & MEM_UNINIT) {
-                       if (!is_dynptr_reg_valid_uninit(env, reg)) {
-                               verbose(env, "Dynptr has to be an uninitialized dynptr\n");
-                               return -EINVAL;
-                       }
-
-                       /* We only support one dynptr being uninitialized at the moment,
-                        * which is sufficient for the helper functions we have right now.
-                        */
-                       if (meta->uninit_dynptr_regno) {
-                               verbose(env, "verifier internal error: multiple uninitialized dynptr args\n");
-                               return -EFAULT;
-                       }
-
-                       meta->uninit_dynptr_regno = regno;
-               } else if (!is_dynptr_reg_valid_init(env, reg)) {
-                       verbose(env,
-                               "Expected an initialized dynptr as arg #%d\n",
-                               arg + 1);
-                       return -EINVAL;
-               } else if (!is_dynptr_type_expected(env, reg, arg_type)) {
-                       const char *err_extra = "";
-
-                       switch (arg_type & DYNPTR_TYPE_FLAG_MASK) {
-                       case DYNPTR_TYPE_LOCAL:
-                               err_extra = "local";
-                               break;
-                       case DYNPTR_TYPE_RINGBUF:
-                               err_extra = "ringbuf";
-                               break;
-                       default:
-                               err_extra = "<unknown>";
-                               break;
-                       }
-                       verbose(env,
-                               "Expected a dynptr of type %s as arg #%d\n",
-                               err_extra, arg + 1);
-                       return -EINVAL;
-               }
+               err = process_dynptr_func(env, regno, arg_type, meta);
+               if (err)
+                       return err;
                break;
        case ARG_CONST_ALLOC_SIZE_OR_ZERO:
                if (!tnum_is_const(reg->var_off)) {
@@ -6465,8 +6634,9 @@ skip_type_check:
                break;
        }
        case ARG_PTR_TO_KPTR:
-               if (process_kptr_func(env, regno, meta))
-                       return -EACCES;
+               err = process_kptr_func(env, regno, meta);
+               if (err)
+                       return err;
                break;
        }
 
@@ -7234,11 +7404,10 @@ static int set_user_ringbuf_callback_state(struct bpf_verifier_env *env,
 {
        /* bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void
         *                        callback_ctx, u64 flags);
-        * callback_fn(struct bpf_dynptr_t* dynptr, void *callback_ctx);
+        * callback_fn(const struct bpf_dynptr_t* dynptr, void *callback_ctx);
         */
        __mark_reg_not_init(env, &callee->regs[BPF_REG_0]);
-       callee->regs[BPF_REG_1].type = PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL;
-       __mark_reg_known_zero(&callee->regs[BPF_REG_1]);
+       mark_dynptr_cb_reg(&callee->regs[BPF_REG_1], BPF_DYNPTR_TYPE_LOCAL);
        callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3];
 
        /* unused */
@@ -7632,7 +7801,15 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 
        regs = cur_regs(env);
 
+       /* This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot
+        * be reinitialized by any dynptr helper. Hence, mark_stack_slots_dynptr
+        * is safe to do directly.
+        */
        if (meta.uninit_dynptr_regno) {
+               if (regs[meta.uninit_dynptr_regno].type == CONST_PTR_TO_DYNPTR) {
+                       verbose(env, "verifier internal error: CONST_PTR_TO_DYNPTR cannot be initialized\n");
+                       return -EFAULT;
+               }
                /* we write BPF_DW bits (8 bytes) at a time */
                for (i = 0; i < BPF_DYNPTR_SIZE; i += 8) {
                        err = check_mem_access(env, insn_idx, meta.uninit_dynptr_regno,
@@ -7650,15 +7827,24 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 
        if (meta.release_regno) {
                err = -EINVAL;
-               if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1]))
+               /* This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot
+                * be released by any dynptr helper. Hence, unmark_stack_slots_dynptr
+                * is safe to do directly.
+                */
+               if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1])) {
+                       if (regs[meta.release_regno].type == CONST_PTR_TO_DYNPTR) {
+                               verbose(env, "verifier internal error: CONST_PTR_TO_DYNPTR cannot be released\n");
+                               return -EFAULT;
+                       }
                        err = unmark_stack_slots_dynptr(env, &regs[meta.release_regno]);
-               else if (meta.ref_obj_id)
+               } else if (meta.ref_obj_id) {
                        err = release_reference(env, meta.ref_obj_id);
-               /* meta.ref_obj_id can only be 0 if register that is meant to be
-                * released is NULL, which must be > R0.
-                */
-               else if (register_is_null(&regs[meta.release_regno]))
+               } else if (register_is_null(&regs[meta.release_regno])) {
+                       /* meta.ref_obj_id can only be 0 if register that is meant to be
+                        * released is NULL, which must be > R0.
+                        */
                        err = 0;
+               }
                if (err) {
                        verbose(env, "func %s#%d reference has not been acquired before\n",
                                func_id_name(func_id), func_id);
@@ -7732,11 +7918,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
                                        return -EFAULT;
                                }
 
-                               if (base_type(reg->type) != PTR_TO_DYNPTR)
-                                       /* Find the id of the dynptr we're
-                                        * tracking the reference of
-                                        */
-                                       meta.ref_obj_id = stack_slot_get_id(env, reg);
+                               meta.ref_obj_id = dynptr_ref_obj_id(env, reg);
                                break;
                        }
                }
@@ -8026,6 +8208,11 @@ static bool is_kfunc_destructive(struct bpf_kfunc_call_arg_meta *meta)
        return meta->kfunc_flags & KF_DESTRUCTIVE;
 }
 
+static bool is_kfunc_rcu(struct bpf_kfunc_call_arg_meta *meta)
+{
+       return meta->kfunc_flags & KF_RCU;
+}
+
 static bool is_kfunc_arg_kptr_get(struct bpf_kfunc_call_arg_meta *meta, int arg)
 {
        return arg == 0 && (meta->kfunc_flags & KF_KPTR_GET);
@@ -8710,13 +8897,20 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
                switch (kf_arg_type) {
                case KF_ARG_PTR_TO_ALLOC_BTF_ID:
                case KF_ARG_PTR_TO_BTF_ID:
-                       if (!is_kfunc_trusted_args(meta))
+                       if (!is_kfunc_trusted_args(meta) && !is_kfunc_rcu(meta))
                                break;
 
                        if (!is_trusted_reg(reg)) {
-                               verbose(env, "R%d must be referenced or trusted\n", regno);
-                               return -EINVAL;
+                               if (!is_kfunc_rcu(meta)) {
+                                       verbose(env, "R%d must be referenced or trusted\n", regno);
+                                       return -EINVAL;
+                               }
+                               if (!is_rcu_reg(reg)) {
+                                       verbose(env, "R%d must be a rcu pointer\n", regno);
+                                       return -EINVAL;
+                               }
                        }
+
                        fallthrough;
                case KF_ARG_PTR_TO_CTX:
                        /* Trusted arguments have the same offset checks as release arguments */
@@ -8780,22 +8974,15 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
                                return ret;
                        break;
                case KF_ARG_PTR_TO_DYNPTR:
-                       if (reg->type != PTR_TO_STACK) {
-                               verbose(env, "arg#%d expected pointer to stack\n", i);
+                       if (reg->type != PTR_TO_STACK &&
+                           reg->type != CONST_PTR_TO_DYNPTR) {
+                               verbose(env, "arg#%d expected pointer to stack or dynptr_ptr\n", i);
                                return -EINVAL;
                        }
 
-                       if (!is_dynptr_reg_valid_init(env, reg)) {
-                               verbose(env, "arg#%d pointer type %s %s must be valid and initialized\n",
-                                       i, btf_type_str(ref_t), ref_tname);
-                               return -EINVAL;
-                       }
-
-                       if (!is_dynptr_type_expected(env, reg, ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL)) {
-                               verbose(env, "arg#%d pointer type %s %s points to unsupported dynamic pointer type\n",
-                                       i, btf_type_str(ref_t), ref_tname);
-                               return -EINVAL;
-                       }
+                       ret = process_dynptr_func(env, regno, ARG_PTR_TO_DYNPTR | MEM_RDONLY, NULL);
+                       if (ret < 0)
+                               return ret;
                        break;
                case KF_ARG_PTR_TO_LIST_HEAD:
                        if (reg->type != PTR_TO_MAP_VALUE &&
@@ -8827,7 +9014,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
                case KF_ARG_PTR_TO_BTF_ID:
                        /* Only base_type is checked, further checks are done here */
                        if ((base_type(reg->type) != PTR_TO_BTF_ID ||
-                            bpf_type_has_unsafe_modifiers(reg->type)) &&
+                            (bpf_type_has_unsafe_modifiers(reg->type) && !is_rcu_reg(reg))) &&
                            !reg2btf_ids[base_type(reg->type)]) {
                                verbose(env, "arg#%d is %s ", i, reg_type_str(env, reg->type));
                                verbose(env, "expected %s or socket\n",
@@ -8942,7 +9129,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
                } else if (rcu_unlock) {
                        bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
                                if (reg->type & MEM_RCU) {
-                                       reg->type &= ~(MEM_RCU | PTR_TRUSTED);
+                                       reg->type &= ~(MEM_RCU | PTR_MAYBE_NULL);
                                        reg->type |= PTR_UNTRUSTED;
                                }
                        }));
@@ -11282,7 +11469,7 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
                                 bool is_null)
 {
        if (type_may_be_null(reg->type) && reg->id == id &&
-           !WARN_ON_ONCE(!reg->id)) {
+           (is_rcu_reg(reg) || !WARN_ON_ONCE(!reg->id))) {
                /* Old offset (both fixed and variable parts) should have been
                 * known-zero, because we don't allow pointer arithmetic on
                 * pointers that might be NULL. If we see this happening, don't
@@ -12104,11 +12291,16 @@ static struct bpf_verifier_state_list **explored_state(
        return &env->explored_states[(idx ^ state->callsite) % state_htab_size(env)];
 }
 
-static void init_explored_state(struct bpf_verifier_env *env, int idx)
+static void mark_prune_point(struct bpf_verifier_env *env, int idx)
 {
        env->insn_aux_data[idx].prune_point = true;
 }
 
+static bool is_prune_point(struct bpf_verifier_env *env, int insn_idx)
+{
+       return env->insn_aux_data[insn_idx].prune_point;
+}
+
 enum {
        DONE_EXPLORING = 0,
        KEEP_EXPLORING = 1,
@@ -12137,9 +12329,11 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,
                return -EINVAL;
        }
 
-       if (e == BRANCH)
+       if (e == BRANCH) {
                /* mark branch target for state pruning */
-               init_explored_state(env, w);
+               mark_prune_point(env, w);
+               mark_jmp_point(env, w);
+       }
 
        if (insn_state[w] == 0) {
                /* tree-edge */
@@ -12166,8 +12360,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,
        return DONE_EXPLORING;
 }
 
-static int visit_func_call_insn(int t, int insn_cnt,
-                               struct bpf_insn *insns,
+static int visit_func_call_insn(int t, struct bpf_insn *insns,
                                struct bpf_verifier_env *env,
                                bool visit_callee)
 {
@@ -12177,10 +12370,12 @@ static int visit_func_call_insn(int t, int insn_cnt,
        if (ret)
                return ret;
 
-       if (t + 1 < insn_cnt)
-               init_explored_state(env, t + 1);
+       mark_prune_point(env, t + 1);
+       /* when we exit from subprog, we need to record non-linear history */
+       mark_jmp_point(env, t + 1);
+
        if (visit_callee) {
-               init_explored_state(env, t);
+               mark_prune_point(env, t);
                ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env,
                                /* It's ok to allow recursion from CFG point of
                                 * view. __check_func_call() will do the actual
@@ -12196,13 +12391,13 @@ static int visit_func_call_insn(int t, int insn_cnt,
  *  DONE_EXPLORING - the instruction was fully explored
  *  KEEP_EXPLORING - there is still work to be done before it is fully explored
  */
-static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env)
+static int visit_insn(int t, struct bpf_verifier_env *env)
 {
        struct bpf_insn *insns = env->prog->insnsi;
        int ret;
 
        if (bpf_pseudo_func(insns + t))
-               return visit_func_call_insn(t, insn_cnt, insns, env, true);
+               return visit_func_call_insn(t, insns, env, true);
 
        /* All non-branch instructions have a single fall-through edge. */
        if (BPF_CLASS(insns[t].code) != BPF_JMP &&
@@ -12215,13 +12410,13 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env)
 
        case BPF_CALL:
                if (insns[t].imm == BPF_FUNC_timer_set_callback)
-                       /* Mark this call insn to trigger is_state_visited() check
-                        * before call itself is processed by __check_func_call().
-                        * Otherwise new async state will be pushed for further
-                        * exploration.
+                       /* Mark this call insn as a prune point to trigger
+                        * is_state_visited() check before call itself is
+                        * processed by __check_func_call(). Otherwise new
+                        * async state will be pushed for further exploration.
                         */
-                       init_explored_state(env, t);
-               return visit_func_call_insn(t, insn_cnt, insns, env,
+                       mark_prune_point(env, t);
+               return visit_func_call_insn(t, insns, env,
                                            insns[t].src_reg == BPF_PSEUDO_CALL);
 
        case BPF_JA:
@@ -12234,22 +12429,15 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env)
                if (ret)
                        return ret;
 
-               /* unconditional jmp is not a good pruning point,
-                * but it's marked, since backtracking needs
-                * to record jmp history in is_state_visited().
-                */
-               init_explored_state(env, t + insns[t].off + 1);
-               /* tell verifier to check for equivalent states
-                * after every call and jump
-                */
-               if (t + 1 < insn_cnt)
-                       init_explored_state(env, t + 1);
+               mark_prune_point(env, t + insns[t].off + 1);
+               mark_jmp_point(env, t + insns[t].off + 1);
 
                return ret;
 
        default:
                /* conditional jump with two edges */
-               init_explored_state(env, t);
+               mark_prune_point(env, t);
+
                ret = push_insn(t, t + 1, FALLTHROUGH, env, true);
                if (ret)
                        return ret;
@@ -12285,7 +12473,7 @@ static int check_cfg(struct bpf_verifier_env *env)
        while (env->cfg.cur_stack > 0) {
                int t = insn_stack[env->cfg.cur_stack - 1];
 
-               ret = visit_insn(t, insn_cnt, env);
+               ret = visit_insn(t, env);
                switch (ret) {
                case DONE_EXPLORING:
                        insn_state[t] = EXPLORED;
@@ -12876,15 +13064,6 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
 
        equal = memcmp(rold, rcur, offsetof(struct bpf_reg_state, parent)) == 0;
 
-       if (rold->type == PTR_TO_STACK)
-               /* two stack pointers are equal only if they're pointing to
-                * the same stack frame, since fp-8 in foo != fp-8 in bar
-                */
-               return equal && rold->frameno == rcur->frameno;
-
-       if (equal)
-               return true;
-
        if (rold->type == NOT_INIT)
                /* explored state can't have used this */
                return true;
@@ -12892,6 +13071,8 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
                return false;
        switch (base_type(rold->type)) {
        case SCALAR_VALUE:
+               if (equal)
+                       return true;
                if (env->explore_alu_limits)
                        return false;
                if (rcur->type == SCALAR_VALUE) {
@@ -12938,7 +13119,8 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
                 */
                return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
                       range_within(rold, rcur) &&
-                      tnum_in(rold->var_off, rcur->var_off);
+                      tnum_in(rold->var_off, rcur->var_off) &&
+                      check_ids(rold->id, rcur->id, idmap);
        case PTR_TO_PACKET_META:
        case PTR_TO_PACKET:
                if (rcur->type != rold->type)
@@ -12962,20 +13144,14 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
                /* new val must satisfy old val knowledge */
                return range_within(rold, rcur) &&
                       tnum_in(rold->var_off, rcur->var_off);
-       case PTR_TO_CTX:
-       case CONST_PTR_TO_MAP:
-       case PTR_TO_PACKET_END:
-       case PTR_TO_FLOW_KEYS:
-       case PTR_TO_SOCKET:
-       case PTR_TO_SOCK_COMMON:
-       case PTR_TO_TCP_SOCK:
-       case PTR_TO_XDP_SOCK:
-               /* Only valid matches are exact, which memcmp() above
-                * would have accepted
+       case PTR_TO_STACK:
+               /* two stack pointers are equal only if they're pointing to
+                * the same stack frame, since fp-8 in foo != fp-8 in bar
                 */
+               return equal && rold->frameno == rcur->frameno;
        default:
-               /* Don't know what's going on, just say it's not safe */
-               return false;
+               /* Only valid matches are exact, which memcmp() */
+               return equal;
        }
 
        /* Shouldn't get here; if we do, say it's not safe */
@@ -13085,7 +13261,6 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat
 {
        int i;
 
-       memset(env->idmap_scratch, 0, sizeof(env->idmap_scratch));
        for (i = 0; i < MAX_BPF_REG; i++)
                if (!regsafe(env, &old->regs[i], &cur->regs[i],
                             env->idmap_scratch))
@@ -13109,14 +13284,25 @@ static bool states_equal(struct bpf_verifier_env *env,
        if (old->curframe != cur->curframe)
                return false;
 
+       memset(env->idmap_scratch, 0, sizeof(env->idmap_scratch));
+
        /* Verification state from speculative execution simulation
         * must never prune a non-speculative execution one.
         */
        if (old->speculative && !cur->speculative)
                return false;
 
-       if (old->active_lock.ptr != cur->active_lock.ptr ||
-           old->active_lock.id != cur->active_lock.id)
+       if (old->active_lock.ptr != cur->active_lock.ptr)
+               return false;
+
+       /* Old and cur active_lock's have to be either both present
+        * or both absent.
+        */
+       if (!!old->active_lock.id != !!cur->active_lock.id)
+               return false;
+
+       if (old->active_lock.id &&
+           !check_ids(old->active_lock.id, cur->active_lock.id, env->idmap_scratch))
                return false;
 
        if (old->active_rcu_lock != cur->active_rcu_lock)
@@ -13283,13 +13469,6 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
        int i, j, err, states_cnt = 0;
        bool add_new_state = env->test_state_freq ? true : false;
 
-       cur->last_insn_idx = env->prev_insn_idx;
-       if (!env->insn_aux_data[insn_idx].prune_point)
-               /* this 'insn_idx' instruction wasn't marked, so we will not
-                * be doing state search here
-                */
-               return 0;
-
        /* bpf progs typically have pruning point every 4 instructions
         * http://vger.kernel.org/bpfconf2019.html#session-1
         * Do not add new state for future pruning if the verifier hasn't seen
@@ -13424,10 +13603,10 @@ next:
                env->max_states_per_insn = states_cnt;
 
        if (!env->bpf_capable && states_cnt > BPF_COMPLEXITY_LIMIT_STATES)
-               return push_jmp_history(env, cur);
+               return 0;
 
        if (!add_new_state)
-               return push_jmp_history(env, cur);
+               return 0;
 
        /* There were no equivalent states, remember the current one.
         * Technically the current state is not proven to be safe yet,
@@ -13567,21 +13746,31 @@ static int do_check(struct bpf_verifier_env *env)
                        return -E2BIG;
                }
 
-               err = is_state_visited(env, env->insn_idx);
-               if (err < 0)
-                       return err;
-               if (err == 1) {
-                       /* found equivalent state, can prune the search */
-                       if (env->log.level & BPF_LOG_LEVEL) {
-                               if (do_print_state)
-                                       verbose(env, "\nfrom %d to %d%s: safe\n",
-                                               env->prev_insn_idx, env->insn_idx,
-                                               env->cur_state->speculative ?
-                                               " (speculative execution)" : "");
-                               else
-                                       verbose(env, "%d: safe\n", env->insn_idx);
+               state->last_insn_idx = env->prev_insn_idx;
+
+               if (is_prune_point(env, env->insn_idx)) {
+                       err = is_state_visited(env, env->insn_idx);
+                       if (err < 0)
+                               return err;
+                       if (err == 1) {
+                               /* found equivalent state, can prune the search */
+                               if (env->log.level & BPF_LOG_LEVEL) {
+                                       if (do_print_state)
+                                               verbose(env, "\nfrom %d to %d%s: safe\n",
+                                                       env->prev_insn_idx, env->insn_idx,
+                                                       env->cur_state->speculative ?
+                                                       " (speculative execution)" : "");
+                                       else
+                                               verbose(env, "%d: safe\n", env->insn_idx);
+                               }
+                               goto process_bpf_exit;
                        }
-                       goto process_bpf_exit;
+               }
+
+               if (is_jmp_point(env, env->insn_idx)) {
+                       err = push_jmp_history(env, state);
+                       if (err)
+                               return err;
                }
 
                if (signal_pending(current))
@@ -14123,10 +14312,11 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
                case BPF_MAP_TYPE_INODE_STORAGE:
                case BPF_MAP_TYPE_SK_STORAGE:
                case BPF_MAP_TYPE_TASK_STORAGE:
+               case BPF_MAP_TYPE_CGRP_STORAGE:
                        break;
                default:
                        verbose(env,
-                               "Sleepable programs can only use array, hash, and ringbuf maps\n");
+                               "Sleepable programs can only use array, hash, ringbuf and local storage maps\n");
                        return -EINVAL;
                }
 
@@ -14782,6 +14972,10 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,
                if (!bpf_jit_needs_zext() && !is_cmpxchg_insn(&insn))
                        continue;
 
+               /* Zero-extension is done by the caller. */
+               if (bpf_pseudo_kfunc_call(&insn))
+                       continue;
+
                if (WARN_ON(load_reg == -1)) {
                        verbose(env, "verifier bug. zext_dst is set, but no reg is defined\n");
                        return -EFAULT;
@@ -15292,7 +15486,7 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
        }
 
        /* insn->imm has the btf func_id. Replace it with
-        * an address (relative to __bpf_base_call).
+        * an address (relative to __bpf_call_base).
         */
        desc = find_kfunc_desc(env->prog, insn->imm, insn->off);
        if (!desc) {
@@ -16464,12 +16658,22 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
                        ret = -EINVAL;
                        switch (prog->type) {
                        case BPF_PROG_TYPE_TRACING:
-                               /* fentry/fexit/fmod_ret progs can be sleepable only if they are
+
+                               /* fentry/fexit/fmod_ret progs can be sleepable if they are
                                 * attached to ALLOW_ERROR_INJECTION and are not in denylist.
                                 */
                                if (!check_non_sleepable_error_inject(btf_id) &&
                                    within_error_injection_list(addr))
                                        ret = 0;
+                               /* fentry/fexit/fmod_ret progs can also be sleepable if they are
+                                * in the fmodret id set with the KF_SLEEPABLE flag.
+                                */
+                               else {
+                                       u32 *flags = btf_kfunc_is_modify_return(btf, btf_id);
+
+                                       if (flags && (*flags & KF_SLEEPABLE))
+                                               ret = 0;
+                               }
                                break;
                        case BPF_PROG_TYPE_LSM:
                                /* LSM progs check that they are attached to bpf_lsm_*() funcs.
@@ -16490,7 +16694,10 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
                                bpf_log(log, "can't modify return codes of BPF programs\n");
                                return -EINVAL;
                        }
-                       ret = check_attach_modify_return(addr, tname);
+                       ret = -EINVAL;
+                       if (btf_kfunc_is_modify_return(btf, btf_id) ||
+                           !check_attach_modify_return(addr, tname))
+                               ret = 0;
                        if (ret) {
                                bpf_log(log, "%s() is not modifiable\n", tname);
                                return ret;
@@ -16679,7 +16886,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
 
        env->allow_ptr_leaks = bpf_allow_ptr_leaks();
        env->allow_uninit_stack = bpf_allow_uninit_stack();
-       env->allow_ptr_to_map_access = bpf_allow_ptr_to_map_access();
        env->bypass_spec_v1 = bpf_bypass_spec_v1();
        env->bypass_spec_v4 = bpf_bypass_spec_v4();
        env->bpf_capable = bpf_capable();
index fd40208..367b0a4 100644 (file)
@@ -167,7 +167,6 @@ struct cgroup_mgctx {
 extern spinlock_t css_set_lock;
 extern struct cgroup_subsys *cgroup_subsys[];
 extern struct list_head cgroup_roots;
-extern struct file_system_type cgroup_fs_type;
 
 /* iterate across the hierarchies */
 #define for_each_root(root)                                            \
index 7091bbf..7f04f99 100644 (file)
@@ -2291,6 +2291,7 @@ event_sched_out(struct perf_event *event,
                    !event->pending_work) {
                        event->pending_work = 1;
                        dec = false;
+                       WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount));
                        task_work_add(current, &event->pending_task, TWA_RESUME);
                }
                if (dec)
@@ -2336,6 +2337,7 @@ group_sched_out(struct perf_event *group_event,
 
 #define DETACH_GROUP   0x01UL
 #define DETACH_CHILD   0x02UL
+#define DETACH_DEAD    0x04UL
 
 /*
  * Cross CPU call to remove a performance event
@@ -2356,12 +2358,20 @@ __perf_remove_from_context(struct perf_event *event,
                update_cgrp_time_from_cpuctx(cpuctx, false);
        }
 
+       /*
+        * Ensure event_sched_out() switches to OFF, at the very least
+        * this avoids raising perf_pending_task() at this time.
+        */
+       if (flags & DETACH_DEAD)
+               event->pending_disable = 1;
        event_sched_out(event, cpuctx, ctx);
        if (flags & DETACH_GROUP)
                perf_group_detach(event);
        if (flags & DETACH_CHILD)
                perf_child_detach(event);
        list_del_event(event, ctx);
+       if (flags & DETACH_DEAD)
+               event->state = PERF_EVENT_STATE_DEAD;
 
        if (!ctx->nr_events && ctx->is_active) {
                if (ctx == &cpuctx->ctx)
@@ -5121,9 +5131,7 @@ int perf_event_release_kernel(struct perf_event *event)
 
        ctx = perf_event_ctx_lock(event);
        WARN_ON_ONCE(ctx->parent_ctx);
-       perf_remove_from_context(event, DETACH_GROUP);
 
-       raw_spin_lock_irq(&ctx->lock);
        /*
         * Mark this event as STATE_DEAD, there is no external reference to it
         * anymore.
@@ -5135,8 +5143,7 @@ int perf_event_release_kernel(struct perf_event *event)
         * Thus this guarantees that we will in fact observe and kill _ALL_
         * child events.
         */
-       event->state = PERF_EVENT_STATE_DEAD;
-       raw_spin_unlock_irq(&ctx->lock);
+       perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD);
 
        perf_event_ctx_unlock(event, ctx);
 
@@ -6577,6 +6584,8 @@ static void perf_pending_task(struct callback_head *head)
        if (rctx >= 0)
                perf_swevent_put_recursion_context(rctx);
        preempt_enable_notrace();
+
+       put_event(event);
 }
 
 #ifdef CONFIG_GUEST_PERF_EVENTS
index 188c305..c6d9dec 100644 (file)
@@ -267,13 +267,14 @@ int proc_dostring(struct ctl_table *table, int write,
                        ppos);
 }
 
-static size_t proc_skip_spaces(char **buf)
+static void proc_skip_spaces(char **buf, size_t *size)
 {
-       size_t ret;
-       char *tmp = skip_spaces(*buf);
-       ret = tmp - *buf;
-       *buf = tmp;
-       return ret;
+       while (*size) {
+               if (!isspace(**buf))
+                       break;
+               (*size)--;
+               (*buf)++;
+       }
 }
 
 static void proc_skip_char(char **buf, size_t *size, const char v)
@@ -342,13 +343,12 @@ static int proc_get_long(char **buf, size_t *size,
                          unsigned long *val, bool *neg,
                          const char *perm_tr, unsigned perm_tr_len, char *tr)
 {
-       int len;
        char *p, tmp[TMPBUFLEN];
+       ssize_t len = *size;
 
-       if (!*size)
+       if (len <= 0)
                return -EINVAL;
 
-       len = *size;
        if (len > TMPBUFLEN - 1)
                len = TMPBUFLEN - 1;
 
@@ -521,7 +521,7 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
                bool neg;
 
                if (write) {
-                       left -= proc_skip_spaces(&p);
+                       proc_skip_spaces(&p, &left);
 
                        if (!left)
                                break;
@@ -548,7 +548,7 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
        if (!write && !first && left && !err)
                proc_put_char(&buffer, &left, '\n');
        if (write && !err && left)
-               left -= proc_skip_spaces(&p);
+               proc_skip_spaces(&p, &left);
        if (write && first)
                return err ? : -EINVAL;
        *lenp -= left;
@@ -590,7 +590,7 @@ static int do_proc_douintvec_w(unsigned int *tbl_data,
        if (left > PAGE_SIZE - 1)
                left = PAGE_SIZE - 1;
 
-       left -= proc_skip_spaces(&p);
+       proc_skip_spaces(&p, &left);
        if (!left) {
                err = -EINVAL;
                goto out_free;
@@ -610,7 +610,7 @@ static int do_proc_douintvec_w(unsigned int *tbl_data,
        }
 
        if (!err && left)
-               left -= proc_skip_spaces(&p);
+               proc_skip_spaces(&p, &left);
 
 out_free:
        if (err)
@@ -1075,7 +1075,7 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table,
                if (write) {
                        bool neg;
 
-                       left -= proc_skip_spaces(&p);
+                       proc_skip_spaces(&p, &left);
                        if (!left)
                                break;
 
@@ -1104,7 +1104,7 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table,
        if (!write && !first && left && !err)
                proc_put_char(&buffer, &left, '\n');
        if (write && !err)
-               left -= proc_skip_spaces(&p);
+               proc_skip_spaces(&p, &left);
        if (write && first)
                return err ? : -EINVAL;
        *lenp -= left;
index 9bbf8a4..cc969ef 100644 (file)
@@ -24,6 +24,7 @@ config LINEAR_RANGES
 
 config PACKING
        bool "Generic bitfield packing and unpacking"
+       select BITREVERSE
        default n
        help
          This option provides the packing() helper function, which permits
index a100541..3638b34 100644 (file)
@@ -399,6 +399,7 @@ config FRAME_WARN
        default 2048 if GCC_PLUGIN_LATENT_ENTROPY
        default 2048 if PARISC
        default 1536 if (!64BIT && XTENSA)
+       default 1280 if KASAN && !64BIT
        default 1024 if !64BIT
        default 2048 if 64BIT
        help
@@ -1874,8 +1875,14 @@ config NETDEV_NOTIFIER_ERROR_INJECT
          If unsure, say N.
 
 config FUNCTION_ERROR_INJECTION
-       def_bool y
+       bool "Fault-injections of functions"
        depends on HAVE_FUNCTION_ERROR_INJECTION && KPROBES
+       help
+         Add fault injections into various functions that are annotated with
+         ALLOW_ERROR_INJECTION() in the kernel. BPF may also modify the return
+         value of theses functions. This is useful to test error paths of code.
+
+         If unsure, say N
 
 config FAULT_INJECTION
        bool "Fault-injection framework"
index 9a72f4b..a961692 100644 (file)
@@ -7,6 +7,7 @@
 #include <linux/bitops.h>
 #include <linux/errno.h>
 #include <linux/types.h>
+#include <linux/bitrev.h>
 
 static int get_le_offset(int offset)
 {
@@ -29,19 +30,6 @@ static int get_reverse_lsw32_offset(int offset, size_t len)
        return word_index * 4 + offset;
 }
 
-static u64 bit_reverse(u64 val, unsigned int width)
-{
-       u64 new_val = 0;
-       unsigned int bit;
-       unsigned int i;
-
-       for (i = 0; i < width; i++) {
-               bit = (val & (1 << i)) != 0;
-               new_val |= (bit << (width - i - 1));
-       }
-       return new_val;
-}
-
 static void adjust_for_msb_right_quirk(u64 *to_write, int *box_start_bit,
                                       int *box_end_bit, u8 *box_mask)
 {
@@ -49,7 +37,7 @@ static void adjust_for_msb_right_quirk(u64 *to_write, int *box_start_bit,
        int new_box_start_bit, new_box_end_bit;
 
        *to_write >>= *box_end_bit;
-       *to_write = bit_reverse(*to_write, box_bit_width);
+       *to_write = bitrev8(*to_write) >> (8 - box_bit_width);
        *to_write <<= *box_end_bit;
 
        new_box_end_bit   = box_bit_width - *box_start_bit - 1;
index e12bbfb..6ae2ba8 100644 (file)
@@ -231,6 +231,7 @@ static int rhashtable_rehash_one(struct rhashtable *ht,
        struct rhash_head *head, *next, *entry;
        struct rhash_head __rcu **pprev = NULL;
        unsigned int new_hash;
+       unsigned long flags;
 
        if (new_tbl->nest)
                goto out;
@@ -253,13 +254,14 @@ static int rhashtable_rehash_one(struct rhashtable *ht,
 
        new_hash = head_hashfn(ht, new_tbl, entry);
 
-       rht_lock_nested(new_tbl, &new_tbl->buckets[new_hash], SINGLE_DEPTH_NESTING);
+       flags = rht_lock_nested(new_tbl, &new_tbl->buckets[new_hash],
+                               SINGLE_DEPTH_NESTING);
 
        head = rht_ptr(new_tbl->buckets + new_hash, new_tbl, new_hash);
 
        RCU_INIT_POINTER(entry->next, head);
 
-       rht_assign_unlock(new_tbl, &new_tbl->buckets[new_hash], entry);
+       rht_assign_unlock(new_tbl, &new_tbl->buckets[new_hash], entry, flags);
 
        if (pprev)
                rcu_assign_pointer(*pprev, next);
@@ -276,18 +278,19 @@ static int rhashtable_rehash_chain(struct rhashtable *ht,
 {
        struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
        struct rhash_lock_head __rcu **bkt = rht_bucket_var(old_tbl, old_hash);
+       unsigned long flags;
        int err;
 
        if (!bkt)
                return 0;
-       rht_lock(old_tbl, bkt);
+       flags = rht_lock(old_tbl, bkt);
 
        while (!(err = rhashtable_rehash_one(ht, bkt, old_hash)))
                ;
 
        if (err == -ENOENT)
                err = 0;
-       rht_unlock(old_tbl, bkt);
+       rht_unlock(old_tbl, bkt, flags);
 
        return err;
 }
@@ -590,6 +593,7 @@ static void *rhashtable_try_insert(struct rhashtable *ht, const void *key,
        struct bucket_table *new_tbl;
        struct bucket_table *tbl;
        struct rhash_lock_head __rcu **bkt;
+       unsigned long flags;
        unsigned int hash;
        void *data;
 
@@ -607,7 +611,7 @@ static void *rhashtable_try_insert(struct rhashtable *ht, const void *key,
                        new_tbl = rht_dereference_rcu(tbl->future_tbl, ht);
                        data = ERR_PTR(-EAGAIN);
                } else {
-                       rht_lock(tbl, bkt);
+                       flags = rht_lock(tbl, bkt);
                        data = rhashtable_lookup_one(ht, bkt, tbl,
                                                     hash, key, obj);
                        new_tbl = rhashtable_insert_one(ht, bkt, tbl,
@@ -615,7 +619,7 @@ static void *rhashtable_try_insert(struct rhashtable *ht, const void *key,
                        if (PTR_ERR(new_tbl) != -EEXIST)
                                data = ERR_CAST(new_tbl);
 
-                       rht_unlock(tbl, bkt);
+                       rht_unlock(tbl, bkt, flags);
                }
        } while (!IS_ERR_OR_NULL(new_tbl));
 
index c51f7f5..1f6da31 100644 (file)
@@ -985,28 +985,28 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
                }
 
                /*
+                * Be careful not to clear PageLRU until after we're
+                * sure the page is not being freed elsewhere -- the
+                * page release code relies on it.
+                */
+               if (unlikely(!get_page_unless_zero(page)))
+                       goto isolate_fail;
+
+               /*
                 * Migration will fail if an anonymous page is pinned in memory,
                 * so avoid taking lru_lock and isolating it unnecessarily in an
                 * admittedly racy check.
                 */
                mapping = page_mapping(page);
-               if (!mapping && page_count(page) > page_mapcount(page))
-                       goto isolate_fail;
+               if (!mapping && (page_count(page) - 1) > total_mapcount(page))
+                       goto isolate_fail_put;
 
                /*
                 * Only allow to migrate anonymous pages in GFP_NOFS context
                 * because those do not depend on fs locks.
                 */
                if (!(cc->gfp_mask & __GFP_FS) && mapping)
-                       goto isolate_fail;
-
-               /*
-                * Be careful not to clear PageLRU until after we're
-                * sure the page is not being freed elsewhere -- the
-                * page release code relies on it.
-                */
-               if (unlikely(!get_page_unless_zero(page)))
-                       goto isolate_fail;
+                       goto isolate_fail_put;
 
                /* Only take pages on LRU: a check now makes later tests safe */
                if (!PageLRU(page))
index 5ce4033..07e5f1b 100644 (file)
@@ -2283,12 +2283,54 @@ static struct damos *damon_sysfs_mk_scheme(
                        &wmarks);
 }
 
+static void damon_sysfs_update_scheme(struct damos *scheme,
+               struct damon_sysfs_scheme *sysfs_scheme)
+{
+       struct damon_sysfs_access_pattern *access_pattern =
+               sysfs_scheme->access_pattern;
+       struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas;
+       struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights;
+       struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks;
+
+       scheme->pattern.min_sz_region = access_pattern->sz->min;
+       scheme->pattern.max_sz_region = access_pattern->sz->max;
+       scheme->pattern.min_nr_accesses = access_pattern->nr_accesses->min;
+       scheme->pattern.max_nr_accesses = access_pattern->nr_accesses->max;
+       scheme->pattern.min_age_region = access_pattern->age->min;
+       scheme->pattern.max_age_region = access_pattern->age->max;
+
+       scheme->action = sysfs_scheme->action;
+
+       scheme->quota.ms = sysfs_quotas->ms;
+       scheme->quota.sz = sysfs_quotas->sz;
+       scheme->quota.reset_interval = sysfs_quotas->reset_interval_ms;
+       scheme->quota.weight_sz = sysfs_weights->sz;
+       scheme->quota.weight_nr_accesses = sysfs_weights->nr_accesses;
+       scheme->quota.weight_age = sysfs_weights->age;
+
+       scheme->wmarks.metric = sysfs_wmarks->metric;
+       scheme->wmarks.interval = sysfs_wmarks->interval_us;
+       scheme->wmarks.high = sysfs_wmarks->high;
+       scheme->wmarks.mid = sysfs_wmarks->mid;
+       scheme->wmarks.low = sysfs_wmarks->low;
+}
+
 static int damon_sysfs_set_schemes(struct damon_ctx *ctx,
                struct damon_sysfs_schemes *sysfs_schemes)
 {
-       int i;
+       struct damos *scheme, *next;
+       int i = 0;
+
+       damon_for_each_scheme_safe(scheme, next, ctx) {
+               if (i < sysfs_schemes->nr)
+                       damon_sysfs_update_scheme(scheme,
+                                       sysfs_schemes->schemes_arr[i]);
+               else
+                       damon_destroy_scheme(scheme);
+               i++;
+       }
 
-       for (i = 0; i < sysfs_schemes->nr; i++) {
+       for (; i < sysfs_schemes->nr; i++) {
                struct damos *scheme, *next;
 
                scheme = damon_sysfs_mk_scheme(sysfs_schemes->schemes_arr[i]);
index f1385c3..e36ca75 100644 (file)
@@ -5206,17 +5206,22 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb,
 
        __unmap_hugepage_range(tlb, vma, start, end, ref_page, zap_flags);
 
-       /*
-        * Unlock and free the vma lock before releasing i_mmap_rwsem.  When
-        * the vma_lock is freed, this makes the vma ineligible for pmd
-        * sharing.  And, i_mmap_rwsem is required to set up pmd sharing.
-        * This is important as page tables for this unmapped range will
-        * be asynchrously deleted.  If the page tables are shared, there
-        * will be issues when accessed by someone else.
-        */
-       __hugetlb_vma_unlock_write_free(vma);
-
-       i_mmap_unlock_write(vma->vm_file->f_mapping);
+       if (zap_flags & ZAP_FLAG_UNMAP) {       /* final unmap */
+               /*
+                * Unlock and free the vma lock before releasing i_mmap_rwsem.
+                * When the vma_lock is freed, this makes the vma ineligible
+                * for pmd sharing.  And, i_mmap_rwsem is required to set up
+                * pmd sharing.  This is important as page tables for this
+                * unmapped range will be asynchrously deleted.  If the page
+                * tables are shared, there will be issues when accessed by
+                * someone else.
+                */
+               __hugetlb_vma_unlock_write_free(vma);
+               i_mmap_unlock_write(vma->vm_file->f_mapping);
+       } else {
+               i_mmap_unlock_write(vma->vm_file->f_mapping);
+               hugetlb_vma_unlock_write(vma);
+       }
 }
 
 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
index a8d5ef2..3703a56 100644 (file)
@@ -1051,6 +1051,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
        _pmd = pmdp_collapse_flush(vma, address, pmd);
        spin_unlock(pmd_ptl);
        mmu_notifier_invalidate_range_end(&range);
+       tlb_remove_table_sync_one();
 
        spin_lock(pte_ptl);
        result =  __collapse_huge_page_isolate(vma, address, pte, cc,
@@ -1379,16 +1380,43 @@ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
        return SCAN_SUCCEED;
 }
 
+/*
+ * A note about locking:
+ * Trying to take the page table spinlocks would be useless here because those
+ * are only used to synchronize:
+ *
+ *  - modifying terminal entries (ones that point to a data page, not to another
+ *    page table)
+ *  - installing *new* non-terminal entries
+ *
+ * Instead, we need roughly the same kind of protection as free_pgtables() or
+ * mm_take_all_locks() (but only for a single VMA):
+ * The mmap lock together with this VMA's rmap locks covers all paths towards
+ * the page table entries we're messing with here, except for hardware page
+ * table walks and lockless_pages_from_mm().
+ */
 static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
                                  unsigned long addr, pmd_t *pmdp)
 {
-       spinlock_t *ptl;
        pmd_t pmd;
+       struct mmu_notifier_range range;
 
        mmap_assert_write_locked(mm);
-       ptl = pmd_lock(vma->vm_mm, pmdp);
+       if (vma->vm_file)
+               lockdep_assert_held_write(&vma->vm_file->f_mapping->i_mmap_rwsem);
+       /*
+        * All anon_vmas attached to the VMA have the same root and are
+        * therefore locked by the same lock.
+        */
+       if (vma->anon_vma)
+               lockdep_assert_held_write(&vma->anon_vma->root->rwsem);
+
+       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm, addr,
+                               addr + HPAGE_PMD_SIZE);
+       mmu_notifier_invalidate_range_start(&range);
        pmd = pmdp_collapse_flush(vma, addr, pmdp);
-       spin_unlock(ptl);
+       tlb_remove_table_sync_one();
+       mmu_notifier_invalidate_range_end(&range);
        mm_dec_nr_ptes(mm);
        page_table_check_pte_clear_range(mm, addr, pmd);
        pte_free(mm, pmd_pgtable(pmd));
@@ -1439,6 +1467,14 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
        if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false))
                return SCAN_VMA_CHECK;
 
+       /*
+        * Symmetry with retract_page_tables(): Exclude MAP_PRIVATE mappings
+        * that got written to. Without this, we'd have to also lock the
+        * anon_vma if one exists.
+        */
+       if (vma->anon_vma)
+               return SCAN_VMA_CHECK;
+
        /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */
        if (userfaultfd_wp(vma))
                return SCAN_PTE_UFFD_WP;
@@ -1472,6 +1508,20 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
                goto drop_hpage;
        }
 
+       /*
+        * We need to lock the mapping so that from here on, only GUP-fast and
+        * hardware page walks can access the parts of the page tables that
+        * we're operating on.
+        * See collapse_and_free_pmd().
+        */
+       i_mmap_lock_write(vma->vm_file->f_mapping);
+
+       /*
+        * This spinlock should be unnecessary: Nobody else should be accessing
+        * the page tables under spinlock protection here, only
+        * lockless_pages_from_mm() and the hardware page walker can access page
+        * tables while all the high-level locks are held in write mode.
+        */
        start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
        result = SCAN_FAIL;
 
@@ -1526,6 +1576,8 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
        /* step 4: remove pte entries */
        collapse_and_free_pmd(mm, vma, haddr, pmd);
 
+       i_mmap_unlock_write(vma->vm_file->f_mapping);
+
 maybe_install_pmd:
        /* step 5: install pmd entry */
        result = install_pmd
@@ -1539,6 +1591,7 @@ drop_hpage:
 
 abort:
        pte_unmap_unlock(start_pte, ptl);
+       i_mmap_unlock_write(vma->vm_file->f_mapping);
        goto drop_hpage;
 }
 
@@ -1595,7 +1648,8 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
                 * An alternative would be drop the check, but check that page
                 * table is clear before calling pmdp_collapse_flush() under
                 * ptl. It has higher chance to recover THP for the VMA, but
-                * has higher cost too.
+                * has higher cost too. It would also probably require locking
+                * the anon_vma.
                 */
                if (vma->anon_vma) {
                        result = SCAN_PAGE_ANON;
index c7105ec..b913ba6 100644 (file)
@@ -772,8 +772,8 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
  * Application no longer needs these pages.  If the pages are dirty,
  * it's OK to just throw them away.  The app will be more careful about
  * data it wants to keep.  Be sure to free swap resources too.  The
- * zap_page_range call sets things up for shrink_active_list to actually free
- * these pages later if no one else has touched them in the meantime,
+ * zap_page_range_single call sets things up for shrink_active_list to actually
+ * free these pages later if no one else has touched them in the meantime,
  * although we could add these pages to a global reuse list for
  * shrink_active_list to pick up before reclaiming other pages.
  *
@@ -790,7 +790,7 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
 static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
                                        unsigned long start, unsigned long end)
 {
-       zap_page_range(vma, start, end - start);
+       zap_page_range_single(vma, start, end - start, NULL);
        return 0;
 }
 
index a1a35c1..266a1ab 100644 (file)
@@ -4832,6 +4832,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
        unsigned int efd, cfd;
        struct fd efile;
        struct fd cfile;
+       struct dentry *cdentry;
        const char *name;
        char *endp;
        int ret;
@@ -4886,6 +4887,16 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
                goto out_put_cfile;
 
        /*
+        * The control file must be a regular cgroup1 file. As a regular cgroup
+        * file can't be renamed, it's safe to access its name afterwards.
+        */
+       cdentry = cfile.file->f_path.dentry;
+       if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) {
+               ret = -EINVAL;
+               goto out_put_cfile;
+       }
+
+       /*
         * Determine the event callbacks and set them in @event.  This used
         * to be done via struct cftype but cgroup core no longer knows
         * about these events.  The following is crude but the whole thing
@@ -4893,7 +4904,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
         *
         * DO NOT ADD NEW FILES.
         */
-       name = cfile.file->f_path.dentry->d_name.name;
+       name = cdentry->d_name.name;
 
        if (!strcmp(name, "memory.usage_in_bytes")) {
                event->register_event = mem_cgroup_usage_register_event;
@@ -4917,7 +4928,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
         * automatically removed on cgroup destruction but the removal is
         * asynchronous, so take an extra ref on @css.
         */
-       cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent,
+       cfile_css = css_tryget_online_from_dir(cdentry->d_parent,
                                               &memory_cgrp_subsys);
        ret = -EINVAL;
        if (IS_ERR(cfile_css))
index 8a6d5c8..8c84209 100644 (file)
@@ -1341,15 +1341,6 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
        return ret;
 }
 
-/*
- * Parameter block passed down to zap_pte_range in exceptional cases.
- */
-struct zap_details {
-       struct folio *single_folio;     /* Locked folio to be unmapped */
-       bool even_cows;                 /* Zap COWed private pages too? */
-       zap_flags_t zap_flags;          /* Extra flags for zapping */
-};
-
 /* Whether we should zap all COWed (private) pages too */
 static inline bool should_zap_cows(struct zap_details *details)
 {
@@ -1720,7 +1711,7 @@ void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,
 {
        struct mmu_notifier_range range;
        struct zap_details details = {
-               .zap_flags = ZAP_FLAG_DROP_MARKER,
+               .zap_flags = ZAP_FLAG_DROP_MARKER | ZAP_FLAG_UNMAP,
                /* Careful - we need to zap private pages too! */
                .even_cows = true,
        };
@@ -1774,19 +1765,27 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
  *
  * The range must fit into one VMA.
  */
-static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
+void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
                unsigned long size, struct zap_details *details)
 {
+       const unsigned long end = address + size;
        struct mmu_notifier_range range;
        struct mmu_gather tlb;
 
        lru_add_drain();
        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
-                               address, address + size);
+                               address, end);
+       if (is_vm_hugetlb_page(vma))
+               adjust_range_if_pmd_sharing_possible(vma, &range.start,
+                                                    &range.end);
        tlb_gather_mmu(&tlb, vma->vm_mm);
        update_hiwater_rss(vma->vm_mm);
        mmu_notifier_invalidate_range_start(&range);
-       unmap_single_vma(&tlb, vma, address, range.end, details);
+       /*
+        * unmap 'address-end' not 'range.start-range.end' as range
+        * could have been expanded for hugetlb pmd sharing.
+        */
+       unmap_single_vma(&tlb, vma, address, end, details);
        mmu_notifier_invalidate_range_end(&range);
        tlb_finish_mmu(&tlb);
 }
index 74a84eb..a5eb2f1 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1779,9 +1779,6 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
                 */
                pgoff = 0;
                get_area = shmem_get_unmapped_area;
-       } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
-               /* Ensures that larger anonymous mappings are THP aligned. */
-               get_area = thp_get_unmapped_area;
        }
 
        addr = get_area(file, addr, len, pgoff, flags);
index add4244..3a2c3f8 100644 (file)
@@ -153,7 +153,7 @@ static void tlb_remove_table_smp_sync(void *arg)
        /* Simply deliver the interrupt */
 }
 
-static void tlb_remove_table_sync_one(void)
+void tlb_remove_table_sync_one(void)
 {
        /*
         * This isn't an RCU grace period and hence the page-tables cannot be
@@ -177,8 +177,6 @@ static void tlb_remove_table_free(struct mmu_table_batch *batch)
 
 #else /* !CONFIG_MMU_GATHER_RCU_TABLE_FREE */
 
-static void tlb_remove_table_sync_one(void) { }
-
 static void tlb_remove_table_free(struct mmu_table_batch *batch)
 {
        __tlb_remove_table_free(batch);
index 026199c..8fcc5fa 100644 (file)
@@ -3987,7 +3987,7 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area
                        goto next;
 
                if (!pmd_trans_huge(pmd[i])) {
-                       if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) &&
+                       if (arch_has_hw_nonleaf_pmd_young() &&
                            get_cap(LRU_GEN_NONLEAF_YOUNG))
                                pmdp_test_and_clear_young(vma, addr, pmd + i);
                        goto next;
@@ -4085,14 +4085,14 @@ restart:
 #endif
                walk->mm_stats[MM_NONLEAF_TOTAL]++;
 
-#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
-               if (get_cap(LRU_GEN_NONLEAF_YOUNG)) {
+               if (arch_has_hw_nonleaf_pmd_young() &&
+                   get_cap(LRU_GEN_NONLEAF_YOUNG)) {
                        if (!pmd_young(val))
                                continue;
 
                        walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
                }
-#endif
+
                if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i))
                        continue;
 
@@ -5392,7 +5392,7 @@ static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, c
        if (arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))
                caps |= BIT(LRU_GEN_MM_WALK);
 
-       if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) && get_cap(LRU_GEN_NONLEAF_YOUNG))
+       if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG))
                caps |= BIT(LRU_GEN_NONLEAF_YOUNG);
 
        return snprintf(buf, PAGE_SIZE, "0x%04x\n", caps);
index 215af9b..c57d643 100644 (file)
@@ -972,6 +972,7 @@ static int get_l2cap_conn(char *buf, bdaddr_t *addr, u8 *addr_type,
        hci_dev_lock(hdev);
        hcon = hci_conn_hash_lookup_le(hdev, addr, *addr_type);
        hci_dev_unlock(hdev);
+       hci_dev_put(hdev);
 
        if (!hcon)
                return -ENOENT;
index ae3bdc6..da7cac0 100644 (file)
@@ -78,6 +78,17 @@ config BT_LE
          Bluetooth Low Energy includes support low-energy physical
          layer available with Bluetooth version 4.0 or later.
 
+config BT_LE_L2CAP_ECRED
+       bool "Bluetooth L2CAP Enhanced Credit Flow Control"
+       depends on BT_LE
+       default y
+       help
+         Bluetooth Low Energy L2CAP Enhanced Credit Flow Control available with
+         Bluetooth version 5.2 or later.
+
+         This can be overridden by passing bluetooth.enable_ecred=[1|0]
+         on the kernel commandline.
+
 config BT_6LOWPAN
        tristate "Bluetooth 6LoWPAN support"
        depends on BT_LE && 6LOWPAN
index dc65974..1c3c7ff 100644 (file)
@@ -737,7 +737,7 @@ static int __init bt_init(void)
 
        err = bt_sysfs_init();
        if (err < 0)
-               return err;
+               goto cleanup_led;
 
        err = sock_register(&bt_sock_family_ops);
        if (err)
@@ -773,6 +773,8 @@ unregister_socket:
        sock_unregister(PF_BLUETOOTH);
 cleanup_sysfs:
        bt_sysfs_cleanup();
+cleanup_led:
+       bt_leds_cleanup();
        return err;
 }
 
index 3820153..3cc135b 100644 (file)
@@ -72,9 +72,8 @@ static void hci_read_codec_capabilities(struct hci_dev *hdev, __u8 transport,
                                continue;
                        }
 
-                       skb = __hci_cmd_sync(hdev, HCI_OP_READ_LOCAL_CODEC_CAPS,
-                                            sizeof(*cmd), cmd,
-                                            HCI_CMD_TIMEOUT);
+                       skb = __hci_cmd_sync_sk(hdev, HCI_OP_READ_LOCAL_CODEC_CAPS,
+                                               sizeof(*cmd), cmd, 0, HCI_CMD_TIMEOUT, NULL);
                        if (IS_ERR(skb)) {
                                bt_dev_err(hdev, "Failed to read codec capabilities (%ld)",
                                           PTR_ERR(skb));
@@ -127,8 +126,8 @@ void hci_read_supported_codecs(struct hci_dev *hdev)
        struct hci_op_read_local_codec_caps caps;
        __u8 i;
 
-       skb = __hci_cmd_sync(hdev, HCI_OP_READ_LOCAL_CODECS, 0, NULL,
-                            HCI_CMD_TIMEOUT);
+       skb = __hci_cmd_sync_sk(hdev, HCI_OP_READ_LOCAL_CODECS, 0, NULL,
+                               0, HCI_CMD_TIMEOUT, NULL);
 
        if (IS_ERR(skb)) {
                bt_dev_err(hdev, "Failed to read local supported codecs (%ld)",
@@ -158,7 +157,8 @@ void hci_read_supported_codecs(struct hci_dev *hdev)
        for (i = 0; i < std_codecs->num; i++) {
                caps.id = std_codecs->codec[i];
                caps.direction = 0x00;
-               hci_read_codec_capabilities(hdev, LOCAL_CODEC_ACL_MASK, &caps);
+               hci_read_codec_capabilities(hdev,
+                                           LOCAL_CODEC_ACL_MASK | LOCAL_CODEC_SCO_MASK, &caps);
        }
 
        skb_pull(skb, flex_array_size(std_codecs, codec, std_codecs->num)
@@ -178,7 +178,8 @@ void hci_read_supported_codecs(struct hci_dev *hdev)
                caps.cid = vnd_codecs->codec[i].cid;
                caps.vid = vnd_codecs->codec[i].vid;
                caps.direction = 0x00;
-               hci_read_codec_capabilities(hdev, LOCAL_CODEC_ACL_MASK, &caps);
+               hci_read_codec_capabilities(hdev,
+                                           LOCAL_CODEC_ACL_MASK | LOCAL_CODEC_SCO_MASK, &caps);
        }
 
 error:
@@ -194,8 +195,8 @@ void hci_read_supported_codecs_v2(struct hci_dev *hdev)
        struct hci_op_read_local_codec_caps caps;
        __u8 i;
 
-       skb = __hci_cmd_sync(hdev, HCI_OP_READ_LOCAL_CODECS_V2, 0, NULL,
-                            HCI_CMD_TIMEOUT);
+       skb = __hci_cmd_sync_sk(hdev, HCI_OP_READ_LOCAL_CODECS_V2, 0, NULL,
+                               0, HCI_CMD_TIMEOUT, NULL);
 
        if (IS_ERR(skb)) {
                bt_dev_err(hdev, "Failed to read local supported codecs (%ld)",
index a6c1286..d3e542c 100644 (file)
@@ -824,11 +824,10 @@ static int hci_le_terminate_big(struct hci_dev *hdev, u8 big, u8 bis)
 
        bt_dev_dbg(hdev, "big 0x%2.2x bis 0x%2.2x", big, bis);
 
-       d = kmalloc(sizeof(*d), GFP_KERNEL);
+       d = kzalloc(sizeof(*d), GFP_KERNEL);
        if (!d)
                return -ENOMEM;
 
-       memset(d, 0, sizeof(*d));
        d->big = big;
        d->bis = bis;
 
@@ -861,11 +860,10 @@ static int hci_le_big_terminate(struct hci_dev *hdev, u8 big, u16 sync_handle)
 
        bt_dev_dbg(hdev, "big 0x%2.2x sync_handle 0x%4.4x", big, sync_handle);
 
-       d = kmalloc(sizeof(*d), GFP_KERNEL);
+       d = kzalloc(sizeof(*d), GFP_KERNEL);
        if (!d)
                return -ENOMEM;
 
-       memset(d, 0, sizeof(*d));
        d->big = big;
        d->sync_handle = sync_handle;
 
@@ -1881,7 +1879,7 @@ static int hci_create_cis_sync(struct hci_dev *hdev, void *data)
                        continue;
 
                /* Check if all CIS(s) belonging to a CIG are ready */
-               if (conn->link->state != BT_CONNECTED ||
+               if (!conn->link || conn->link->state != BT_CONNECTED ||
                    conn->state != BT_CONNECT) {
                        cmd.cp.num_cis = 0;
                        break;
@@ -2046,19 +2044,12 @@ int hci_pa_create_sync(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type,
        if (hci_dev_test_and_set_flag(hdev, HCI_PA_SYNC))
                return -EBUSY;
 
-       cp = kmalloc(sizeof(*cp), GFP_KERNEL);
+       cp = kzalloc(sizeof(*cp), GFP_KERNEL);
        if (!cp) {
                hci_dev_clear_flag(hdev, HCI_PA_SYNC);
                return -ENOMEM;
        }
 
-       /* Convert from ISO socket address type to HCI address type  */
-       if (dst_type == BDADDR_LE_PUBLIC)
-               dst_type = ADDR_LE_DEV_PUBLIC;
-       else
-               dst_type = ADDR_LE_DEV_RANDOM;
-
-       memset(cp, 0, sizeof(*cp));
        cp->sid = sid;
        cp->addr_type = dst_type;
        bacpy(&cp->addr, dst);
index 0540555..b65c3aa 100644 (file)
@@ -2660,7 +2660,7 @@ int hci_register_dev(struct hci_dev *hdev)
 
        error = hci_register_suspend_notifier(hdev);
        if (error)
-               goto err_wqueue;
+               BT_WARN("register suspend notifier failed error:%d\n", error);
 
        queue_work(hdev->req_workqueue, &hdev->power_on);
 
@@ -2764,7 +2764,8 @@ int hci_register_suspend_notifier(struct hci_dev *hdev)
 {
        int ret = 0;
 
-       if (!test_bit(HCI_QUIRK_NO_SUSPEND_NOTIFIER, &hdev->quirks)) {
+       if (!hdev->suspend_notifier.notifier_call &&
+           !test_bit(HCI_QUIRK_NO_SUSPEND_NOTIFIER, &hdev->quirks)) {
                hdev->suspend_notifier.notifier_call = hci_suspend_notifier;
                ret = register_pm_notifier(&hdev->suspend_notifier);
        }
@@ -2776,8 +2777,11 @@ int hci_unregister_suspend_notifier(struct hci_dev *hdev)
 {
        int ret = 0;
 
-       if (!test_bit(HCI_QUIRK_NO_SUSPEND_NOTIFIER, &hdev->quirks))
+       if (hdev->suspend_notifier.notifier_call) {
                ret = unregister_pm_notifier(&hdev->suspend_notifier);
+               if (!ret)
+                       hdev->suspend_notifier.notifier_call = NULL;
+       }
 
        return ret;
 }
@@ -3981,7 +3985,7 @@ void hci_req_cmd_complete(struct hci_dev *hdev, u16 opcode, u8 status,
                        *req_complete_skb = bt_cb(skb)->hci.req_complete_skb;
                else
                        *req_complete = bt_cb(skb)->hci.req_complete;
-               kfree_skb(skb);
+               dev_kfree_skb_irq(skb);
        }
        spin_unlock_irqrestore(&hdev->cmd_q.lock, flags);
 }
index 3f401ec..b7f6829 100644 (file)
@@ -757,7 +757,7 @@ static ssize_t force_static_address_write(struct file *file,
        bool enable;
        int err;
 
-       if (test_bit(HCI_UP, &hdev->flags))
+       if (hdev_is_powered(hdev))
                return -EBUSY;
 
        err = kstrtobool_from_user(user_buf, count, &enable);
index faca701..0594af4 100644 (file)
@@ -801,9 +801,6 @@ static u8 hci_cc_write_auth_payload_timeout(struct hci_dev *hdev, void *data,
 
        bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
 
-       if (rp->status)
-               return rp->status;
-
        sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_AUTH_PAYLOAD_TO);
        if (!sent)
                return rp->status;
@@ -811,9 +808,17 @@ static u8 hci_cc_write_auth_payload_timeout(struct hci_dev *hdev, void *data,
        hci_dev_lock(hdev);
 
        conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(rp->handle));
-       if (conn)
+       if (!conn) {
+               rp->status = 0xff;
+               goto unlock;
+       }
+
+       if (!rp->status)
                conn->auth_payload_timeout = get_unaligned_le16(sent + 2);
 
+       hci_encrypt_cfm(conn, 0);
+
+unlock:
        hci_dev_unlock(hdev);
 
        return rp->status;
@@ -3680,8 +3685,13 @@ static void hci_encrypt_change_evt(struct hci_dev *hdev, void *data,
 
                cp.handle = cpu_to_le16(conn->handle);
                cp.timeout = cpu_to_le16(hdev->auth_payload_timeout);
-               hci_send_cmd(conn->hdev, HCI_OP_WRITE_AUTH_PAYLOAD_TO,
-                            sizeof(cp), &cp);
+               if (hci_send_cmd(conn->hdev, HCI_OP_WRITE_AUTH_PAYLOAD_TO,
+                                sizeof(cp), &cp)) {
+                       bt_dev_err(hdev, "write auth payload timeout failed");
+                       goto notify;
+               }
+
+               goto unlock;
        }
 
 notify:
@@ -6494,7 +6504,7 @@ static void hci_le_ext_adv_report_evt(struct hci_dev *hdev, void *data,
                                        info->length))
                        break;
 
-               evt_type = __le16_to_cpu(info->type);
+               evt_type = __le16_to_cpu(info->type) & LE_EXT_ADV_EVT_TYPE_MASK;
                legacy_evt_type = ext_evt_type_to_legacy(hdev, evt_type);
                if (legacy_evt_type != LE_ADV_INVALID) {
                        process_adv_report(hdev, legacy_evt_type, &info->bdaddr,
index 5a0296a..f7e006a 100644 (file)
@@ -269,7 +269,7 @@ void hci_req_add_ev(struct hci_request *req, u16 opcode, u32 plen,
 void hci_req_add(struct hci_request *req, u16 opcode, u32 plen,
                 const void *param)
 {
-       bt_dev_err(req->hdev, "HCI_REQ-0x%4.4x", opcode);
+       bt_dev_dbg(req->hdev, "HCI_REQ-0x%4.4x", opcode);
        hci_req_add_ev(req, opcode, plen, param, 0);
 }
 
index 76c3107..9e2d7e4 100644 (file)
@@ -12,6 +12,7 @@
 #include <net/bluetooth/mgmt.h>
 
 #include "hci_request.h"
+#include "hci_codec.h"
 #include "hci_debugfs.h"
 #include "smp.h"
 #include "eir.h"
@@ -3054,6 +3055,7 @@ int hci_update_name_sync(struct hci_dev *hdev)
  * Enable Authentication
  * lmp_bredr_capable(Set Fast Connectable -> Set Scan Type -> Set Class ->
  * Set Name -> Set EIR)
+ * HCI_FORCE_STATIC_ADDR | BDADDR_ANY && !HCI_BREDR_ENABLED (Set Static Address)
  */
 int hci_powered_update_sync(struct hci_dev *hdev)
 {
@@ -3093,6 +3095,23 @@ int hci_powered_update_sync(struct hci_dev *hdev)
                hci_update_eir_sync(hdev);
        }
 
+       /* If forcing static address is in use or there is no public
+        * address use the static address as random address (but skip
+        * the HCI command if the current random address is already the
+        * static one.
+        *
+        * In case BR/EDR has been disabled on a dual-mode controller
+        * and a static address has been configured, then use that
+        * address instead of the public BR/EDR address.
+        */
+       if (hci_dev_test_flag(hdev, HCI_FORCE_STATIC_ADDR) ||
+           (!bacmp(&hdev->bdaddr, BDADDR_ANY) &&
+           !hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))) {
+               if (bacmp(&hdev->static_addr, BDADDR_ANY))
+                       return hci_set_random_addr_sync(hdev,
+                                                       &hdev->static_addr);
+       }
+
        return 0;
 }
 
@@ -3780,7 +3799,8 @@ static int hci_read_page_scan_activity_sync(struct hci_dev *hdev)
 static int hci_read_def_err_data_reporting_sync(struct hci_dev *hdev)
 {
        if (!(hdev->commands[18] & 0x04) ||
-           !(hdev->features[0][6] & LMP_ERR_DATA_REPORTING))
+           !(hdev->features[0][6] & LMP_ERR_DATA_REPORTING) ||
+           test_bit(HCI_QUIRK_BROKEN_ERR_DATA_REPORTING, &hdev->quirks))
                return 0;
 
        return __hci_cmd_sync_status(hdev, HCI_OP_READ_DEF_ERR_DATA_REPORTING,
@@ -4238,11 +4258,12 @@ static int hci_set_event_mask_page_2_sync(struct hci_dev *hdev)
 /* Read local codec list if the HCI command is supported */
 static int hci_read_local_codecs_sync(struct hci_dev *hdev)
 {
-       if (!(hdev->commands[29] & 0x20))
-               return 0;
+       if (hdev->commands[45] & 0x04)
+               hci_read_supported_codecs_v2(hdev);
+       else if (hdev->commands[29] & 0x20)
+               hci_read_supported_codecs(hdev);
 
-       return __hci_cmd_sync_status(hdev, HCI_OP_READ_LOCAL_CODECS, 0, NULL,
-                                    HCI_CMD_TIMEOUT);
+       return 0;
 }
 
 /* Read local pairing options if the HCI command is supported */
@@ -4258,7 +4279,7 @@ static int hci_read_local_pairing_opts_sync(struct hci_dev *hdev)
 /* Get MWS transport configuration if the HCI command is supported */
 static int hci_get_mws_transport_config_sync(struct hci_dev *hdev)
 {
-       if (!(hdev->commands[30] & 0x08))
+       if (!mws_transport_config_capable(hdev))
                return 0;
 
        return __hci_cmd_sync_status(hdev, HCI_OP_GET_MWS_TRANSPORT_CONFIG,
@@ -4298,7 +4319,8 @@ static int hci_set_err_data_report_sync(struct hci_dev *hdev)
        bool enabled = hci_dev_test_flag(hdev, HCI_WIDEBAND_SPEECH_ENABLED);
 
        if (!(hdev->commands[18] & 0x08) ||
-           !(hdev->features[0][6] & LMP_ERR_DATA_REPORTING))
+           !(hdev->features[0][6] & LMP_ERR_DATA_REPORTING) ||
+           test_bit(HCI_QUIRK_BROKEN_ERR_DATA_REPORTING, &hdev->quirks))
                return 0;
 
        if (enabled == hdev->err_data_reporting)
@@ -4457,6 +4479,9 @@ static const struct {
        HCI_QUIRK_BROKEN(STORED_LINK_KEY,
                         "HCI Delete Stored Link Key command is advertised, "
                         "but not supported."),
+       HCI_QUIRK_BROKEN(ERR_DATA_REPORTING,
+                        "HCI Read Default Erroneous Data Reporting command is "
+                        "advertised, but not supported."),
        HCI_QUIRK_BROKEN(READ_TRANSMIT_POWER,
                         "HCI Read Transmit Power Level command is advertised, "
                         "but not supported."),
@@ -4696,6 +4721,7 @@ int hci_dev_open_sync(struct hci_dev *hdev)
                        hdev->flush(hdev);
 
                if (hdev->sent_cmd) {
+                       cancel_delayed_work_sync(&hdev->cmd_timer);
                        kfree_skb(hdev->sent_cmd);
                        hdev->sent_cmd = NULL;
                }
index f825857..035bb5d 100644 (file)
@@ -261,36 +261,42 @@ static int iso_connect_bis(struct sock *sk)
 
        if (!bis_capable(hdev)) {
                err = -EOPNOTSUPP;
-               goto done;
+               goto unlock;
        }
 
        /* Fail if out PHYs are marked as disabled */
        if (!iso_pi(sk)->qos.out.phy) {
                err = -EINVAL;
-               goto done;
+               goto unlock;
        }
 
-       hcon = hci_connect_bis(hdev, &iso_pi(sk)->dst, iso_pi(sk)->dst_type,
+       hcon = hci_connect_bis(hdev, &iso_pi(sk)->dst,
+                              le_addr_type(iso_pi(sk)->dst_type),
                               &iso_pi(sk)->qos, iso_pi(sk)->base_len,
                               iso_pi(sk)->base);
        if (IS_ERR(hcon)) {
                err = PTR_ERR(hcon);
-               goto done;
+               goto unlock;
        }
 
        conn = iso_conn_add(hcon);
        if (!conn) {
                hci_conn_drop(hcon);
                err = -ENOMEM;
-               goto done;
+               goto unlock;
        }
 
+       hci_dev_unlock(hdev);
+       hci_dev_put(hdev);
+
+       lock_sock(sk);
+
        /* Update source addr of the socket */
        bacpy(&iso_pi(sk)->src, &hcon->src);
 
        err = iso_chan_add(conn, sk, NULL);
        if (err)
-               goto done;
+               goto release;
 
        if (hcon->state == BT_CONNECTED) {
                iso_sock_clear_timer(sk);
@@ -300,7 +306,11 @@ static int iso_connect_bis(struct sock *sk)
                iso_sock_set_timer(sk, sk->sk_sndtimeo);
        }
 
-done:
+release:
+       release_sock(sk);
+       return err;
+
+unlock:
        hci_dev_unlock(hdev);
        hci_dev_put(hdev);
        return err;
@@ -324,13 +334,13 @@ static int iso_connect_cis(struct sock *sk)
 
        if (!cis_central_capable(hdev)) {
                err = -EOPNOTSUPP;
-               goto done;
+               goto unlock;
        }
 
        /* Fail if either PHYs are marked as disabled */
        if (!iso_pi(sk)->qos.in.phy && !iso_pi(sk)->qos.out.phy) {
                err = -EINVAL;
-               goto done;
+               goto unlock;
        }
 
        /* Just bind if DEFER_SETUP has been set */
@@ -340,7 +350,7 @@ static int iso_connect_cis(struct sock *sk)
                                    &iso_pi(sk)->qos);
                if (IS_ERR(hcon)) {
                        err = PTR_ERR(hcon);
-                       goto done;
+                       goto unlock;
                }
        } else {
                hcon = hci_connect_cis(hdev, &iso_pi(sk)->dst,
@@ -348,7 +358,7 @@ static int iso_connect_cis(struct sock *sk)
                                       &iso_pi(sk)->qos);
                if (IS_ERR(hcon)) {
                        err = PTR_ERR(hcon);
-                       goto done;
+                       goto unlock;
                }
        }
 
@@ -356,15 +366,20 @@ static int iso_connect_cis(struct sock *sk)
        if (!conn) {
                hci_conn_drop(hcon);
                err = -ENOMEM;
-               goto done;
+               goto unlock;
        }
 
+       hci_dev_unlock(hdev);
+       hci_dev_put(hdev);
+
+       lock_sock(sk);
+
        /* Update source addr of the socket */
        bacpy(&iso_pi(sk)->src, &hcon->src);
 
        err = iso_chan_add(conn, sk, NULL);
        if (err)
-               goto done;
+               goto release;
 
        if (hcon->state == BT_CONNECTED) {
                iso_sock_clear_timer(sk);
@@ -377,7 +392,11 @@ static int iso_connect_cis(struct sock *sk)
                iso_sock_set_timer(sk, sk->sk_sndtimeo);
        }
 
-done:
+release:
+       release_sock(sk);
+       return err;
+
+unlock:
        hci_dev_unlock(hdev);
        hci_dev_put(hdev);
        return err;
@@ -831,20 +850,23 @@ static int iso_sock_connect(struct socket *sock, struct sockaddr *addr,
        bacpy(&iso_pi(sk)->dst, &sa->iso_bdaddr);
        iso_pi(sk)->dst_type = sa->iso_bdaddr_type;
 
+       release_sock(sk);
+
        if (bacmp(&iso_pi(sk)->dst, BDADDR_ANY))
                err = iso_connect_cis(sk);
        else
                err = iso_connect_bis(sk);
 
        if (err)
-               goto done;
+               return err;
+
+       lock_sock(sk);
 
        if (!test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) {
                err = bt_sock_wait_state(sk, BT_CONNECTED,
                                         sock_sndtimeo(sk, flags & O_NONBLOCK));
        }
 
-done:
        release_sock(sk);
        return err;
 }
@@ -875,10 +897,12 @@ static int iso_listen_bis(struct sock *sk)
 
        hci_dev_lock(hdev);
 
-       err = hci_pa_create_sync(hdev, &iso_pi(sk)->dst, iso_pi(sk)->dst_type,
+       err = hci_pa_create_sync(hdev, &iso_pi(sk)->dst,
+                                le_addr_type(iso_pi(sk)->dst_type),
                                 iso_pi(sk)->bc_sid);
 
        hci_dev_unlock(hdev);
+       hci_dev_put(hdev);
 
        return err;
 }
@@ -1098,28 +1122,22 @@ static int iso_sock_recvmsg(struct socket *sock, struct msghdr *msg,
 {
        struct sock *sk = sock->sk;
        struct iso_pinfo *pi = iso_pi(sk);
-       int err;
 
        BT_DBG("sk %p", sk);
 
-       lock_sock(sk);
-
        if (test_and_clear_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) {
                switch (sk->sk_state) {
                case BT_CONNECT2:
+                       lock_sock(sk);
                        iso_conn_defer_accept(pi->conn->hcon);
                        sk->sk_state = BT_CONFIG;
                        release_sock(sk);
                        return 0;
                case BT_CONNECT:
-                       err = iso_connect_cis(sk);
-                       release_sock(sk);
-                       return err;
+                       return iso_connect_cis(sk);
                }
        }
 
-       release_sock(sk);
-
        return bt_sock_recvmsg(sock, msg, len, flags);
 }
 
index 9c24947..a3e0dc6 100644 (file)
@@ -45,7 +45,7 @@
 #define LE_FLOWCTL_MAX_CREDITS 65535
 
 bool disable_ertm;
-bool enable_ecred;
+bool enable_ecred = IS_ENABLED(CONFIG_BT_LE_L2CAP_ECRED);
 
 static u32 l2cap_feat_mask = L2CAP_FEAT_FIXED_CHAN | L2CAP_FEAT_UCD;
 
@@ -4453,7 +4453,8 @@ static inline int l2cap_config_req(struct l2cap_conn *conn,
 
        chan->ident = cmd->ident;
        l2cap_send_cmd(conn, cmd->ident, L2CAP_CONF_RSP, len, rsp);
-       chan->num_conf_rsp++;
+       if (chan->num_conf_rsp < L2CAP_CONF_MAX_CONF_RSP)
+               chan->num_conf_rsp++;
 
        /* Reset config buffer. */
        chan->conf_len = 0;
index 469a0c9..53a796a 100644 (file)
@@ -170,7 +170,7 @@ __u8 bt_status(int err)
        case -EMLINK:
                return 0x09;
 
-       case EALREADY:
+       case -EALREADY:
                return 0x0b;
 
        case -EBUSY:
@@ -191,7 +191,7 @@ __u8 bt_status(int err)
        case -ECONNABORTED:
                return 0x16;
 
-       case ELOOP:
+       case -ELOOP:
                return 0x17;
 
        case -EPROTONOSUPPORT:
index a92e7e4..0dd30a3 100644 (file)
@@ -8859,7 +8859,7 @@ static int add_ext_adv_params(struct sock *sk, struct hci_dev *hdev,
         * extra parameters we don't know about will be ignored in this request.
         */
        if (data_len < MGMT_ADD_EXT_ADV_PARAMS_MIN_SIZE)
-               return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING,
+               return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_PARAMS,
                                       MGMT_STATUS_INVALID_PARAMS);
 
        flags = __le32_to_cpu(cp->flags);
index 7324764..8d6fce9 100644 (file)
@@ -590,7 +590,7 @@ int rfcomm_dlc_send(struct rfcomm_dlc *d, struct sk_buff *skb)
 
                ret = rfcomm_dlc_send_frag(d, frag);
                if (ret < 0) {
-                       kfree_skb(frag);
+                       dev_kfree_skb_irq(frag);
                        goto unlock;
                }
 
index 6094ef7..2723623 100644 (file)
@@ -489,7 +489,6 @@ int noinline bpf_fentry_test1(int a)
        return a + 1;
 }
 EXPORT_SYMBOL_GPL(bpf_fentry_test1);
-ALLOW_ERROR_INJECTION(bpf_fentry_test1, ERRNO);
 
 int noinline bpf_fentry_test2(int a, u64 b)
 {
@@ -733,7 +732,15 @@ noinline void bpf_kfunc_call_test_destructive(void)
 
 __diag_pop();
 
-ALLOW_ERROR_INJECTION(bpf_modify_return_test, ERRNO);
+BTF_SET8_START(bpf_test_modify_return_ids)
+BTF_ID_FLAGS(func, bpf_modify_return_test)
+BTF_ID_FLAGS(func, bpf_fentry_test1, KF_SLEEPABLE)
+BTF_SET8_END(bpf_test_modify_return_ids)
+
+static const struct btf_kfunc_id_set bpf_test_modify_return_set = {
+       .owner = THIS_MODULE,
+       .set   = &bpf_test_modify_return_ids,
+};
 
 BTF_SET8_START(test_sk_check_kfunc_ids)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test1)
@@ -1128,7 +1135,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
        }
        sock_init_data(NULL, sk);
 
-       skb = build_skb(data, 0);
+       skb = slab_build_skb(data);
        if (!skb) {
                kfree(data);
                kfree(ctx);
@@ -1666,7 +1673,8 @@ static int __init bpf_prog_test_run_init(void)
        };
        int ret;
 
-       ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_prog_test_kfunc_set);
+       ret = register_btf_fmodret_id_set(&bpf_test_modify_return_set);
+       ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_prog_test_kfunc_set);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_prog_test_kfunc_set);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &bpf_prog_test_kfunc_set);
        return ret ?: register_btf_id_dtor_kfuncs(bpf_prog_test_dtor_kfunc,
index 321be94..00e5743 100644 (file)
@@ -663,6 +663,28 @@ errout:
        rtnl_set_sk_err(net, RTNLGRP_MDB, err);
 }
 
+static const struct nla_policy
+br_mdbe_src_list_entry_pol[MDBE_SRCATTR_MAX + 1] = {
+       [MDBE_SRCATTR_ADDRESS] = NLA_POLICY_RANGE(NLA_BINARY,
+                                                 sizeof(struct in_addr),
+                                                 sizeof(struct in6_addr)),
+};
+
+static const struct nla_policy
+br_mdbe_src_list_pol[MDBE_SRC_LIST_MAX + 1] = {
+       [MDBE_SRC_LIST_ENTRY] = NLA_POLICY_NESTED(br_mdbe_src_list_entry_pol),
+};
+
+static const struct nla_policy br_mdbe_attrs_pol[MDBE_ATTR_MAX + 1] = {
+       [MDBE_ATTR_SOURCE] = NLA_POLICY_RANGE(NLA_BINARY,
+                                             sizeof(struct in_addr),
+                                             sizeof(struct in6_addr)),
+       [MDBE_ATTR_GROUP_MODE] = NLA_POLICY_RANGE(NLA_U8, MCAST_EXCLUDE,
+                                                 MCAST_INCLUDE),
+       [MDBE_ATTR_SRC_LIST] = NLA_POLICY_NESTED(br_mdbe_src_list_pol),
+       [MDBE_ATTR_RTPROT] = NLA_POLICY_MIN(NLA_U8, RTPROT_STATIC),
+};
+
 static bool is_valid_mdb_entry(struct br_mdb_entry *entry,
                               struct netlink_ext_ack *extack)
 {
@@ -748,79 +770,6 @@ static bool is_valid_mdb_source(struct nlattr *attr, __be16 proto,
        return true;
 }
 
-static const struct nla_policy br_mdbe_attrs_pol[MDBE_ATTR_MAX + 1] = {
-       [MDBE_ATTR_SOURCE] = NLA_POLICY_RANGE(NLA_BINARY,
-                                             sizeof(struct in_addr),
-                                             sizeof(struct in6_addr)),
-};
-
-static int br_mdb_parse(struct sk_buff *skb, struct nlmsghdr *nlh,
-                       struct net_device **pdev, struct br_mdb_entry **pentry,
-                       struct nlattr **mdb_attrs, struct netlink_ext_ack *extack)
-{
-       struct net *net = sock_net(skb->sk);
-       struct br_mdb_entry *entry;
-       struct br_port_msg *bpm;
-       struct nlattr *tb[MDBA_SET_ENTRY_MAX+1];
-       struct net_device *dev;
-       int err;
-
-       err = nlmsg_parse_deprecated(nlh, sizeof(*bpm), tb,
-                                    MDBA_SET_ENTRY_MAX, NULL, NULL);
-       if (err < 0)
-               return err;
-
-       bpm = nlmsg_data(nlh);
-       if (bpm->ifindex == 0) {
-               NL_SET_ERR_MSG_MOD(extack, "Invalid bridge ifindex");
-               return -EINVAL;
-       }
-
-       dev = __dev_get_by_index(net, bpm->ifindex);
-       if (dev == NULL) {
-               NL_SET_ERR_MSG_MOD(extack, "Bridge device doesn't exist");
-               return -ENODEV;
-       }
-
-       if (!netif_is_bridge_master(dev)) {
-               NL_SET_ERR_MSG_MOD(extack, "Device is not a bridge");
-               return -EOPNOTSUPP;
-       }
-
-       *pdev = dev;
-
-       if (!tb[MDBA_SET_ENTRY]) {
-               NL_SET_ERR_MSG_MOD(extack, "Missing MDBA_SET_ENTRY attribute");
-               return -EINVAL;
-       }
-       if (nla_len(tb[MDBA_SET_ENTRY]) != sizeof(struct br_mdb_entry)) {
-               NL_SET_ERR_MSG_MOD(extack, "Invalid MDBA_SET_ENTRY attribute length");
-               return -EINVAL;
-       }
-
-       entry = nla_data(tb[MDBA_SET_ENTRY]);
-       if (!is_valid_mdb_entry(entry, extack))
-               return -EINVAL;
-       *pentry = entry;
-
-       if (tb[MDBA_SET_ENTRY_ATTRS]) {
-               err = nla_parse_nested(mdb_attrs, MDBE_ATTR_MAX,
-                                      tb[MDBA_SET_ENTRY_ATTRS],
-                                      br_mdbe_attrs_pol, extack);
-               if (err)
-                       return err;
-               if (mdb_attrs[MDBE_ATTR_SOURCE] &&
-                   !is_valid_mdb_source(mdb_attrs[MDBE_ATTR_SOURCE],
-                                        entry->addr.proto, extack))
-                       return -EINVAL;
-       } else {
-               memset(mdb_attrs, 0,
-                      sizeof(struct nlattr *) * (MDBE_ATTR_MAX + 1));
-       }
-
-       return 0;
-}
-
 static struct net_bridge_mcast *
 __br_mdb_choose_context(struct net_bridge *br,
                        const struct br_mdb_entry *entry,
@@ -853,213 +802,669 @@ out:
        return brmctx;
 }
 
-static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port,
-                           struct br_mdb_entry *entry,
-                           struct nlattr **mdb_attrs,
-                           struct netlink_ext_ack *extack)
+static int br_mdb_replace_group_sg(const struct br_mdb_config *cfg,
+                                  struct net_bridge_mdb_entry *mp,
+                                  struct net_bridge_port_group *pg,
+                                  struct net_bridge_mcast *brmctx,
+                                  unsigned char flags)
+{
+       unsigned long now = jiffies;
+
+       pg->flags = flags;
+       pg->rt_protocol = cfg->rt_protocol;
+       if (!(flags & MDB_PG_FLAGS_PERMANENT) && !cfg->src_entry)
+               mod_timer(&pg->timer,
+                         now + brmctx->multicast_membership_interval);
+       else
+               del_timer(&pg->timer);
+
+       br_mdb_notify(cfg->br->dev, mp, pg, RTM_NEWMDB);
+
+       return 0;
+}
+
+static int br_mdb_add_group_sg(const struct br_mdb_config *cfg,
+                              struct net_bridge_mdb_entry *mp,
+                              struct net_bridge_mcast *brmctx,
+                              unsigned char flags,
+                              struct netlink_ext_ack *extack)
 {
-       struct net_bridge_mdb_entry *mp, *star_mp;
        struct net_bridge_port_group __rcu **pp;
        struct net_bridge_port_group *p;
-       struct net_bridge_mcast *brmctx;
-       struct br_ip group, star_group;
        unsigned long now = jiffies;
-       unsigned char flags = 0;
-       u8 filter_mode;
 
-       __mdb_entry_to_br_ip(entry, &group, mdb_attrs);
+       for (pp = &mp->ports;
+            (p = mlock_dereference(*pp, cfg->br)) != NULL;
+            pp = &p->next) {
+               if (p->key.port == cfg->p) {
+                       if (!(cfg->nlflags & NLM_F_REPLACE)) {
+                               NL_SET_ERR_MSG_MOD(extack, "(S, G) group is already joined by port");
+                               return -EEXIST;
+                       }
+                       return br_mdb_replace_group_sg(cfg, mp, p, brmctx,
+                                                      flags);
+               }
+               if ((unsigned long)p->key.port < (unsigned long)cfg->p)
+                       break;
+       }
 
-       brmctx = __br_mdb_choose_context(br, entry, extack);
-       if (!brmctx)
-               return -EINVAL;
+       p = br_multicast_new_port_group(cfg->p, &cfg->group, *pp, flags, NULL,
+                                       MCAST_INCLUDE, cfg->rt_protocol);
+       if (unlikely(!p)) {
+               NL_SET_ERR_MSG_MOD(extack, "Couldn't allocate new (S, G) port group");
+               return -ENOMEM;
+       }
+       rcu_assign_pointer(*pp, p);
+       if (!(flags & MDB_PG_FLAGS_PERMANENT) && !cfg->src_entry)
+               mod_timer(&p->timer,
+                         now + brmctx->multicast_membership_interval);
+       br_mdb_notify(cfg->br->dev, mp, p, RTM_NEWMDB);
 
-       /* host join errors which can happen before creating the group */
-       if (!port && !br_group_is_l2(&group)) {
-               /* don't allow any flags for host-joined IP groups */
-               if (entry->state) {
-                       NL_SET_ERR_MSG_MOD(extack, "Flags are not allowed for host groups");
-                       return -EINVAL;
-               }
-               if (!br_multicast_is_star_g(&group)) {
-                       NL_SET_ERR_MSG_MOD(extack, "Groups with sources cannot be manually host joined");
-                       return -EINVAL;
-               }
+       /* All of (*, G) EXCLUDE ports need to be added to the new (S, G) for
+        * proper replication.
+        */
+       if (br_multicast_should_handle_mode(brmctx, cfg->group.proto)) {
+               struct net_bridge_mdb_entry *star_mp;
+               struct br_ip star_group;
+
+               star_group = p->key.addr;
+               memset(&star_group.src, 0, sizeof(star_group.src));
+               star_mp = br_mdb_ip_get(cfg->br, &star_group);
+               if (star_mp)
+                       br_multicast_sg_add_exclude_ports(star_mp, p);
        }
 
-       if (br_group_is_l2(&group) && entry->state != MDB_PERMANENT) {
-               NL_SET_ERR_MSG_MOD(extack, "Only permanent L2 entries allowed");
-               return -EINVAL;
+       return 0;
+}
+
+static int br_mdb_add_group_src_fwd(const struct br_mdb_config *cfg,
+                                   struct br_ip *src_ip,
+                                   struct net_bridge_mcast *brmctx,
+                                   struct netlink_ext_ack *extack)
+{
+       struct net_bridge_mdb_entry *sgmp;
+       struct br_mdb_config sg_cfg;
+       struct br_ip sg_ip;
+       u8 flags = 0;
+
+       sg_ip = cfg->group;
+       sg_ip.src = src_ip->src;
+       sgmp = br_multicast_new_group(cfg->br, &sg_ip);
+       if (IS_ERR(sgmp)) {
+               NL_SET_ERR_MSG_MOD(extack, "Failed to add (S, G) MDB entry");
+               return PTR_ERR(sgmp);
        }
 
-       mp = br_multicast_new_group(br, &group);
-       if (IS_ERR(mp))
-               return PTR_ERR(mp);
+       if (cfg->entry->state == MDB_PERMANENT)
+               flags |= MDB_PG_FLAGS_PERMANENT;
+       if (cfg->filter_mode == MCAST_EXCLUDE)
+               flags |= MDB_PG_FLAGS_BLOCKED;
+
+       memset(&sg_cfg, 0, sizeof(sg_cfg));
+       sg_cfg.br = cfg->br;
+       sg_cfg.p = cfg->p;
+       sg_cfg.entry = cfg->entry;
+       sg_cfg.group = sg_ip;
+       sg_cfg.src_entry = true;
+       sg_cfg.filter_mode = MCAST_INCLUDE;
+       sg_cfg.rt_protocol = cfg->rt_protocol;
+       sg_cfg.nlflags = cfg->nlflags;
+       return br_mdb_add_group_sg(&sg_cfg, sgmp, brmctx, flags, extack);
+}
 
-       /* host join */
-       if (!port) {
-               if (mp->host_joined) {
-                       NL_SET_ERR_MSG_MOD(extack, "Group is already joined by host");
-                       return -EEXIST;
+static int br_mdb_add_group_src(const struct br_mdb_config *cfg,
+                               struct net_bridge_port_group *pg,
+                               struct net_bridge_mcast *brmctx,
+                               struct br_mdb_src_entry *src,
+                               struct netlink_ext_ack *extack)
+{
+       struct net_bridge_group_src *ent;
+       unsigned long now = jiffies;
+       int err;
+
+       ent = br_multicast_find_group_src(pg, &src->addr);
+       if (!ent) {
+               ent = br_multicast_new_group_src(pg, &src->addr);
+               if (!ent) {
+                       NL_SET_ERR_MSG_MOD(extack, "Failed to add new source entry");
+                       return -ENOSPC;
                }
+       } else if (!(cfg->nlflags & NLM_F_REPLACE)) {
+               NL_SET_ERR_MSG_MOD(extack, "Source entry already exists");
+               return -EEXIST;
+       }
 
-               br_multicast_host_join(brmctx, mp, false);
-               br_mdb_notify(br->dev, mp, NULL, RTM_NEWMDB);
+       if (cfg->filter_mode == MCAST_INCLUDE &&
+           cfg->entry->state == MDB_TEMPORARY)
+               mod_timer(&ent->timer, now + br_multicast_gmi(brmctx));
+       else
+               del_timer(&ent->timer);
 
-               return 0;
+       /* Install a (S, G) forwarding entry for the source. */
+       err = br_mdb_add_group_src_fwd(cfg, &src->addr, brmctx, extack);
+       if (err)
+               goto err_del_sg;
+
+       ent->flags = BR_SGRP_F_INSTALLED | BR_SGRP_F_USER_ADDED;
+
+       return 0;
+
+err_del_sg:
+       __br_multicast_del_group_src(ent);
+       return err;
+}
+
+static void br_mdb_del_group_src(struct net_bridge_port_group *pg,
+                                struct br_mdb_src_entry *src)
+{
+       struct net_bridge_group_src *ent;
+
+       ent = br_multicast_find_group_src(pg, &src->addr);
+       if (WARN_ON_ONCE(!ent))
+               return;
+       br_multicast_del_group_src(ent, false);
+}
+
+static int br_mdb_add_group_srcs(const struct br_mdb_config *cfg,
+                                struct net_bridge_port_group *pg,
+                                struct net_bridge_mcast *brmctx,
+                                struct netlink_ext_ack *extack)
+{
+       int i, err;
+
+       for (i = 0; i < cfg->num_src_entries; i++) {
+               err = br_mdb_add_group_src(cfg, pg, brmctx,
+                                          &cfg->src_entries[i], extack);
+               if (err)
+                       goto err_del_group_srcs;
        }
 
+       return 0;
+
+err_del_group_srcs:
+       for (i--; i >= 0; i--)
+               br_mdb_del_group_src(pg, &cfg->src_entries[i]);
+       return err;
+}
+
+static int br_mdb_replace_group_srcs(const struct br_mdb_config *cfg,
+                                    struct net_bridge_port_group *pg,
+                                    struct net_bridge_mcast *brmctx,
+                                    struct netlink_ext_ack *extack)
+{
+       struct net_bridge_group_src *ent;
+       struct hlist_node *tmp;
+       int err;
+
+       hlist_for_each_entry(ent, &pg->src_list, node)
+               ent->flags |= BR_SGRP_F_DELETE;
+
+       err = br_mdb_add_group_srcs(cfg, pg, brmctx, extack);
+       if (err)
+               goto err_clear_delete;
+
+       hlist_for_each_entry_safe(ent, tmp, &pg->src_list, node) {
+               if (ent->flags & BR_SGRP_F_DELETE)
+                       br_multicast_del_group_src(ent, false);
+       }
+
+       return 0;
+
+err_clear_delete:
+       hlist_for_each_entry(ent, &pg->src_list, node)
+               ent->flags &= ~BR_SGRP_F_DELETE;
+       return err;
+}
+
+static int br_mdb_replace_group_star_g(const struct br_mdb_config *cfg,
+                                      struct net_bridge_mdb_entry *mp,
+                                      struct net_bridge_port_group *pg,
+                                      struct net_bridge_mcast *brmctx,
+                                      unsigned char flags,
+                                      struct netlink_ext_ack *extack)
+{
+       unsigned long now = jiffies;
+       int err;
+
+       err = br_mdb_replace_group_srcs(cfg, pg, brmctx, extack);
+       if (err)
+               return err;
+
+       pg->flags = flags;
+       pg->filter_mode = cfg->filter_mode;
+       pg->rt_protocol = cfg->rt_protocol;
+       if (!(flags & MDB_PG_FLAGS_PERMANENT) &&
+           cfg->filter_mode == MCAST_EXCLUDE)
+               mod_timer(&pg->timer,
+                         now + brmctx->multicast_membership_interval);
+       else
+               del_timer(&pg->timer);
+
+       br_mdb_notify(cfg->br->dev, mp, pg, RTM_NEWMDB);
+
+       if (br_multicast_should_handle_mode(brmctx, cfg->group.proto))
+               br_multicast_star_g_handle_mode(pg, cfg->filter_mode);
+
+       return 0;
+}
+
+static int br_mdb_add_group_star_g(const struct br_mdb_config *cfg,
+                                  struct net_bridge_mdb_entry *mp,
+                                  struct net_bridge_mcast *brmctx,
+                                  unsigned char flags,
+                                  struct netlink_ext_ack *extack)
+{
+       struct net_bridge_port_group __rcu **pp;
+       struct net_bridge_port_group *p;
+       unsigned long now = jiffies;
+       int err;
+
        for (pp = &mp->ports;
-            (p = mlock_dereference(*pp, br)) != NULL;
+            (p = mlock_dereference(*pp, cfg->br)) != NULL;
             pp = &p->next) {
-               if (p->key.port == port) {
-                       NL_SET_ERR_MSG_MOD(extack, "Group is already joined by port");
-                       return -EEXIST;
+               if (p->key.port == cfg->p) {
+                       if (!(cfg->nlflags & NLM_F_REPLACE)) {
+                               NL_SET_ERR_MSG_MOD(extack, "(*, G) group is already joined by port");
+                               return -EEXIST;
+                       }
+                       return br_mdb_replace_group_star_g(cfg, mp, p, brmctx,
+                                                          flags, extack);
                }
-               if ((unsigned long)p->key.port < (unsigned long)port)
+               if ((unsigned long)p->key.port < (unsigned long)cfg->p)
                        break;
        }
 
-       filter_mode = br_multicast_is_star_g(&group) ? MCAST_EXCLUDE :
-                                                      MCAST_INCLUDE;
-
-       if (entry->state == MDB_PERMANENT)
-               flags |= MDB_PG_FLAGS_PERMANENT;
-
-       p = br_multicast_new_port_group(port, &group, *pp, flags, NULL,
-                                       filter_mode, RTPROT_STATIC);
+       p = br_multicast_new_port_group(cfg->p, &cfg->group, *pp, flags, NULL,
+                                       cfg->filter_mode, cfg->rt_protocol);
        if (unlikely(!p)) {
-               NL_SET_ERR_MSG_MOD(extack, "Couldn't allocate new port group");
+               NL_SET_ERR_MSG_MOD(extack, "Couldn't allocate new (*, G) port group");
                return -ENOMEM;
        }
+
+       err = br_mdb_add_group_srcs(cfg, p, brmctx, extack);
+       if (err)
+               goto err_del_port_group;
+
        rcu_assign_pointer(*pp, p);
-       if (entry->state == MDB_TEMPORARY)
+       if (!(flags & MDB_PG_FLAGS_PERMANENT) &&
+           cfg->filter_mode == MCAST_EXCLUDE)
                mod_timer(&p->timer,
                          now + brmctx->multicast_membership_interval);
-       br_mdb_notify(br->dev, mp, p, RTM_NEWMDB);
-       /* if we are adding a new EXCLUDE port group (*,G) it needs to be also
-        * added to all S,G entries for proper replication, if we are adding
-        * a new INCLUDE port (S,G) then all of *,G EXCLUDE ports need to be
-        * added to it for proper replication
+       br_mdb_notify(cfg->br->dev, mp, p, RTM_NEWMDB);
+       /* If we are adding a new EXCLUDE port group (*, G), it needs to be
+        * also added to all (S, G) entries for proper replication.
         */
-       if (br_multicast_should_handle_mode(brmctx, group.proto)) {
-               switch (filter_mode) {
-               case MCAST_EXCLUDE:
-                       br_multicast_star_g_handle_mode(p, MCAST_EXCLUDE);
-                       break;
-               case MCAST_INCLUDE:
-                       star_group = p->key.addr;
-                       memset(&star_group.src, 0, sizeof(star_group.src));
-                       star_mp = br_mdb_ip_get(br, &star_group);
-                       if (star_mp)
-                               br_multicast_sg_add_exclude_ports(star_mp, p);
-                       break;
+       if (br_multicast_should_handle_mode(brmctx, cfg->group.proto) &&
+           cfg->filter_mode == MCAST_EXCLUDE)
+               br_multicast_star_g_handle_mode(p, MCAST_EXCLUDE);
+
+       return 0;
+
+err_del_port_group:
+       hlist_del_init(&p->mglist);
+       kfree(p);
+       return err;
+}
+
+static int br_mdb_add_group(const struct br_mdb_config *cfg,
+                           struct netlink_ext_ack *extack)
+{
+       struct br_mdb_entry *entry = cfg->entry;
+       struct net_bridge_port *port = cfg->p;
+       struct net_bridge_mdb_entry *mp;
+       struct net_bridge *br = cfg->br;
+       struct net_bridge_mcast *brmctx;
+       struct br_ip group = cfg->group;
+       unsigned char flags = 0;
+
+       brmctx = __br_mdb_choose_context(br, entry, extack);
+       if (!brmctx)
+               return -EINVAL;
+
+       mp = br_multicast_new_group(br, &group);
+       if (IS_ERR(mp))
+               return PTR_ERR(mp);
+
+       /* host join */
+       if (!port) {
+               if (mp->host_joined) {
+                       NL_SET_ERR_MSG_MOD(extack, "Group is already joined by host");
+                       return -EEXIST;
                }
+
+               br_multicast_host_join(brmctx, mp, false);
+               br_mdb_notify(br->dev, mp, NULL, RTM_NEWMDB);
+
+               return 0;
        }
 
-       return 0;
+       if (entry->state == MDB_PERMANENT)
+               flags |= MDB_PG_FLAGS_PERMANENT;
+
+       if (br_multicast_is_star_g(&group))
+               return br_mdb_add_group_star_g(cfg, mp, brmctx, flags, extack);
+       else
+               return br_mdb_add_group_sg(cfg, mp, brmctx, flags, extack);
 }
 
-static int __br_mdb_add(struct net *net, struct net_bridge *br,
-                       struct net_bridge_port *p,
-                       struct br_mdb_entry *entry,
-                       struct nlattr **mdb_attrs,
+static int __br_mdb_add(const struct br_mdb_config *cfg,
                        struct netlink_ext_ack *extack)
 {
        int ret;
 
-       spin_lock_bh(&br->multicast_lock);
-       ret = br_mdb_add_group(br, p, entry, mdb_attrs, extack);
-       spin_unlock_bh(&br->multicast_lock);
+       spin_lock_bh(&cfg->br->multicast_lock);
+       ret = br_mdb_add_group(cfg, extack);
+       spin_unlock_bh(&cfg->br->multicast_lock);
 
        return ret;
 }
 
-static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh,
-                     struct netlink_ext_ack *extack)
+static int br_mdb_config_src_entry_init(struct nlattr *src_entry,
+                                       struct br_mdb_src_entry *src,
+                                       __be16 proto,
+                                       struct netlink_ext_ack *extack)
+{
+       struct nlattr *tb[MDBE_SRCATTR_MAX + 1];
+       int err;
+
+       err = nla_parse_nested(tb, MDBE_SRCATTR_MAX, src_entry,
+                              br_mdbe_src_list_entry_pol, extack);
+       if (err)
+               return err;
+
+       if (NL_REQ_ATTR_CHECK(extack, src_entry, tb, MDBE_SRCATTR_ADDRESS))
+               return -EINVAL;
+
+       if (!is_valid_mdb_source(tb[MDBE_SRCATTR_ADDRESS], proto, extack))
+               return -EINVAL;
+
+       src->addr.proto = proto;
+       nla_memcpy(&src->addr.src, tb[MDBE_SRCATTR_ADDRESS],
+                  nla_len(tb[MDBE_SRCATTR_ADDRESS]));
+
+       return 0;
+}
+
+static int br_mdb_config_src_list_init(struct nlattr *src_list,
+                                      struct br_mdb_config *cfg,
+                                      struct netlink_ext_ack *extack)
+{
+       struct nlattr *src_entry;
+       int rem, err;
+       int i = 0;
+
+       nla_for_each_nested(src_entry, src_list, rem)
+               cfg->num_src_entries++;
+
+       if (cfg->num_src_entries >= PG_SRC_ENT_LIMIT) {
+               NL_SET_ERR_MSG_FMT_MOD(extack, "Exceeded maximum number of source entries (%u)",
+                                      PG_SRC_ENT_LIMIT - 1);
+               return -EINVAL;
+       }
+
+       cfg->src_entries = kcalloc(cfg->num_src_entries,
+                                  sizeof(struct br_mdb_src_entry), GFP_KERNEL);
+       if (!cfg->src_entries)
+               return -ENOMEM;
+
+       nla_for_each_nested(src_entry, src_list, rem) {
+               err = br_mdb_config_src_entry_init(src_entry,
+                                                  &cfg->src_entries[i],
+                                                  cfg->entry->addr.proto,
+                                                  extack);
+               if (err)
+                       goto err_src_entry_init;
+               i++;
+       }
+
+       return 0;
+
+err_src_entry_init:
+       kfree(cfg->src_entries);
+       return err;
+}
+
+static void br_mdb_config_src_list_fini(struct br_mdb_config *cfg)
+{
+       kfree(cfg->src_entries);
+}
+
+static int br_mdb_config_attrs_init(struct nlattr *set_attrs,
+                                   struct br_mdb_config *cfg,
+                                   struct netlink_ext_ack *extack)
 {
        struct nlattr *mdb_attrs[MDBE_ATTR_MAX + 1];
-       struct net *net = sock_net(skb->sk);
-       struct net_bridge_vlan_group *vg;
-       struct net_bridge_port *p = NULL;
-       struct net_device *dev, *pdev;
-       struct br_mdb_entry *entry;
-       struct net_bridge_vlan *v;
-       struct net_bridge *br;
        int err;
 
-       err = br_mdb_parse(skb, nlh, &dev, &entry, mdb_attrs, extack);
-       if (err < 0)
+       err = nla_parse_nested(mdb_attrs, MDBE_ATTR_MAX, set_attrs,
+                              br_mdbe_attrs_pol, extack);
+       if (err)
+               return err;
+
+       if (mdb_attrs[MDBE_ATTR_SOURCE] &&
+           !is_valid_mdb_source(mdb_attrs[MDBE_ATTR_SOURCE],
+                                cfg->entry->addr.proto, extack))
+               return -EINVAL;
+
+       __mdb_entry_to_br_ip(cfg->entry, &cfg->group, mdb_attrs);
+
+       if (mdb_attrs[MDBE_ATTR_GROUP_MODE]) {
+               if (!cfg->p) {
+                       NL_SET_ERR_MSG_MOD(extack, "Filter mode cannot be set for host groups");
+                       return -EINVAL;
+               }
+               if (!br_multicast_is_star_g(&cfg->group)) {
+                       NL_SET_ERR_MSG_MOD(extack, "Filter mode can only be set for (*, G) entries");
+                       return -EINVAL;
+               }
+               cfg->filter_mode = nla_get_u8(mdb_attrs[MDBE_ATTR_GROUP_MODE]);
+       } else {
+               cfg->filter_mode = MCAST_EXCLUDE;
+       }
+
+       if (mdb_attrs[MDBE_ATTR_SRC_LIST]) {
+               if (!cfg->p) {
+                       NL_SET_ERR_MSG_MOD(extack, "Source list cannot be set for host groups");
+                       return -EINVAL;
+               }
+               if (!br_multicast_is_star_g(&cfg->group)) {
+                       NL_SET_ERR_MSG_MOD(extack, "Source list can only be set for (*, G) entries");
+                       return -EINVAL;
+               }
+               if (!mdb_attrs[MDBE_ATTR_GROUP_MODE]) {
+                       NL_SET_ERR_MSG_MOD(extack, "Source list cannot be set without filter mode");
+                       return -EINVAL;
+               }
+               err = br_mdb_config_src_list_init(mdb_attrs[MDBE_ATTR_SRC_LIST],
+                                                 cfg, extack);
+               if (err)
+                       return err;
+       }
+
+       if (!cfg->num_src_entries && cfg->filter_mode == MCAST_INCLUDE) {
+               NL_SET_ERR_MSG_MOD(extack, "Cannot add (*, G) INCLUDE with an empty source list");
+               return -EINVAL;
+       }
+
+       if (mdb_attrs[MDBE_ATTR_RTPROT]) {
+               if (!cfg->p) {
+                       NL_SET_ERR_MSG_MOD(extack, "Protocol cannot be set for host groups");
+                       return -EINVAL;
+               }
+               cfg->rt_protocol = nla_get_u8(mdb_attrs[MDBE_ATTR_RTPROT]);
+       }
+
+       return 0;
+}
+
+static int br_mdb_config_init(struct net *net, const struct nlmsghdr *nlh,
+                             struct br_mdb_config *cfg,
+                             struct netlink_ext_ack *extack)
+{
+       struct nlattr *tb[MDBA_SET_ENTRY_MAX + 1];
+       struct br_port_msg *bpm;
+       struct net_device *dev;
+       int err;
+
+       err = nlmsg_parse_deprecated(nlh, sizeof(*bpm), tb,
+                                    MDBA_SET_ENTRY_MAX, NULL, extack);
+       if (err)
                return err;
 
-       br = netdev_priv(dev);
+       memset(cfg, 0, sizeof(*cfg));
+       cfg->filter_mode = MCAST_EXCLUDE;
+       cfg->rt_protocol = RTPROT_STATIC;
+       cfg->nlflags = nlh->nlmsg_flags;
+
+       bpm = nlmsg_data(nlh);
+       if (!bpm->ifindex) {
+               NL_SET_ERR_MSG_MOD(extack, "Invalid bridge ifindex");
+               return -EINVAL;
+       }
+
+       dev = __dev_get_by_index(net, bpm->ifindex);
+       if (!dev) {
+               NL_SET_ERR_MSG_MOD(extack, "Bridge device doesn't exist");
+               return -ENODEV;
+       }
+
+       if (!netif_is_bridge_master(dev)) {
+               NL_SET_ERR_MSG_MOD(extack, "Device is not a bridge");
+               return -EOPNOTSUPP;
+       }
+
+       cfg->br = netdev_priv(dev);
 
-       if (!netif_running(br->dev)) {
+       if (!netif_running(cfg->br->dev)) {
                NL_SET_ERR_MSG_MOD(extack, "Bridge device is not running");
                return -EINVAL;
        }
 
-       if (!br_opt_get(br, BROPT_MULTICAST_ENABLED)) {
+       if (!br_opt_get(cfg->br, BROPT_MULTICAST_ENABLED)) {
                NL_SET_ERR_MSG_MOD(extack, "Bridge's multicast processing is disabled");
                return -EINVAL;
        }
 
-       if (entry->ifindex != br->dev->ifindex) {
-               pdev = __dev_get_by_index(net, entry->ifindex);
+       if (NL_REQ_ATTR_CHECK(extack, NULL, tb, MDBA_SET_ENTRY)) {
+               NL_SET_ERR_MSG_MOD(extack, "Missing MDBA_SET_ENTRY attribute");
+               return -EINVAL;
+       }
+       if (nla_len(tb[MDBA_SET_ENTRY]) != sizeof(struct br_mdb_entry)) {
+               NL_SET_ERR_MSG_MOD(extack, "Invalid MDBA_SET_ENTRY attribute length");
+               return -EINVAL;
+       }
+
+       cfg->entry = nla_data(tb[MDBA_SET_ENTRY]);
+       if (!is_valid_mdb_entry(cfg->entry, extack))
+               return -EINVAL;
+
+       if (cfg->entry->ifindex != cfg->br->dev->ifindex) {
+               struct net_device *pdev;
+
+               pdev = __dev_get_by_index(net, cfg->entry->ifindex);
                if (!pdev) {
                        NL_SET_ERR_MSG_MOD(extack, "Port net device doesn't exist");
                        return -ENODEV;
                }
 
-               p = br_port_get_rtnl(pdev);
-               if (!p) {
+               cfg->p = br_port_get_rtnl(pdev);
+               if (!cfg->p) {
                        NL_SET_ERR_MSG_MOD(extack, "Net device is not a bridge port");
                        return -EINVAL;
                }
 
-               if (p->br != br) {
+               if (cfg->p->br != cfg->br) {
                        NL_SET_ERR_MSG_MOD(extack, "Port belongs to a different bridge device");
                        return -EINVAL;
                }
-               if (p->state == BR_STATE_DISABLED && entry->state != MDB_PERMANENT) {
+       }
+
+       if (tb[MDBA_SET_ENTRY_ATTRS])
+               return br_mdb_config_attrs_init(tb[MDBA_SET_ENTRY_ATTRS], cfg,
+                                               extack);
+       else
+               __mdb_entry_to_br_ip(cfg->entry, &cfg->group, NULL);
+
+       return 0;
+}
+
+static void br_mdb_config_fini(struct br_mdb_config *cfg)
+{
+       br_mdb_config_src_list_fini(cfg);
+}
+
+static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh,
+                     struct netlink_ext_ack *extack)
+{
+       struct net *net = sock_net(skb->sk);
+       struct net_bridge_vlan_group *vg;
+       struct net_bridge_vlan *v;
+       struct br_mdb_config cfg;
+       int err;
+
+       err = br_mdb_config_init(net, nlh, &cfg, extack);
+       if (err)
+               return err;
+
+       err = -EINVAL;
+       /* host join errors which can happen before creating the group */
+       if (!cfg.p && !br_group_is_l2(&cfg.group)) {
+               /* don't allow any flags for host-joined IP groups */
+               if (cfg.entry->state) {
+                       NL_SET_ERR_MSG_MOD(extack, "Flags are not allowed for host groups");
+                       goto out;
+               }
+               if (!br_multicast_is_star_g(&cfg.group)) {
+                       NL_SET_ERR_MSG_MOD(extack, "Groups with sources cannot be manually host joined");
+                       goto out;
+               }
+       }
+
+       if (br_group_is_l2(&cfg.group) && cfg.entry->state != MDB_PERMANENT) {
+               NL_SET_ERR_MSG_MOD(extack, "Only permanent L2 entries allowed");
+               goto out;
+       }
+
+       if (cfg.p) {
+               if (cfg.p->state == BR_STATE_DISABLED && cfg.entry->state != MDB_PERMANENT) {
                        NL_SET_ERR_MSG_MOD(extack, "Port is in disabled state and entry is not permanent");
-                       return -EINVAL;
+                       goto out;
                }
-               vg = nbp_vlan_group(p);
+               vg = nbp_vlan_group(cfg.p);
        } else {
-               vg = br_vlan_group(br);
+               vg = br_vlan_group(cfg.br);
        }
 
        /* If vlan filtering is enabled and VLAN is not specified
         * install mdb entry on all vlans configured on the port.
         */
-       if (br_vlan_enabled(br->dev) && vg && entry->vid == 0) {
+       if (br_vlan_enabled(cfg.br->dev) && vg && cfg.entry->vid == 0) {
                list_for_each_entry(v, &vg->vlan_list, vlist) {
-                       entry->vid = v->vid;
-                       err = __br_mdb_add(net, br, p, entry, mdb_attrs, extack);
+                       cfg.entry->vid = v->vid;
+                       cfg.group.vid = v->vid;
+                       err = __br_mdb_add(&cfg, extack);
                        if (err)
                                break;
                }
        } else {
-               err = __br_mdb_add(net, br, p, entry, mdb_attrs, extack);
+               err = __br_mdb_add(&cfg, extack);
        }
 
+out:
+       br_mdb_config_fini(&cfg);
        return err;
 }
 
-static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry,
-                       struct nlattr **mdb_attrs)
+static int __br_mdb_del(const struct br_mdb_config *cfg)
 {
+       struct br_mdb_entry *entry = cfg->entry;
+       struct net_bridge *br = cfg->br;
        struct net_bridge_mdb_entry *mp;
        struct net_bridge_port_group *p;
        struct net_bridge_port_group __rcu **pp;
-       struct br_ip ip;
+       struct br_ip ip = cfg->group;
        int err = -EINVAL;
 
-       if (!netif_running(br->dev) || !br_opt_get(br, BROPT_MULTICAST_ENABLED))
-               return -EINVAL;
-
-       __mdb_entry_to_br_ip(entry, &ip, mdb_attrs);
-
        spin_lock_bh(&br->multicast_lock);
        mp = br_mdb_ip_get(br, &ip);
        if (!mp)
@@ -1094,53 +1499,35 @@ unlock:
 static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
                      struct netlink_ext_ack *extack)
 {
-       struct nlattr *mdb_attrs[MDBE_ATTR_MAX + 1];
        struct net *net = sock_net(skb->sk);
        struct net_bridge_vlan_group *vg;
-       struct net_bridge_port *p = NULL;
-       struct net_device *dev, *pdev;
-       struct br_mdb_entry *entry;
        struct net_bridge_vlan *v;
-       struct net_bridge *br;
+       struct br_mdb_config cfg;
        int err;
 
-       err = br_mdb_parse(skb, nlh, &dev, &entry, mdb_attrs, extack);
-       if (err < 0)
+       err = br_mdb_config_init(net, nlh, &cfg, extack);
+       if (err)
                return err;
 
-       br = netdev_priv(dev);
-
-       if (entry->ifindex != br->dev->ifindex) {
-               pdev = __dev_get_by_index(net, entry->ifindex);
-               if (!pdev)
-                       return -ENODEV;
-
-               p = br_port_get_rtnl(pdev);
-               if (!p) {
-                       NL_SET_ERR_MSG_MOD(extack, "Net device is not a bridge port");
-                       return -EINVAL;
-               }
-               if (p->br != br) {
-                       NL_SET_ERR_MSG_MOD(extack, "Port belongs to a different bridge device");
-                       return -EINVAL;
-               }
-               vg = nbp_vlan_group(p);
-       } else {
-               vg = br_vlan_group(br);
-       }
+       if (cfg.p)
+               vg = nbp_vlan_group(cfg.p);
+       else
+               vg = br_vlan_group(cfg.br);
 
        /* If vlan filtering is enabled and VLAN is not specified
         * delete mdb entry on all vlans configured on the port.
         */
-       if (br_vlan_enabled(br->dev) && vg && entry->vid == 0) {
+       if (br_vlan_enabled(cfg.br->dev) && vg && cfg.entry->vid == 0) {
                list_for_each_entry(v, &vg->vlan_list, vlist) {
-                       entry->vid = v->vid;
-                       err = __br_mdb_del(br, entry, mdb_attrs);
+                       cfg.entry->vid = v->vid;
+                       cfg.group.vid = v->vid;
+                       err = __br_mdb_del(&cfg);
                }
        } else {
-               err = __br_mdb_del(br, entry, mdb_attrs);
+               err = __br_mdb_del(&cfg);
        }
 
+       br_mdb_config_fini(&cfg);
        return err;
 }
 
index 5e988f0..48170bd 100644 (file)
@@ -552,7 +552,8 @@ static void br_multicast_fwd_src_remove(struct net_bridge_group_src *src,
                        continue;
 
                if (p->rt_protocol != RTPROT_KERNEL &&
-                   (p->flags & MDB_PG_FLAGS_PERMANENT))
+                   (p->flags & MDB_PG_FLAGS_PERMANENT) &&
+                   !(src->flags & BR_SGRP_F_USER_ADDED))
                        break;
 
                if (fastleave)
@@ -650,18 +651,23 @@ static void br_multicast_destroy_group_src(struct net_bridge_mcast_gc *gc)
        kfree_rcu(src, rcu);
 }
 
-void br_multicast_del_group_src(struct net_bridge_group_src *src,
-                               bool fastleave)
+void __br_multicast_del_group_src(struct net_bridge_group_src *src)
 {
        struct net_bridge *br = src->pg->key.port->br;
 
-       br_multicast_fwd_src_remove(src, fastleave);
        hlist_del_init_rcu(&src->node);
        src->pg->src_ents--;
        hlist_add_head(&src->mcast_gc.gc_node, &br->mcast_gc_list);
        queue_work(system_long_wq, &br->mcast_gc_work);
 }
 
+void br_multicast_del_group_src(struct net_bridge_group_src *src,
+                               bool fastleave)
+{
+       br_multicast_fwd_src_remove(src, fastleave);
+       __br_multicast_del_group_src(src);
+}
+
 static void br_multicast_destroy_port_group(struct net_bridge_mcast_gc *gc)
 {
        struct net_bridge_port_group *pg;
@@ -1232,7 +1238,7 @@ br_multicast_find_group_src(struct net_bridge_port_group *pg, struct br_ip *ip)
        return NULL;
 }
 
-static struct net_bridge_group_src *
+struct net_bridge_group_src *
 br_multicast_new_group_src(struct net_bridge_port_group *pg, struct br_ip *src_ip)
 {
        struct net_bridge_group_src *grp_src;
@@ -1273,7 +1279,7 @@ br_multicast_new_group_src(struct net_bridge_port_group *pg, struct br_ip *src_i
 
 struct net_bridge_port_group *br_multicast_new_port_group(
                        struct net_bridge_port *port,
-                       struct br_ip *group,
+                       const struct br_ip *group,
                        struct net_bridge_port_group __rcu *next,
                        unsigned char flags,
                        const unsigned char *src,
index 4c4fda9..15ef7fd 100644 (file)
@@ -92,6 +92,23 @@ struct bridge_mcast_stats {
        struct br_mcast_stats mstats;
        struct u64_stats_sync syncp;
 };
+
+struct br_mdb_src_entry {
+       struct br_ip                    addr;
+};
+
+struct br_mdb_config {
+       struct net_bridge               *br;
+       struct net_bridge_port          *p;
+       struct br_mdb_entry             *entry;
+       struct br_ip                    group;
+       bool                            src_entry;
+       u8                              filter_mode;
+       u16                             nlflags;
+       struct br_mdb_src_entry         *src_entries;
+       int                             num_src_entries;
+       u8                              rt_protocol;
+};
 #endif
 
 /* net_bridge_mcast_port must be always defined due to forwarding stubs */
@@ -293,6 +310,7 @@ struct net_bridge_fdb_flush_desc {
 #define BR_SGRP_F_DELETE       BIT(0)
 #define BR_SGRP_F_SEND         BIT(1)
 #define BR_SGRP_F_INSTALLED    BIT(2)
+#define BR_SGRP_F_USER_ADDED   BIT(3)
 
 struct net_bridge_mcast_gc {
        struct hlist_node               gc_node;
@@ -934,7 +952,8 @@ br_mdb_ip_get(struct net_bridge *br, struct br_ip *dst);
 struct net_bridge_mdb_entry *
 br_multicast_new_group(struct net_bridge *br, struct br_ip *group);
 struct net_bridge_port_group *
-br_multicast_new_port_group(struct net_bridge_port *port, struct br_ip *group,
+br_multicast_new_port_group(struct net_bridge_port *port,
+                           const struct br_ip *group,
                            struct net_bridge_port_group __rcu *next,
                            unsigned char flags, const unsigned char *src,
                            u8 filter_mode, u8 rt_protocol);
@@ -966,6 +985,10 @@ void br_multicast_sg_add_exclude_ports(struct net_bridge_mdb_entry *star_mp,
                                       struct net_bridge_port_group *sg);
 struct net_bridge_group_src *
 br_multicast_find_group_src(struct net_bridge_port_group *pg, struct br_ip *ip);
+struct net_bridge_group_src *
+br_multicast_new_group_src(struct net_bridge_port_group *pg,
+                          struct br_ip *src_ip);
+void __br_multicast_del_group_src(struct net_bridge_group_src *src);
 void br_multicast_del_group_src(struct net_bridge_group_src *src,
                                bool fastleave);
 void br_multicast_ctx_init(struct net_bridge *br,
index 7324296..5c5dd43 100644 (file)
@@ -366,42 +366,12 @@ static int nf_ct_bridge_refrag_post(struct net *net, struct sock *sk,
        return br_dev_queue_push_xmit(net, sk, skb);
 }
 
-static unsigned int nf_ct_bridge_confirm(struct sk_buff *skb)
-{
-       enum ip_conntrack_info ctinfo;
-       struct nf_conn *ct;
-       int protoff;
-
-       ct = nf_ct_get(skb, &ctinfo);
-       if (!ct || ctinfo == IP_CT_RELATED_REPLY)
-               return nf_conntrack_confirm(skb);
-
-       switch (skb->protocol) {
-       case htons(ETH_P_IP):
-               protoff = skb_network_offset(skb) + ip_hdrlen(skb);
-               break;
-       case htons(ETH_P_IPV6): {
-               unsigned char pnum = ipv6_hdr(skb)->nexthdr;
-               __be16 frag_off;
-
-               protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum,
-                                          &frag_off);
-               if (protoff < 0 || (frag_off & htons(~0x7)) != 0)
-                       return nf_conntrack_confirm(skb);
-               }
-               break;
-       default:
-               return NF_ACCEPT;
-       }
-       return nf_confirm(skb, protoff, ct, ctinfo);
-}
-
 static unsigned int nf_ct_bridge_post(void *priv, struct sk_buff *skb,
                                      const struct nf_hook_state *state)
 {
        int ret;
 
-       ret = nf_ct_bridge_confirm(skb);
+       ret = nf_confirm(priv, skb, state);
        if (ret != NF_ACCEPT)
                return ret;
 
index 27dcdcc..7343fd4 100644 (file)
@@ -446,7 +446,6 @@ int can_rx_register(struct net *net, struct net_device *dev, canid_t can_id,
        struct hlist_head *rcv_list;
        struct can_dev_rcv_lists *dev_rcv_lists;
        struct can_rcv_lists_stats *rcv_lists_stats = net->can.rcv_lists_stats;
-       int err = 0;
 
        /* insert new receiver  (dev,canid,mask) -> (func,data) */
 
@@ -481,7 +480,7 @@ int can_rx_register(struct net *net, struct net_device *dev, canid_t can_id,
                                               rcv_lists_stats->rcv_entries);
        spin_unlock_bh(&net->can.rcvlists_lock);
 
-       return err;
+       return 0;
 }
 EXPORT_SYMBOL(can_rx_register);
 
@@ -677,7 +676,7 @@ static void can_receive(struct sk_buff *skb, struct net_device *dev)
 static int can_rcv(struct sk_buff *skb, struct net_device *dev,
                   struct packet_type *pt, struct net_device *orig_dev)
 {
-       if (unlikely(dev->type != ARPHRD_CAN || (!can_is_can_skb(skb)))) {
+       if (unlikely(dev->type != ARPHRD_CAN || !can_get_ml_priv(dev) || !can_is_can_skb(skb))) {
                pr_warn_once("PF_CAN: dropped non conform CAN skbuff: dev type %d, len %d\n",
                             dev->type, skb->len);
 
@@ -692,7 +691,7 @@ static int can_rcv(struct sk_buff *skb, struct net_device *dev,
 static int canfd_rcv(struct sk_buff *skb, struct net_device *dev,
                     struct packet_type *pt, struct net_device *orig_dev)
 {
-       if (unlikely(dev->type != ARPHRD_CAN || (!can_is_canfd_skb(skb)))) {
+       if (unlikely(dev->type != ARPHRD_CAN || !can_get_ml_priv(dev) || !can_is_canfd_skb(skb))) {
                pr_warn_once("PF_CAN: dropped non conform CAN FD skbuff: dev type %d, len %d\n",
                             dev->type, skb->len);
 
@@ -707,7 +706,7 @@ static int canfd_rcv(struct sk_buff *skb, struct net_device *dev,
 static int canxl_rcv(struct sk_buff *skb, struct net_device *dev,
                     struct packet_type *pt, struct net_device *orig_dev)
 {
-       if (unlikely(dev->type != ARPHRD_CAN || (!can_is_canxl_skb(skb)))) {
+       if (unlikely(dev->type != ARPHRD_CAN || !can_get_ml_priv(dev) || !can_is_canxl_skb(skb))) {
                pr_warn_once("PF_CAN: dropped non conform CAN XL skbuff: dev type %d, len %d\n",
                             dev->type, skb->len);
 
index 3eb7d3e..81071cd 100644 (file)
@@ -857,6 +857,7 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
 
        skb->dev = dev;
        skb->priority = sk->sk_priority;
+       skb->mark = sk->sk_mark;
        skb->tstamp = sockc.transmit_time;
 
        skb_setup_tx_timestamp(skb, sockc.tsflags);
index 9d2288c..bb378c3 100644 (file)
@@ -310,7 +310,6 @@ bpf_sk_storage_ptr(void *owner)
        return &sk->sk_bpf_storage;
 }
 
-BTF_ID_LIST_SINGLE(sk_storage_map_btf_ids, struct, bpf_local_storage_map)
 const struct bpf_map_ops sk_storage_map_ops = {
        .map_meta_equal = bpf_map_meta_equal,
        .map_alloc_check = bpf_local_storage_map_alloc_check,
@@ -321,7 +320,7 @@ const struct bpf_map_ops sk_storage_map_ops = {
        .map_update_elem = bpf_fd_sk_storage_update_elem,
        .map_delete_elem = bpf_fd_sk_storage_delete_elem,
        .map_check_btf = bpf_local_storage_map_check_btf,
-       .map_btf_id = &sk_storage_map_btf_ids[0],
+       .map_btf_id = &bpf_local_storage_map_btf_id[0],
        .map_local_storage_charge = bpf_sk_storage_charge,
        .map_local_storage_uncharge = bpf_sk_storage_uncharge,
        .map_owner_storage_ptr = bpf_sk_storage_ptr,
index 7627c47..b76fb37 100644 (file)
@@ -10517,6 +10517,22 @@ void netdev_set_default_ethtool_ops(struct net_device *dev,
 }
 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
 
+/**
+ * netdev_sw_irq_coalesce_default_on() - enable SW IRQ coalescing by default
+ * @dev: netdev to enable the IRQ coalescing on
+ *
+ * Sets a conservative default for SW IRQ coalescing. Users can use
+ * sysfs attributes to override the default values.
+ */
+void netdev_sw_irq_coalesce_default_on(struct net_device *dev)
+{
+       WARN_ON(dev->reg_state == NETREG_REGISTERED);
+
+       dev->gro_flush_timeout = 20000;
+       dev->napi_defer_hard_irqs = 1;
+}
+EXPORT_SYMBOL_GPL(netdev_sw_irq_coalesce_default_on);
+
 void netdev_freemem(struct net_device *dev)
 {
        char *addr = (char *)dev - dev->padded;
index 907df71..6004bd0 100644 (file)
@@ -195,11 +195,16 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwmsg);
 EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwerr);
 EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_trap_report);
 
+#define DEVLINK_PORT_FN_CAPS_VALID_MASK \
+       (_BITUL(__DEVLINK_PORT_FN_ATTR_CAPS_MAX) - 1)
+
 static const struct nla_policy devlink_function_nl_policy[DEVLINK_PORT_FUNCTION_ATTR_MAX + 1] = {
        [DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR] = { .type = NLA_BINARY },
        [DEVLINK_PORT_FN_ATTR_STATE] =
                NLA_POLICY_RANGE(NLA_U8, DEVLINK_PORT_FN_STATE_INACTIVE,
                                 DEVLINK_PORT_FN_STATE_ACTIVE),
+       [DEVLINK_PORT_FN_ATTR_CAPS] =
+               NLA_POLICY_BITFIELD32(DEVLINK_PORT_FN_CAPS_VALID_MASK),
 };
 
 static const struct nla_policy devlink_selftest_nl_policy[DEVLINK_ATTR_SELFTEST_ID_MAX + 1] = {
@@ -680,6 +685,87 @@ devlink_sb_tc_index_get_from_attrs(struct devlink_sb *devlink_sb,
        return 0;
 }
 
+static void devlink_port_fn_cap_fill(struct nla_bitfield32 *caps,
+                                    u32 cap, bool is_enable)
+{
+       caps->selector |= cap;
+       if (is_enable)
+               caps->value |= cap;
+}
+
+static int devlink_port_fn_roce_fill(const struct devlink_ops *ops,
+                                    struct devlink_port *devlink_port,
+                                    struct nla_bitfield32 *caps,
+                                    struct netlink_ext_ack *extack)
+{
+       bool is_enable;
+       int err;
+
+       if (!ops->port_fn_roce_get)
+               return 0;
+
+       err = ops->port_fn_roce_get(devlink_port, &is_enable, extack);
+       if (err) {
+               if (err == -EOPNOTSUPP)
+                       return 0;
+               return err;
+       }
+
+       devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_ROCE, is_enable);
+       return 0;
+}
+
+static int devlink_port_fn_migratable_fill(const struct devlink_ops *ops,
+                                          struct devlink_port *devlink_port,
+                                          struct nla_bitfield32 *caps,
+                                          struct netlink_ext_ack *extack)
+{
+       bool is_enable;
+       int err;
+
+       if (!ops->port_fn_migratable_get ||
+           devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF)
+               return 0;
+
+       err = ops->port_fn_migratable_get(devlink_port, &is_enable, extack);
+       if (err) {
+               if (err == -EOPNOTSUPP)
+                       return 0;
+               return err;
+       }
+
+       devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_MIGRATABLE, is_enable);
+       return 0;
+}
+
+static int devlink_port_fn_caps_fill(const struct devlink_ops *ops,
+                                    struct devlink_port *devlink_port,
+                                    struct sk_buff *msg,
+                                    struct netlink_ext_ack *extack,
+                                    bool *msg_updated)
+{
+       struct nla_bitfield32 caps = {};
+       int err;
+
+       err = devlink_port_fn_roce_fill(ops, devlink_port, &caps, extack);
+       if (err)
+               return err;
+
+       err = devlink_port_fn_migratable_fill(ops, devlink_port, &caps, extack);
+       if (err)
+               return err;
+
+       if (!caps.selector)
+               return 0;
+       err = nla_put_bitfield32(msg, DEVLINK_PORT_FN_ATTR_CAPS, caps.value,
+                                caps.selector);
+       if (err)
+               return err;
+
+       *msg_updated = true;
+       return 0;
+}
+
 static int
 devlink_sb_tc_index_get_from_info(struct devlink_sb *devlink_sb,
                                  struct genl_info *info,
@@ -1264,6 +1350,51 @@ static int devlink_port_fn_state_fill(const struct devlink_ops *ops,
 }
 
 static int
+devlink_port_fn_mig_set(struct devlink_port *devlink_port, bool enable,
+                       struct netlink_ext_ack *extack)
+{
+       const struct devlink_ops *ops = devlink_port->devlink->ops;
+
+       return ops->port_fn_migratable_set(devlink_port, enable, extack);
+}
+
+static int
+devlink_port_fn_roce_set(struct devlink_port *devlink_port, bool enable,
+                        struct netlink_ext_ack *extack)
+{
+       const struct devlink_ops *ops = devlink_port->devlink->ops;
+
+       return ops->port_fn_roce_set(devlink_port, enable, extack);
+}
+
+static int devlink_port_fn_caps_set(struct devlink_port *devlink_port,
+                                   const struct nlattr *attr,
+                                   struct netlink_ext_ack *extack)
+{
+       struct nla_bitfield32 caps;
+       u32 caps_value;
+       int err;
+
+       caps = nla_get_bitfield32(attr);
+       caps_value = caps.value & caps.selector;
+       if (caps.selector & DEVLINK_PORT_FN_CAP_ROCE) {
+               err = devlink_port_fn_roce_set(devlink_port,
+                                              caps_value & DEVLINK_PORT_FN_CAP_ROCE,
+                                              extack);
+               if (err)
+                       return err;
+       }
+       if (caps.selector & DEVLINK_PORT_FN_CAP_MIGRATABLE) {
+               err = devlink_port_fn_mig_set(devlink_port, caps_value &
+                                             DEVLINK_PORT_FN_CAP_MIGRATABLE,
+                                             extack);
+               if (err)
+                       return err;
+       }
+       return 0;
+}
+
+static int
 devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *port,
                                   struct netlink_ext_ack *extack)
 {
@@ -1281,6 +1412,10 @@ devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *por
                                           &msg_updated);
        if (err)
                goto out;
+       err = devlink_port_fn_caps_fill(ops, port, msg, extack,
+                                       &msg_updated);
+       if (err)
+               goto out;
        err = devlink_port_fn_state_fill(ops, port, msg, extack, &msg_updated);
 out:
        if (err || !msg_updated)
@@ -1632,11 +1767,6 @@ static int devlink_port_function_hw_addr_set(struct devlink_port *port,
                }
        }
 
-       if (!ops->port_function_hw_addr_set) {
-               NL_SET_ERR_MSG_MOD(extack, "Port doesn't support function attributes");
-               return -EOPNOTSUPP;
-       }
-
        return ops->port_function_hw_addr_set(port, hw_addr, hw_addr_len,
                                              extack);
 }
@@ -1650,12 +1780,52 @@ static int devlink_port_fn_state_set(struct devlink_port *port,
 
        state = nla_get_u8(attr);
        ops = port->devlink->ops;
-       if (!ops->port_fn_state_set) {
-               NL_SET_ERR_MSG_MOD(extack,
-                                  "Function does not support state setting");
+       return ops->port_fn_state_set(port, state, extack);
+}
+
+static int devlink_port_function_validate(struct devlink_port *devlink_port,
+                                         struct nlattr **tb,
+                                         struct netlink_ext_ack *extack)
+{
+       const struct devlink_ops *ops = devlink_port->devlink->ops;
+       struct nlattr *attr;
+
+       if (tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR] &&
+           !ops->port_function_hw_addr_set) {
+               NL_SET_ERR_MSG_ATTR(extack, tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR],
+                                   "Port doesn't support function attributes");
                return -EOPNOTSUPP;
        }
-       return ops->port_fn_state_set(port, state, extack);
+       if (tb[DEVLINK_PORT_FN_ATTR_STATE] && !ops->port_fn_state_set) {
+               NL_SET_ERR_MSG_ATTR(extack, tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR],
+                                   "Function does not support state setting");
+               return -EOPNOTSUPP;
+       }
+       attr = tb[DEVLINK_PORT_FN_ATTR_CAPS];
+       if (attr) {
+               struct nla_bitfield32 caps;
+
+               caps = nla_get_bitfield32(attr);
+               if (caps.selector & DEVLINK_PORT_FN_CAP_ROCE &&
+                   !ops->port_fn_roce_set) {
+                       NL_SET_ERR_MSG_ATTR(extack, attr,
+                                           "Port doesn't support RoCE function attribute");
+                       return -EOPNOTSUPP;
+               }
+               if (caps.selector & DEVLINK_PORT_FN_CAP_MIGRATABLE) {
+                       if (!ops->port_fn_migratable_set) {
+                               NL_SET_ERR_MSG_ATTR(extack, attr,
+                                                   "Port doesn't support migratable function attribute");
+                               return -EOPNOTSUPP;
+                       }
+                       if (devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) {
+                               NL_SET_ERR_MSG_ATTR(extack, attr,
+                                                   "migratable function attribute supported for VFs only");
+                               return -EOPNOTSUPP;
+                       }
+               }
+       }
+       return 0;
 }
 
 static int devlink_port_function_set(struct devlink_port *port,
@@ -1672,12 +1842,24 @@ static int devlink_port_function_set(struct devlink_port *port,
                return err;
        }
 
+       err = devlink_port_function_validate(port, tb, extack);
+       if (err)
+               return err;
+
        attr = tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR];
        if (attr) {
                err = devlink_port_function_hw_addr_set(port, attr, extack);
                if (err)
                        return err;
        }
+
+       attr = tb[DEVLINK_PORT_FN_ATTR_CAPS];
+       if (attr) {
+               err = devlink_port_fn_caps_set(port, attr, extack);
+               if (err)
+                       return err;
+       }
+
        /* Keep this as the last function attribute set, so that when
         * multiple port function attributes are set along with state,
         * Those can be applied first before activating the state.
@@ -4259,9 +4441,10 @@ static int devlink_resource_put(struct devlink *devlink, struct sk_buff *skb,
            nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_ID, resource->id,
                              DEVLINK_ATTR_PAD))
                goto nla_put_failure;
-       if (resource->size != resource->size_new)
-               nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_NEW,
-                                 resource->size_new, DEVLINK_ATTR_PAD);
+       if (resource->size != resource->size_new &&
+           nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_NEW,
+                             resource->size_new, DEVLINK_ATTR_PAD))
+               goto nla_put_failure;
        if (devlink_resource_occ_put(resource, skb))
                goto nla_put_failure;
        if (devlink_resource_size_params_put(resource, skb))
index bc9c9be..bb14a03 100644 (file)
@@ -316,6 +316,8 @@ void metadata_dst_free(struct metadata_dst *md_dst)
        if (md_dst->type == METADATA_IP_TUNNEL)
                dst_cache_destroy(&md_dst->u.tun_info.dst_cache);
 #endif
+       if (md_dst->type == METADATA_XFRM)
+               dst_release(md_dst->u.xfrm_info.dst_orig);
        kfree(md_dst);
 }
 EXPORT_SYMBOL_GPL(metadata_dst_free);
@@ -340,16 +342,18 @@ EXPORT_SYMBOL_GPL(metadata_dst_alloc_percpu);
 
 void metadata_dst_free_percpu(struct metadata_dst __percpu *md_dst)
 {
-#ifdef CONFIG_DST_CACHE
        int cpu;
 
        for_each_possible_cpu(cpu) {
                struct metadata_dst *one_md_dst = per_cpu_ptr(md_dst, cpu);
 
+#ifdef CONFIG_DST_CACHE
                if (one_md_dst->type == METADATA_IP_TUNNEL)
                        dst_cache_destroy(&one_md_dst->u.tun_info.dst_cache);
-       }
 #endif
+               if (one_md_dst->type == METADATA_XFRM)
+                       dst_release(one_md_dst->u.xfrm_info.dst_orig);
+       }
        free_percpu(md_dst);
 }
 EXPORT_SYMBOL_GPL(metadata_dst_free_percpu);
index 655411c..2a140b3 100644 (file)
@@ -80,14 +80,14 @@ static int failover_slave_register(struct net_device *slave_dev)
                goto err_upper_link;
        }
 
-       slave_dev->priv_flags |= IFF_FAILOVER_SLAVE;
+       slave_dev->priv_flags |= (IFF_FAILOVER_SLAVE | IFF_NO_ADDRCONF);
 
        if (fops && fops->slave_register &&
            !fops->slave_register(slave_dev, failover_dev))
                return NOTIFY_OK;
 
        netdev_upper_dev_unlink(slave_dev, failover_dev);
-       slave_dev->priv_flags &= ~IFF_FAILOVER_SLAVE;
+       slave_dev->priv_flags &= ~(IFF_FAILOVER_SLAVE | IFF_NO_ADDRCONF);
 err_upper_link:
        netdev_rx_handler_unregister(slave_dev);
 done:
@@ -121,7 +121,7 @@ int failover_slave_unregister(struct net_device *slave_dev)
 
        netdev_rx_handler_unregister(slave_dev);
        netdev_upper_dev_unlink(slave_dev, failover_dev);
-       slave_dev->priv_flags &= ~IFF_FAILOVER_SLAVE;
+       slave_dev->priv_flags &= ~(IFF_FAILOVER_SLAVE | IFF_NO_ADDRCONF);
 
        if (fops && fops->slave_unregister &&
            !fops->slave_unregister(slave_dev, failover_dev))
index 37baaa6..9293586 100644 (file)
@@ -80,6 +80,7 @@
 #include <net/tls.h>
 #include <net/xdp.h>
 #include <net/mptcp.h>
+#include <net/netfilter/nf_conntrack_bpf.h>
 
 static const struct bpf_func_proto *
 bpf_sk_base_func_proto(enum bpf_func_id func_id);
@@ -5630,6 +5631,15 @@ static const struct bpf_func_proto bpf_bind_proto = {
 };
 
 #ifdef CONFIG_XFRM
+
+#if (IS_BUILTIN(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \
+    (IS_MODULE(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES))
+
+struct metadata_dst __percpu *xfrm_bpf_md_dst;
+EXPORT_SYMBOL_GPL(xfrm_bpf_md_dst);
+
+#endif
+
 BPF_CALL_5(bpf_skb_get_xfrm_state, struct sk_buff *, skb, u32, index,
           struct bpf_xfrm_state *, to, u32, size, u64, flags)
 {
@@ -7992,6 +8002,19 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
        default:
                return bpf_sk_base_func_proto(func_id);
        }
+
+#if IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)
+       /* The nf_conn___init type is used in the NF_CONNTRACK kfuncs. The
+        * kfuncs are defined in two different modules, and we want to be able
+        * to use them interchangably with the same BTF type ID. Because modules
+        * can't de-duplicate BTF IDs between each other, we need the type to be
+        * referenced in the vmlinux BTF or the verifier will get confused about
+        * the different types. So we add this dummy type reference which will
+        * be included in vmlinux BTF, allowing both modules to refer to the
+        * same type ID.
+        */
+       BTF_TYPE_EMIT(struct nf_conn___init);
+#endif
 }
 
 const struct bpf_func_proto bpf_sock_map_update_proto __weak;
index 4bf95e3..3cbba70 100644 (file)
@@ -270,12 +270,10 @@ static struct sk_buff *napi_skb_cache_get(void)
        return skb;
 }
 
-/* Caller must provide SKB that is memset cleared */
-static void __build_skb_around(struct sk_buff *skb, void *data,
-                              unsigned int frag_size)
+static inline void __finalize_skb_around(struct sk_buff *skb, void *data,
+                                        unsigned int size)
 {
        struct skb_shared_info *shinfo;
-       unsigned int size = frag_size ? : ksize(data);
 
        size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 
@@ -297,15 +295,71 @@ static void __build_skb_around(struct sk_buff *skb, void *data,
        skb_set_kcov_handle(skb, kcov_common_handle());
 }
 
+static inline void *__slab_build_skb(struct sk_buff *skb, void *data,
+                                    unsigned int *size)
+{
+       void *resized;
+
+       /* Must find the allocation size (and grow it to match). */
+       *size = ksize(data);
+       /* krealloc() will immediately return "data" when
+        * "ksize(data)" is requested: it is the existing upper
+        * bounds. As a result, GFP_ATOMIC will be ignored. Note
+        * that this "new" pointer needs to be passed back to the
+        * caller for use so the __alloc_size hinting will be
+        * tracked correctly.
+        */
+       resized = krealloc(data, *size, GFP_ATOMIC);
+       WARN_ON_ONCE(resized != data);
+       return resized;
+}
+
+/* build_skb() variant which can operate on slab buffers.
+ * Note that this should be used sparingly as slab buffers
+ * cannot be combined efficiently by GRO!
+ */
+struct sk_buff *slab_build_skb(void *data)
+{
+       struct sk_buff *skb;
+       unsigned int size;
+
+       skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
+       if (unlikely(!skb))
+               return NULL;
+
+       memset(skb, 0, offsetof(struct sk_buff, tail));
+       data = __slab_build_skb(skb, data, &size);
+       __finalize_skb_around(skb, data, size);
+
+       return skb;
+}
+EXPORT_SYMBOL(slab_build_skb);
+
+/* Caller must provide SKB that is memset cleared */
+static void __build_skb_around(struct sk_buff *skb, void *data,
+                              unsigned int frag_size)
+{
+       unsigned int size = frag_size;
+
+       /* frag_size == 0 is considered deprecated now. Callers
+        * using slab buffer should use slab_build_skb() instead.
+        */
+       if (WARN_ONCE(size == 0, "Use slab_build_skb() instead"))
+               data = __slab_build_skb(skb, data, &size);
+
+       __finalize_skb_around(skb, data, size);
+}
+
 /**
  * __build_skb - build a network buffer
  * @data: data buffer provided by caller
- * @frag_size: size of data, or 0 if head was kmalloced
+ * @frag_size: size of data (must not be 0)
  *
  * Allocate a new &sk_buff. Caller provides space holding head and
- * skb_shared_info. @data must have been allocated by kmalloc() only if
- * @frag_size is 0, otherwise data should come from the page allocator
- *  or vmalloc()
+ * skb_shared_info. @data must have been allocated from the page
+ * allocator or vmalloc(). (A @frag_size of 0 to indicate a kmalloc()
+ * allocation is deprecated, and callers should use slab_build_skb()
+ * instead.)
  * The return is the new skb buffer.
  * On a failure the return is %NULL, and @data is not freed.
  * Notes :
index e6b9ced..53d0251 100644 (file)
@@ -886,13 +886,16 @@ int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock,
        ret = sk_psock_map_verd(ret, msg->sk_redir);
        psock->apply_bytes = msg->apply_bytes;
        if (ret == __SK_REDIRECT) {
-               if (psock->sk_redir)
+               if (psock->sk_redir) {
                        sock_put(psock->sk_redir);
-               psock->sk_redir = msg->sk_redir;
-               if (!psock->sk_redir) {
+                       psock->sk_redir = NULL;
+               }
+               if (!msg->sk_redir) {
                        ret = __SK_DROP;
                        goto out;
                }
+               psock->redir_ingress = sk_msg_to_ingress(msg);
+               psock->sk_redir = msg->sk_redir;
                sock_hold(psock->sk_redir);
        }
 out:
index 4571914..b0ab841 100644 (file)
@@ -901,13 +901,20 @@ int sock_set_timestamping(struct sock *sk, int optname,
        if (val & ~SOF_TIMESTAMPING_MASK)
                return -EINVAL;
 
+       if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
+           !(val & SOF_TIMESTAMPING_OPT_ID))
+               return -EINVAL;
+
        if (val & SOF_TIMESTAMPING_OPT_ID &&
            !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
                if (sk_is_tcp(sk)) {
                        if ((1 << sk->sk_state) &
                            (TCPF_CLOSE | TCPF_LISTEN))
                                return -EINVAL;
-                       atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
+                       if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
+                               atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
+                       else
+                               atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
                } else {
                        atomic_set(&sk->sk_tskey, 0);
                }
index 81beb16..22fa2c5 100644 (file)
@@ -349,11 +349,13 @@ static void sock_map_free(struct bpf_map *map)
 
                sk = xchg(psk, NULL);
                if (sk) {
+                       sock_hold(sk);
                        lock_sock(sk);
                        rcu_read_lock();
                        sock_map_unref(sk, psk);
                        rcu_read_unlock();
                        release_sock(sk);
+                       sock_put(sk);
                }
        }
 
index 4148f6d..e00796e 100644 (file)
@@ -5,14 +5,6 @@
 #include <net/tso.h>
 #include <asm/unaligned.h>
 
-/* Calculate expected number of TX descriptors */
-int tso_count_descs(const struct sk_buff *skb)
-{
-       /* The Marvell Way */
-       return skb_shinfo(skb)->gso_segs * 2 + skb_shinfo(skb)->nr_frags;
-}
-EXPORT_SYMBOL(tso_count_descs);
-
 void tso_build_hdr(const struct sk_buff *skb, char *hdr, struct tso_t *tso,
                   int size, bool is_last)
 {
index 383721e..b2fba1a 100644 (file)
@@ -33,6 +33,9 @@ static bool dsa_skb_defer_rx_timestamp(struct dsa_slave_priv *p,
        struct dsa_switch *ds = p->dp->ds;
        unsigned int type;
 
+       if (!ds->ops->port_rxtstamp)
+               return false;
+
        if (skb_headroom(skb) < ETH_HLEN)
                return false;
 
@@ -45,10 +48,7 @@ static bool dsa_skb_defer_rx_timestamp(struct dsa_slave_priv *p,
        if (type == PTP_CLASS_NONE)
                return false;
 
-       if (likely(ds->ops->port_rxtstamp))
-               return ds->ops->port_rxtstamp(ds, p->dp->index, skb, type);
-
-       return false;
+       return ds->ops->port_rxtstamp(ds, p->dp->index, skb, type);
 }
 
 static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev,
index 7188429..03a1fb9 100644 (file)
@@ -51,7 +51,8 @@ static struct sk_buff *hellcreek_rcv(struct sk_buff *skb,
                return NULL;
        }
 
-       pskb_trim_rcsum(skb, skb->len - HELLCREEK_TAG_LEN);
+       if (pskb_trim_rcsum(skb, skb->len - HELLCREEK_TAG_LEN))
+               return NULL;
 
        dsa_default_offload_fwd_mark(skb);
 
index 0f6ae14..080e5c3 100644 (file)
@@ -27,7 +27,8 @@ static struct sk_buff *ksz_common_rcv(struct sk_buff *skb,
        if (!skb->dev)
                return NULL;
 
-       pskb_trim_rcsum(skb, skb->len - len);
+       if (pskb_trim_rcsum(skb, skb->len - len))
+               return NULL;
 
        dsa_default_offload_fwd_mark(skb);
 
index f14f51b..1c2ceba 100644 (file)
@@ -670,7 +670,8 @@ static struct sk_buff *sja1110_rcv_inband_control_extension(struct sk_buff *skb,
                 * padding and trailer we need to account for the fact that
                 * skb->data points to skb_mac_header(skb) + ETH_HLEN.
                 */
-               pskb_trim_rcsum(skb, start_of_padding - ETH_HLEN);
+               if (pskb_trim_rcsum(skb, start_of_padding - ETH_HLEN))
+                       return NULL;
        /* Trap-to-host frame, no timestamp trailer */
        } else {
                *source_port = SJA1110_RX_HEADER_SRC_PORT(rx_header);
index e02daa7..2edc8b7 100644 (file)
@@ -398,7 +398,7 @@ EXPORT_SYMBOL(alloc_etherdev_mqs);
 
 ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len)
 {
-       return scnprintf(buf, PAGE_SIZE, "%*phC\n", len, addr);
+       return sysfs_emit(buf, "%*phC\n", len, addr);
 }
 EXPORT_SYMBOL(sysfs_format_mac);
 
index 72ab094..228f13d 100644 (file)
@@ -4,7 +4,7 @@ obj-y                           += ioctl.o common.o
 
 obj-$(CONFIG_ETHTOOL_NETLINK)  += ethtool_nl.o
 
-ethtool_nl-y   := netlink.o bitset.o strset.o linkinfo.o linkmodes.o \
+ethtool_nl-y   := netlink.o bitset.o strset.o linkinfo.o linkmodes.o rss.o \
                   linkstate.o debug.o wol.o features.o privflags.o rings.o \
                   channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o \
                   tunnels.o fec.o eeprom.o stats.o phc_vclocks.o module.o \
index 21cfe85..6f399af 100644 (file)
@@ -417,6 +417,7 @@ const char sof_timestamping_names[][ETH_GSTRING_LEN] = {
        [const_ilog2(SOF_TIMESTAMPING_OPT_PKTINFO)]  = "option-pktinfo",
        [const_ilog2(SOF_TIMESTAMPING_OPT_TX_SWHW)]  = "option-tx-swhw",
        [const_ilog2(SOF_TIMESTAMPING_BIND_PHC)]     = "bind-phc",
+       [const_ilog2(SOF_TIMESTAMPING_OPT_ID_TCP)]   = "option-id-tcp",
 };
 static_assert(ARRAY_SIZE(sof_timestamping_names) == __SOF_TIMESTAMPING_CNT);
 
index 1a4c113..aee98be 100644 (file)
@@ -287,6 +287,7 @@ ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = {
        [ETHTOOL_MSG_PHC_VCLOCKS_GET]   = &ethnl_phc_vclocks_request_ops,
        [ETHTOOL_MSG_MODULE_GET]        = &ethnl_module_request_ops,
        [ETHTOOL_MSG_PSE_GET]           = &ethnl_pse_request_ops,
+       [ETHTOOL_MSG_RSS_GET]           = &ethnl_rss_request_ops,
 };
 
 static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb)
@@ -1040,6 +1041,12 @@ static const struct genl_ops ethtool_genl_ops[] = {
                .policy = ethnl_pse_set_policy,
                .maxattr = ARRAY_SIZE(ethnl_pse_set_policy) - 1,
        },
+       {
+               .cmd    = ETHTOOL_MSG_RSS_GET,
+               .doit   = ethnl_default_doit,
+               .policy = ethnl_rss_get_policy,
+               .maxattr = ARRAY_SIZE(ethnl_rss_get_policy) - 1,
+       },
 };
 
 static const struct genl_multicast_group ethtool_nl_mcgrps[] = {
index 1bfd374..3753787 100644 (file)
@@ -346,6 +346,7 @@ extern const struct ethnl_request_ops ethnl_stats_request_ops;
 extern const struct ethnl_request_ops ethnl_phc_vclocks_request_ops;
 extern const struct ethnl_request_ops ethnl_module_request_ops;
 extern const struct ethnl_request_ops ethnl_pse_request_ops;
+extern const struct ethnl_request_ops ethnl_rss_request_ops;
 
 extern const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_FLAGS + 1];
 extern const struct nla_policy ethnl_header_policy_stats[ETHTOOL_A_HEADER_FLAGS + 1];
@@ -386,6 +387,7 @@ extern const struct nla_policy ethnl_module_get_policy[ETHTOOL_A_MODULE_HEADER +
 extern const struct nla_policy ethnl_module_set_policy[ETHTOOL_A_MODULE_POWER_MODE_POLICY + 1];
 extern const struct nla_policy ethnl_pse_get_policy[ETHTOOL_A_PSE_HEADER + 1];
 extern const struct nla_policy ethnl_pse_set_policy[ETHTOOL_A_PSE_MAX + 1];
+extern const struct nla_policy ethnl_rss_get_policy[ETHTOOL_A_RSS_CONTEXT + 1];
 
 int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info);
 int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info);
diff --git a/net/ethtool/rss.c b/net/ethtool/rss.c
new file mode 100644 (file)
index 0000000..ebe6145
--- /dev/null
@@ -0,0 +1,153 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include "netlink.h"
+#include "common.h"
+
+struct rss_req_info {
+       struct ethnl_req_info           base;
+       u32                             rss_context;
+};
+
+struct rss_reply_data {
+       struct ethnl_reply_data         base;
+       u32                             indir_size;
+       u32                             hkey_size;
+       u32                             hfunc;
+       u32                             *indir_table;
+       u8                              *hkey;
+};
+
+#define RSS_REQINFO(__req_base) \
+       container_of(__req_base, struct rss_req_info, base)
+
+#define RSS_REPDATA(__reply_base) \
+       container_of(__reply_base, struct rss_reply_data, base)
+
+const struct nla_policy ethnl_rss_get_policy[] = {
+       [ETHTOOL_A_RSS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy),
+       [ETHTOOL_A_RSS_CONTEXT] = { .type = NLA_U32 },
+};
+
+static int
+rss_parse_request(struct ethnl_req_info *req_info, struct nlattr **tb,
+                 struct netlink_ext_ack *extack)
+{
+       struct rss_req_info *request = RSS_REQINFO(req_info);
+
+       if (tb[ETHTOOL_A_RSS_CONTEXT])
+               request->rss_context = nla_get_u32(tb[ETHTOOL_A_RSS_CONTEXT]);
+
+       return 0;
+}
+
+static int
+rss_prepare_data(const struct ethnl_req_info *req_base,
+                struct ethnl_reply_data *reply_base, struct genl_info *info)
+{
+       struct rss_reply_data *data = RSS_REPDATA(reply_base);
+       struct rss_req_info *request = RSS_REQINFO(req_base);
+       struct net_device *dev = reply_base->dev;
+       const struct ethtool_ops *ops;
+       u32 total_size, indir_bytes;
+       u8 dev_hfunc = 0;
+       u8 *rss_config;
+       int ret;
+
+       ops = dev->ethtool_ops;
+       if (!ops->get_rxfh)
+               return -EOPNOTSUPP;
+
+       /* Some drivers don't handle rss_context */
+       if (request->rss_context && !ops->get_rxfh_context)
+               return -EOPNOTSUPP;
+
+       ret = ethnl_ops_begin(dev);
+       if (ret < 0)
+               return ret;
+
+       data->indir_size = 0;
+       data->hkey_size = 0;
+       if (ops->get_rxfh_indir_size)
+               data->indir_size = ops->get_rxfh_indir_size(dev);
+       if (ops->get_rxfh_key_size)
+               data->hkey_size = ops->get_rxfh_key_size(dev);
+
+       indir_bytes = data->indir_size * sizeof(u32);
+       total_size = indir_bytes + data->hkey_size;
+       rss_config = kzalloc(total_size, GFP_KERNEL);
+       if (!rss_config) {
+               ret = -ENOMEM;
+               goto out_ops;
+       }
+
+       if (data->indir_size)
+               data->indir_table = (u32 *)rss_config;
+
+       if (data->hkey_size)
+               data->hkey = rss_config + indir_bytes;
+
+       if (request->rss_context)
+               ret = ops->get_rxfh_context(dev, data->indir_table, data->hkey,
+                                           &dev_hfunc, request->rss_context);
+       else
+               ret = ops->get_rxfh(dev, data->indir_table, data->hkey,
+                                   &dev_hfunc);
+
+       if (ret)
+               goto out_ops;
+
+       data->hfunc = dev_hfunc;
+out_ops:
+       ethnl_ops_complete(dev);
+       return ret;
+}
+
+static int
+rss_reply_size(const struct ethnl_req_info *req_base,
+              const struct ethnl_reply_data *reply_base)
+{
+       const struct rss_reply_data *data = RSS_REPDATA(reply_base);
+       int len;
+
+       len = nla_total_size(sizeof(u32)) +     /* _RSS_HFUNC */
+             nla_total_size(sizeof(u32) * data->indir_size) + /* _RSS_INDIR */
+             nla_total_size(data->hkey_size);  /* _RSS_HKEY */
+
+       return len;
+}
+
+static int
+rss_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base,
+              const struct ethnl_reply_data *reply_base)
+{
+       const struct rss_reply_data *data = RSS_REPDATA(reply_base);
+
+       if (nla_put_u32(skb, ETHTOOL_A_RSS_HFUNC, data->hfunc) ||
+           nla_put(skb, ETHTOOL_A_RSS_INDIR,
+                   sizeof(u32) * data->indir_size, data->indir_table) ||
+           nla_put(skb, ETHTOOL_A_RSS_HKEY, data->hkey_size, data->hkey))
+               return -EMSGSIZE;
+
+       return 0;
+}
+
+static void rss_cleanup_data(struct ethnl_reply_data *reply_base)
+{
+       const struct rss_reply_data *data = RSS_REPDATA(reply_base);
+
+       kfree(data->indir_table);
+}
+
+const struct ethnl_request_ops ethnl_rss_request_ops = {
+       .request_cmd            = ETHTOOL_MSG_RSS_GET,
+       .reply_cmd              = ETHTOOL_MSG_RSS_GET_REPLY,
+       .hdr_attr               = ETHTOOL_A_RSS_HEADER,
+       .req_info_size          = sizeof(struct rss_req_info),
+       .reply_data_size        = sizeof(struct rss_reply_data),
+
+       .parse_request          = rss_parse_request,
+       .prepare_data           = rss_prepare_data,
+       .reply_size             = rss_reply_size,
+       .fill_reply             = rss_fill_reply,
+       .cleanup_data           = rss_cleanup_data,
+};
index b33d1b5..248ad5e 100644 (file)
@@ -26,10 +26,12 @@ static struct genl_family nl802154_fam;
 /* multicast groups */
 enum nl802154_multicast_groups {
        NL802154_MCGRP_CONFIG,
+       NL802154_MCGRP_SCAN,
 };
 
 static const struct genl_multicast_group nl802154_mcgrps[] = {
        [NL802154_MCGRP_CONFIG] = { .name = "config", },
+       [NL802154_MCGRP_SCAN] = { .name = "scan", },
 };
 
 /* returns ERR_PTR values */
@@ -216,6 +218,9 @@ static const struct nla_policy nl802154_policy[NL802154_ATTR_MAX+1] = {
 
        [NL802154_ATTR_PID] = { .type = NLA_U32 },
        [NL802154_ATTR_NETNS_FD] = { .type = NLA_U32 },
+
+       [NL802154_ATTR_COORDINATOR] = { .type = NLA_NESTED },
+
 #ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL
        [NL802154_ATTR_SEC_ENABLED] = { .type = NLA_U8, },
        [NL802154_ATTR_SEC_OUT_LEVEL] = { .type = NLA_U32, },
@@ -1281,6 +1286,104 @@ static int nl802154_wpan_phy_netns(struct sk_buff *skb, struct genl_info *info)
        return err;
 }
 
+static int nl802154_prep_scan_event_msg(struct sk_buff *msg,
+                                       struct cfg802154_registered_device *rdev,
+                                       struct wpan_dev *wpan_dev,
+                                       u32 portid, u32 seq, int flags, u8 cmd,
+                                       struct ieee802154_coord_desc *desc)
+{
+       struct nlattr *nla;
+       void *hdr;
+
+       hdr = nl802154hdr_put(msg, portid, seq, flags, cmd);
+       if (!hdr)
+               return -ENOBUFS;
+
+       if (nla_put_u32(msg, NL802154_ATTR_WPAN_PHY, rdev->wpan_phy_idx))
+               goto nla_put_failure;
+
+       if (wpan_dev->netdev &&
+           nla_put_u32(msg, NL802154_ATTR_IFINDEX, wpan_dev->netdev->ifindex))
+               goto nla_put_failure;
+
+       if (nla_put_u64_64bit(msg, NL802154_ATTR_WPAN_DEV,
+                             wpan_dev_id(wpan_dev), NL802154_ATTR_PAD))
+               goto nla_put_failure;
+
+       nla = nla_nest_start_noflag(msg, NL802154_ATTR_COORDINATOR);
+       if (!nla)
+               goto nla_put_failure;
+
+       if (nla_put(msg, NL802154_COORD_PANID, IEEE802154_PAN_ID_LEN,
+                   &desc->addr.pan_id))
+               goto nla_put_failure;
+
+       if (desc->addr.mode == IEEE802154_ADDR_SHORT) {
+               if (nla_put(msg, NL802154_COORD_ADDR,
+                           IEEE802154_SHORT_ADDR_LEN,
+                           &desc->addr.short_addr))
+                       goto nla_put_failure;
+       } else {
+               if (nla_put(msg, NL802154_COORD_ADDR,
+                           IEEE802154_EXTENDED_ADDR_LEN,
+                           &desc->addr.extended_addr))
+                       goto nla_put_failure;
+       }
+
+       if (nla_put_u8(msg, NL802154_COORD_CHANNEL, desc->channel))
+               goto nla_put_failure;
+
+       if (nla_put_u8(msg, NL802154_COORD_PAGE, desc->page))
+               goto nla_put_failure;
+
+       if (nla_put_u16(msg, NL802154_COORD_SUPERFRAME_SPEC,
+                       desc->superframe_spec))
+               goto nla_put_failure;
+
+       if (nla_put_u8(msg, NL802154_COORD_LINK_QUALITY, desc->link_quality))
+               goto nla_put_failure;
+
+       if (desc->gts_permit && nla_put_flag(msg, NL802154_COORD_GTS_PERMIT))
+               goto nla_put_failure;
+
+       /* TODO: NL802154_COORD_PAYLOAD_DATA if any */
+
+       nla_nest_end(msg, nla);
+
+       genlmsg_end(msg, hdr);
+
+       return 0;
+
+ nla_put_failure:
+       genlmsg_cancel(msg, hdr);
+
+       return -EMSGSIZE;
+}
+
+int nl802154_scan_event(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
+                       struct ieee802154_coord_desc *desc)
+{
+       struct cfg802154_registered_device *rdev = wpan_phy_to_rdev(wpan_phy);
+       struct sk_buff *msg;
+       int ret;
+
+       msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+       if (!msg)
+               return -ENOMEM;
+
+       ret = nl802154_prep_scan_event_msg(msg, rdev, wpan_dev, 0, 0, 0,
+                                          NL802154_CMD_SCAN_EVENT,
+                                          desc);
+       if (ret < 0) {
+               nlmsg_free(msg);
+               return ret;
+       }
+
+       return genlmsg_multicast_netns(&nl802154_fam, wpan_phy_net(wpan_phy),
+                                      msg, 0, NL802154_MCGRP_SCAN, GFP_ATOMIC);
+}
+EXPORT_SYMBOL_GPL(nl802154_scan_event);
+
 #ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL
 static const struct nla_policy nl802154_dev_addr_policy[NL802154_DEV_ADDR_ATTR_MAX + 1] = {
        [NL802154_DEV_ADDR_ATTR_PAN_ID] = { .type = NLA_U16 },
index 8c4b6d0..89b8055 100644 (file)
@@ -4,5 +4,7 @@
 
 int nl802154_init(void);
 void nl802154_exit(void);
+int nl802154_scan_event(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
+                       struct ieee802154_coord_desc *desc);
 
 #endif /* __IEEE802154_NL802154_H */
index f361d3d..b5736ef 100644 (file)
@@ -841,6 +841,9 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
                return -EINVAL;
        }
 
+       if (!cfg->fc_table)
+               cfg->fc_table = RT_TABLE_MAIN;
+
        return 0;
 errout:
        return err;
index 19a6620..ce9ff3c 100644 (file)
@@ -423,6 +423,7 @@ static struct fib_info *fib_find_info(struct fib_info *nfi)
                    nfi->fib_prefsrc == fi->fib_prefsrc &&
                    nfi->fib_priority == fi->fib_priority &&
                    nfi->fib_type == fi->fib_type &&
+                   nfi->fib_tb_id == fi->fib_tb_id &&
                    memcmp(nfi->fib_metrics, fi->fib_metrics,
                           sizeof(u32) * RTAX_MAX) == 0 &&
                    !((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK) &&
index a4ccef3..ffff46c 100644 (file)
@@ -1492,24 +1492,6 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
        struct ip_tunnel_parm *p = &t->parms;
        __be16 o_flags = p->o_flags;
 
-       if (t->erspan_ver <= 2) {
-               if (t->erspan_ver != 0 && !t->collect_md)
-                       o_flags |= TUNNEL_KEY;
-
-               if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, t->erspan_ver))
-                       goto nla_put_failure;
-
-               if (t->erspan_ver == 1) {
-                       if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index))
-                               goto nla_put_failure;
-               } else if (t->erspan_ver == 2) {
-                       if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, t->dir))
-                               goto nla_put_failure;
-                       if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, t->hwid))
-                               goto nla_put_failure;
-               }
-       }
-
        if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
            nla_put_be16(skb, IFLA_GRE_IFLAGS,
                         gre_tnl_flags_to_gre_flags(p->i_flags)) ||
@@ -1550,6 +1532,34 @@ nla_put_failure:
        return -EMSGSIZE;
 }
 
+static int erspan_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+       struct ip_tunnel *t = netdev_priv(dev);
+
+       if (t->erspan_ver <= 2) {
+               if (t->erspan_ver != 0 && !t->collect_md)
+                       t->parms.o_flags |= TUNNEL_KEY;
+
+               if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, t->erspan_ver))
+                       goto nla_put_failure;
+
+               if (t->erspan_ver == 1) {
+                       if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index))
+                               goto nla_put_failure;
+               } else if (t->erspan_ver == 2) {
+                       if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, t->dir))
+                               goto nla_put_failure;
+                       if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, t->hwid))
+                               goto nla_put_failure;
+               }
+       }
+
+       return ipgre_fill_info(skb, dev);
+
+nla_put_failure:
+       return -EMSGSIZE;
+}
+
 static void erspan_setup(struct net_device *dev)
 {
        struct ip_tunnel *t = netdev_priv(dev);
@@ -1628,7 +1638,7 @@ static struct rtnl_link_ops erspan_link_ops __read_mostly = {
        .changelink     = erspan_changelink,
        .dellink        = ip_tunnel_dellink,
        .get_size       = ipgre_get_size,
-       .fill_info      = ipgre_fill_info,
+       .fill_info      = erspan_fill_info,
        .get_link_net   = ip_tunnel_get_link_net,
 };
 
index bb9854c..409ec2a 100644 (file)
 #include <net/transp_v6.h>
 #endif
 
+#define ping_portaddr_for_each_entry(__sk, node, list) \
+       hlist_nulls_for_each_entry(__sk, node, list, sk_nulls_node)
+#define ping_portaddr_for_each_entry_rcu(__sk, node, list) \
+       hlist_nulls_for_each_entry_rcu(__sk, node, list, sk_nulls_node)
+
 struct ping_table {
        struct hlist_nulls_head hash[PING_HTABLE_SIZE];
        spinlock_t              lock;
@@ -192,7 +197,7 @@ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident)
                return NULL;
        }
 
-       ping_portaddr_for_each_entry(sk, hnode, hslot) {
+       ping_portaddr_for_each_entry_rcu(sk, hnode, hslot) {
                isk = inet_sk(sk);
 
                pr_debug("iterate\n");
index cf9c3e8..94aad38 100644 (file)
@@ -45,8 +45,11 @@ static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock,
                tmp->sg.end = i;
                if (apply) {
                        apply_bytes -= size;
-                       if (!apply_bytes)
+                       if (!apply_bytes) {
+                               if (sge->length)
+                                       sk_msg_iter_var_prev(i);
                                break;
+                       }
                }
        } while (i != msg->sg.end);
 
@@ -131,10 +134,9 @@ static int tcp_bpf_push_locked(struct sock *sk, struct sk_msg *msg,
        return ret;
 }
 
-int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg,
-                         u32 bytes, int flags)
+int tcp_bpf_sendmsg_redir(struct sock *sk, bool ingress,
+                         struct sk_msg *msg, u32 bytes, int flags)
 {
-       bool ingress = sk_msg_to_ingress(msg);
        struct sk_psock *psock = sk_psock_get(sk);
        int ret;
 
@@ -276,10 +278,10 @@ msg_bytes_ready:
 static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock,
                                struct sk_msg *msg, int *copied, int flags)
 {
-       bool cork = false, enospc = sk_msg_full(msg);
+       bool cork = false, enospc = sk_msg_full(msg), redir_ingress;
        struct sock *sk_redir;
        u32 tosend, origsize, sent, delta = 0;
-       u32 eval = __SK_NONE;
+       u32 eval;
        int ret;
 
 more_data:
@@ -310,6 +312,7 @@ more_data:
        tosend = msg->sg.size;
        if (psock->apply_bytes && psock->apply_bytes < tosend)
                tosend = psock->apply_bytes;
+       eval = __SK_NONE;
 
        switch (psock->eval) {
        case __SK_PASS:
@@ -321,6 +324,7 @@ more_data:
                sk_msg_apply_bytes(psock, tosend);
                break;
        case __SK_REDIRECT:
+               redir_ingress = psock->redir_ingress;
                sk_redir = psock->sk_redir;
                sk_msg_apply_bytes(psock, tosend);
                if (!psock->apply_bytes) {
@@ -337,7 +341,8 @@ more_data:
                release_sock(sk);
 
                origsize = msg->sg.size;
-               ret = tcp_bpf_sendmsg_redir(sk_redir, msg, tosend, flags);
+               ret = tcp_bpf_sendmsg_redir(sk_redir, redir_ingress,
+                                           msg, tosend, flags);
                sent = origsize - msg->sg.size;
 
                if (eval == __SK_REDIRECT)
index aedde65..1f01e15 100644 (file)
@@ -387,7 +387,8 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
        if (!pskb_may_pull(skb, sizeof(struct udphdr)))
                goto out;
 
-       if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4)
+       if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 &&
+           !skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST))
                return __udp_gso_segment(skb, features, false);
 
        mss = skb_shinfo(skb)->gso_size;
index 9c3f520..c338dfb 100644 (file)
@@ -3320,7 +3320,7 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route)
                return;
 
        /* no link local addresses on devices flagged as slaves */
-       if (idev->dev->flags & IFF_SLAVE)
+       if (idev->dev->priv_flags & IFF_NO_ADDRCONF)
                return;
 
        ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0);
@@ -3560,7 +3560,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
                if (idev && idev->cnf.disable_ipv6)
                        break;
 
-               if (dev->flags & IFF_SLAVE) {
+               if (dev->priv_flags & IFF_NO_ADDRCONF) {
                        if (event == NETDEV_UP && !IS_ERR_OR_NULL(idev) &&
                            dev->flags & IFF_UP && dev->flags & IFF_MULTICAST)
                                ipv6_mc_up(idev);
index 3ee3456..00dc2e3 100644 (file)
@@ -77,7 +77,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
        struct sk_buff *segs = ERR_PTR(-EINVAL);
        struct ipv6hdr *ipv6h;
        const struct net_offload *ops;
-       int proto, nexthdr;
+       int proto, err;
        struct frag_hdr *fptr;
        unsigned int payload_len;
        u8 *prevhdr;
@@ -87,28 +87,9 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
        bool gso_partial;
 
        skb_reset_network_header(skb);
-       nexthdr = ipv6_has_hopopt_jumbo(skb);
-       if (nexthdr) {
-               const int hophdr_len = sizeof(struct hop_jumbo_hdr);
-               int err;
-
-               err = skb_cow_head(skb, 0);
-               if (err < 0)
-                       return ERR_PTR(err);
-
-               /* remove the HBH header.
-                * Layout: [Ethernet header][IPv6 header][HBH][TCP header]
-                */
-               memmove(skb_mac_header(skb) + hophdr_len,
-                       skb_mac_header(skb),
-                       ETH_HLEN + sizeof(struct ipv6hdr));
-               skb->data += hophdr_len;
-               skb->len -= hophdr_len;
-               skb->network_header += hophdr_len;
-               skb->mac_header += hophdr_len;
-               ipv6h = (struct ipv6hdr *)skb->data;
-               ipv6h->nexthdr = nexthdr;
-       }
+       err = ipv6_hopopt_jumbo_remove(skb);
+       if (err)
+               return ERR_PTR(err);
        nhoff = skb_network_header(skb) - skb_mac_header(skb);
        if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h))))
                goto out;
index e195076..60fd91b 100644 (file)
@@ -920,6 +920,9 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
                if (err < 0)
                        goto fail;
 
+               /* We prevent @rt from being freed. */
+               rcu_read_lock();
+
                for (;;) {
                        /* Prepare header of the next frame,
                         * before previous one went down. */
@@ -943,6 +946,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
                if (err == 0) {
                        IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
                                      IPSTATS_MIB_FRAGOKS);
+                       rcu_read_unlock();
                        return 0;
                }
 
@@ -950,6 +954,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 
                IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
                              IPSTATS_MIB_FRAGFAILS);
+               rcu_read_unlock();
                return err;
 
 slow_path_clean:
index e0e10f6..c39c1e3 100644 (file)
@@ -42,7 +42,8 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
                if (!pskb_may_pull(skb, sizeof(struct udphdr)))
                        goto out;
 
-               if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4)
+               if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 &&
+                   !skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST))
                        return __udp_gso_segment(skb, features, true);
 
                mss = skb_shinfo(skb)->gso_size;
index d9b5088..ac0b280 100644 (file)
@@ -254,7 +254,6 @@ ieee802154_check_concurrent_iface(struct ieee802154_sub_if_data *sdata,
                                  enum nl802154_iftype iftype)
 {
        struct ieee802154_local *local = sdata->local;
-       struct wpan_dev *wpan_dev = &sdata->wpan_dev;
        struct ieee802154_sub_if_data *nsdata;
 
        /* we hold the RTNL here so can safely walk the list */
@@ -262,13 +261,13 @@ ieee802154_check_concurrent_iface(struct ieee802154_sub_if_data *sdata,
                if (nsdata != sdata && ieee802154_sdata_running(nsdata)) {
                        int ret;
 
-                       /* TODO currently we don't support multiple node types
-                        * we need to run skb_clone at rx path. Check if there
-                        * exist really an use case if we need to support
-                        * multiple node types at the same time.
+                       /* TODO currently we don't support multiple node/coord
+                        * types we need to run skb_clone at rx path. Check if
+                        * there exist really an use case if we need to support
+                        * multiple node/coord types at the same time.
                         */
-                       if (wpan_dev->iftype == NL802154_IFTYPE_NODE &&
-                           nsdata->wpan_dev.iftype == NL802154_IFTYPE_NODE)
+                       if (sdata->wpan_dev.iftype != NL802154_IFTYPE_MONITOR &&
+                           nsdata->wpan_dev.iftype != NL802154_IFTYPE_MONITOR)
                                return -EBUSY;
 
                        /* check all phy mac sublayer settings are the same.
@@ -565,6 +564,7 @@ ieee802154_setup_sdata(struct ieee802154_sub_if_data *sdata,
        wpan_dev->short_addr = cpu_to_le16(IEEE802154_ADDR_BROADCAST);
 
        switch (type) {
+       case NL802154_IFTYPE_COORD:
        case NL802154_IFTYPE_NODE:
                ieee802154_be64_to_le64(&wpan_dev->extended_addr,
                                        sdata->dev->dev_addr);
@@ -624,6 +624,7 @@ ieee802154_if_add(struct ieee802154_local *local, const char *name,
        ieee802154_le64_to_be64(ndev->perm_addr,
                                &local->hw.phy->perm_extended_addr);
        switch (type) {
+       case NL802154_IFTYPE_COORD:
        case NL802154_IFTYPE_NODE:
                ndev->type = ARPHRD_IEEE802154;
                if (ieee802154_is_valid_extended_unicast_addr(extended_addr)) {
@@ -650,6 +651,7 @@ ieee802154_if_add(struct ieee802154_local *local, const char *name,
        sdata->dev = ndev;
        sdata->wpan_dev.wpan_phy = local->hw.phy;
        sdata->local = local;
+       INIT_LIST_HEAD(&sdata->wpan_dev.list);
 
        /* setup type-dependent data */
        ret = ieee802154_setup_sdata(sdata, type);
index 40fab08..3ed31da 100644 (file)
@@ -107,7 +107,7 @@ ieee802154_alloc_hw(size_t priv_data_len, const struct ieee802154_ops *ops)
        phy->supported.lbt = NL802154_SUPPORTED_BOOL_FALSE;
 
        /* always supported */
-       phy->supported.iftypes = BIT(NL802154_IFTYPE_NODE);
+       phy->supported.iftypes = BIT(NL802154_IFTYPE_NODE) | BIT(NL802154_IFTYPE_COORD);
 
        return &local->hw;
 }
index 0724aac..c2aae2a 100644 (file)
@@ -208,6 +208,7 @@ __ieee802154_rx_handle_packet(struct ieee802154_local *local,
        int ret;
        struct ieee802154_sub_if_data *sdata;
        struct ieee802154_hdr hdr;
+       struct sk_buff *skb2;
 
        ret = ieee802154_parse_frame_start(skb, &hdr);
        if (ret) {
@@ -217,7 +218,7 @@ __ieee802154_rx_handle_packet(struct ieee802154_local *local,
        }
 
        list_for_each_entry_rcu(sdata, &local->interfaces, list) {
-               if (sdata->wpan_dev.iftype != NL802154_IFTYPE_NODE)
+               if (sdata->wpan_dev.iftype == NL802154_IFTYPE_MONITOR)
                        continue;
 
                if (!ieee802154_sdata_running(sdata))
@@ -230,12 +231,12 @@ __ieee802154_rx_handle_packet(struct ieee802154_local *local,
                    sdata->required_filtering == IEEE802154_FILTERING_4_FRAME_FIELDS)
                        continue;
 
-               ieee802154_subif_frame(sdata, skb, &hdr);
-               skb = NULL;
-               break;
+               skb2 = skb_clone(skb, GFP_ATOMIC);
+               if (skb2) {
+                       skb2->dev = sdata->dev;
+                       ieee802154_subif_frame(sdata, skb2, &hdr);
+               }
        }
-
-       kfree_skb(skb);
 }
 
 static void
@@ -274,7 +275,7 @@ void ieee802154_rx(struct ieee802154_local *local, struct sk_buff *skb)
        WARN_ON_ONCE(softirq_count() == 0);
 
        if (local->suspended)
-               goto drop;
+               goto free_skb;
 
        /* TODO: When a transceiver omits the checksum here, we
         * add an own calculated one. This is currently an ugly
@@ -292,20 +293,17 @@ void ieee802154_rx(struct ieee802154_local *local, struct sk_buff *skb)
        /* Level 1 filtering: Check the FCS by software when relevant */
        if (local->hw.phy->filtering == IEEE802154_FILTERING_NONE) {
                crc = crc_ccitt(0, skb->data, skb->len);
-               if (crc) {
-                       rcu_read_unlock();
+               if (crc)
                        goto drop;
-               }
        }
        /* remove crc */
        skb_trim(skb, skb->len - 2);
 
        __ieee802154_rx_handle_packet(local, skb);
 
-       rcu_read_unlock();
-
-       return;
 drop:
+       rcu_read_unlock();
+free_skb:
        kfree_skb(skb);
 }
 
index df855c3..689396d 100644 (file)
@@ -264,6 +264,31 @@ TRACE_EVENT(802154_drv_set_promiscuous_mode,
                  BOOL_TO_STR(__entry->on))
 );
 
+TRACE_EVENT(802154_new_scan_event,
+       TP_PROTO(struct ieee802154_coord_desc *desc),
+       TP_ARGS(desc),
+       TP_STRUCT__entry(
+               __field(__le16, pan_id)
+               __field(__le64, addr)
+               __field(u8, channel)
+               __field(u8, page)
+       ),
+       TP_fast_assign(
+               __entry->page = desc->page;
+               __entry->channel = desc->channel;
+               __entry->pan_id = desc->addr.pan_id;
+               __entry->addr = desc->addr.extended_addr;
+       ),
+       TP_printk("panid: %u, coord_addr: 0x%llx, page: %u, channel: %u",
+                 __le16_to_cpu(__entry->pan_id), __le64_to_cpu(__entry->addr),
+                 __entry->page, __entry->channel)
+);
+
+DEFINE_EVENT(802154_new_scan_event, 802154_scan_event,
+       TP_PROTO(struct ieee802154_coord_desc *desc),
+       TP_ARGS(desc)
+);
+
 #endif /* !__MAC802154_DRIVER_TRACE || TRACE_HEADER_MULTI_READ */
 
 #undef TRACE_INCLUDE_PATH
index eef69d0..2ea7eae 100644 (file)
@@ -1190,7 +1190,7 @@ static int mptcp_pm_parse_pm_addr_attr(struct nlattr *tb[],
 
        if (!tb[MPTCP_PM_ADDR_ATTR_FAMILY]) {
                if (!require_family)
-                       return err;
+                       return 0;
 
                NL_SET_ERR_MSG_ATTR(info->extack, attr,
                                    "missing family");
@@ -1224,7 +1224,7 @@ static int mptcp_pm_parse_pm_addr_attr(struct nlattr *tb[],
        if (tb[MPTCP_PM_ADDR_ATTR_PORT])
                addr->port = htons(nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_PORT]));
 
-       return err;
+       return 0;
 }
 
 int mptcp_pm_parse_addr(struct nlattr *attr, struct genl_info *info,
@@ -2094,7 +2094,7 @@ void mptcp_event_addr_removed(const struct mptcp_sock *msk, uint8_t id)
        return;
 
 nla_put_failure:
-       kfree_skb(skb);
+       nlmsg_free(skb);
 }
 
 void mptcp_event_addr_announced(const struct sock *ssk,
@@ -2151,7 +2151,7 @@ void mptcp_event_addr_announced(const struct sock *ssk,
        return;
 
 nla_put_failure:
-       kfree_skb(skb);
+       nlmsg_free(skb);
 }
 
 void mptcp_event_pm_listener(const struct sock *ssk,
@@ -2203,7 +2203,7 @@ void mptcp_event_pm_listener(const struct sock *ssk,
        return;
 
 nla_put_failure:
-       kfree_skb(skb);
+       nlmsg_free(skb);
 }
 
 void mptcp_event(enum mptcp_event_type type, const struct mptcp_sock *msk,
@@ -2261,7 +2261,7 @@ void mptcp_event(enum mptcp_event_type type, const struct mptcp_sock *msk,
        return;
 
 nla_put_failure:
-       kfree_skb(skb);
+       nlmsg_free(skb);
 }
 
 static const struct genl_small_ops mptcp_pm_ops[] = {
index a47423e..d4b1e6e 100644 (file)
@@ -740,7 +740,7 @@ static int mptcp_setsockopt_v4_set_tos(struct mptcp_sock *msk, int optname,
        }
        release_sock(sk);
 
-       return err;
+       return 0;
 }
 
 static int mptcp_setsockopt_v4(struct mptcp_sock *msk, int optname,
index dda8b76..fd2236e 100644 (file)
@@ -228,7 +228,8 @@ static int ncsi_cmd_handler_oem(struct sk_buff *skb,
        len += max(payload, padding_bytes);
 
        cmd = skb_put_zero(skb, len);
-       memcpy(&cmd->mfr_id, nca->data, nca->payload);
+       unsafe_memcpy(&cmd->mfr_id, nca->data, nca->payload,
+                     /* skb allocated with enough to load the payload */);
        ncsi_cmd_build_header(&cmd->cmd.common, nca);
 
        return 0;
index 0846bd7..f71b41c 100644 (file)
@@ -459,6 +459,9 @@ config NF_NAT_REDIRECT
 config NF_NAT_MASQUERADE
        bool
 
+config NF_NAT_OVS
+       bool
+
 config NETFILTER_SYNPROXY
        tristate
 
index 1d4db19..3754eb0 100644 (file)
@@ -59,6 +59,7 @@ obj-$(CONFIG_NF_LOG_SYSLOG) += nf_log_syslog.o
 obj-$(CONFIG_NF_NAT) += nf_nat.o
 nf_nat-$(CONFIG_NF_NAT_REDIRECT) += nf_nat_redirect.o
 nf_nat-$(CONFIG_NF_NAT_MASQUERADE) += nf_nat_masquerade.o
+nf_nat-$(CONFIG_NF_NAT_OVS) += nf_nat_ovs.o
 
 ifeq ($(CONFIG_NF_NAT),m)
 nf_nat-$(CONFIG_DEBUG_INFO_BTF_MODULES) += nf_nat_bpf.o
index 7499192..7c23995 100644 (file)
@@ -159,6 +159,17 @@ htable_size(u8 hbits)
        (SET_WITH_TIMEOUT(set) &&       \
         ip_set_timeout_expired(ext_timeout(d, set)))
 
+#if defined(IP_SET_HASH_WITH_NETMASK) || defined(IP_SET_HASH_WITH_BITMASK)
+static const union nf_inet_addr onesmask = {
+       .all[0] = 0xffffffff,
+       .all[1] = 0xffffffff,
+       .all[2] = 0xffffffff,
+       .all[3] = 0xffffffff
+};
+
+static const union nf_inet_addr zeromask = {};
+#endif
+
 #endif /* _IP_SET_HASH_GEN_H */
 
 #ifndef MTYPE
@@ -283,8 +294,9 @@ struct htype {
        u32 markmask;           /* markmask value for mark mask to store */
 #endif
        u8 bucketsize;          /* max elements in an array block */
-#ifdef IP_SET_HASH_WITH_NETMASK
+#if defined(IP_SET_HASH_WITH_NETMASK) || defined(IP_SET_HASH_WITH_BITMASK)
        u8 netmask;             /* netmask value for subnets to store */
+       union nf_inet_addr bitmask;     /* stores bitmask */
 #endif
        struct list_head ad;    /* Resize add|del backlist */
        struct mtype_elem next; /* temporary storage for uadd */
@@ -459,8 +471,8 @@ mtype_same_set(const struct ip_set *a, const struct ip_set *b)
        /* Resizing changes htable_bits, so we ignore it */
        return x->maxelem == y->maxelem &&
               a->timeout == b->timeout &&
-#ifdef IP_SET_HASH_WITH_NETMASK
-              x->netmask == y->netmask &&
+#if defined(IP_SET_HASH_WITH_NETMASK) || defined(IP_SET_HASH_WITH_BITMASK)
+              nf_inet_addr_cmp(&x->bitmask, &y->bitmask) &&
 #endif
 #ifdef IP_SET_HASH_WITH_MARKMASK
               x->markmask == y->markmask &&
@@ -1264,9 +1276,21 @@ mtype_head(struct ip_set *set, struct sk_buff *skb)
                          htonl(jhash_size(htable_bits))) ||
            nla_put_net32(skb, IPSET_ATTR_MAXELEM, htonl(h->maxelem)))
                goto nla_put_failure;
+#ifdef IP_SET_HASH_WITH_BITMASK
+       /* if netmask is set to anything other than HOST_MASK we know that the user supplied netmask
+        * and not bitmask. These two are mutually exclusive. */
+       if (h->netmask == HOST_MASK && !nf_inet_addr_cmp(&onesmask, &h->bitmask)) {
+               if (set->family == NFPROTO_IPV4) {
+                       if (nla_put_ipaddr4(skb, IPSET_ATTR_BITMASK, h->bitmask.ip))
+                               goto nla_put_failure;
+               } else if (set->family == NFPROTO_IPV6) {
+                       if (nla_put_ipaddr6(skb, IPSET_ATTR_BITMASK, &h->bitmask.in6))
+                               goto nla_put_failure;
+               }
+       }
+#endif
 #ifdef IP_SET_HASH_WITH_NETMASK
-       if (h->netmask != HOST_MASK &&
-           nla_put_u8(skb, IPSET_ATTR_NETMASK, h->netmask))
+       if (h->netmask != HOST_MASK && nla_put_u8(skb, IPSET_ATTR_NETMASK, h->netmask))
                goto nla_put_failure;
 #endif
 #ifdef IP_SET_HASH_WITH_MARKMASK
@@ -1429,8 +1453,10 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
        u32 markmask;
 #endif
        u8 hbits;
-#ifdef IP_SET_HASH_WITH_NETMASK
-       u8 netmask;
+#if defined(IP_SET_HASH_WITH_NETMASK) || defined(IP_SET_HASH_WITH_BITMASK)
+       int ret __attribute__((unused)) = 0;
+       u8 netmask = set->family == NFPROTO_IPV4 ? 32 : 128;
+       union nf_inet_addr bitmask = onesmask;
 #endif
        size_t hsize;
        struct htype *h;
@@ -1468,7 +1494,6 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
 #endif
 
 #ifdef IP_SET_HASH_WITH_NETMASK
-       netmask = set->family == NFPROTO_IPV4 ? 32 : 128;
        if (tb[IPSET_ATTR_NETMASK]) {
                netmask = nla_get_u8(tb[IPSET_ATTR_NETMASK]);
 
@@ -1476,6 +1501,33 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
                    (set->family == NFPROTO_IPV6 && netmask > 128) ||
                    netmask == 0)
                        return -IPSET_ERR_INVALID_NETMASK;
+
+               /* we convert netmask to bitmask and store it */
+               if (set->family == NFPROTO_IPV4)
+                       bitmask.ip = ip_set_netmask(netmask);
+               else
+                       ip6_netmask(&bitmask, netmask);
+       }
+#endif
+
+#ifdef IP_SET_HASH_WITH_BITMASK
+       if (tb[IPSET_ATTR_BITMASK]) {
+               /* bitmask and netmask do the same thing, allow only one of these options */
+               if (tb[IPSET_ATTR_NETMASK])
+                       return -IPSET_ERR_BITMASK_NETMASK_EXCL;
+
+               if (set->family == NFPROTO_IPV4) {
+                       ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_BITMASK], &bitmask.ip);
+                       if (ret || !bitmask.ip)
+                               return -IPSET_ERR_INVALID_NETMASK;
+               } else if (set->family == NFPROTO_IPV6) {
+                       ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_BITMASK], &bitmask);
+                       if (ret || ipv6_addr_any(&bitmask.in6))
+                               return -IPSET_ERR_INVALID_NETMASK;
+               }
+
+               if (nf_inet_addr_cmp(&bitmask, &zeromask))
+                       return -IPSET_ERR_INVALID_NETMASK;
        }
 #endif
 
@@ -1518,7 +1570,8 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
        for (i = 0; i < ahash_numof_locks(hbits); i++)
                spin_lock_init(&t->hregion[i].lock);
        h->maxelem = maxelem;
-#ifdef IP_SET_HASH_WITH_NETMASK
+#if defined(IP_SET_HASH_WITH_NETMASK) || defined(IP_SET_HASH_WITH_BITMASK)
+       h->bitmask = bitmask;
        h->netmask = netmask;
 #endif
 #ifdef IP_SET_HASH_WITH_MARKMASK
index 75d556d..e30513c 100644 (file)
@@ -24,7 +24,8 @@
 /*                             2          Comments support */
 /*                             3          Forceadd support */
 /*                             4          skbinfo support */
-#define IPSET_TYPE_REV_MAX     5       /* bucketsize, initval support  */
+/*                             5          bucketsize, initval support  */
+#define IPSET_TYPE_REV_MAX     6       /* bitmask support  */
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
@@ -34,6 +35,7 @@ MODULE_ALIAS("ip_set_hash:ip");
 /* Type specific function prefix */
 #define HTYPE          hash_ip
 #define IP_SET_HASH_WITH_NETMASK
+#define IP_SET_HASH_WITH_BITMASK
 
 /* IPv4 variant */
 
@@ -86,7 +88,7 @@ hash_ip4_kadt(struct ip_set *set, const struct sk_buff *skb,
        __be32 ip;
 
        ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &ip);
-       ip &= ip_set_netmask(h->netmask);
+       ip &= h->bitmask.ip;
        if (ip == 0)
                return -EINVAL;
 
@@ -119,7 +121,7 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[],
        if (ret)
                return ret;
 
-       ip &= ip_set_hostmask(h->netmask);
+       ip &= ntohl(h->bitmask.ip);
        e.ip = htonl(ip);
        if (e.ip == 0)
                return -IPSET_ERR_HASH_ELEM;
@@ -185,12 +187,6 @@ hash_ip6_data_equal(const struct hash_ip6_elem *ip1,
        return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6);
 }
 
-static void
-hash_ip6_netmask(union nf_inet_addr *ip, u8 prefix)
-{
-       ip6_netmask(ip, prefix);
-}
-
 static bool
 hash_ip6_data_list(struct sk_buff *skb, const struct hash_ip6_elem *e)
 {
@@ -227,7 +223,7 @@ hash_ip6_kadt(struct ip_set *set, const struct sk_buff *skb,
        struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
 
        ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6);
-       hash_ip6_netmask(&e.ip, h->netmask);
+       nf_inet_addr_mask_inplace(&e.ip, &h->bitmask);
        if (ipv6_addr_any(&e.ip.in6))
                return -EINVAL;
 
@@ -266,7 +262,7 @@ hash_ip6_uadt(struct ip_set *set, struct nlattr *tb[],
        if (ret)
                return ret;
 
-       hash_ip6_netmask(&e.ip, h->netmask);
+       nf_inet_addr_mask_inplace(&e.ip, &h->bitmask);
        if (ipv6_addr_any(&e.ip.in6))
                return -IPSET_ERR_HASH_ELEM;
 
@@ -293,6 +289,7 @@ static struct ip_set_type hash_ip_type __read_mostly = {
                [IPSET_ATTR_RESIZE]     = { .type = NLA_U8  },
                [IPSET_ATTR_TIMEOUT]    = { .type = NLA_U32 },
                [IPSET_ATTR_NETMASK]    = { .type = NLA_U8  },
+               [IPSET_ATTR_BITMASK]    = { .type = NLA_NESTED },
                [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
        },
        .adt_policy     = {
index 7303138..2ffbd0b 100644 (file)
@@ -26,7 +26,8 @@
 /*                             3    Comments support added */
 /*                             4    Forceadd support added */
 /*                             5    skbinfo support added */
-#define IPSET_TYPE_REV_MAX     6 /* bucketsize, initval support added */
+/*                             6    bucketsize, initval support added */
+#define IPSET_TYPE_REV_MAX     7 /* bitmask support added */
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
@@ -35,6 +36,8 @@ MODULE_ALIAS("ip_set_hash:ip,port");
 
 /* Type specific function prefix */
 #define HTYPE          hash_ipport
+#define IP_SET_HASH_WITH_NETMASK
+#define IP_SET_HASH_WITH_BITMASK
 
 /* IPv4 variant */
 
@@ -92,12 +95,16 @@ hash_ipport4_kadt(struct ip_set *set, const struct sk_buff *skb,
        ipset_adtfn adtfn = set->variant->adt[adt];
        struct hash_ipport4_elem e = { .ip = 0 };
        struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+       const struct MTYPE *h = set->data;
 
        if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC,
                                 &e.port, &e.proto))
                return -EINVAL;
 
        ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip);
+       e.ip &= h->bitmask.ip;
+       if (e.ip == 0)
+               return -EINVAL;
        return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
 }
 
@@ -129,6 +136,10 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[],
        if (ret)
                return ret;
 
+       e.ip &= h->bitmask.ip;
+       if (e.ip == 0)
+               return -EINVAL;
+
        e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
 
        if (tb[IPSET_ATTR_PROTO]) {
@@ -253,12 +264,17 @@ hash_ipport6_kadt(struct ip_set *set, const struct sk_buff *skb,
        ipset_adtfn adtfn = set->variant->adt[adt];
        struct hash_ipport6_elem e = { .ip = { .all = { 0 } } };
        struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+       const struct MTYPE *h = set->data;
 
        if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC,
                                 &e.port, &e.proto))
                return -EINVAL;
 
        ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6);
+       nf_inet_addr_mask_inplace(&e.ip, &h->bitmask);
+       if (ipv6_addr_any(&e.ip.in6))
+               return -EINVAL;
+
        return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
 }
 
@@ -298,6 +314,10 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[],
        if (ret)
                return ret;
 
+       nf_inet_addr_mask_inplace(&e.ip, &h->bitmask);
+       if (ipv6_addr_any(&e.ip.in6))
+               return -EINVAL;
+
        e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
 
        if (tb[IPSET_ATTR_PROTO]) {
@@ -356,6 +376,8 @@ static struct ip_set_type hash_ipport_type __read_mostly = {
                [IPSET_ATTR_PROTO]      = { .type = NLA_U8 },
                [IPSET_ATTR_TIMEOUT]    = { .type = NLA_U32 },
                [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+               [IPSET_ATTR_NETMASK]    = { .type = NLA_U8 },
+               [IPSET_ATTR_BITMASK]    = { .type = NLA_NESTED },
        },
        .adt_policy     = {
                [IPSET_ATTR_IP]         = { .type = NLA_NESTED },
index 3d09eef..cdfb78c 100644 (file)
@@ -23,7 +23,8 @@
 #define IPSET_TYPE_REV_MIN     0
 /*                             1          Forceadd support added */
 /*                             2          skbinfo support added */
-#define IPSET_TYPE_REV_MAX     3       /* bucketsize, initval support added */
+/*                             3          bucketsize, initval support added */
+#define IPSET_TYPE_REV_MAX     4       /* bitmask support added */
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>");
@@ -33,6 +34,8 @@ MODULE_ALIAS("ip_set_hash:net,net");
 /* Type specific function prefix */
 #define HTYPE          hash_netnet
 #define IP_SET_HASH_WITH_NETS
+#define IP_SET_HASH_WITH_NETMASK
+#define IP_SET_HASH_WITH_BITMASK
 #define IPSET_NET_COUNT 2
 
 /* IPv4 variants */
@@ -153,8 +156,8 @@ hash_netnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
 
        ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip[0]);
        ip4addrptr(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.ip[1]);
-       e.ip[0] &= ip_set_netmask(e.cidr[0]);
-       e.ip[1] &= ip_set_netmask(e.cidr[1]);
+       e.ip[0] &= (ip_set_netmask(e.cidr[0]) & h->bitmask.ip);
+       e.ip[1] &= (ip_set_netmask(e.cidr[1]) & h->bitmask.ip);
 
        return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
 }
@@ -213,8 +216,8 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[],
 
        if (adt == IPSET_TEST || !(tb[IPSET_ATTR_IP_TO] ||
                                   tb[IPSET_ATTR_IP2_TO])) {
-               e.ip[0] = htonl(ip & ip_set_hostmask(e.cidr[0]));
-               e.ip[1] = htonl(ip2_from & ip_set_hostmask(e.cidr[1]));
+               e.ip[0] = htonl(ip & ntohl(h->bitmask.ip) & ip_set_hostmask(e.cidr[0]));
+               e.ip[1] = htonl(ip2_from & ntohl(h->bitmask.ip) & ip_set_hostmask(e.cidr[1]));
                ret = adtfn(set, &e, &ext, &ext, flags);
                return ip_set_enomatch(ret, flags, adt, set) ? -ret :
                       ip_set_eexist(ret, flags) ? 0 : ret;
@@ -404,6 +407,11 @@ hash_netnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
        ip6_netmask(&e.ip[0], e.cidr[0]);
        ip6_netmask(&e.ip[1], e.cidr[1]);
 
+       nf_inet_addr_mask_inplace(&e.ip[0], &h->bitmask);
+       nf_inet_addr_mask_inplace(&e.ip[1], &h->bitmask);
+       if (e.cidr[0] == HOST_MASK && ipv6_addr_any(&e.ip[0].in6))
+               return -EINVAL;
+
        return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
 }
 
@@ -414,6 +422,7 @@ hash_netnet6_uadt(struct ip_set *set, struct nlattr *tb[],
        ipset_adtfn adtfn = set->variant->adt[adt];
        struct hash_netnet6_elem e = { };
        struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+       const struct hash_netnet6 *h = set->data;
        int ret;
 
        if (tb[IPSET_ATTR_LINENO])
@@ -453,6 +462,11 @@ hash_netnet6_uadt(struct ip_set *set, struct nlattr *tb[],
        ip6_netmask(&e.ip[0], e.cidr[0]);
        ip6_netmask(&e.ip[1], e.cidr[1]);
 
+       nf_inet_addr_mask_inplace(&e.ip[0], &h->bitmask);
+       nf_inet_addr_mask_inplace(&e.ip[1], &h->bitmask);
+       if (e.cidr[0] == HOST_MASK && ipv6_addr_any(&e.ip[0].in6))
+               return -IPSET_ERR_HASH_ELEM;
+
        if (tb[IPSET_ATTR_CADT_FLAGS]) {
                u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
 
@@ -484,6 +498,8 @@ static struct ip_set_type hash_netnet_type __read_mostly = {
                [IPSET_ATTR_RESIZE]     = { .type = NLA_U8  },
                [IPSET_ATTR_TIMEOUT]    = { .type = NLA_U32 },
                [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+               [IPSET_ATTR_NETMASK]    = { .type = NLA_U8 },
+               [IPSET_ATTR_BITMASK]    = { .type = NLA_NESTED },
        },
        .adt_policy     = {
                [IPSET_ATTR_IP]         = { .type = NLA_NESTED },
index 51ad557..2fcc265 100644 (file)
@@ -132,21 +132,21 @@ ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
 
                s = this_cpu_ptr(dest->stats.cpustats);
                u64_stats_update_begin(&s->syncp);
-               s->cnt.inpkts++;
-               s->cnt.inbytes += skb->len;
+               u64_stats_inc(&s->cnt.inpkts);
+               u64_stats_add(&s->cnt.inbytes, skb->len);
                u64_stats_update_end(&s->syncp);
 
                svc = rcu_dereference(dest->svc);
                s = this_cpu_ptr(svc->stats.cpustats);
                u64_stats_update_begin(&s->syncp);
-               s->cnt.inpkts++;
-               s->cnt.inbytes += skb->len;
+               u64_stats_inc(&s->cnt.inpkts);
+               u64_stats_add(&s->cnt.inbytes, skb->len);
                u64_stats_update_end(&s->syncp);
 
-               s = this_cpu_ptr(ipvs->tot_stats.cpustats);
+               s = this_cpu_ptr(ipvs->tot_stats->s.cpustats);
                u64_stats_update_begin(&s->syncp);
-               s->cnt.inpkts++;
-               s->cnt.inbytes += skb->len;
+               u64_stats_inc(&s->cnt.inpkts);
+               u64_stats_add(&s->cnt.inbytes, skb->len);
                u64_stats_update_end(&s->syncp);
 
                local_bh_enable();
@@ -168,21 +168,21 @@ ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
 
                s = this_cpu_ptr(dest->stats.cpustats);
                u64_stats_update_begin(&s->syncp);
-               s->cnt.outpkts++;
-               s->cnt.outbytes += skb->len;
+               u64_stats_inc(&s->cnt.outpkts);
+               u64_stats_add(&s->cnt.outbytes, skb->len);
                u64_stats_update_end(&s->syncp);
 
                svc = rcu_dereference(dest->svc);
                s = this_cpu_ptr(svc->stats.cpustats);
                u64_stats_update_begin(&s->syncp);
-               s->cnt.outpkts++;
-               s->cnt.outbytes += skb->len;
+               u64_stats_inc(&s->cnt.outpkts);
+               u64_stats_add(&s->cnt.outbytes, skb->len);
                u64_stats_update_end(&s->syncp);
 
-               s = this_cpu_ptr(ipvs->tot_stats.cpustats);
+               s = this_cpu_ptr(ipvs->tot_stats->s.cpustats);
                u64_stats_update_begin(&s->syncp);
-               s->cnt.outpkts++;
-               s->cnt.outbytes += skb->len;
+               u64_stats_inc(&s->cnt.outpkts);
+               u64_stats_add(&s->cnt.outbytes, skb->len);
                u64_stats_update_end(&s->syncp);
 
                local_bh_enable();
@@ -200,17 +200,17 @@ ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
 
        s = this_cpu_ptr(cp->dest->stats.cpustats);
        u64_stats_update_begin(&s->syncp);
-       s->cnt.conns++;
+       u64_stats_inc(&s->cnt.conns);
        u64_stats_update_end(&s->syncp);
 
        s = this_cpu_ptr(svc->stats.cpustats);
        u64_stats_update_begin(&s->syncp);
-       s->cnt.conns++;
+       u64_stats_inc(&s->cnt.conns);
        u64_stats_update_end(&s->syncp);
 
-       s = this_cpu_ptr(ipvs->tot_stats.cpustats);
+       s = this_cpu_ptr(ipvs->tot_stats->s.cpustats);
        u64_stats_update_begin(&s->syncp);
-       s->cnt.conns++;
+       u64_stats_inc(&s->cnt.conns);
        u64_stats_update_end(&s->syncp);
 
        local_bh_enable();
@@ -2448,6 +2448,10 @@ static void __exit ip_vs_cleanup(void)
        ip_vs_conn_cleanup();
        ip_vs_protocol_cleanup();
        ip_vs_control_cleanup();
+       /* common rcu_barrier() used by:
+        * - ip_vs_control_cleanup()
+        */
+       rcu_barrier();
        pr_info("ipvs unloaded.\n");
 }
 
index 4d62059..c9f5985 100644 (file)
@@ -49,8 +49,7 @@
 
 MODULE_ALIAS_GENL_FAMILY(IPVS_GENL_NAME);
 
-/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
-static DEFINE_MUTEX(__ip_vs_mutex);
+DEFINE_MUTEX(__ip_vs_mutex); /* Serialize configuration with sockopt/netlink */
 
 /* sysctl variables */
 
@@ -241,6 +240,47 @@ static void defense_work_handler(struct work_struct *work)
 }
 #endif
 
+static void est_reload_work_handler(struct work_struct *work)
+{
+       struct netns_ipvs *ipvs =
+               container_of(work, struct netns_ipvs, est_reload_work.work);
+       int genid_done = atomic_read(&ipvs->est_genid_done);
+       unsigned long delay = HZ / 10;  /* repeat startups after failure */
+       bool repeat = false;
+       int genid;
+       int id;
+
+       mutex_lock(&ipvs->est_mutex);
+       genid = atomic_read(&ipvs->est_genid);
+       for (id = 0; id < ipvs->est_kt_count; id++) {
+               struct ip_vs_est_kt_data *kd = ipvs->est_kt_arr[id];
+
+               /* netns clean up started, abort delayed work */
+               if (!ipvs->enable)
+                       goto unlock;
+               if (!kd)
+                       continue;
+               /* New config ? Stop kthread tasks */
+               if (genid != genid_done)
+                       ip_vs_est_kthread_stop(kd);
+               if (!kd->task && !ip_vs_est_stopped(ipvs)) {
+                       /* Do not start kthreads above 0 in calc phase */
+                       if ((!id || !ipvs->est_calc_phase) &&
+                           ip_vs_est_kthread_start(ipvs, kd) < 0)
+                               repeat = true;
+               }
+       }
+
+       atomic_set(&ipvs->est_genid_done, genid);
+
+       if (repeat)
+               queue_delayed_work(system_long_wq, &ipvs->est_reload_work,
+                                  delay);
+
+unlock:
+       mutex_unlock(&ipvs->est_mutex);
+}
+
 int
 ip_vs_use_count_inc(void)
 {
@@ -471,7 +511,7 @@ __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
 
 static void ip_vs_service_free(struct ip_vs_service *svc)
 {
-       free_percpu(svc->stats.cpustats);
+       ip_vs_stats_release(&svc->stats);
        kfree(svc);
 }
 
@@ -483,17 +523,14 @@ static void ip_vs_service_rcu_free(struct rcu_head *head)
        ip_vs_service_free(svc);
 }
 
-static void __ip_vs_svc_put(struct ip_vs_service *svc, bool do_delay)
+static void __ip_vs_svc_put(struct ip_vs_service *svc)
 {
        if (atomic_dec_and_test(&svc->refcnt)) {
                IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n",
                              svc->fwmark,
                              IP_VS_DBG_ADDR(svc->af, &svc->addr),
                              ntohs(svc->port));
-               if (do_delay)
-                       call_rcu(&svc->rcu_head, ip_vs_service_rcu_free);
-               else
-                       ip_vs_service_free(svc);
+               call_rcu(&svc->rcu_head, ip_vs_service_rcu_free);
        }
 }
 
@@ -780,14 +817,22 @@ out:
        return dest;
 }
 
+static void ip_vs_dest_rcu_free(struct rcu_head *head)
+{
+       struct ip_vs_dest *dest;
+
+       dest = container_of(head, struct ip_vs_dest, rcu_head);
+       ip_vs_stats_release(&dest->stats);
+       ip_vs_dest_put_and_free(dest);
+}
+
 static void ip_vs_dest_free(struct ip_vs_dest *dest)
 {
        struct ip_vs_service *svc = rcu_dereference_protected(dest->svc, 1);
 
        __ip_vs_dst_cache_reset(dest);
-       __ip_vs_svc_put(svc, false);
-       free_percpu(dest->stats.cpustats);
-       ip_vs_dest_put_and_free(dest);
+       __ip_vs_svc_put(svc);
+       call_rcu(&dest->rcu_head, ip_vs_dest_rcu_free);
 }
 
 /*
@@ -811,12 +856,22 @@ static void ip_vs_trash_cleanup(struct netns_ipvs *ipvs)
        }
 }
 
+static void ip_vs_stats_rcu_free(struct rcu_head *head)
+{
+       struct ip_vs_stats_rcu *rs = container_of(head,
+                                                 struct ip_vs_stats_rcu,
+                                                 rcu_head);
+
+       ip_vs_stats_release(&rs->s);
+       kfree(rs);
+}
+
 static void
 ip_vs_copy_stats(struct ip_vs_kstats *dst, struct ip_vs_stats *src)
 {
 #define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->kstats.c - src->kstats0.c
 
-       spin_lock_bh(&src->lock);
+       spin_lock(&src->lock);
 
        IP_VS_SHOW_STATS_COUNTER(conns);
        IP_VS_SHOW_STATS_COUNTER(inpkts);
@@ -826,7 +881,7 @@ ip_vs_copy_stats(struct ip_vs_kstats *dst, struct ip_vs_stats *src)
 
        ip_vs_read_estimator(dst, src);
 
-       spin_unlock_bh(&src->lock);
+       spin_unlock(&src->lock);
 }
 
 static void
@@ -847,7 +902,7 @@ ip_vs_export_stats_user(struct ip_vs_stats_user *dst, struct ip_vs_kstats *src)
 static void
 ip_vs_zero_stats(struct ip_vs_stats *stats)
 {
-       spin_lock_bh(&stats->lock);
+       spin_lock(&stats->lock);
 
        /* get current counters as zero point, rates are zeroed */
 
@@ -861,7 +916,48 @@ ip_vs_zero_stats(struct ip_vs_stats *stats)
 
        ip_vs_zero_estimator(stats);
 
-       spin_unlock_bh(&stats->lock);
+       spin_unlock(&stats->lock);
+}
+
+/* Allocate fields after kzalloc */
+int ip_vs_stats_init_alloc(struct ip_vs_stats *s)
+{
+       int i;
+
+       spin_lock_init(&s->lock);
+       s->cpustats = alloc_percpu(struct ip_vs_cpu_stats);
+       if (!s->cpustats)
+               return -ENOMEM;
+
+       for_each_possible_cpu(i) {
+               struct ip_vs_cpu_stats *cs = per_cpu_ptr(s->cpustats, i);
+
+               u64_stats_init(&cs->syncp);
+       }
+       return 0;
+}
+
+struct ip_vs_stats *ip_vs_stats_alloc(void)
+{
+       struct ip_vs_stats *s = kzalloc(sizeof(*s), GFP_KERNEL);
+
+       if (s && ip_vs_stats_init_alloc(s) >= 0)
+               return s;
+       kfree(s);
+       return NULL;
+}
+
+void ip_vs_stats_release(struct ip_vs_stats *stats)
+{
+       free_percpu(stats->cpustats);
+}
+
+void ip_vs_stats_free(struct ip_vs_stats *stats)
+{
+       if (stats) {
+               ip_vs_stats_release(stats);
+               kfree(stats);
+       }
 }
 
 /*
@@ -923,7 +1019,7 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
                if (old_svc != svc) {
                        ip_vs_zero_stats(&dest->stats);
                        __ip_vs_bind_svc(dest, svc);
-                       __ip_vs_svc_put(old_svc, true);
+                       __ip_vs_svc_put(old_svc);
                }
        }
 
@@ -942,7 +1038,6 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
        spin_unlock_bh(&dest->dst_lock);
 
        if (add) {
-               ip_vs_start_estimator(svc->ipvs, &dest->stats);
                list_add_rcu(&dest->n_list, &svc->destinations);
                svc->num_dests++;
                sched = rcu_dereference_protected(svc->scheduler, 1);
@@ -963,14 +1058,13 @@ static int
 ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
 {
        struct ip_vs_dest *dest;
-       unsigned int atype, i;
+       unsigned int atype;
+       int ret;
 
        EnterFunction(2);
 
 #ifdef CONFIG_IP_VS_IPV6
        if (udest->af == AF_INET6) {
-               int ret;
-
                atype = ipv6_addr_type(&udest->addr.in6);
                if ((!(atype & IPV6_ADDR_UNICAST) ||
                        atype & IPV6_ADDR_LINKLOCAL) &&
@@ -992,15 +1086,13 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
        if (dest == NULL)
                return -ENOMEM;
 
-       dest->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
-       if (!dest->stats.cpustats)
+       ret = ip_vs_stats_init_alloc(&dest->stats);
+       if (ret < 0)
                goto err_alloc;
 
-       for_each_possible_cpu(i) {
-               struct ip_vs_cpu_stats *ip_vs_dest_stats;
-               ip_vs_dest_stats = per_cpu_ptr(dest->stats.cpustats, i);
-               u64_stats_init(&ip_vs_dest_stats->syncp);
-       }
+       ret = ip_vs_start_estimator(svc->ipvs, &dest->stats);
+       if (ret < 0)
+               goto err_stats;
 
        dest->af = udest->af;
        dest->protocol = svc->protocol;
@@ -1017,15 +1109,17 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
 
        INIT_HLIST_NODE(&dest->d_list);
        spin_lock_init(&dest->dst_lock);
-       spin_lock_init(&dest->stats.lock);
        __ip_vs_update_dest(svc, dest, udest, 1);
 
        LeaveFunction(2);
        return 0;
 
+err_stats:
+       ip_vs_stats_release(&dest->stats);
+
 err_alloc:
        kfree(dest);
-       return -ENOMEM;
+       return ret;
 }
 
 
@@ -1087,14 +1181,18 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
                              IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
                              ntohs(dest->vport));
 
+               ret = ip_vs_start_estimator(svc->ipvs, &dest->stats);
+               if (ret < 0)
+                       goto err;
                __ip_vs_update_dest(svc, dest, udest, 1);
-               ret = 0;
        } else {
                /*
                 * Allocate and initialize the dest structure
                 */
                ret = ip_vs_new_dest(svc, udest);
        }
+
+err:
        LeaveFunction(2);
 
        return ret;
@@ -1284,7 +1382,7 @@ static int
 ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
                  struct ip_vs_service **svc_p)
 {
-       int ret = 0, i;
+       int ret = 0;
        struct ip_vs_scheduler *sched = NULL;
        struct ip_vs_pe *pe = NULL;
        struct ip_vs_service *svc = NULL;
@@ -1344,18 +1442,9 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
                ret = -ENOMEM;
                goto out_err;
        }
-       svc->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
-       if (!svc->stats.cpustats) {
-               ret = -ENOMEM;
+       ret = ip_vs_stats_init_alloc(&svc->stats);
+       if (ret < 0)
                goto out_err;
-       }
-
-       for_each_possible_cpu(i) {
-               struct ip_vs_cpu_stats *ip_vs_stats;
-               ip_vs_stats = per_cpu_ptr(svc->stats.cpustats, i);
-               u64_stats_init(&ip_vs_stats->syncp);
-       }
-
 
        /* I'm the first user of the service */
        atomic_set(&svc->refcnt, 0);
@@ -1372,7 +1461,6 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
 
        INIT_LIST_HEAD(&svc->destinations);
        spin_lock_init(&svc->sched_lock);
-       spin_lock_init(&svc->stats.lock);
 
        /* Bind the scheduler */
        if (sched) {
@@ -1382,6 +1470,10 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
                sched = NULL;
        }
 
+       ret = ip_vs_start_estimator(ipvs, &svc->stats);
+       if (ret < 0)
+               goto out_err;
+
        /* Bind the ct retriever */
        RCU_INIT_POINTER(svc->pe, pe);
        pe = NULL;
@@ -1394,8 +1486,6 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
        if (svc->pe && svc->pe->conn_out)
                atomic_inc(&ipvs->conn_out_counter);
 
-       ip_vs_start_estimator(ipvs, &svc->stats);
-
        /* Count only IPv4 services for old get/setsockopt interface */
        if (svc->af == AF_INET)
                ipvs->num_services++;
@@ -1406,8 +1496,15 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
        ip_vs_svc_hash(svc);
 
        *svc_p = svc;
-       /* Now there is a service - full throttle */
-       ipvs->enable = 1;
+
+       if (!ipvs->enable) {
+               /* Now there is a service - full throttle */
+               ipvs->enable = 1;
+
+               /* Start estimation for first time */
+               ip_vs_est_reload_start(ipvs);
+       }
+
        return 0;
 
 
@@ -1571,7 +1668,7 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup)
        /*
         *    Free the service if nobody refers to it
         */
-       __ip_vs_svc_put(svc, true);
+       __ip_vs_svc_put(svc);
 
        /* decrease the module use count */
        ip_vs_use_count_dec();
@@ -1761,7 +1858,7 @@ static int ip_vs_zero_all(struct netns_ipvs *ipvs)
                }
        }
 
-       ip_vs_zero_stats(&ipvs->tot_stats);
+       ip_vs_zero_stats(&ipvs->tot_stats->s);
        return 0;
 }
 
@@ -1843,6 +1940,148 @@ proc_do_sync_ports(struct ctl_table *table, int write,
        return rc;
 }
 
+static int ipvs_proc_est_cpumask_set(struct ctl_table *table, void *buffer)
+{
+       struct netns_ipvs *ipvs = table->extra2;
+       cpumask_var_t *valp = table->data;
+       cpumask_var_t newmask;
+       int ret;
+
+       if (!zalloc_cpumask_var(&newmask, GFP_KERNEL))
+               return -ENOMEM;
+
+       ret = cpulist_parse(buffer, newmask);
+       if (ret)
+               goto out;
+
+       mutex_lock(&ipvs->est_mutex);
+
+       if (!ipvs->est_cpulist_valid) {
+               if (!zalloc_cpumask_var(valp, GFP_KERNEL)) {
+                       ret = -ENOMEM;
+                       goto unlock;
+               }
+               ipvs->est_cpulist_valid = 1;
+       }
+       cpumask_and(newmask, newmask, &current->cpus_mask);
+       cpumask_copy(*valp, newmask);
+       /* est_max_threads may depend on cpulist size */
+       ipvs->est_max_threads = ip_vs_est_max_threads(ipvs);
+       ipvs->est_calc_phase = 1;
+       ip_vs_est_reload_start(ipvs);
+
+unlock:
+       mutex_unlock(&ipvs->est_mutex);
+
+out:
+       free_cpumask_var(newmask);
+       return ret;
+}
+
+static int ipvs_proc_est_cpumask_get(struct ctl_table *table, void *buffer,
+                                    size_t size)
+{
+       struct netns_ipvs *ipvs = table->extra2;
+       cpumask_var_t *valp = table->data;
+       struct cpumask *mask;
+       int ret;
+
+       mutex_lock(&ipvs->est_mutex);
+
+       if (ipvs->est_cpulist_valid)
+               mask = *valp;
+       else
+               mask = (struct cpumask *)housekeeping_cpumask(HK_TYPE_KTHREAD);
+       ret = scnprintf(buffer, size, "%*pbl\n", cpumask_pr_args(mask));
+
+       mutex_unlock(&ipvs->est_mutex);
+
+       return ret;
+}
+
+static int ipvs_proc_est_cpulist(struct ctl_table *table, int write,
+                                void *buffer, size_t *lenp, loff_t *ppos)
+{
+       int ret;
+
+       /* Ignore both read and write(append) if *ppos not 0 */
+       if (*ppos || !*lenp) {
+               *lenp = 0;
+               return 0;
+       }
+       if (write) {
+               /* proc_sys_call_handler() appends terminator */
+               ret = ipvs_proc_est_cpumask_set(table, buffer);
+               if (ret >= 0)
+                       *ppos += *lenp;
+       } else {
+               /* proc_sys_call_handler() allocates 1 byte for terminator */
+               ret = ipvs_proc_est_cpumask_get(table, buffer, *lenp + 1);
+               if (ret >= 0) {
+                       *lenp = ret;
+                       *ppos += *lenp;
+                       ret = 0;
+               }
+       }
+       return ret;
+}
+
+static int ipvs_proc_est_nice(struct ctl_table *table, int write,
+                             void *buffer, size_t *lenp, loff_t *ppos)
+{
+       struct netns_ipvs *ipvs = table->extra2;
+       int *valp = table->data;
+       int val = *valp;
+       int ret;
+
+       struct ctl_table tmp_table = {
+               .data = &val,
+               .maxlen = sizeof(int),
+               .mode = table->mode,
+       };
+
+       ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos);
+       if (write && ret >= 0) {
+               if (val < MIN_NICE || val > MAX_NICE) {
+                       ret = -EINVAL;
+               } else {
+                       mutex_lock(&ipvs->est_mutex);
+                       if (*valp != val) {
+                               *valp = val;
+                               ip_vs_est_reload_start(ipvs);
+                       }
+                       mutex_unlock(&ipvs->est_mutex);
+               }
+       }
+       return ret;
+}
+
+static int ipvs_proc_run_estimation(struct ctl_table *table, int write,
+                                   void *buffer, size_t *lenp, loff_t *ppos)
+{
+       struct netns_ipvs *ipvs = table->extra2;
+       int *valp = table->data;
+       int val = *valp;
+       int ret;
+
+       struct ctl_table tmp_table = {
+               .data = &val,
+               .maxlen = sizeof(int),
+               .mode = table->mode,
+       };
+
+       ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos);
+       if (write && ret >= 0) {
+               mutex_lock(&ipvs->est_mutex);
+               if (*valp != val) {
+                       *valp = val;
+                       ip_vs_est_reload_start(ipvs);
+               }
+               mutex_unlock(&ipvs->est_mutex);
+       }
+       return ret;
+}
+
 /*
  *     IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
  *     Do not change order or insert new entries without
@@ -2017,7 +2256,19 @@ static struct ctl_table vs_vars[] = {
                .procname       = "run_estimation",
                .maxlen         = sizeof(int),
                .mode           = 0644,
-               .proc_handler   = proc_dointvec,
+               .proc_handler   = ipvs_proc_run_estimation,
+       },
+       {
+               .procname       = "est_cpulist",
+               .maxlen         = NR_CPUS,      /* unused */
+               .mode           = 0644,
+               .proc_handler   = ipvs_proc_est_cpulist,
+       },
+       {
+               .procname       = "est_nice",
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = ipvs_proc_est_nice,
        },
 #ifdef CONFIG_IP_VS_DEBUG
        {
@@ -2255,7 +2506,7 @@ static int ip_vs_stats_show(struct seq_file *seq, void *v)
        seq_puts(seq,
                 "   Conns  Packets  Packets            Bytes            Bytes\n");
 
-       ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats);
+       ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats->s);
        seq_printf(seq, "%8LX %8LX %8LX %16LX %16LX\n\n",
                   (unsigned long long)show.conns,
                   (unsigned long long)show.inpkts,
@@ -2279,7 +2530,7 @@ static int ip_vs_stats_show(struct seq_file *seq, void *v)
 static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v)
 {
        struct net *net = seq_file_single_net(seq);
-       struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats;
+       struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats->s;
        struct ip_vs_cpu_stats __percpu *cpustats = tot_stats->cpustats;
        struct ip_vs_kstats kstats;
        int i;
@@ -2297,11 +2548,11 @@ static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v)
 
                do {
                        start = u64_stats_fetch_begin(&u->syncp);
-                       conns = u->cnt.conns;
-                       inpkts = u->cnt.inpkts;
-                       outpkts = u->cnt.outpkts;
-                       inbytes = u->cnt.inbytes;
-                       outbytes = u->cnt.outbytes;
+                       conns = u64_stats_read(&u->cnt.conns);
+                       inpkts = u64_stats_read(&u->cnt.inpkts);
+                       outpkts = u64_stats_read(&u->cnt.outpkts);
+                       inbytes = u64_stats_read(&u->cnt.inbytes);
+                       outbytes = u64_stats_read(&u->cnt.outbytes);
                } while (u64_stats_fetch_retry(&u->syncp, start));
 
                seq_printf(seq, "%3X %8LX %8LX %8LX %16LX %16LX\n",
@@ -4027,13 +4278,17 @@ static void ip_vs_genl_unregister(void)
 static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs)
 {
        struct net *net = ipvs->net;
-       int idx;
        struct ctl_table *tbl;
+       int idx, ret;
 
        atomic_set(&ipvs->dropentry, 0);
        spin_lock_init(&ipvs->dropentry_lock);
        spin_lock_init(&ipvs->droppacket_lock);
        spin_lock_init(&ipvs->securetcp_lock);
+       INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler);
+       INIT_DELAYED_WORK(&ipvs->expire_nodest_conn_work,
+                         expire_nodest_conn_handler);
+       ipvs->est_stopped = 0;
 
        if (!net_eq(net, &init_net)) {
                tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL);
@@ -4094,31 +4349,44 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs)
        tbl[idx++].data = &ipvs->sysctl_schedule_icmp;
        tbl[idx++].data = &ipvs->sysctl_ignore_tunneled;
        ipvs->sysctl_run_estimation = 1;
+       tbl[idx].extra2 = ipvs;
        tbl[idx++].data = &ipvs->sysctl_run_estimation;
+
+       ipvs->est_cpulist_valid = 0;
+       tbl[idx].extra2 = ipvs;
+       tbl[idx++].data = &ipvs->sysctl_est_cpulist;
+
+       ipvs->sysctl_est_nice = IPVS_EST_NICE;
+       tbl[idx].extra2 = ipvs;
+       tbl[idx++].data = &ipvs->sysctl_est_nice;
+
 #ifdef CONFIG_IP_VS_DEBUG
        /* Global sysctls must be ro in non-init netns */
        if (!net_eq(net, &init_net))
                tbl[idx++].mode = 0444;
 #endif
 
+       ret = -ENOMEM;
        ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl);
-       if (ipvs->sysctl_hdr == NULL) {
-               if (!net_eq(net, &init_net))
-                       kfree(tbl);
-               return -ENOMEM;
-       }
-       ip_vs_start_estimator(ipvs, &ipvs->tot_stats);
+       if (!ipvs->sysctl_hdr)
+               goto err;
        ipvs->sysctl_tbl = tbl;
+
+       ret = ip_vs_start_estimator(ipvs, &ipvs->tot_stats->s);
+       if (ret < 0)
+               goto err;
+
        /* Schedule defense work */
-       INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler);
        queue_delayed_work(system_long_wq, &ipvs->defense_work,
                           DEFENSE_TIMER_PERIOD);
 
-       /* Init delayed work for expiring no dest conn */
-       INIT_DELAYED_WORK(&ipvs->expire_nodest_conn_work,
-                         expire_nodest_conn_handler);
-
        return 0;
+
+err:
+       unregister_net_sysctl_table(ipvs->sysctl_hdr);
+       if (!net_eq(net, &init_net))
+               kfree(tbl);
+       return ret;
 }
 
 static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs)
@@ -4129,7 +4397,10 @@ static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs)
        cancel_delayed_work_sync(&ipvs->defense_work);
        cancel_work_sync(&ipvs->defense_work.work);
        unregister_net_sysctl_table(ipvs->sysctl_hdr);
-       ip_vs_stop_estimator(ipvs, &ipvs->tot_stats);
+       ip_vs_stop_estimator(ipvs, &ipvs->tot_stats->s);
+
+       if (ipvs->est_cpulist_valid)
+               free_cpumask_var(ipvs->sysctl_est_cpulist);
 
        if (!net_eq(net, &init_net))
                kfree(ipvs->sysctl_tbl);
@@ -4151,7 +4422,8 @@ static struct notifier_block ip_vs_dst_notifier = {
 
 int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
 {
-       int i, idx;
+       int ret = -ENOMEM;
+       int idx;
 
        /* Initialize rs_table */
        for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++)
@@ -4164,18 +4436,14 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
        atomic_set(&ipvs->nullsvc_counter, 0);
        atomic_set(&ipvs->conn_out_counter, 0);
 
-       /* procfs stats */
-       ipvs->tot_stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
-       if (!ipvs->tot_stats.cpustats)
-               return -ENOMEM;
-
-       for_each_possible_cpu(i) {
-               struct ip_vs_cpu_stats *ipvs_tot_stats;
-               ipvs_tot_stats = per_cpu_ptr(ipvs->tot_stats.cpustats, i);
-               u64_stats_init(&ipvs_tot_stats->syncp);
-       }
+       INIT_DELAYED_WORK(&ipvs->est_reload_work, est_reload_work_handler);
 
-       spin_lock_init(&ipvs->tot_stats.lock);
+       /* procfs stats */
+       ipvs->tot_stats = kzalloc(sizeof(*ipvs->tot_stats), GFP_KERNEL);
+       if (!ipvs->tot_stats)
+               goto out;
+       if (ip_vs_stats_init_alloc(&ipvs->tot_stats->s) < 0)
+               goto err_tot_stats;
 
 #ifdef CONFIG_PROC_FS
        if (!proc_create_net("ip_vs", 0, ipvs->net->proc_net,
@@ -4190,7 +4458,8 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
                goto err_percpu;
 #endif
 
-       if (ip_vs_control_net_init_sysctl(ipvs))
+       ret = ip_vs_control_net_init_sysctl(ipvs);
+       if (ret < 0)
                goto err;
 
        return 0;
@@ -4207,20 +4476,26 @@ err_stats:
 
 err_vs:
 #endif
-       free_percpu(ipvs->tot_stats.cpustats);
-       return -ENOMEM;
+       ip_vs_stats_release(&ipvs->tot_stats->s);
+
+err_tot_stats:
+       kfree(ipvs->tot_stats);
+
+out:
+       return ret;
 }
 
 void __net_exit ip_vs_control_net_cleanup(struct netns_ipvs *ipvs)
 {
        ip_vs_trash_cleanup(ipvs);
        ip_vs_control_net_cleanup_sysctl(ipvs);
+       cancel_delayed_work_sync(&ipvs->est_reload_work);
 #ifdef CONFIG_PROC_FS
        remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net);
        remove_proc_entry("ip_vs_stats", ipvs->net->proc_net);
        remove_proc_entry("ip_vs", ipvs->net->proc_net);
 #endif
-       free_percpu(ipvs->tot_stats.cpustats);
+       call_rcu(&ipvs->tot_stats->rcu_head, ip_vs_stats_rcu_free);
 }
 
 int __init ip_vs_register_nl_ioctl(void)
@@ -4280,5 +4555,6 @@ void ip_vs_control_cleanup(void)
 {
        EnterFunction(2);
        unregister_netdevice_notifier(&ip_vs_dst_notifier);
+       /* relying on common rcu_barrier() in ip_vs_cleanup() */
        LeaveFunction(2);
 }
index 9a1a7af..df56073 100644 (file)
@@ -30,9 +30,6 @@
   long interval, it is easy to implement a user level daemon which
   periodically reads those statistical counters and measure rate.
 
-  Currently, the measurement is activated by slow timer handler. Hope
-  this measurement will not introduce too much load.
-
   We measure rate during the last 8 seconds every 2 seconds:
 
     avgrate = avgrate*(1-W) + rate*W
     to 32-bit values for conns, packets, bps, cps and pps.
 
   * A lot of code is taken from net/core/gen_estimator.c
- */
-
 
-/*
- * Make a summary from each cpu
+  KEY POINTS:
+  - cpustats counters are updated per-cpu in SoftIRQ context with BH disabled
+  - kthreads read the cpustats to update the estimators (svcs, dests, total)
+  - the states of estimators can be read (get stats) or modified (zero stats)
+    from processes
+
+  KTHREADS:
+  - estimators are added initially to est_temp_list and later kthread 0
+    distributes them to one or many kthreads for estimation
+  - kthread contexts are created and attached to array
+  - the kthread tasks are started when first service is added, before that
+    the total stats are not estimated
+  - when configuration (cpulist/nice) is changed, the tasks are restarted
+    by work (est_reload_work)
+  - kthread tasks are stopped while the cpulist is empty
+  - the kthread context holds lists with estimators (chains) which are
+    processed every 2 seconds
+  - as estimators can be added dynamically and in bursts, we try to spread
+    them to multiple chains which are estimated at different time
+  - on start, kthread 0 enters calculation phase to determine the chain limits
+    and the limit of estimators per kthread
+  - est_add_ktid: ktid where to add new ests, can point to empty slot where
+    we should add kt data
  */
-static void ip_vs_read_cpu_stats(struct ip_vs_kstats *sum,
-                                struct ip_vs_cpu_stats __percpu *stats)
-{
-       int i;
-       bool add = false;
-
-       for_each_possible_cpu(i) {
-               struct ip_vs_cpu_stats *s = per_cpu_ptr(stats, i);
-               unsigned int start;
-               u64 conns, inpkts, outpkts, inbytes, outbytes;
 
-               if (add) {
-                       do {
-                               start = u64_stats_fetch_begin(&s->syncp);
-                               conns = s->cnt.conns;
-                               inpkts = s->cnt.inpkts;
-                               outpkts = s->cnt.outpkts;
-                               inbytes = s->cnt.inbytes;
-                               outbytes = s->cnt.outbytes;
-                       } while (u64_stats_fetch_retry(&s->syncp, start));
-                       sum->conns += conns;
-                       sum->inpkts += inpkts;
-                       sum->outpkts += outpkts;
-                       sum->inbytes += inbytes;
-                       sum->outbytes += outbytes;
-               } else {
-                       add = true;
-                       do {
-                               start = u64_stats_fetch_begin(&s->syncp);
-                               sum->conns = s->cnt.conns;
-                               sum->inpkts = s->cnt.inpkts;
-                               sum->outpkts = s->cnt.outpkts;
-                               sum->inbytes = s->cnt.inbytes;
-                               sum->outbytes = s->cnt.outbytes;
-                       } while (u64_stats_fetch_retry(&s->syncp, start));
-               }
-       }
-}
+static struct lock_class_key __ipvs_est_key;
 
+static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs);
+static void ip_vs_est_drain_temp_list(struct netns_ipvs *ipvs);
 
-static void estimation_timer(struct timer_list *t)
+static void ip_vs_chain_estimation(struct hlist_head *chain)
 {
        struct ip_vs_estimator *e;
+       struct ip_vs_cpu_stats *c;
        struct ip_vs_stats *s;
        u64 rate;
-       struct netns_ipvs *ipvs = from_timer(ipvs, t, est_timer);
 
-       if (!sysctl_run_estimation(ipvs))
-               goto skip;
+       hlist_for_each_entry_rcu(e, chain, list) {
+               u64 conns, inpkts, outpkts, inbytes, outbytes;
+               u64 kconns = 0, kinpkts = 0, koutpkts = 0;
+               u64 kinbytes = 0, koutbytes = 0;
+               unsigned int start;
+               int i;
+
+               if (kthread_should_stop())
+                       break;
 
-       spin_lock(&ipvs->est_lock);
-       list_for_each_entry(e, &ipvs->est_list, list) {
                s = container_of(e, struct ip_vs_stats, est);
+               for_each_possible_cpu(i) {
+                       c = per_cpu_ptr(s->cpustats, i);
+                       do {
+                               start = u64_stats_fetch_begin(&c->syncp);
+                               conns = u64_stats_read(&c->cnt.conns);
+                               inpkts = u64_stats_read(&c->cnt.inpkts);
+                               outpkts = u64_stats_read(&c->cnt.outpkts);
+                               inbytes = u64_stats_read(&c->cnt.inbytes);
+                               outbytes = u64_stats_read(&c->cnt.outbytes);
+                       } while (u64_stats_fetch_retry(&c->syncp, start));
+                       kconns += conns;
+                       kinpkts += inpkts;
+                       koutpkts += outpkts;
+                       kinbytes += inbytes;
+                       koutbytes += outbytes;
+               }
 
                spin_lock(&s->lock);
-               ip_vs_read_cpu_stats(&s->kstats, s->cpustats);
+
+               s->kstats.conns = kconns;
+               s->kstats.inpkts = kinpkts;
+               s->kstats.outpkts = koutpkts;
+               s->kstats.inbytes = kinbytes;
+               s->kstats.outbytes = koutbytes;
 
                /* scaled by 2^10, but divided 2 seconds */
                rate = (s->kstats.conns - e->last_conns) << 9;
@@ -133,30 +141,757 @@ static void estimation_timer(struct timer_list *t)
                e->outbps += ((s64)rate - (s64)e->outbps) >> 2;
                spin_unlock(&s->lock);
        }
-       spin_unlock(&ipvs->est_lock);
+}
 
-skip:
-       mod_timer(&ipvs->est_timer, jiffies + 2*HZ);
+static void ip_vs_tick_estimation(struct ip_vs_est_kt_data *kd, int row)
+{
+       struct ip_vs_est_tick_data *td;
+       int cid;
+
+       rcu_read_lock();
+       td = rcu_dereference(kd->ticks[row]);
+       if (!td)
+               goto out;
+       for_each_set_bit(cid, td->present, IPVS_EST_TICK_CHAINS) {
+               if (kthread_should_stop())
+                       break;
+               ip_vs_chain_estimation(&td->chains[cid]);
+               cond_resched_rcu();
+               td = rcu_dereference(kd->ticks[row]);
+               if (!td)
+                       break;
+       }
+
+out:
+       rcu_read_unlock();
 }
 
-void ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats)
+static int ip_vs_estimation_kthread(void *data)
 {
-       struct ip_vs_estimator *est = &stats->est;
+       struct ip_vs_est_kt_data *kd = data;
+       struct netns_ipvs *ipvs = kd->ipvs;
+       int row = kd->est_row;
+       unsigned long now;
+       int id = kd->id;
+       long gap;
+
+       if (id > 0) {
+               if (!ipvs->est_chain_max)
+                       return 0;
+       } else {
+               if (!ipvs->est_chain_max) {
+                       ipvs->est_calc_phase = 1;
+                       /* commit est_calc_phase before reading est_genid */
+                       smp_mb();
+               }
+
+               /* kthread 0 will handle the calc phase */
+               if (ipvs->est_calc_phase)
+                       ip_vs_est_calc_phase(ipvs);
+       }
+
+       while (1) {
+               if (!id && !hlist_empty(&ipvs->est_temp_list))
+                       ip_vs_est_drain_temp_list(ipvs);
+               set_current_state(TASK_IDLE);
+               if (kthread_should_stop())
+                       break;
+
+               /* before estimation, check if we should sleep */
+               now = jiffies;
+               gap = kd->est_timer - now;
+               if (gap > 0) {
+                       if (gap > IPVS_EST_TICK) {
+                               kd->est_timer = now - IPVS_EST_TICK;
+                               gap = IPVS_EST_TICK;
+                       }
+                       schedule_timeout(gap);
+               } else {
+                       __set_current_state(TASK_RUNNING);
+                       if (gap < -8 * IPVS_EST_TICK)
+                               kd->est_timer = now;
+               }
+
+               if (kd->tick_len[row])
+                       ip_vs_tick_estimation(kd, row);
+
+               row++;
+               if (row >= IPVS_EST_NTICKS)
+                       row = 0;
+               WRITE_ONCE(kd->est_row, row);
+               kd->est_timer += IPVS_EST_TICK;
+       }
+       __set_current_state(TASK_RUNNING);
+
+       return 0;
+}
+
+/* Schedule stop/start for kthread tasks */
+void ip_vs_est_reload_start(struct netns_ipvs *ipvs)
+{
+       /* Ignore reloads before first service is added */
+       if (!ipvs->enable)
+               return;
+       ip_vs_est_stopped_recalc(ipvs);
+       /* Bump the kthread configuration genid */
+       atomic_inc(&ipvs->est_genid);
+       queue_delayed_work(system_long_wq, &ipvs->est_reload_work, 0);
+}
+
+/* Start kthread task with current configuration */
+int ip_vs_est_kthread_start(struct netns_ipvs *ipvs,
+                           struct ip_vs_est_kt_data *kd)
+{
+       unsigned long now;
+       int ret = 0;
+       long gap;
+
+       lockdep_assert_held(&ipvs->est_mutex);
+
+       if (kd->task)
+               goto out;
+       now = jiffies;
+       gap = kd->est_timer - now;
+       /* Sync est_timer if task is starting later */
+       if (abs(gap) > 4 * IPVS_EST_TICK)
+               kd->est_timer = now;
+       kd->task = kthread_create(ip_vs_estimation_kthread, kd, "ipvs-e:%d:%d",
+                                 ipvs->gen, kd->id);
+       if (IS_ERR(kd->task)) {
+               ret = PTR_ERR(kd->task);
+               kd->task = NULL;
+               goto out;
+       }
+
+       set_user_nice(kd->task, sysctl_est_nice(ipvs));
+       set_cpus_allowed_ptr(kd->task, sysctl_est_cpulist(ipvs));
+
+       pr_info("starting estimator thread %d...\n", kd->id);
+       wake_up_process(kd->task);
+
+out:
+       return ret;
+}
+
+void ip_vs_est_kthread_stop(struct ip_vs_est_kt_data *kd)
+{
+       if (kd->task) {
+               pr_info("stopping estimator thread %d...\n", kd->id);
+               kthread_stop(kd->task);
+               kd->task = NULL;
+       }
+}
+
+/* Apply parameters to kthread */
+static void ip_vs_est_set_params(struct netns_ipvs *ipvs,
+                                struct ip_vs_est_kt_data *kd)
+{
+       kd->chain_max = ipvs->est_chain_max;
+       /* We are using single chain on RCU preemption */
+       if (IPVS_EST_TICK_CHAINS == 1)
+               kd->chain_max *= IPVS_EST_CHAIN_FACTOR;
+       kd->tick_max = IPVS_EST_TICK_CHAINS * kd->chain_max;
+       kd->est_max_count = IPVS_EST_NTICKS * kd->tick_max;
+}
+
+/* Create and start estimation kthread in a free or new array slot */
+static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs)
+{
+       struct ip_vs_est_kt_data *kd = NULL;
+       int id = ipvs->est_kt_count;
+       int ret = -ENOMEM;
+       void *arr = NULL;
+       int i;
+
+       if ((unsigned long)ipvs->est_kt_count >= ipvs->est_max_threads &&
+           ipvs->enable && ipvs->est_max_threads)
+               return -EINVAL;
+
+       mutex_lock(&ipvs->est_mutex);
+
+       for (i = 0; i < id; i++) {
+               if (!ipvs->est_kt_arr[i])
+                       break;
+       }
+       if (i >= id) {
+               arr = krealloc_array(ipvs->est_kt_arr, id + 1,
+                                    sizeof(struct ip_vs_est_kt_data *),
+                                    GFP_KERNEL);
+               if (!arr)
+                       goto out;
+               ipvs->est_kt_arr = arr;
+       } else {
+               id = i;
+       }
 
-       INIT_LIST_HEAD(&est->list);
+       kd = kzalloc(sizeof(*kd), GFP_KERNEL);
+       if (!kd)
+               goto out;
+       kd->ipvs = ipvs;
+       bitmap_fill(kd->avail, IPVS_EST_NTICKS);
+       kd->est_timer = jiffies;
+       kd->id = id;
+       ip_vs_est_set_params(ipvs, kd);
+
+       /* Pre-allocate stats used in calc phase */
+       if (!id && !kd->calc_stats) {
+               kd->calc_stats = ip_vs_stats_alloc();
+               if (!kd->calc_stats)
+                       goto out;
+       }
+
+       /* Start kthread tasks only when services are present */
+       if (ipvs->enable && !ip_vs_est_stopped(ipvs)) {
+               ret = ip_vs_est_kthread_start(ipvs, kd);
+               if (ret < 0)
+                       goto out;
+       }
+
+       if (arr)
+               ipvs->est_kt_count++;
+       ipvs->est_kt_arr[id] = kd;
+       kd = NULL;
+       /* Use most recent kthread for new ests */
+       ipvs->est_add_ktid = id;
+       ret = 0;
+
+out:
+       mutex_unlock(&ipvs->est_mutex);
+       if (kd) {
+               ip_vs_stats_free(kd->calc_stats);
+               kfree(kd);
+       }
 
-       spin_lock_bh(&ipvs->est_lock);
-       list_add(&est->list, &ipvs->est_list);
-       spin_unlock_bh(&ipvs->est_lock);
+       return ret;
 }
 
+/* Select ktid where to add new ests: available, unused or new slot */
+static void ip_vs_est_update_ktid(struct netns_ipvs *ipvs)
+{
+       int ktid, best = ipvs->est_kt_count;
+       struct ip_vs_est_kt_data *kd;
+
+       for (ktid = 0; ktid < ipvs->est_kt_count; ktid++) {
+               kd = ipvs->est_kt_arr[ktid];
+               if (kd) {
+                       if (kd->est_count < kd->est_max_count) {
+                               best = ktid;
+                               break;
+                       }
+               } else if (ktid < best) {
+                       best = ktid;
+               }
+       }
+       ipvs->est_add_ktid = best;
+}
+
+/* Add estimator to current kthread (est_add_ktid) */
+static int ip_vs_enqueue_estimator(struct netns_ipvs *ipvs,
+                                  struct ip_vs_estimator *est)
+{
+       struct ip_vs_est_kt_data *kd = NULL;
+       struct ip_vs_est_tick_data *td;
+       int ktid, row, crow, cid, ret;
+       int delay = est->ktrow;
+
+       BUILD_BUG_ON_MSG(IPVS_EST_TICK_CHAINS > 127,
+                        "Too many chains for ktcid");
+
+       if (ipvs->est_add_ktid < ipvs->est_kt_count) {
+               kd = ipvs->est_kt_arr[ipvs->est_add_ktid];
+               if (kd)
+                       goto add_est;
+       }
+
+       ret = ip_vs_est_add_kthread(ipvs);
+       if (ret < 0)
+               goto out;
+       kd = ipvs->est_kt_arr[ipvs->est_add_ktid];
+
+add_est:
+       ktid = kd->id;
+       /* For small number of estimators prefer to use few ticks,
+        * otherwise try to add into the last estimated row.
+        * est_row and add_row point after the row we should use
+        */
+       if (kd->est_count >= 2 * kd->tick_max || delay < IPVS_EST_NTICKS - 1)
+               crow = READ_ONCE(kd->est_row);
+       else
+               crow = kd->add_row;
+       crow += delay;
+       if (crow >= IPVS_EST_NTICKS)
+               crow -= IPVS_EST_NTICKS;
+       /* Assume initial delay ? */
+       if (delay >= IPVS_EST_NTICKS - 1) {
+               /* Preserve initial delay or decrease it if no space in tick */
+               row = crow;
+               if (crow < IPVS_EST_NTICKS - 1) {
+                       crow++;
+                       row = find_last_bit(kd->avail, crow);
+               }
+               if (row >= crow)
+                       row = find_last_bit(kd->avail, IPVS_EST_NTICKS);
+       } else {
+               /* Preserve delay or increase it if no space in tick */
+               row = IPVS_EST_NTICKS;
+               if (crow > 0)
+                       row = find_next_bit(kd->avail, IPVS_EST_NTICKS, crow);
+               if (row >= IPVS_EST_NTICKS)
+                       row = find_first_bit(kd->avail, IPVS_EST_NTICKS);
+       }
+
+       td = rcu_dereference_protected(kd->ticks[row], 1);
+       if (!td) {
+               td = kzalloc(sizeof(*td), GFP_KERNEL);
+               if (!td) {
+                       ret = -ENOMEM;
+                       goto out;
+               }
+               rcu_assign_pointer(kd->ticks[row], td);
+       }
+
+       cid = find_first_zero_bit(td->full, IPVS_EST_TICK_CHAINS);
+
+       kd->est_count++;
+       kd->tick_len[row]++;
+       if (!td->chain_len[cid])
+               __set_bit(cid, td->present);
+       td->chain_len[cid]++;
+       est->ktid = ktid;
+       est->ktrow = row;
+       est->ktcid = cid;
+       hlist_add_head_rcu(&est->list, &td->chains[cid]);
+
+       if (td->chain_len[cid] >= kd->chain_max) {
+               __set_bit(cid, td->full);
+               if (kd->tick_len[row] >= kd->tick_max)
+                       __clear_bit(row, kd->avail);
+       }
+
+       /* Update est_add_ktid to point to first available/empty kt slot */
+       if (kd->est_count == kd->est_max_count)
+               ip_vs_est_update_ktid(ipvs);
+
+       ret = 0;
+
+out:
+       return ret;
+}
+
+/* Start estimation for stats */
+int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats)
+{
+       struct ip_vs_estimator *est = &stats->est;
+       int ret;
+
+       if (!ipvs->est_max_threads && ipvs->enable)
+               ipvs->est_max_threads = ip_vs_est_max_threads(ipvs);
+
+       est->ktid = -1;
+       est->ktrow = IPVS_EST_NTICKS - 1;       /* Initial delay */
+
+       /* We prefer this code to be short, kthread 0 will requeue the
+        * estimator to available chain. If tasks are disabled, we
+        * will not allocate much memory, just for kt 0.
+        */
+       ret = 0;
+       if (!ipvs->est_kt_count || !ipvs->est_kt_arr[0])
+               ret = ip_vs_est_add_kthread(ipvs);
+       if (ret >= 0)
+               hlist_add_head(&est->list, &ipvs->est_temp_list);
+       else
+               INIT_HLIST_NODE(&est->list);
+       return ret;
+}
+
+static void ip_vs_est_kthread_destroy(struct ip_vs_est_kt_data *kd)
+{
+       if (kd) {
+               if (kd->task) {
+                       pr_info("stop unused estimator thread %d...\n", kd->id);
+                       kthread_stop(kd->task);
+               }
+               ip_vs_stats_free(kd->calc_stats);
+               kfree(kd);
+       }
+}
+
+/* Unlink estimator from chain */
 void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats)
 {
        struct ip_vs_estimator *est = &stats->est;
+       struct ip_vs_est_tick_data *td;
+       struct ip_vs_est_kt_data *kd;
+       int ktid = est->ktid;
+       int row = est->ktrow;
+       int cid = est->ktcid;
+
+       /* Failed to add to chain ? */
+       if (hlist_unhashed(&est->list))
+               return;
+
+       /* On return, estimator can be freed, dequeue it now */
+
+       /* In est_temp_list ? */
+       if (ktid < 0) {
+               hlist_del(&est->list);
+               goto end_kt0;
+       }
+
+       hlist_del_rcu(&est->list);
+       kd = ipvs->est_kt_arr[ktid];
+       td = rcu_dereference_protected(kd->ticks[row], 1);
+       __clear_bit(cid, td->full);
+       td->chain_len[cid]--;
+       if (!td->chain_len[cid])
+               __clear_bit(cid, td->present);
+       kd->tick_len[row]--;
+       __set_bit(row, kd->avail);
+       if (!kd->tick_len[row]) {
+               RCU_INIT_POINTER(kd->ticks[row], NULL);
+               kfree_rcu(td);
+       }
+       kd->est_count--;
+       if (kd->est_count) {
+               /* This kt slot can become available just now, prefer it */
+               if (ktid < ipvs->est_add_ktid)
+                       ipvs->est_add_ktid = ktid;
+               return;
+       }
 
-       spin_lock_bh(&ipvs->est_lock);
-       list_del(&est->list);
-       spin_unlock_bh(&ipvs->est_lock);
+       if (ktid > 0) {
+               mutex_lock(&ipvs->est_mutex);
+               ip_vs_est_kthread_destroy(kd);
+               ipvs->est_kt_arr[ktid] = NULL;
+               if (ktid == ipvs->est_kt_count - 1) {
+                       ipvs->est_kt_count--;
+                       while (ipvs->est_kt_count > 1 &&
+                              !ipvs->est_kt_arr[ipvs->est_kt_count - 1])
+                               ipvs->est_kt_count--;
+               }
+               mutex_unlock(&ipvs->est_mutex);
+
+               /* This slot is now empty, prefer another available kt slot */
+               if (ktid == ipvs->est_add_ktid)
+                       ip_vs_est_update_ktid(ipvs);
+       }
+
+end_kt0:
+       /* kt 0 is freed after all other kthreads and chains are empty */
+       if (ipvs->est_kt_count == 1 && hlist_empty(&ipvs->est_temp_list)) {
+               kd = ipvs->est_kt_arr[0];
+               if (!kd || !kd->est_count) {
+                       mutex_lock(&ipvs->est_mutex);
+                       if (kd) {
+                               ip_vs_est_kthread_destroy(kd);
+                               ipvs->est_kt_arr[0] = NULL;
+                       }
+                       ipvs->est_kt_count--;
+                       mutex_unlock(&ipvs->est_mutex);
+                       ipvs->est_add_ktid = 0;
+               }
+       }
+}
+
+/* Register all ests from est_temp_list to kthreads */
+static void ip_vs_est_drain_temp_list(struct netns_ipvs *ipvs)
+{
+       struct ip_vs_estimator *est;
+
+       while (1) {
+               int max = 16;
+
+               mutex_lock(&__ip_vs_mutex);
+
+               while (max-- > 0) {
+                       est = hlist_entry_safe(ipvs->est_temp_list.first,
+                                              struct ip_vs_estimator, list);
+                       if (est) {
+                               if (kthread_should_stop())
+                                       goto unlock;
+                               hlist_del_init(&est->list);
+                               if (ip_vs_enqueue_estimator(ipvs, est) >= 0)
+                                       continue;
+                               est->ktid = -1;
+                               hlist_add_head(&est->list,
+                                              &ipvs->est_temp_list);
+                               /* Abort, some entries will not be estimated
+                                * until next attempt
+                                */
+                       }
+                       goto unlock;
+               }
+               mutex_unlock(&__ip_vs_mutex);
+               cond_resched();
+       }
+
+unlock:
+       mutex_unlock(&__ip_vs_mutex);
+}
+
+/* Calculate limits for all kthreads */
+static int ip_vs_est_calc_limits(struct netns_ipvs *ipvs, int *chain_max)
+{
+       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
+       struct ip_vs_est_kt_data *kd;
+       struct hlist_head chain;
+       struct ip_vs_stats *s;
+       int cache_factor = 4;
+       int i, loops, ntest;
+       s32 min_est = 0;
+       ktime_t t1, t2;
+       s64 diff, val;
+       int max = 8;
+       int ret = 1;
+
+       INIT_HLIST_HEAD(&chain);
+       mutex_lock(&__ip_vs_mutex);
+       kd = ipvs->est_kt_arr[0];
+       mutex_unlock(&__ip_vs_mutex);
+       s = kd ? kd->calc_stats : NULL;
+       if (!s)
+               goto out;
+       hlist_add_head(&s->est.list, &chain);
+
+       loops = 1;
+       /* Get best result from many tests */
+       for (ntest = 0; ntest < 12; ntest++) {
+               if (!(ntest & 3)) {
+                       /* Wait for cpufreq frequency transition */
+                       wait_event_idle_timeout(wq, kthread_should_stop(),
+                                               HZ / 50);
+                       if (!ipvs->enable || kthread_should_stop())
+                               goto stop;
+               }
+
+               local_bh_disable();
+               rcu_read_lock();
+
+               /* Put stats in cache */
+               ip_vs_chain_estimation(&chain);
+
+               t1 = ktime_get();
+               for (i = loops * cache_factor; i > 0; i--)
+                       ip_vs_chain_estimation(&chain);
+               t2 = ktime_get();
+
+               rcu_read_unlock();
+               local_bh_enable();
+
+               if (!ipvs->enable || kthread_should_stop())
+                       goto stop;
+               cond_resched();
+
+               diff = ktime_to_ns(ktime_sub(t2, t1));
+               if (diff <= 1 * NSEC_PER_USEC) {
+                       /* Do more loops on low time resolution */
+                       loops *= 2;
+                       continue;
+               }
+               if (diff >= NSEC_PER_SEC)
+                       continue;
+               val = diff;
+               do_div(val, loops);
+               if (!min_est || val < min_est) {
+                       min_est = val;
+                       /* goal: 95usec per chain */
+                       val = 95 * NSEC_PER_USEC;
+                       if (val >= min_est) {
+                               do_div(val, min_est);
+                               max = (int)val;
+                       } else {
+                               max = 1;
+                       }
+               }
+       }
+
+out:
+       if (s)
+               hlist_del_init(&s->est.list);
+       *chain_max = max;
+       return ret;
+
+stop:
+       ret = 0;
+       goto out;
+}
+
+/* Calculate the parameters and apply them in context of kt #0
+ * ECP: est_calc_phase
+ * ECM: est_chain_max
+ * ECP ECM     Insert Chain    enable  Description
+ * ---------------------------------------------------------------------------
+ * 0   0       est_temp_list   0       create kt #0 context
+ * 0   0       est_temp_list   0->1    service added, start kthread #0 task
+ * 0->1        0       est_temp_list   1       kt task #0 started, enters calc phase
+ * 1   0       est_temp_list   1       kt #0: determine est_chain_max,
+ *                                     stop tasks, move ests to est_temp_list
+ *                                     and free kd for kthreads 1..last
+ * 1->0        0->N    kt chains       1       ests can go to kthreads
+ * 0   N       kt chains       1       drain est_temp_list, create new kthread
+ *                                     contexts, start tasks, estimate
+ */
+static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs)
+{
+       int genid = atomic_read(&ipvs->est_genid);
+       struct ip_vs_est_tick_data *td;
+       struct ip_vs_est_kt_data *kd;
+       struct ip_vs_estimator *est;
+       struct ip_vs_stats *stats;
+       int id, row, cid, delay;
+       bool last, last_td;
+       int chain_max;
+       int step;
+
+       if (!ip_vs_est_calc_limits(ipvs, &chain_max))
+               return;
+
+       mutex_lock(&__ip_vs_mutex);
+
+       /* Stop all other tasks, so that we can immediately move the
+        * estimators to est_temp_list without RCU grace period
+        */
+       mutex_lock(&ipvs->est_mutex);
+       for (id = 1; id < ipvs->est_kt_count; id++) {
+               /* netns clean up started, abort */
+               if (!ipvs->enable)
+                       goto unlock2;
+               kd = ipvs->est_kt_arr[id];
+               if (!kd)
+                       continue;
+               ip_vs_est_kthread_stop(kd);
+       }
+       mutex_unlock(&ipvs->est_mutex);
+
+       /* Move all estimators to est_temp_list but carefully,
+        * all estimators and kthread data can be released while
+        * we reschedule. Even for kthread 0.
+        */
+       step = 0;
+
+       /* Order entries in est_temp_list in ascending delay, so now
+        * walk delay(desc), id(desc), cid(asc)
+        */
+       delay = IPVS_EST_NTICKS;
+
+next_delay:
+       delay--;
+       if (delay < 0)
+               goto end_dequeue;
+
+last_kt:
+       /* Destroy contexts backwards */
+       id = ipvs->est_kt_count;
+
+next_kt:
+       if (!ipvs->enable || kthread_should_stop())
+               goto unlock;
+       id--;
+       if (id < 0)
+               goto next_delay;
+       kd = ipvs->est_kt_arr[id];
+       if (!kd)
+               goto next_kt;
+       /* kt 0 can exist with empty chains */
+       if (!id && kd->est_count <= 1)
+               goto next_delay;
+
+       row = kd->est_row + delay;
+       if (row >= IPVS_EST_NTICKS)
+               row -= IPVS_EST_NTICKS;
+       td = rcu_dereference_protected(kd->ticks[row], 1);
+       if (!td)
+               goto next_kt;
+
+       cid = 0;
+
+walk_chain:
+       if (kthread_should_stop())
+               goto unlock;
+       step++;
+       if (!(step & 63)) {
+               /* Give chance estimators to be added (to est_temp_list)
+                * and deleted (releasing kthread contexts)
+                */
+               mutex_unlock(&__ip_vs_mutex);
+               cond_resched();
+               mutex_lock(&__ip_vs_mutex);
+
+               /* Current kt released ? */
+               if (id >= ipvs->est_kt_count)
+                       goto last_kt;
+               if (kd != ipvs->est_kt_arr[id])
+                       goto next_kt;
+               /* Current td released ? */
+               if (td != rcu_dereference_protected(kd->ticks[row], 1))
+                       goto next_kt;
+               /* No fatal changes on the current kd and td */
+       }
+       est = hlist_entry_safe(td->chains[cid].first, struct ip_vs_estimator,
+                              list);
+       if (!est) {
+               cid++;
+               if (cid >= IPVS_EST_TICK_CHAINS)
+                       goto next_kt;
+               goto walk_chain;
+       }
+       /* We can cheat and increase est_count to protect kt 0 context
+        * from release but we prefer to keep the last estimator
+        */
+       last = kd->est_count <= 1;
+       /* Do not free kt #0 data */
+       if (!id && last)
+               goto next_delay;
+       last_td = kd->tick_len[row] <= 1;
+       stats = container_of(est, struct ip_vs_stats, est);
+       ip_vs_stop_estimator(ipvs, stats);
+       /* Tasks are stopped, move without RCU grace period */
+       est->ktid = -1;
+       est->ktrow = row - kd->est_row;
+       if (est->ktrow < 0)
+               est->ktrow += IPVS_EST_NTICKS;
+       hlist_add_head(&est->list, &ipvs->est_temp_list);
+       /* kd freed ? */
+       if (last)
+               goto next_kt;
+       /* td freed ? */
+       if (last_td)
+               goto next_kt;
+       goto walk_chain;
+
+end_dequeue:
+       /* All estimators removed while calculating ? */
+       if (!ipvs->est_kt_count)
+               goto unlock;
+       kd = ipvs->est_kt_arr[0];
+       if (!kd)
+               goto unlock;
+       kd->add_row = kd->est_row;
+       ipvs->est_chain_max = chain_max;
+       ip_vs_est_set_params(ipvs, kd);
+
+       pr_info("using max %d ests per chain, %d per kthread\n",
+               kd->chain_max, kd->est_max_count);
+
+       /* Try to keep tot_stats in kt0, enqueue it early */
+       if (ipvs->tot_stats && !hlist_unhashed(&ipvs->tot_stats->s.est.list) &&
+           ipvs->tot_stats->s.est.ktid == -1) {
+               hlist_del(&ipvs->tot_stats->s.est.list);
+               hlist_add_head(&ipvs->tot_stats->s.est.list,
+                              &ipvs->est_temp_list);
+       }
+
+       mutex_lock(&ipvs->est_mutex);
+
+       /* We completed the calc phase, new calc phase not requested */
+       if (genid == atomic_read(&ipvs->est_genid))
+               ipvs->est_calc_phase = 0;
+
+unlock2:
+       mutex_unlock(&ipvs->est_mutex);
+
+unlock:
+       mutex_unlock(&__ip_vs_mutex);
 }
 
 void ip_vs_zero_estimator(struct ip_vs_stats *stats)
@@ -191,14 +926,25 @@ void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats)
 
 int __net_init ip_vs_estimator_net_init(struct netns_ipvs *ipvs)
 {
-       INIT_LIST_HEAD(&ipvs->est_list);
-       spin_lock_init(&ipvs->est_lock);
-       timer_setup(&ipvs->est_timer, estimation_timer, 0);
-       mod_timer(&ipvs->est_timer, jiffies + 2 * HZ);
+       INIT_HLIST_HEAD(&ipvs->est_temp_list);
+       ipvs->est_kt_arr = NULL;
+       ipvs->est_max_threads = 0;
+       ipvs->est_calc_phase = 0;
+       ipvs->est_chain_max = 0;
+       ipvs->est_kt_count = 0;
+       ipvs->est_add_ktid = 0;
+       atomic_set(&ipvs->est_genid, 0);
+       atomic_set(&ipvs->est_genid_done, 0);
+       __mutex_init(&ipvs->est_mutex, "ipvs->est_mutex", &__ipvs_est_key);
        return 0;
 }
 
 void __net_exit ip_vs_estimator_net_cleanup(struct netns_ipvs *ipvs)
 {
-       del_timer_sync(&ipvs->est_timer);
+       int i;
+
+       for (i = 0; i < ipvs->est_kt_count; i++)
+               ip_vs_est_kthread_destroy(ipvs->est_kt_arr[i]);
+       kfree(ipvs->est_kt_arr);
+       mutex_destroy(&ipvs->est_mutex);
 }
index b96338b..5c3cf08 100644 (file)
@@ -887,7 +887,7 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
        zone = nf_ct_zone(ct);
 
        if (!nf_ct_ext_valid_pre(ct->ext)) {
-               NF_CT_STAT_INC(net, insert_failed);
+               NF_CT_STAT_INC_ATOMIC(net, insert_failed);
                return -ETIMEDOUT;
        }
 
@@ -934,7 +934,7 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
 
        if (!nf_ct_ext_valid_post(ct->ext)) {
                nf_ct_kill(ct);
-               NF_CT_STAT_INC(net, drop);
+               NF_CT_STAT_INC_ATOMIC(net, drop);
                return -ETIMEDOUT;
        }
 
@@ -1271,7 +1271,7 @@ chaintoolong:
         */
        if (!nf_ct_ext_valid_post(ct->ext)) {
                nf_ct_kill(ct);
-               NF_CT_STAT_INC(net, drop);
+               NF_CT_STAT_INC_ATOMIC(net, drop);
                return NF_DROP;
        }
 
index d71150a..1286ae7 100644 (file)
@@ -328,8 +328,13 @@ nla_put_failure:
 }
 
 #ifdef CONFIG_NF_CONNTRACK_MARK
-static int ctnetlink_dump_mark(struct sk_buff *skb, u32 mark)
+static int ctnetlink_dump_mark(struct sk_buff *skb, const struct nf_conn *ct)
 {
+       u32 mark = READ_ONCE(ct->mark);
+
+       if (!mark)
+               return 0;
+
        if (nla_put_be32(skb, CTA_MARK, htonl(mark)))
                goto nla_put_failure;
        return 0;
@@ -543,7 +548,7 @@ static int ctnetlink_dump_extinfo(struct sk_buff *skb,
 static int ctnetlink_dump_info(struct sk_buff *skb, struct nf_conn *ct)
 {
        if (ctnetlink_dump_status(skb, ct) < 0 ||
-           ctnetlink_dump_mark(skb, READ_ONCE(ct->mark)) < 0 ||
+           ctnetlink_dump_mark(skb, ct) < 0 ||
            ctnetlink_dump_secctx(skb, ct) < 0 ||
            ctnetlink_dump_id(skb, ct) < 0 ||
            ctnetlink_dump_use(skb, ct) < 0 ||
@@ -722,7 +727,6 @@ ctnetlink_conntrack_event(unsigned int events, const struct nf_ct_event *item)
        struct sk_buff *skb;
        unsigned int type;
        unsigned int flags = 0, group;
-       u32 mark;
        int err;
 
        if (events & (1 << IPCT_DESTROY)) {
@@ -827,9 +831,8 @@ ctnetlink_conntrack_event(unsigned int events, const struct nf_ct_event *item)
        }
 
 #ifdef CONFIG_NF_CONNTRACK_MARK
-       mark = READ_ONCE(ct->mark);
-       if ((events & (1 << IPCT_MARK) || mark) &&
-           ctnetlink_dump_mark(skb, mark) < 0)
+       if (events & (1 << IPCT_MARK) &&
+           ctnetlink_dump_mark(skb, ct) < 0)
                goto nla_put_failure;
 #endif
        nlmsg_end(skb, nlh);
@@ -2671,7 +2674,6 @@ static int __ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct)
 {
        const struct nf_conntrack_zone *zone;
        struct nlattr *nest_parms;
-       u32 mark;
 
        zone = nf_ct_zone(ct);
 
@@ -2733,8 +2735,7 @@ static int __ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct)
                goto nla_put_failure;
 
 #ifdef CONFIG_NF_CONNTRACK_MARK
-       mark = READ_ONCE(ct->mark);
-       if (mark && ctnetlink_dump_mark(skb, mark) < 0)
+       if (ctnetlink_dump_mark(skb, ct) < 0)
                goto nla_put_failure;
 #endif
        if (ctnetlink_dump_labels(skb, ct) < 0)
index 895b09c..99323fb 100644 (file)
@@ -121,17 +121,61 @@ const struct nf_conntrack_l4proto *nf_ct_l4proto_find(u8 l4proto)
 };
 EXPORT_SYMBOL_GPL(nf_ct_l4proto_find);
 
-unsigned int nf_confirm(struct sk_buff *skb, unsigned int protoff,
-                       struct nf_conn *ct, enum ip_conntrack_info ctinfo)
+static bool in_vrf_postrouting(const struct nf_hook_state *state)
+{
+#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
+       if (state->hook == NF_INET_POST_ROUTING &&
+           netif_is_l3_master(state->out))
+               return true;
+#endif
+       return false;
+}
+
+unsigned int nf_confirm(void *priv,
+                       struct sk_buff *skb,
+                       const struct nf_hook_state *state)
 {
        const struct nf_conn_help *help;
+       enum ip_conntrack_info ctinfo;
+       unsigned int protoff;
+       struct nf_conn *ct;
+       bool seqadj_needed;
+       __be16 frag_off;
+       u8 pnum;
+
+       ct = nf_ct_get(skb, &ctinfo);
+       if (!ct || in_vrf_postrouting(state))
+               return NF_ACCEPT;
 
        help = nfct_help(ct);
+
+       seqadj_needed = test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && !nf_is_loopback_packet(skb);
+       if (!help && !seqadj_needed)
+               return nf_conntrack_confirm(skb);
+
+       /* helper->help() do not expect ICMP packets */
+       if (ctinfo == IP_CT_RELATED_REPLY)
+               return nf_conntrack_confirm(skb);
+
+       switch (nf_ct_l3num(ct)) {
+       case NFPROTO_IPV4:
+               protoff = skb_network_offset(skb) + ip_hdrlen(skb);
+               break;
+       case NFPROTO_IPV6:
+               pnum = ipv6_hdr(skb)->nexthdr;
+               protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, &frag_off);
+               if (protoff < 0 || (frag_off & htons(~0x7)) != 0)
+                       return nf_conntrack_confirm(skb);
+               break;
+       default:
+               return nf_conntrack_confirm(skb);
+       }
+
        if (help) {
                const struct nf_conntrack_helper *helper;
                int ret;
 
-               /* rcu_read_lock()ed by nf_hook_thresh */
+               /* rcu_read_lock()ed by nf_hook */
                helper = rcu_dereference(help->helper);
                if (helper) {
                        ret = helper->help(skb,
@@ -142,12 +186,10 @@ unsigned int nf_confirm(struct sk_buff *skb, unsigned int protoff,
                }
        }
 
-       if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
-           !nf_is_loopback_packet(skb)) {
-               if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) {
-                       NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
-                       return NF_DROP;
-               }
+       if (seqadj_needed &&
+           !nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) {
+               NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
+               return NF_DROP;
        }
 
        /* We've seen it coming out the other side: confirm it */
@@ -155,35 +197,6 @@ unsigned int nf_confirm(struct sk_buff *skb, unsigned int protoff,
 }
 EXPORT_SYMBOL_GPL(nf_confirm);
 
-static bool in_vrf_postrouting(const struct nf_hook_state *state)
-{
-#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
-       if (state->hook == NF_INET_POST_ROUTING &&
-           netif_is_l3_master(state->out))
-               return true;
-#endif
-       return false;
-}
-
-static unsigned int ipv4_confirm(void *priv,
-                                struct sk_buff *skb,
-                                const struct nf_hook_state *state)
-{
-       enum ip_conntrack_info ctinfo;
-       struct nf_conn *ct;
-
-       ct = nf_ct_get(skb, &ctinfo);
-       if (!ct || ctinfo == IP_CT_RELATED_REPLY)
-               return nf_conntrack_confirm(skb);
-
-       if (in_vrf_postrouting(state))
-               return NF_ACCEPT;
-
-       return nf_confirm(skb,
-                         skb_network_offset(skb) + ip_hdrlen(skb),
-                         ct, ctinfo);
-}
-
 static unsigned int ipv4_conntrack_in(void *priv,
                                      struct sk_buff *skb,
                                      const struct nf_hook_state *state)
@@ -230,13 +243,13 @@ static const struct nf_hook_ops ipv4_conntrack_ops[] = {
                .priority       = NF_IP_PRI_CONNTRACK,
        },
        {
-               .hook           = ipv4_confirm,
+               .hook           = nf_confirm,
                .pf             = NFPROTO_IPV4,
                .hooknum        = NF_INET_POST_ROUTING,
                .priority       = NF_IP_PRI_CONNTRACK_CONFIRM,
        },
        {
-               .hook           = ipv4_confirm,
+               .hook           = nf_confirm,
                .pf             = NFPROTO_IPV4,
                .hooknum        = NF_INET_LOCAL_IN,
                .priority       = NF_IP_PRI_CONNTRACK_CONFIRM,
@@ -373,33 +386,6 @@ static struct nf_sockopt_ops so_getorigdst6 = {
        .owner          = THIS_MODULE,
 };
 
-static unsigned int ipv6_confirm(void *priv,
-                                struct sk_buff *skb,
-                                const struct nf_hook_state *state)
-{
-       struct nf_conn *ct;
-       enum ip_conntrack_info ctinfo;
-       unsigned char pnum = ipv6_hdr(skb)->nexthdr;
-       __be16 frag_off;
-       int protoff;
-
-       ct = nf_ct_get(skb, &ctinfo);
-       if (!ct || ctinfo == IP_CT_RELATED_REPLY)
-               return nf_conntrack_confirm(skb);
-
-       if (in_vrf_postrouting(state))
-               return NF_ACCEPT;
-
-       protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum,
-                                  &frag_off);
-       if (protoff < 0 || (frag_off & htons(~0x7)) != 0) {
-               pr_debug("proto header not found\n");
-               return nf_conntrack_confirm(skb);
-       }
-
-       return nf_confirm(skb, protoff, ct, ctinfo);
-}
-
 static unsigned int ipv6_conntrack_in(void *priv,
                                      struct sk_buff *skb,
                                      const struct nf_hook_state *state)
@@ -428,13 +414,13 @@ static const struct nf_hook_ops ipv6_conntrack_ops[] = {
                .priority       = NF_IP6_PRI_CONNTRACK,
        },
        {
-               .hook           = ipv6_confirm,
+               .hook           = nf_confirm,
                .pf             = NFPROTO_IPV6,
                .hooknum        = NF_INET_POST_ROUTING,
                .priority       = NF_IP6_PRI_LAST,
        },
        {
-               .hook           = ipv6_confirm,
+               .hook           = nf_confirm,
                .pf             = NFPROTO_IPV6,
                .hooknum        = NF_INET_LOCAL_IN,
                .priority       = NF_IP6_PRI_LAST - 1,
index 61e3b05..1020d67 100644 (file)
@@ -129,6 +129,56 @@ static void icmpv6_error_log(const struct sk_buff *skb,
        nf_l4proto_log_invalid(skb, state, IPPROTO_ICMPV6, "%s", msg);
 }
 
+static noinline_for_stack int
+nf_conntrack_icmpv6_redirect(struct nf_conn *tmpl, struct sk_buff *skb,
+                            unsigned int dataoff,
+                            const struct nf_hook_state *state)
+{
+       u8 hl = ipv6_hdr(skb)->hop_limit;
+       union nf_inet_addr outer_daddr;
+       union {
+               struct nd_opt_hdr nd_opt;
+               struct rd_msg rd_msg;
+       } tmp;
+       const struct nd_opt_hdr *nd_opt;
+       const struct rd_msg *rd_msg;
+
+       rd_msg = skb_header_pointer(skb, dataoff, sizeof(*rd_msg), &tmp.rd_msg);
+       if (!rd_msg) {
+               icmpv6_error_log(skb, state, "short redirect");
+               return -NF_ACCEPT;
+       }
+
+       if (rd_msg->icmph.icmp6_code != 0)
+               return NF_ACCEPT;
+
+       if (hl != 255 || !(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) {
+               icmpv6_error_log(skb, state, "invalid saddr or hoplimit for redirect");
+               return -NF_ACCEPT;
+       }
+
+       dataoff += sizeof(*rd_msg);
+
+       /* warning: rd_msg no longer usable after this call */
+       nd_opt = skb_header_pointer(skb, dataoff, sizeof(*nd_opt), &tmp.nd_opt);
+       if (!nd_opt || nd_opt->nd_opt_len == 0) {
+               icmpv6_error_log(skb, state, "redirect without options");
+               return -NF_ACCEPT;
+       }
+
+       /* We could call ndisc_parse_options(), but it would need
+        * skb_linearize() and a bit more work.
+        */
+       if (nd_opt->nd_opt_type != ND_OPT_REDIRECT_HDR)
+               return NF_ACCEPT;
+
+       memcpy(&outer_daddr.ip6, &ipv6_hdr(skb)->daddr,
+              sizeof(outer_daddr.ip6));
+       dataoff += 8;
+       return nf_conntrack_inet_error(tmpl, skb, dataoff, state,
+                                      IPPROTO_ICMPV6, &outer_daddr);
+}
+
 int nf_conntrack_icmpv6_error(struct nf_conn *tmpl,
                              struct sk_buff *skb,
                              unsigned int dataoff,
@@ -159,6 +209,9 @@ int nf_conntrack_icmpv6_error(struct nf_conn *tmpl,
                return NF_ACCEPT;
        }
 
+       if (icmp6h->icmp6_type == NDISC_REDIRECT)
+               return nf_conntrack_icmpv6_redirect(tmpl, skb, dataoff, state);
+
        /* is not error message ? */
        if (icmp6h->icmp6_type >= 128)
                return NF_ACCEPT;
index 5a93633..d88b92a 100644 (file)
@@ -60,6 +60,7 @@ static const unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] = {
        [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT]      = 3 SECS,
        [SCTP_CONNTRACK_HEARTBEAT_SENT]         = 30 SECS,
        [SCTP_CONNTRACK_HEARTBEAT_ACKED]        = 210 SECS,
+       [SCTP_CONNTRACK_DATA_SENT]              = 30 SECS,
 };
 
 #define        SCTP_FLAG_HEARTBEAT_VTAG_FAILED 1
@@ -74,6 +75,7 @@ static const unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] = {
 #define        sSA SCTP_CONNTRACK_SHUTDOWN_ACK_SENT
 #define        sHS SCTP_CONNTRACK_HEARTBEAT_SENT
 #define        sHA SCTP_CONNTRACK_HEARTBEAT_ACKED
+#define        sDS SCTP_CONNTRACK_DATA_SENT
 #define        sIV SCTP_CONNTRACK_MAX
 
 /*
@@ -90,15 +92,16 @@ COOKIE WAIT       - We have seen an INIT chunk in the original direction, or als
 COOKIE ECHOED     - We have seen a COOKIE_ECHO chunk in the original direction.
 ESTABLISHED       - We have seen a COOKIE_ACK in the reply direction.
 SHUTDOWN_SENT     - We have seen a SHUTDOWN chunk in the original direction.
-SHUTDOWN_RECD     - We have seen a SHUTDOWN chunk in the reply directoin.
+SHUTDOWN_RECD     - We have seen a SHUTDOWN chunk in the reply direction.
 SHUTDOWN_ACK_SENT - We have seen a SHUTDOWN_ACK chunk in the direction opposite
                    to that of the SHUTDOWN chunk.
 CLOSED            - We have seen a SHUTDOWN_COMPLETE chunk in the direction of
                    the SHUTDOWN chunk. Connection is closed.
 HEARTBEAT_SENT    - We have seen a HEARTBEAT in a new flow.
-HEARTBEAT_ACKED   - We have seen a HEARTBEAT-ACK in the direction opposite to
-                   that of the HEARTBEAT chunk. Secondary connection is
-                   established.
+HEARTBEAT_ACKED   - We have seen a HEARTBEAT-ACK/DATA/SACK in the direction
+                   opposite to that of the HEARTBEAT/DATA chunk. Secondary connection
+                   is established.
+DATA_SENT         - We have seen a DATA/SACK in a new flow.
 */
 
 /* TODO
@@ -112,36 +115,38 @@ cookie echoed to closed.
 */
 
 /* SCTP conntrack state transitions */
-static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = {
+static const u8 sctp_conntracks[2][12][SCTP_CONNTRACK_MAX] = {
        {
 /*     ORIGINAL        */
-/*                  sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA */
-/* init         */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCW, sHA},
-/* init_ack     */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA},
-/* abort        */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
-/* shutdown     */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA, sCL, sSS},
-/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA, sSA, sHA},
-/* error        */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA},/* Can't have Stale cookie*/
-/* cookie_echo  */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA, sCL, sHA},/* 5.2.4 - Big TODO */
-/* cookie_ack   */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA},/* Can't come in orig dir */
-/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL, sCL, sHA},
-/* heartbeat    */ {sHS, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA},
-/* heartbeat_ack*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA}
+/*                  sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA, sDS */
+/* init         */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCW, sHA, sCW},
+/* init_ack     */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA, sCL},
+/* abort        */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
+/* shutdown     */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA, sCL, sSS, sCL},
+/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA, sSA, sHA, sSA},
+/* error        */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA, sCL},/* Can't have Stale cookie*/
+/* cookie_echo  */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA, sCL, sHA, sCL},/* 5.2.4 - Big TODO */
+/* cookie_ack   */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA, sCL},/* Can't come in orig dir */
+/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL, sCL, sHA, sCL},
+/* heartbeat    */ {sHS, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA, sDS},
+/* heartbeat_ack*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA, sDS},
+/* data/sack    */ {sDS, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA, sDS}
        },
        {
 /*     REPLY   */
-/*                  sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA */
-/* init         */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA},/* INIT in sCL Big TODO */
-/* init_ack     */ {sIV, sCW, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA},
-/* abort        */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV, sCL},
-/* shutdown     */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA, sIV, sSR},
-/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA, sIV, sHA},
-/* error        */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA, sIV, sHA},
-/* cookie_echo  */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA},/* Can't come in reply dir */
-/* cookie_ack   */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA, sIV, sHA},
-/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL, sIV, sHA},
-/* heartbeat    */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA},
-/* heartbeat_ack*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHA, sHA}
+/*                  sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA, sDS */
+/* init         */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA, sIV},/* INIT in sCL Big TODO */
+/* init_ack     */ {sIV, sCW, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA, sIV},
+/* abort        */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV, sCL, sIV},
+/* shutdown     */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA, sIV, sSR, sIV},
+/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA, sIV, sHA, sIV},
+/* error        */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA, sIV, sHA, sIV},
+/* cookie_echo  */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA, sIV},/* Can't come in reply dir */
+/* cookie_ack   */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA, sIV, sHA, sIV},
+/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL, sIV, sHA, sIV},
+/* heartbeat    */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA, sHA},
+/* heartbeat_ack*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHA, sHA, sHA},
+/* data/sack    */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHA, sHA, sHA},
        }
 };
 
@@ -253,6 +258,11 @@ static int sctp_new_state(enum ip_conntrack_dir dir,
                pr_debug("SCTP_CID_HEARTBEAT_ACK");
                i = 10;
                break;
+       case SCTP_CID_DATA:
+       case SCTP_CID_SACK:
+               pr_debug("SCTP_CID_DATA/SACK");
+               i = 11;
+               break;
        default:
                /* Other chunks like DATA or SACK do not change the state */
                pr_debug("Unknown chunk type, Will stay in %s\n",
@@ -306,7 +316,9 @@ sctp_new(struct nf_conn *ct, const struct sk_buff *skb,
                                 ih->init_tag);
 
                        ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = ih->init_tag;
-               } else if (sch->type == SCTP_CID_HEARTBEAT) {
+               } else if (sch->type == SCTP_CID_HEARTBEAT ||
+                          sch->type == SCTP_CID_DATA ||
+                          sch->type == SCTP_CID_SACK) {
                        pr_debug("Setting vtag %x for secondary conntrack\n",
                                 sh->vtag);
                        ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] = sh->vtag;
@@ -392,19 +404,19 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct,
 
                if (!sctp_new(ct, skb, sh, dataoff))
                        return -NF_ACCEPT;
-       }
-
-       /* Check the verification tag (Sec 8.5) */
-       if (!test_bit(SCTP_CID_INIT, map) &&
-           !test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) &&
-           !test_bit(SCTP_CID_COOKIE_ECHO, map) &&
-           !test_bit(SCTP_CID_ABORT, map) &&
-           !test_bit(SCTP_CID_SHUTDOWN_ACK, map) &&
-           !test_bit(SCTP_CID_HEARTBEAT, map) &&
-           !test_bit(SCTP_CID_HEARTBEAT_ACK, map) &&
-           sh->vtag != ct->proto.sctp.vtag[dir]) {
-               pr_debug("Verification tag check failed\n");
-               goto out;
+       } else {
+               /* Check the verification tag (Sec 8.5) */
+               if (!test_bit(SCTP_CID_INIT, map) &&
+                   !test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) &&
+                   !test_bit(SCTP_CID_COOKIE_ECHO, map) &&
+                   !test_bit(SCTP_CID_ABORT, map) &&
+                   !test_bit(SCTP_CID_SHUTDOWN_ACK, map) &&
+                   !test_bit(SCTP_CID_HEARTBEAT, map) &&
+                   !test_bit(SCTP_CID_HEARTBEAT_ACK, map) &&
+                   sh->vtag != ct->proto.sctp.vtag[dir]) {
+                       pr_debug("Verification tag check failed\n");
+                       goto out;
+               }
        }
 
        old_state = new_state = SCTP_CONNTRACK_NONE;
@@ -464,6 +476,11 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct,
                        } else if (ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) {
                                ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED;
                        }
+               } else if (sch->type == SCTP_CID_DATA || sch->type == SCTP_CID_SACK) {
+                       if (ct->proto.sctp.vtag[dir] == 0) {
+                               pr_debug("Setting vtag %x for dir %d\n", sh->vtag, dir);
+                               ct->proto.sctp.vtag[dir] = sh->vtag;
+                       }
                }
 
                old_state = ct->proto.sctp.state;
@@ -684,6 +701,7 @@ sctp_timeout_nla_policy[CTA_TIMEOUT_SCTP_MAX+1] = {
        [CTA_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT]    = { .type = NLA_U32 },
        [CTA_TIMEOUT_SCTP_HEARTBEAT_SENT]       = { .type = NLA_U32 },
        [CTA_TIMEOUT_SCTP_HEARTBEAT_ACKED]      = { .type = NLA_U32 },
+       [CTA_TIMEOUT_SCTP_DATA_SENT]            = { .type = NLA_U32 },
 };
 #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
 
index bca839a..0250725 100644 (file)
@@ -602,6 +602,7 @@ enum nf_ct_sysctl_index {
        NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT,
        NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_HEARTBEAT_SENT,
        NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_HEARTBEAT_ACKED,
+       NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_DATA_SENT,
 #endif
 #ifdef CONFIG_NF_CT_PROTO_DCCP
        NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_REQUEST,
@@ -892,6 +893,12 @@ static struct ctl_table nf_ct_sysctl_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec_jiffies,
        },
+       [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_DATA_SENT] = {
+               .procname       = "nf_conntrack_sctp_timeout_data_sent",
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_jiffies,
+       },
 #endif
 #ifdef CONFIG_NF_CT_PROTO_DCCP
        [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_REQUEST] = {
@@ -1036,6 +1043,7 @@ static void nf_conntrack_standalone_init_sctp_sysctl(struct net *net,
        XASSIGN(SHUTDOWN_ACK_SENT, sn);
        XASSIGN(HEARTBEAT_SENT, sn);
        XASSIGN(HEARTBEAT_ACKED, sn);
+       XASSIGN(DATA_SENT, sn);
 #undef XASSIGN
 #endif
 }
index b350fe9..19efba1 100644 (file)
@@ -421,6 +421,10 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
                if (ret == NF_DROP)
                        flow_offload_teardown(flow);
                break;
+       default:
+               WARN_ON_ONCE(1);
+               ret = NF_DROP;
+               break;
        }
 
        return ret;
@@ -682,6 +686,10 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
                if (ret == NF_DROP)
                        flow_offload_teardown(flow);
                break;
+       default:
+               WARN_ON_ONCE(1);
+               ret = NF_DROP;
+               break;
        }
 
        return ret;
index 00b5228..0fdcdb2 100644 (file)
@@ -997,13 +997,13 @@ static void flow_offload_queue_work(struct flow_offload_work *offload)
        struct net *net = read_pnet(&offload->flowtable->net);
 
        if (offload->cmd == FLOW_CLS_REPLACE) {
-               NF_FLOW_TABLE_STAT_INC(net, count_wq_add);
+               NF_FLOW_TABLE_STAT_INC_ATOMIC(net, count_wq_add);
                queue_work(nf_flow_offload_add_wq, &offload->work);
        } else if (offload->cmd == FLOW_CLS_DESTROY) {
-               NF_FLOW_TABLE_STAT_INC(net, count_wq_del);
+               NF_FLOW_TABLE_STAT_INC_ATOMIC(net, count_wq_del);
                queue_work(nf_flow_offload_del_wq, &offload->work);
        } else {
-               NF_FLOW_TABLE_STAT_INC(net, count_wq_stats);
+               NF_FLOW_TABLE_STAT_INC_ATOMIC(net, count_wq_stats);
                queue_work(nf_flow_offload_stats_wq, &offload->work);
        }
 }
diff --git a/net/netfilter/nf_nat_ovs.c b/net/netfilter/nf_nat_ovs.c
new file mode 100644 (file)
index 0000000..551abd2
--- /dev/null
@@ -0,0 +1,135 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Support nat functions for openvswitch and used by OVS and TC conntrack. */
+
+#include <net/netfilter/nf_nat.h>
+
+/* Modelled after nf_nat_ipv[46]_fn().
+ * range is only used for new, uninitialized NAT state.
+ * Returns either NF_ACCEPT or NF_DROP.
+ */
+static int nf_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct,
+                            enum ip_conntrack_info ctinfo, int *action,
+                            const struct nf_nat_range2 *range,
+                            enum nf_nat_manip_type maniptype)
+{
+       __be16 proto = skb_protocol(skb, true);
+       int hooknum, err = NF_ACCEPT;
+
+       /* See HOOK2MANIP(). */
+       if (maniptype == NF_NAT_MANIP_SRC)
+               hooknum = NF_INET_LOCAL_IN; /* Source NAT */
+       else
+               hooknum = NF_INET_LOCAL_OUT; /* Destination NAT */
+
+       switch (ctinfo) {
+       case IP_CT_RELATED:
+       case IP_CT_RELATED_REPLY:
+               if (proto == htons(ETH_P_IP) &&
+                   ip_hdr(skb)->protocol == IPPROTO_ICMP) {
+                       if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
+                                                          hooknum))
+                               err = NF_DROP;
+                       goto out;
+               } else if (IS_ENABLED(CONFIG_IPV6) && proto == htons(ETH_P_IPV6)) {
+                       __be16 frag_off;
+                       u8 nexthdr = ipv6_hdr(skb)->nexthdr;
+                       int hdrlen = ipv6_skip_exthdr(skb,
+                                                     sizeof(struct ipv6hdr),
+                                                     &nexthdr, &frag_off);
+
+                       if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) {
+                               if (!nf_nat_icmpv6_reply_translation(skb, ct,
+                                                                    ctinfo,
+                                                                    hooknum,
+                                                                    hdrlen))
+                                       err = NF_DROP;
+                               goto out;
+                       }
+               }
+               /* Non-ICMP, fall thru to initialize if needed. */
+               fallthrough;
+       case IP_CT_NEW:
+               /* Seen it before?  This can happen for loopback, retrans,
+                * or local packets.
+                */
+               if (!nf_nat_initialized(ct, maniptype)) {
+                       /* Initialize according to the NAT action. */
+                       err = (range && range->flags & NF_NAT_RANGE_MAP_IPS)
+                               /* Action is set up to establish a new
+                                * mapping.
+                                */
+                               ? nf_nat_setup_info(ct, range, maniptype)
+                               : nf_nat_alloc_null_binding(ct, hooknum);
+                       if (err != NF_ACCEPT)
+                               goto out;
+               }
+               break;
+
+       case IP_CT_ESTABLISHED:
+       case IP_CT_ESTABLISHED_REPLY:
+               break;
+
+       default:
+               err = NF_DROP;
+               goto out;
+       }
+
+       err = nf_nat_packet(ct, ctinfo, hooknum, skb);
+       if (err == NF_ACCEPT)
+               *action |= BIT(maniptype);
+out:
+       return err;
+}
+
+int nf_ct_nat(struct sk_buff *skb, struct nf_conn *ct,
+             enum ip_conntrack_info ctinfo, int *action,
+             const struct nf_nat_range2 *range, bool commit)
+{
+       enum nf_nat_manip_type maniptype;
+       int err, ct_action = *action;
+
+       *action = 0;
+
+       /* Add NAT extension if not confirmed yet. */
+       if (!nf_ct_is_confirmed(ct) && !nf_ct_nat_ext_add(ct))
+               return NF_DROP;   /* Can't NAT. */
+
+       if (ctinfo != IP_CT_NEW && (ct->status & IPS_NAT_MASK) &&
+           (ctinfo != IP_CT_RELATED || commit)) {
+               /* NAT an established or related connection like before. */
+               if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY)
+                       /* This is the REPLY direction for a connection
+                        * for which NAT was applied in the forward
+                        * direction.  Do the reverse NAT.
+                        */
+                       maniptype = ct->status & IPS_SRC_NAT
+                               ? NF_NAT_MANIP_DST : NF_NAT_MANIP_SRC;
+               else
+                       maniptype = ct->status & IPS_SRC_NAT
+                               ? NF_NAT_MANIP_SRC : NF_NAT_MANIP_DST;
+       } else if (ct_action & BIT(NF_NAT_MANIP_SRC)) {
+               maniptype = NF_NAT_MANIP_SRC;
+       } else if (ct_action & BIT(NF_NAT_MANIP_DST)) {
+               maniptype = NF_NAT_MANIP_DST;
+       } else {
+               return NF_ACCEPT;
+       }
+
+       err = nf_ct_nat_execute(skb, ct, ctinfo, action, range, maniptype);
+       if (err == NF_ACCEPT && ct->status & IPS_DST_NAT) {
+               if (ct->status & IPS_SRC_NAT) {
+                       if (maniptype == NF_NAT_MANIP_SRC)
+                               maniptype = NF_NAT_MANIP_DST;
+                       else
+                               maniptype = NF_NAT_MANIP_SRC;
+
+                       err = nf_ct_nat_execute(skb, ct, ctinfo, action, range,
+                                               maniptype);
+               } else if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) {
+                       err = nf_ct_nat_execute(skb, ct, ctinfo, action, NULL,
+                                               NF_NAT_MANIP_SRC);
+               }
+       }
+       return err;
+}
+EXPORT_SYMBOL_GPL(nf_ct_nat);
index 6269b0d..832b881 100644 (file)
@@ -2873,8 +2873,8 @@ int nft_expr_inner_parse(const struct nft_ctx *ctx, const struct nlattr *nla,
                return -EINVAL;
 
        type = __nft_expr_type_get(ctx->family, tb[NFTA_EXPR_NAME]);
-       if (IS_ERR(type))
-               return PTR_ERR(type);
+       if (!type)
+               return -ENOENT;
 
        if (!type->inner_ops)
                return -EOPNOTSUPP;
index 4f9299b..06d46d1 100644 (file)
@@ -1162,6 +1162,7 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set,
        struct nft_pipapo_match *m = priv->clone;
        u8 genmask = nft_genmask_next(net);
        struct nft_pipapo_field *f;
+       const u8 *start_p, *end_p;
        int i, bsize_max, err = 0;
 
        if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END))
@@ -1202,9 +1203,9 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set,
        }
 
        /* Validate */
+       start_p = start;
+       end_p = end;
        nft_pipapo_for_each_field(f, i, m) {
-               const u8 *start_p = start, *end_p = end;
-
                if (f->rules >= (unsigned long)NFT_PIPAPO_RULE0_MAX)
                        return -ENOSPC;
 
index 282c510..994a0a1 100644 (file)
@@ -240,6 +240,8 @@ static int nci_add_new_protocol(struct nci_dev *ndev,
                target->sens_res = nfca_poll->sens_res;
                target->sel_res = nfca_poll->sel_res;
                target->nfcid1_len = nfca_poll->nfcid1_len;
+               if (target->nfcid1_len > ARRAY_SIZE(target->nfcid1))
+                       return -EPROTO;
                if (target->nfcid1_len > 0) {
                        memcpy(target->nfcid1, nfca_poll->nfcid1,
                               target->nfcid1_len);
@@ -248,6 +250,8 @@ static int nci_add_new_protocol(struct nci_dev *ndev,
                nfcb_poll = (struct rf_tech_specific_params_nfcb_poll *)params;
 
                target->sensb_res_len = nfcb_poll->sensb_res_len;
+               if (target->sensb_res_len > ARRAY_SIZE(target->sensb_res))
+                       return -EPROTO;
                if (target->sensb_res_len > 0) {
                        memcpy(target->sensb_res, nfcb_poll->sensb_res,
                               target->sensb_res_len);
@@ -256,6 +260,8 @@ static int nci_add_new_protocol(struct nci_dev *ndev,
                nfcf_poll = (struct rf_tech_specific_params_nfcf_poll *)params;
 
                target->sensf_res_len = nfcf_poll->sensf_res_len;
+               if (target->sensf_res_len > ARRAY_SIZE(target->sensf_res))
+                       return -EPROTO;
                if (target->sensf_res_len > 0) {
                        memcpy(target->sensf_res, nfcf_poll->sensf_res,
                               target->sensf_res_len);
index 15bd287..747d537 100644 (file)
@@ -15,6 +15,7 @@ config OPENVSWITCH
        select NET_MPLS_GSO
        select DST_CACHE
        select NET_NSH
+       select NF_NAT_OVS if NF_NAT
        help
          Open vSwitch is a multilayer Ethernet switch targeted at virtualized
          environments.  In addition to supporting a variety of features
index d78f0fc..c8b1376 100644 (file)
@@ -726,147 +726,27 @@ static void ovs_nat_update_key(struct sw_flow_key *key,
        }
 }
 
-/* Modelled after nf_nat_ipv[46]_fn().
- * range is only used for new, uninitialized NAT state.
- * Returns either NF_ACCEPT or NF_DROP.
- */
-static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct,
-                             enum ip_conntrack_info ctinfo,
-                             const struct nf_nat_range2 *range,
-                             enum nf_nat_manip_type maniptype, struct sw_flow_key *key)
-{
-       int hooknum, nh_off, err = NF_ACCEPT;
-
-       nh_off = skb_network_offset(skb);
-       skb_pull_rcsum(skb, nh_off);
-
-       /* See HOOK2MANIP(). */
-       if (maniptype == NF_NAT_MANIP_SRC)
-               hooknum = NF_INET_LOCAL_IN; /* Source NAT */
-       else
-               hooknum = NF_INET_LOCAL_OUT; /* Destination NAT */
-
-       switch (ctinfo) {
-       case IP_CT_RELATED:
-       case IP_CT_RELATED_REPLY:
-               if (IS_ENABLED(CONFIG_NF_NAT) &&
-                   skb->protocol == htons(ETH_P_IP) &&
-                   ip_hdr(skb)->protocol == IPPROTO_ICMP) {
-                       if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
-                                                          hooknum))
-                               err = NF_DROP;
-                       goto push;
-               } else if (IS_ENABLED(CONFIG_IPV6) &&
-                          skb->protocol == htons(ETH_P_IPV6)) {
-                       __be16 frag_off;
-                       u8 nexthdr = ipv6_hdr(skb)->nexthdr;
-                       int hdrlen = ipv6_skip_exthdr(skb,
-                                                     sizeof(struct ipv6hdr),
-                                                     &nexthdr, &frag_off);
-
-                       if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) {
-                               if (!nf_nat_icmpv6_reply_translation(skb, ct,
-                                                                    ctinfo,
-                                                                    hooknum,
-                                                                    hdrlen))
-                                       err = NF_DROP;
-                               goto push;
-                       }
-               }
-               /* Non-ICMP, fall thru to initialize if needed. */
-               fallthrough;
-       case IP_CT_NEW:
-               /* Seen it before?  This can happen for loopback, retrans,
-                * or local packets.
-                */
-               if (!nf_nat_initialized(ct, maniptype)) {
-                       /* Initialize according to the NAT action. */
-                       err = (range && range->flags & NF_NAT_RANGE_MAP_IPS)
-                               /* Action is set up to establish a new
-                                * mapping.
-                                */
-                               ? nf_nat_setup_info(ct, range, maniptype)
-                               : nf_nat_alloc_null_binding(ct, hooknum);
-                       if (err != NF_ACCEPT)
-                               goto push;
-               }
-               break;
-
-       case IP_CT_ESTABLISHED:
-       case IP_CT_ESTABLISHED_REPLY:
-               break;
-
-       default:
-               err = NF_DROP;
-               goto push;
-       }
-
-       err = nf_nat_packet(ct, ctinfo, hooknum, skb);
-push:
-       skb_push_rcsum(skb, nh_off);
-
-       /* Update the flow key if NAT successful. */
-       if (err == NF_ACCEPT)
-               ovs_nat_update_key(key, skb, maniptype);
-
-       return err;
-}
-
 /* Returns NF_DROP if the packet should be dropped, NF_ACCEPT otherwise. */
 static int ovs_ct_nat(struct net *net, struct sw_flow_key *key,
                      const struct ovs_conntrack_info *info,
                      struct sk_buff *skb, struct nf_conn *ct,
                      enum ip_conntrack_info ctinfo)
 {
-       enum nf_nat_manip_type maniptype;
-       int err;
+       int err, action = 0;
 
-       /* Add NAT extension if not confirmed yet. */
-       if (!nf_ct_is_confirmed(ct) && !nf_ct_nat_ext_add(ct))
-               return NF_ACCEPT;   /* Can't NAT. */
+       if (!(info->nat & OVS_CT_NAT))
+               return NF_ACCEPT;
+       if (info->nat & OVS_CT_SRC_NAT)
+               action |= BIT(NF_NAT_MANIP_SRC);
+       if (info->nat & OVS_CT_DST_NAT)
+               action |= BIT(NF_NAT_MANIP_DST);
 
-       /* Determine NAT type.
-        * Check if the NAT type can be deduced from the tracked connection.
-        * Make sure new expected connections (IP_CT_RELATED) are NATted only
-        * when committing.
-        */
-       if (info->nat & OVS_CT_NAT && ctinfo != IP_CT_NEW &&
-           ct->status & IPS_NAT_MASK &&
-           (ctinfo != IP_CT_RELATED || info->commit)) {
-               /* NAT an established or related connection like before. */
-               if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY)
-                       /* This is the REPLY direction for a connection
-                        * for which NAT was applied in the forward
-                        * direction.  Do the reverse NAT.
-                        */
-                       maniptype = ct->status & IPS_SRC_NAT
-                               ? NF_NAT_MANIP_DST : NF_NAT_MANIP_SRC;
-               else
-                       maniptype = ct->status & IPS_SRC_NAT
-                               ? NF_NAT_MANIP_SRC : NF_NAT_MANIP_DST;
-       } else if (info->nat & OVS_CT_SRC_NAT) {
-               maniptype = NF_NAT_MANIP_SRC;
-       } else if (info->nat & OVS_CT_DST_NAT) {
-               maniptype = NF_NAT_MANIP_DST;
-       } else {
-               return NF_ACCEPT; /* Connection is not NATed. */
-       }
-       err = ovs_ct_nat_execute(skb, ct, ctinfo, &info->range, maniptype, key);
-
-       if (err == NF_ACCEPT && ct->status & IPS_DST_NAT) {
-               if (ct->status & IPS_SRC_NAT) {
-                       if (maniptype == NF_NAT_MANIP_SRC)
-                               maniptype = NF_NAT_MANIP_DST;
-                       else
-                               maniptype = NF_NAT_MANIP_SRC;
-
-                       err = ovs_ct_nat_execute(skb, ct, ctinfo, &info->range,
-                                                maniptype, key);
-               } else if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) {
-                       err = ovs_ct_nat_execute(skb, ct, ctinfo, NULL,
-                                                NF_NAT_MANIP_SRC, key);
-               }
-       }
+       err = nf_ct_nat(skb, ct, ctinfo, &action, &info->range, info->commit);
+
+       if (action & BIT(NF_NAT_MANIP_SRC))
+               ovs_nat_update_key(key, skb, NF_NAT_MANIP_SRC);
+       if (action & BIT(NF_NAT_MANIP_DST))
+               ovs_nat_update_key(key, skb, NF_NAT_MANIP_DST);
 
        return err;
 }
index 861dfb8..932bcf7 100644 (file)
@@ -209,6 +209,26 @@ static struct vport *new_vport(const struct vport_parms *parms)
        return vport;
 }
 
+static void ovs_vport_update_upcall_stats(struct sk_buff *skb,
+                                         const struct dp_upcall_info *upcall_info,
+                                         bool upcall_result)
+{
+       struct vport *p = OVS_CB(skb)->input_vport;
+       struct vport_upcall_stats_percpu *stats;
+
+       if (upcall_info->cmd != OVS_PACKET_CMD_MISS &&
+           upcall_info->cmd != OVS_PACKET_CMD_ACTION)
+               return;
+
+       stats = this_cpu_ptr(p->upcall_stats);
+       u64_stats_update_begin(&stats->syncp);
+       if (upcall_result)
+               u64_stats_inc(&stats->n_success);
+       else
+               u64_stats_inc(&stats->n_fail);
+       u64_stats_update_end(&stats->syncp);
+}
+
 void ovs_dp_detach_port(struct vport *p)
 {
        ASSERT_OVSL();
@@ -216,6 +236,9 @@ void ovs_dp_detach_port(struct vport *p)
        /* First drop references to device. */
        hlist_del_rcu(&p->dp_hash_node);
 
+       /* Free percpu memory */
+       free_percpu(p->upcall_stats);
+
        /* Then destroy it. */
        ovs_vport_del(p);
 }
@@ -305,6 +328,8 @@ int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb,
                err = queue_userspace_packet(dp, skb, key, upcall_info, cutlen);
        else
                err = queue_gso_packets(dp, skb, key, upcall_info, cutlen);
+
+       ovs_vport_update_upcall_stats(skb, upcall_info, !err);
        if (err)
                goto err;
 
@@ -1826,6 +1851,12 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
                goto err_destroy_portids;
        }
 
+       vport->upcall_stats = netdev_alloc_pcpu_stats(struct vport_upcall_stats_percpu);
+       if (!vport->upcall_stats) {
+               err = -ENOMEM;
+               goto err_destroy_portids;
+       }
+
        err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid,
                                   info->snd_seq, 0, OVS_DP_CMD_NEW);
        BUG_ON(err < 0);
@@ -2098,6 +2129,9 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb,
                          OVS_VPORT_ATTR_PAD))
                goto nla_put_failure;
 
+       if (ovs_vport_get_upcall_stats(vport, skb))
+               goto nla_put_failure;
+
        if (ovs_vport_get_upcall_portids(vport, skb))
                goto nla_put_failure;
 
@@ -2279,6 +2313,12 @@ restart:
                goto exit_unlock_free;
        }
 
+       vport->upcall_stats = netdev_alloc_pcpu_stats(struct vport_upcall_stats_percpu);
+       if (!vport->upcall_stats) {
+               err = -ENOMEM;
+               goto exit_unlock_free;
+       }
+
        err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info),
                                      info->snd_portid, info->snd_seq, 0,
                                      OVS_VPORT_CMD_NEW, GFP_KERNEL);
@@ -2508,6 +2548,7 @@ static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = {
        [OVS_VPORT_ATTR_OPTIONS] = { .type = NLA_NESTED },
        [OVS_VPORT_ATTR_IFINDEX] = { .type = NLA_U32 },
        [OVS_VPORT_ATTR_NETNSID] = { .type = NLA_S32 },
+       [OVS_VPORT_ATTR_UPCALL_STATS] = { .type = NLA_NESTED },
 };
 
 static const struct genl_small_ops dp_vport_genl_ops[] = {
index 82a74f9..7e0f5c4 100644 (file)
@@ -285,6 +285,56 @@ void ovs_vport_get_stats(struct vport *vport, struct ovs_vport_stats *stats)
 }
 
 /**
+ *     ovs_vport_get_upcall_stats - retrieve upcall stats
+ *
+ * @vport: vport from which to retrieve the stats.
+ * @skb: sk_buff where upcall stats should be appended.
+ *
+ * Retrieves upcall stats for the given device.
+ *
+ * Must be called with ovs_mutex or rcu_read_lock.
+ */
+int ovs_vport_get_upcall_stats(struct vport *vport, struct sk_buff *skb)
+{
+       struct nlattr *nla;
+       int i;
+
+       __u64 tx_success = 0;
+       __u64 tx_fail = 0;
+
+       for_each_possible_cpu(i) {
+               const struct vport_upcall_stats_percpu *stats;
+               unsigned int start;
+
+               stats = per_cpu_ptr(vport->upcall_stats, i);
+               do {
+                       start = u64_stats_fetch_begin(&stats->syncp);
+                       tx_success += u64_stats_read(&stats->n_success);
+                       tx_fail += u64_stats_read(&stats->n_fail);
+               } while (u64_stats_fetch_retry(&stats->syncp, start));
+       }
+
+       nla = nla_nest_start_noflag(skb, OVS_VPORT_ATTR_UPCALL_STATS);
+       if (!nla)
+               return -EMSGSIZE;
+
+       if (nla_put_u64_64bit(skb, OVS_VPORT_UPCALL_ATTR_SUCCESS, tx_success,
+                             OVS_VPORT_ATTR_PAD)) {
+               nla_nest_cancel(skb, nla);
+               return -EMSGSIZE;
+       }
+
+       if (nla_put_u64_64bit(skb, OVS_VPORT_UPCALL_ATTR_FAIL, tx_fail,
+                             OVS_VPORT_ATTR_PAD)) {
+               nla_nest_cancel(skb, nla);
+               return -EMSGSIZE;
+       }
+       nla_nest_end(skb, nla);
+
+       return 0;
+}
+
+/**
  *     ovs_vport_get_options - retrieve device options
  *
  * @vport: vport from which to retrieve the options.
index 6ff45e8..3e71ca8 100644 (file)
@@ -32,6 +32,8 @@ struct vport *ovs_vport_locate(const struct net *net, const char *name);
 
 void ovs_vport_get_stats(struct vport *, struct ovs_vport_stats *);
 
+int ovs_vport_get_upcall_stats(struct vport *vport, struct sk_buff *skb);
+
 int ovs_vport_set_options(struct vport *, struct nlattr *options);
 int ovs_vport_get_options(const struct vport *, struct sk_buff *);
 
@@ -65,6 +67,7 @@ struct vport_portids {
  * @hash_node: Element in @dev_table hash table in vport.c.
  * @dp_hash_node: Element in @datapath->ports hash table in datapath.c.
  * @ops: Class structure.
+ * @upcall_stats: Upcall stats of every ports.
  * @detach_list: list used for detaching vport in net-exit call.
  * @rcu: RCU callback head for deferred destruction.
  */
@@ -78,6 +81,7 @@ struct vport {
        struct hlist_node hash_node;
        struct hlist_node dp_hash_node;
        const struct vport_ops *ops;
+       struct vport_upcall_stats_percpu __percpu *upcall_stats;
 
        struct list_head detach_list;
        struct rcu_head rcu;
@@ -137,6 +141,18 @@ struct vport_ops {
        struct list_head list;
 };
 
+/**
+ * struct vport_upcall_stats_percpu - per-cpu packet upcall statistics for
+ * a given vport.
+ * @n_success: Number of packets that upcall to userspace succeed.
+ * @n_fail:    Number of packets that upcall to userspace failed.
+ */
+struct vport_upcall_stats_percpu {
+       struct u64_stats_sync syncp;
+       u64_stats_t n_success;
+       u64_stats_t n_fail;
+};
+
 struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *,
                              const struct vport_parms *);
 void ovs_vport_free(struct vport *);
index accd35c..7ae023b 100644 (file)
@@ -58,4 +58,11 @@ config RXKAD
 
          See Documentation/networking/rxrpc.rst.
 
+config RXPERF
+       tristate "RxRPC test service"
+       help
+         Provide an rxperf service tester.  This listens on UDP port 7009 for
+         incoming calls from the rxperf program (an example of which can be
+         found in OpenAFS).
+
 endif
index fdeba48..e76d345 100644 (file)
@@ -16,6 +16,7 @@ rxrpc-y := \
        conn_service.o \
        input.o \
        insecure.o \
+       io_thread.o \
        key.o \
        local_event.o \
        local_object.o \
@@ -36,3 +37,6 @@ rxrpc-y := \
 rxrpc-$(CONFIG_PROC_FS) += proc.o
 rxrpc-$(CONFIG_RXKAD) += rxkad.o
 rxrpc-$(CONFIG_SYSCTL) += sysctl.o
+
+
+obj-$(CONFIG_RXPERF) += rxperf.o
index aacdd96..7ea576f 100644 (file)
@@ -194,8 +194,8 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len)
 
 service_in_use:
        write_unlock(&local->services_lock);
-       rxrpc_unuse_local(local);
-       rxrpc_put_local(local);
+       rxrpc_unuse_local(local, rxrpc_local_unuse_bind);
+       rxrpc_put_local(local, rxrpc_local_put_bind);
        ret = -EADDRINUSE;
 error_unlock:
        release_sock(&rx->sk);
@@ -328,7 +328,7 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock,
                mutex_unlock(&call->user_mutex);
        }
 
-       rxrpc_put_peer(cp.peer);
+       rxrpc_put_peer(cp.peer, rxrpc_peer_put_discard_tmp);
        _leave(" = %p", call);
        return call;
 }
@@ -359,9 +359,9 @@ void rxrpc_kernel_end_call(struct socket *sock, struct rxrpc_call *call)
 
        /* Make sure we're not going to call back into a kernel service */
        if (call->notify_rx) {
-               spin_lock_bh(&call->notify_lock);
+               spin_lock(&call->notify_lock);
                call->notify_rx = rxrpc_dummy_notify_rx;
-               spin_unlock_bh(&call->notify_lock);
+               spin_unlock(&call->notify_lock);
        }
 
        mutex_unlock(&call->user_mutex);
@@ -812,14 +812,12 @@ static int rxrpc_shutdown(struct socket *sock, int flags)
 
        lock_sock(sk);
 
-       spin_lock_bh(&sk->sk_receive_queue.lock);
        if (sk->sk_state < RXRPC_CLOSE) {
                sk->sk_state = RXRPC_CLOSE;
                sk->sk_shutdown = SHUTDOWN_MASK;
        } else {
                ret = -ESHUTDOWN;
        }
-       spin_unlock_bh(&sk->sk_receive_queue.lock);
 
        rxrpc_discard_prealloc(rx);
 
@@ -872,9 +870,7 @@ static int rxrpc_release_sock(struct sock *sk)
                break;
        }
 
-       spin_lock_bh(&sk->sk_receive_queue.lock);
        sk->sk_state = RXRPC_CLOSE;
-       spin_unlock_bh(&sk->sk_receive_queue.lock);
 
        if (rx->local && rcu_access_pointer(rx->local->service) == rx) {
                write_lock(&rx->local->services_lock);
@@ -888,8 +884,8 @@ static int rxrpc_release_sock(struct sock *sk)
        flush_workqueue(rxrpc_workqueue);
        rxrpc_purge_queue(&sk->sk_receive_queue);
 
-       rxrpc_unuse_local(rx->local);
-       rxrpc_put_local(rx->local);
+       rxrpc_unuse_local(rx->local, rxrpc_local_unuse_release_sock);
+       rxrpc_put_local(rx->local, rxrpc_local_put_release_sock);
        rx->local = NULL;
        key_put(rx->key);
        rx->key = NULL;
index f5c538c..e7dccab 100644 (file)
@@ -36,6 +36,8 @@ struct rxrpc_txbuf;
  * to pass supplementary information.
  */
 enum rxrpc_skb_mark {
+       RXRPC_SKB_MARK_PACKET,          /* Received packet */
+       RXRPC_SKB_MARK_ERROR,           /* Error notification */
        RXRPC_SKB_MARK_REJECT_BUSY,     /* Reject with BUSY */
        RXRPC_SKB_MARK_REJECT_ABORT,    /* Reject with ABORT (code in skb->priority) */
 };
@@ -76,7 +78,7 @@ struct rxrpc_net {
        bool                    kill_all_client_conns;
        atomic_t                nr_client_conns;
        spinlock_t              client_conn_cache_lock; /* Lock for ->*_client_conns */
-       spinlock_t              client_conn_discard_lock; /* Prevent multiple discarders */
+       struct mutex            client_conn_discard_lock; /* Prevent multiple discarders */
        struct list_head        idle_client_conns;
        struct work_struct      client_conn_reaper;
        struct timer_list       client_conn_reap_timer;
@@ -99,6 +101,9 @@ struct rxrpc_net {
        atomic_t                stat_tx_data_retrans;
        atomic_t                stat_tx_data_send;
        atomic_t                stat_tx_data_send_frag;
+       atomic_t                stat_tx_data_send_fail;
+       atomic_t                stat_tx_data_underflow;
+       atomic_t                stat_tx_data_cwnd_reset;
        atomic_t                stat_rx_data;
        atomic_t                stat_rx_data_reqack;
        atomic_t                stat_rx_data_jumbo;
@@ -110,6 +115,8 @@ struct rxrpc_net {
        atomic_t                stat_rx_acks[256];
 
        atomic_t                stat_why_req_ack[8];
+
+       atomic_t                stat_io_loop;
 };
 
 /*
@@ -279,13 +286,11 @@ struct rxrpc_local {
        struct rxrpc_net        *rxnet;         /* The network ns in which this resides */
        struct hlist_node       link;
        struct socket           *socket;        /* my UDP socket */
-       struct work_struct      processor;
-       struct list_head        ack_tx_queue;   /* List of ACKs that need sending */
-       spinlock_t              ack_tx_lock;    /* ACK list lock */
+       struct task_struct      *io_thread;
        struct rxrpc_sock __rcu *service;       /* Service(s) listening on this endpoint */
        struct rw_semaphore     defrag_sem;     /* control re-enablement of IP DF bit */
-       struct sk_buff_head     reject_queue;   /* packets awaiting rejection */
-       struct sk_buff_head     event_queue;    /* endpoint event packets awaiting processing */
+       struct sk_buff_head     rx_queue;       /* Received packets */
+       struct list_head        call_attend_q;  /* Calls requiring immediate attention */
        struct rb_root          client_bundles; /* Client connection bundles by socket params */
        spinlock_t              client_bundles_lock; /* Lock for client_bundles */
        spinlock_t              lock;           /* access lock */
@@ -403,12 +408,18 @@ enum rxrpc_conn_proto_state {
  * RxRPC client connection bundle.
  */
 struct rxrpc_bundle {
-       struct rxrpc_conn_parameters params;
+       struct rxrpc_local      *local;         /* Representation of local endpoint */
+       struct rxrpc_peer       *peer;          /* Remote endpoint */
+       struct key              *key;           /* Security details */
        refcount_t              ref;
        atomic_t                active;         /* Number of active users */
        unsigned int            debug_id;
+       u32                     security_level; /* Security level selected */
+       u16                     service_id;     /* Service ID for this connection */
        bool                    try_upgrade;    /* True if the bundle is attempting upgrade */
        bool                    alloc_conn;     /* True if someone's getting a conn */
+       bool                    exclusive;      /* T if conn is exclusive */
+       bool                    upgrade;        /* T if service ID can be upgraded */
        short                   alloc_error;    /* Error from last conn allocation */
        spinlock_t              channel_lock;
        struct rb_node          local_node;     /* Node in local->client_conns */
@@ -424,9 +435,13 @@ struct rxrpc_bundle {
  */
 struct rxrpc_connection {
        struct rxrpc_conn_proto proto;
-       struct rxrpc_conn_parameters params;
+       struct rxrpc_local      *local;         /* Representation of local endpoint */
+       struct rxrpc_peer       *peer;          /* Remote endpoint */
+       struct rxrpc_net        *rxnet;         /* Network namespace to which call belongs */
+       struct key              *key;           /* Security details */
 
        refcount_t              ref;
+       atomic_t                active;         /* Active count for service conns */
        struct rcu_head         rcu;
        struct list_head        cache_link;
 
@@ -447,6 +462,7 @@ struct rxrpc_connection {
 
        struct timer_list       timer;          /* Conn event timer */
        struct work_struct      processor;      /* connection event processor */
+       struct work_struct      destructor;     /* In-process-context destroyer */
        struct rxrpc_bundle     *bundle;        /* Client connection bundle */
        struct rb_node          service_node;   /* Node in peer->service_conns */
        struct list_head        proc_link;      /* link in procfs list */
@@ -471,9 +487,13 @@ struct rxrpc_connection {
        atomic_t                serial;         /* packet serial number counter */
        unsigned int            hi_serial;      /* highest serial number received */
        u32                     service_id;     /* Service ID, possibly upgraded */
+       u32                     security_level; /* Security level selected */
        u8                      security_ix;    /* security type */
        u8                      out_clientflag; /* RXRPC_CLIENT_INITIATED if we are client */
        u8                      bundle_shift;   /* Index into bundle->avail_chans */
+       bool                    exclusive;      /* T if conn is exclusive */
+       bool                    upgrade;        /* T if service ID can be upgraded */
+       u16                     orig_service_id; /* Originally requested service ID */
        short                   error;          /* Local error code */
 };
 
@@ -502,22 +522,19 @@ enum rxrpc_call_flag {
        RXRPC_CALL_RETRANS_TIMEOUT,     /* Retransmission due to timeout occurred */
        RXRPC_CALL_BEGAN_RX_TIMER,      /* We began the expect_rx_by timer */
        RXRPC_CALL_RX_HEARD,            /* The peer responded at least once to this call */
-       RXRPC_CALL_RX_UNDERRUN,         /* Got data underrun */
        RXRPC_CALL_DISCONNECTED,        /* The call has been disconnected */
        RXRPC_CALL_KERNEL,              /* The call was made by the kernel */
        RXRPC_CALL_UPGRADE,             /* Service upgrade was requested for the call */
-       RXRPC_CALL_DELAY_ACK_PENDING,   /* DELAY ACK generation is pending */
-       RXRPC_CALL_IDLE_ACK_PENDING,    /* IDLE ACK generation is pending */
+       RXRPC_CALL_EXCLUSIVE,           /* The call uses a once-only connection */
+       RXRPC_CALL_RX_IS_IDLE,          /* Reception is idle - send an ACK */
 };
 
 /*
  * Events that can be raised on a call.
  */
 enum rxrpc_call_event {
-       RXRPC_CALL_EV_ABORT,            /* need to generate abort */
-       RXRPC_CALL_EV_RESEND,           /* Tx resend required */
-       RXRPC_CALL_EV_EXPIRED,          /* Expiry occurred */
        RXRPC_CALL_EV_ACK_LOST,         /* ACK may be lost, send ping */
+       RXRPC_CALL_EV_INITIAL_PING,     /* Send initial ping for a new service call */
 };
 
 /*
@@ -570,10 +587,13 @@ struct rxrpc_call {
        struct rcu_head         rcu;
        struct rxrpc_connection *conn;          /* connection carrying call */
        struct rxrpc_peer       *peer;          /* Peer record for remote address */
+       struct rxrpc_local      *local;         /* Representation of local endpoint */
        struct rxrpc_sock __rcu *socket;        /* socket responsible */
        struct rxrpc_net        *rxnet;         /* Network namespace to which call belongs */
+       struct key              *key;           /* Security details */
        const struct rxrpc_security *security;  /* applied security module */
        struct mutex            user_mutex;     /* User access mutex */
+       struct sockaddr_rxrpc   dest_srx;       /* Destination address */
        unsigned long           delay_ack_at;   /* When DELAY ACK needs to happen */
        unsigned long           ack_lost_at;    /* When ACK is figured as lost */
        unsigned long           resend_at;      /* When next resend needs to happen */
@@ -585,7 +605,7 @@ struct rxrpc_call {
        u32                     next_rx_timo;   /* Timeout for next Rx packet (jif) */
        u32                     next_req_timo;  /* Timeout for next Rx request packet (jif) */
        struct timer_list       timer;          /* Combined event timer */
-       struct work_struct      processor;      /* Event processor */
+       struct work_struct      destroyer;      /* In-process-context destroyer */
        rxrpc_notify_rx_t       notify_rx;      /* kernel service Rx notification function */
        struct list_head        link;           /* link in master call list */
        struct list_head        chan_wait_link; /* Link in conn->bundle->waiting_calls */
@@ -594,6 +614,7 @@ struct rxrpc_call {
        struct list_head        recvmsg_link;   /* Link in rx->recvmsg_q */
        struct list_head        sock_link;      /* Link in rx->sock_calls */
        struct rb_node          sock_node;      /* Node in rx->calls */
+       struct list_head        attend_link;    /* Link in local->call_attend_q */
        struct rxrpc_txbuf      *tx_pending;    /* Tx buffer being filled */
        wait_queue_head_t       waitq;          /* Wait queue for channel or Tx */
        s64                     tx_total_len;   /* Total length left to be transmitted (or -1) */
@@ -607,20 +628,22 @@ struct rxrpc_call {
        enum rxrpc_call_state   state;          /* current state of call */
        enum rxrpc_call_completion completion;  /* Call completion condition */
        refcount_t              ref;
-       u16                     service_id;     /* service ID */
        u8                      security_ix;    /* Security type */
        enum rxrpc_interruptibility interruptibility; /* At what point call may be interrupted */
        u32                     call_id;        /* call ID on connection  */
        u32                     cid;            /* connection ID plus channel index */
+       u32                     security_level; /* Security level selected */
        int                     debug_id;       /* debug ID for printks */
        unsigned short          rx_pkt_offset;  /* Current recvmsg packet offset */
        unsigned short          rx_pkt_len;     /* Current recvmsg packet len */
 
        /* Transmitted data tracking. */
        spinlock_t              tx_lock;        /* Transmit queue lock */
+       struct list_head        tx_sendmsg;     /* Sendmsg prepared packets */
        struct list_head        tx_buffer;      /* Buffer of transmissible packets */
        rxrpc_seq_t             tx_bottom;      /* First packet in buffer */
        rxrpc_seq_t             tx_transmitted; /* Highest packet transmitted */
+       rxrpc_seq_t             tx_prepared;    /* Highest Tx slot prepared. */
        rxrpc_seq_t             tx_top;         /* Highest Tx slot allocated. */
        u16                     tx_backoff;     /* Delay to insert due to Tx failure */
        u8                      tx_winsize;     /* Maximum size of Tx window */
@@ -635,13 +658,13 @@ struct rxrpc_call {
        rxrpc_seq_t             rx_consumed;    /* Highest packet consumed */
        rxrpc_serial_t          rx_serial;      /* Highest serial received for this call */
        u8                      rx_winsize;     /* Size of Rx window */
-       spinlock_t              input_lock;     /* Lock for packet input to this call */
 
        /* TCP-style slow-start congestion control [RFC5681].  Since the SMSS
         * is fixed, we keep these numbers in terms of segments (ie. DATA
         * packets) rather than bytes.
         */
 #define RXRPC_TX_SMSS          RXRPC_JUMBO_DATALEN
+#define RXRPC_MIN_CWND         (RXRPC_TX_SMSS > 2190 ? 2 : RXRPC_TX_SMSS > 1095 ? 3 : 4)
        u8                      cong_cwnd;      /* Congestion window size */
        u8                      cong_extra;     /* Extra to send for congestion management */
        u8                      cong_ssthresh;  /* Slow-start threshold */
@@ -676,11 +699,7 @@ struct rxrpc_call {
        rxrpc_seq_t             acks_prev_seq;  /* Highest previousPacket received */
        rxrpc_seq_t             acks_hard_ack;  /* Latest hard-ack point */
        rxrpc_seq_t             acks_lowest_nak; /* Lowest NACK in the buffer (or ==tx_hard_ack) */
-       rxrpc_seq_t             acks_lost_top;  /* tx_top at the time lost-ack ping sent */
-       rxrpc_serial_t          acks_lost_ping; /* Serial number of probe ACK */
        rxrpc_serial_t          acks_highest_serial; /* Highest serial number ACK'd */
-       struct sk_buff          *acks_soft_tbl; /* The last ACK packet with NAKs in it */
-       spinlock_t              acks_ack_lock;  /* Access to ->acks_last_ack */
 };
 
 /*
@@ -739,9 +758,8 @@ struct rxrpc_send_params {
  */
 struct rxrpc_txbuf {
        struct rcu_head         rcu;
-       struct list_head        call_link;      /* Link in call->tx_queue */
+       struct list_head        call_link;      /* Link in call->tx_sendmsg/tx_buffer */
        struct list_head        tx_link;        /* Link in live Enc queue or Tx queue */
-       struct rxrpc_call       *call;          /* Call to which belongs */
        ktime_t                 last_sent;      /* Time at which last transmitted */
        refcount_t              ref;
        rxrpc_seq_t             seq;            /* Sequence number of this packet */
@@ -793,9 +811,9 @@ extern struct workqueue_struct *rxrpc_workqueue;
  */
 int rxrpc_service_prealloc(struct rxrpc_sock *, gfp_t);
 void rxrpc_discard_prealloc(struct rxrpc_sock *);
-struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *,
-                                          struct rxrpc_sock *,
-                                          struct sk_buff *);
+bool rxrpc_new_incoming_call(struct rxrpc_local *, struct rxrpc_peer *,
+                            struct rxrpc_connection *, struct sockaddr_rxrpc *,
+                            struct sk_buff *);
 void rxrpc_accept_incoming_calls(struct rxrpc_local *);
 int rxrpc_user_charge_accept(struct rxrpc_sock *, unsigned long);
 
@@ -808,14 +826,14 @@ void rxrpc_send_ACK(struct rxrpc_call *, u8, rxrpc_serial_t, enum rxrpc_propose_
 void rxrpc_propose_delay_ACK(struct rxrpc_call *, rxrpc_serial_t,
                             enum rxrpc_propose_ack_trace);
 void rxrpc_shrink_call_tx_buffer(struct rxrpc_call *);
-void rxrpc_process_call(struct work_struct *);
+void rxrpc_resend(struct rxrpc_call *call, struct sk_buff *ack_skb);
 
 void rxrpc_reduce_call_timer(struct rxrpc_call *call,
                             unsigned long expire_at,
                             unsigned long now,
                             enum rxrpc_timer_trace why);
 
-void rxrpc_delete_call_timer(struct rxrpc_call *call);
+void rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb);
 
 /*
  * call_object.c
@@ -824,6 +842,7 @@ extern const char *const rxrpc_call_states[];
 extern const char *const rxrpc_call_completions[];
 extern struct kmem_cache *rxrpc_call_jar;
 
+void rxrpc_poke_call(struct rxrpc_call *call, enum rxrpc_call_poke_trace what);
 struct rxrpc_call *rxrpc_find_call_by_user_ID(struct rxrpc_sock *, unsigned long);
 struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *, gfp_t, unsigned int);
 struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *,
@@ -835,10 +854,8 @@ void rxrpc_incoming_call(struct rxrpc_sock *, struct rxrpc_call *,
                         struct sk_buff *);
 void rxrpc_release_call(struct rxrpc_sock *, struct rxrpc_call *);
 void rxrpc_release_calls_on_socket(struct rxrpc_sock *);
-bool __rxrpc_queue_call(struct rxrpc_call *);
-bool rxrpc_queue_call(struct rxrpc_call *);
-void rxrpc_see_call(struct rxrpc_call *);
-bool rxrpc_try_get_call(struct rxrpc_call *call, enum rxrpc_call_trace op);
+void rxrpc_see_call(struct rxrpc_call *, enum rxrpc_call_trace);
+struct rxrpc_call *rxrpc_try_get_call(struct rxrpc_call *, enum rxrpc_call_trace);
 void rxrpc_get_call(struct rxrpc_call *, enum rxrpc_call_trace);
 void rxrpc_put_call(struct rxrpc_call *, enum rxrpc_call_trace);
 void rxrpc_cleanup_call(struct rxrpc_call *);
@@ -863,14 +880,14 @@ extern unsigned long rxrpc_conn_idle_client_fast_expiry;
 extern struct idr rxrpc_client_conn_ids;
 
 void rxrpc_destroy_client_conn_ids(void);
-struct rxrpc_bundle *rxrpc_get_bundle(struct rxrpc_bundle *);
-void rxrpc_put_bundle(struct rxrpc_bundle *);
+struct rxrpc_bundle *rxrpc_get_bundle(struct rxrpc_bundle *, enum rxrpc_bundle_trace);
+void rxrpc_put_bundle(struct rxrpc_bundle *, enum rxrpc_bundle_trace);
 int rxrpc_connect_call(struct rxrpc_sock *, struct rxrpc_call *,
                       struct rxrpc_conn_parameters *, struct sockaddr_rxrpc *,
                       gfp_t);
 void rxrpc_expose_client_call(struct rxrpc_call *);
 void rxrpc_disconnect_client_call(struct rxrpc_bundle *, struct rxrpc_call *);
-void rxrpc_put_client_conn(struct rxrpc_connection *);
+void rxrpc_put_client_conn(struct rxrpc_connection *, enum rxrpc_conn_trace);
 void rxrpc_discard_expired_client_conns(struct work_struct *);
 void rxrpc_destroy_all_client_connections(struct rxrpc_net *);
 void rxrpc_clean_up_local_conns(struct rxrpc_local *);
@@ -880,6 +897,7 @@ void rxrpc_clean_up_local_conns(struct rxrpc_local *);
  */
 void rxrpc_process_connection(struct work_struct *);
 void rxrpc_process_delayed_final_acks(struct rxrpc_connection *, bool);
+int rxrpc_input_conn_packet(struct rxrpc_connection *conn, struct sk_buff *skb);
 
 /*
  * conn_object.c
@@ -887,18 +905,20 @@ void rxrpc_process_delayed_final_acks(struct rxrpc_connection *, bool);
 extern unsigned int rxrpc_connection_expiry;
 extern unsigned int rxrpc_closed_conn_expiry;
 
-struct rxrpc_connection *rxrpc_alloc_connection(gfp_t);
-struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *,
-                                                  struct sk_buff *,
-                                                  struct rxrpc_peer **);
+struct rxrpc_connection *rxrpc_alloc_connection(struct rxrpc_net *, gfp_t);
+struct rxrpc_connection *rxrpc_find_client_connection_rcu(struct rxrpc_local *,
+                                                         struct sockaddr_rxrpc *,
+                                                         struct sk_buff *);
 void __rxrpc_disconnect_call(struct rxrpc_connection *, struct rxrpc_call *);
 void rxrpc_disconnect_call(struct rxrpc_call *);
-void rxrpc_kill_connection(struct rxrpc_connection *);
-bool rxrpc_queue_conn(struct rxrpc_connection *);
-void rxrpc_see_connection(struct rxrpc_connection *);
-struct rxrpc_connection *rxrpc_get_connection(struct rxrpc_connection *);
-struct rxrpc_connection *rxrpc_get_connection_maybe(struct rxrpc_connection *);
-void rxrpc_put_service_conn(struct rxrpc_connection *);
+void rxrpc_kill_client_conn(struct rxrpc_connection *);
+void rxrpc_queue_conn(struct rxrpc_connection *, enum rxrpc_conn_trace);
+void rxrpc_see_connection(struct rxrpc_connection *, enum rxrpc_conn_trace);
+struct rxrpc_connection *rxrpc_get_connection(struct rxrpc_connection *,
+                                             enum rxrpc_conn_trace);
+struct rxrpc_connection *rxrpc_get_connection_maybe(struct rxrpc_connection *,
+                                                   enum rxrpc_conn_trace);
+void rxrpc_put_connection(struct rxrpc_connection *, enum rxrpc_conn_trace);
 void rxrpc_service_connection_reaper(struct work_struct *);
 void rxrpc_destroy_all_connections(struct rxrpc_net *);
 
@@ -912,17 +932,6 @@ static inline bool rxrpc_conn_is_service(const struct rxrpc_connection *conn)
        return !rxrpc_conn_is_client(conn);
 }
 
-static inline void rxrpc_put_connection(struct rxrpc_connection *conn)
-{
-       if (!conn)
-               return;
-
-       if (rxrpc_conn_is_client(conn))
-               rxrpc_put_client_conn(conn);
-       else
-               rxrpc_put_service_conn(conn);
-}
-
 static inline void rxrpc_reduce_conn_timer(struct rxrpc_connection *conn,
                                           unsigned long expire_at)
 {
@@ -942,7 +951,20 @@ void rxrpc_unpublish_service_conn(struct rxrpc_connection *);
 /*
  * input.c
  */
-int rxrpc_input_packet(struct sock *, struct sk_buff *);
+void rxrpc_congestion_degrade(struct rxrpc_call *);
+void rxrpc_input_call_packet(struct rxrpc_call *, struct sk_buff *);
+void rxrpc_implicit_end_call(struct rxrpc_call *, struct sk_buff *);
+
+/*
+ * io_thread.c
+ */
+int rxrpc_encap_rcv(struct sock *, struct sk_buff *);
+void rxrpc_error_report(struct sock *);
+int rxrpc_io_thread(void *data);
+static inline void rxrpc_wake_up_io_thread(struct rxrpc_local *local)
+{
+       wake_up_process(local->io_thread);
+}
 
 /*
  * insecure.c
@@ -961,28 +983,41 @@ int rxrpc_get_server_data_key(struct rxrpc_connection *, const void *, time64_t,
 /*
  * local_event.c
  */
-extern void rxrpc_process_local_events(struct rxrpc_local *);
+void rxrpc_send_version_request(struct rxrpc_local *local,
+                               struct rxrpc_host_header *hdr,
+                               struct sk_buff *skb);
 
 /*
  * local_object.c
  */
 struct rxrpc_local *rxrpc_lookup_local(struct net *, const struct sockaddr_rxrpc *);
-struct rxrpc_local *rxrpc_get_local(struct rxrpc_local *);
-struct rxrpc_local *rxrpc_get_local_maybe(struct rxrpc_local *);
-void rxrpc_put_local(struct rxrpc_local *);
-struct rxrpc_local *rxrpc_use_local(struct rxrpc_local *);
-void rxrpc_unuse_local(struct rxrpc_local *);
-void rxrpc_queue_local(struct rxrpc_local *);
+struct rxrpc_local *rxrpc_get_local(struct rxrpc_local *, enum rxrpc_local_trace);
+struct rxrpc_local *rxrpc_get_local_maybe(struct rxrpc_local *, enum rxrpc_local_trace);
+void rxrpc_put_local(struct rxrpc_local *, enum rxrpc_local_trace);
+struct rxrpc_local *rxrpc_use_local(struct rxrpc_local *, enum rxrpc_local_trace);
+void rxrpc_unuse_local(struct rxrpc_local *, enum rxrpc_local_trace);
+void rxrpc_destroy_local(struct rxrpc_local *local);
 void rxrpc_destroy_all_locals(struct rxrpc_net *);
 
-static inline bool __rxrpc_unuse_local(struct rxrpc_local *local)
+static inline bool __rxrpc_use_local(struct rxrpc_local *local,
+                                    enum rxrpc_local_trace why)
 {
-       return atomic_dec_return(&local->active_users) == 0;
+       int r, u;
+
+       r = refcount_read(&local->ref);
+       u = atomic_fetch_add_unless(&local->active_users, 1, 0);
+       trace_rxrpc_local(local->debug_id, why, r, u);
+       return u != 0;
 }
 
-static inline bool __rxrpc_use_local(struct rxrpc_local *local)
+static inline void rxrpc_see_local(struct rxrpc_local *local,
+                                  enum rxrpc_local_trace why)
 {
-       return atomic_fetch_add_unless(&local->active_users, 1, 0) != 0;
+       int r, u;
+
+       r = refcount_read(&local->ref);
+       u = atomic_read(&local->active_users);
+       trace_rxrpc_local(local->debug_id, why, r, u);
 }
 
 /*
@@ -1009,16 +1044,17 @@ static inline struct rxrpc_net *rxrpc_net(struct net *net)
 /*
  * output.c
  */
-void rxrpc_transmit_ack_packets(struct rxrpc_local *);
+int rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb);
 int rxrpc_send_abort_packet(struct rxrpc_call *);
 int rxrpc_send_data_packet(struct rxrpc_call *, struct rxrpc_txbuf *);
-void rxrpc_reject_packets(struct rxrpc_local *);
+void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb);
 void rxrpc_send_keepalive(struct rxrpc_peer *);
+void rxrpc_transmit_one(struct rxrpc_call *call, struct rxrpc_txbuf *txb);
 
 /*
  * peer_event.c
  */
-void rxrpc_error_report(struct sock *);
+void rxrpc_input_error(struct rxrpc_local *, struct sk_buff *);
 void rxrpc_peer_keepalive_worker(struct work_struct *);
 
 /*
@@ -1028,14 +1064,15 @@ struct rxrpc_peer *rxrpc_lookup_peer_rcu(struct rxrpc_local *,
                                         const struct sockaddr_rxrpc *);
 struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_sock *, struct rxrpc_local *,
                                     struct sockaddr_rxrpc *, gfp_t);
-struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *, gfp_t);
+struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *, gfp_t,
+                                   enum rxrpc_peer_trace);
 void rxrpc_new_incoming_peer(struct rxrpc_sock *, struct rxrpc_local *,
                             struct rxrpc_peer *);
 void rxrpc_destroy_all_peers(struct rxrpc_net *);
-struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *);
-struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *);
-void rxrpc_put_peer(struct rxrpc_peer *);
-void rxrpc_put_peer_locked(struct rxrpc_peer *);
+struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *, enum rxrpc_peer_trace);
+struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *, enum rxrpc_peer_trace);
+void rxrpc_put_peer(struct rxrpc_peer *, enum rxrpc_peer_trace);
+void rxrpc_put_peer_locked(struct rxrpc_peer *, enum rxrpc_peer_trace);
 
 /*
  * proc.c
@@ -1097,6 +1134,7 @@ extern const struct rxrpc_security rxkad;
 int __init rxrpc_init_security(void);
 const struct rxrpc_security *rxrpc_security_lookup(u8);
 void rxrpc_exit_security(void);
+int rxrpc_init_client_call_security(struct rxrpc_call *);
 int rxrpc_init_client_conn_security(struct rxrpc_connection *);
 const struct rxrpc_security *rxrpc_get_incoming_security(struct rxrpc_sock *,
                                                         struct sk_buff *);
@@ -1119,7 +1157,6 @@ int rxrpc_server_keyring(struct rxrpc_sock *, sockptr_t, int);
  * skbuff.c
  */
 void rxrpc_kernel_data_consumed(struct rxrpc_call *, struct sk_buff *);
-void rxrpc_packet_destructor(struct sk_buff *);
 void rxrpc_new_skb(struct sk_buff *, enum rxrpc_skb_trace);
 void rxrpc_see_skb(struct sk_buff *, enum rxrpc_skb_trace);
 void rxrpc_eaten_skb(struct sk_buff *, enum rxrpc_skb_trace);
@@ -1190,23 +1227,17 @@ extern unsigned int rxrpc_debug;
 #define kenter(FMT,...)        dbgprintk("==> %s("FMT")",__func__ ,##__VA_ARGS__)
 #define kleave(FMT,...)        dbgprintk("<== %s()"FMT"",__func__ ,##__VA_ARGS__)
 #define kdebug(FMT,...)        dbgprintk("    "FMT ,##__VA_ARGS__)
-#define kproto(FMT,...)        dbgprintk("### "FMT ,##__VA_ARGS__)
-#define knet(FMT,...)  dbgprintk("@@@ "FMT ,##__VA_ARGS__)
 
 
 #if defined(__KDEBUG)
 #define _enter(FMT,...)        kenter(FMT,##__VA_ARGS__)
 #define _leave(FMT,...)        kleave(FMT,##__VA_ARGS__)
 #define _debug(FMT,...)        kdebug(FMT,##__VA_ARGS__)
-#define _proto(FMT,...)        kproto(FMT,##__VA_ARGS__)
-#define _net(FMT,...)  knet(FMT,##__VA_ARGS__)
 
 #elif defined(CONFIG_AF_RXRPC_DEBUG)
 #define RXRPC_DEBUG_KENTER     0x01
 #define RXRPC_DEBUG_KLEAVE     0x02
 #define RXRPC_DEBUG_KDEBUG     0x04
-#define RXRPC_DEBUG_KPROTO     0x08
-#define RXRPC_DEBUG_KNET       0x10
 
 #define _enter(FMT,...)                                        \
 do {                                                   \
@@ -1226,24 +1257,10 @@ do {                                                    \
                kdebug(FMT,##__VA_ARGS__);              \
 } while (0)
 
-#define _proto(FMT,...)                                        \
-do {                                                   \
-       if (unlikely(rxrpc_debug & RXRPC_DEBUG_KPROTO)) \
-               kproto(FMT,##__VA_ARGS__);              \
-} while (0)
-
-#define _net(FMT,...)                                  \
-do {                                                   \
-       if (unlikely(rxrpc_debug & RXRPC_DEBUG_KNET))   \
-               knet(FMT,##__VA_ARGS__);                \
-} while (0)
-
 #else
 #define _enter(FMT,...)        no_printk("==> %s("FMT")",__func__ ,##__VA_ARGS__)
 #define _leave(FMT,...)        no_printk("<== %s()"FMT"",__func__ ,##__VA_ARGS__)
 #define _debug(FMT,...)        no_printk("    "FMT ,##__VA_ARGS__)
-#define _proto(FMT,...)        no_printk("### "FMT ,##__VA_ARGS__)
-#define _net(FMT,...)  no_printk("@@@ "FMT ,##__VA_ARGS__)
 #endif
 
 /*
index 48790ee..d185086 100644 (file)
@@ -38,7 +38,6 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx,
                                      unsigned long user_call_ID, gfp_t gfp,
                                      unsigned int debug_id)
 {
-       const void *here = __builtin_return_address(0);
        struct rxrpc_call *call, *xcall;
        struct rxrpc_net *rxnet = rxrpc_net(sock_net(&rx->sk));
        struct rb_node *parent, **pp;
@@ -70,7 +69,9 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx,
        head = b->peer_backlog_head;
        tail = READ_ONCE(b->peer_backlog_tail);
        if (CIRC_CNT(head, tail, size) < max) {
-               struct rxrpc_peer *peer = rxrpc_alloc_peer(rx->local, gfp);
+               struct rxrpc_peer *peer;
+
+               peer = rxrpc_alloc_peer(rx->local, gfp, rxrpc_peer_new_prealloc);
                if (!peer)
                        return -ENOMEM;
                b->peer_backlog[head] = peer;
@@ -89,9 +90,6 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx,
                b->conn_backlog[head] = conn;
                smp_store_release(&b->conn_backlog_head,
                                  (head + 1) & (size - 1));
-
-               trace_rxrpc_conn(conn->debug_id, rxrpc_conn_new_service,
-                                refcount_read(&conn->ref), here);
        }
 
        /* Now it gets complicated, because calls get registered with the
@@ -102,10 +100,10 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx,
                return -ENOMEM;
        call->flags |= (1 << RXRPC_CALL_IS_SERVICE);
        call->state = RXRPC_CALL_SERVER_PREALLOC;
+       __set_bit(RXRPC_CALL_EV_INITIAL_PING, &call->events);
 
-       trace_rxrpc_call(call->debug_id, rxrpc_call_new_service,
-                        refcount_read(&call->ref),
-                        here, (const void *)user_call_ID);
+       trace_rxrpc_call(call->debug_id, refcount_read(&call->ref),
+                        user_call_ID, rxrpc_call_new_prealloc_service);
 
        write_lock(&rx->call_lock);
 
@@ -126,11 +124,11 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx,
        call->user_call_ID = user_call_ID;
        call->notify_rx = notify_rx;
        if (user_attach_call) {
-               rxrpc_get_call(call, rxrpc_call_got_kernel);
+               rxrpc_get_call(call, rxrpc_call_get_kernel_service);
                user_attach_call(call, user_call_ID);
        }
 
-       rxrpc_get_call(call, rxrpc_call_got_userid);
+       rxrpc_get_call(call, rxrpc_call_get_userid);
        rb_link_node(&call->sock_node, parent, pp);
        rb_insert_color(&call->sock_node, &rx->calls);
        set_bit(RXRPC_CALL_HAS_USERID, &call->flags);
@@ -140,9 +138,9 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx,
        write_unlock(&rx->call_lock);
 
        rxnet = call->rxnet;
-       spin_lock_bh(&rxnet->call_lock);
+       spin_lock(&rxnet->call_lock);
        list_add_tail_rcu(&call->link, &rxnet->calls);
-       spin_unlock_bh(&rxnet->call_lock);
+       spin_unlock(&rxnet->call_lock);
 
        b->call_backlog[call_head] = call;
        smp_store_release(&b->call_backlog_head, (call_head + 1) & (size - 1));
@@ -190,14 +188,14 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx)
        /* Make sure that there aren't any incoming calls in progress before we
         * clear the preallocation buffers.
         */
-       spin_lock_bh(&rx->incoming_lock);
-       spin_unlock_bh(&rx->incoming_lock);
+       spin_lock(&rx->incoming_lock);
+       spin_unlock(&rx->incoming_lock);
 
        head = b->peer_backlog_head;
        tail = b->peer_backlog_tail;
        while (CIRC_CNT(head, tail, size) > 0) {
                struct rxrpc_peer *peer = b->peer_backlog[tail];
-               rxrpc_put_local(peer->local);
+               rxrpc_put_local(peer->local, rxrpc_local_put_prealloc_conn);
                kfree(peer);
                tail = (tail + 1) & (size - 1);
        }
@@ -230,7 +228,7 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx)
                }
                rxrpc_call_completed(call);
                rxrpc_release_call(rx, call);
-               rxrpc_put_call(call, rxrpc_call_put);
+               rxrpc_put_call(call, rxrpc_call_put_discard_prealloc);
                tail = (tail + 1) & (size - 1);
        }
 
@@ -238,21 +236,6 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx)
 }
 
 /*
- * Ping the other end to fill our RTT cache and to retrieve the rwind
- * and MTU parameters.
- */
-static void rxrpc_send_ping(struct rxrpc_call *call, struct sk_buff *skb)
-{
-       struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
-       ktime_t now = skb->tstamp;
-
-       if (call->peer->rtt_count < 3 ||
-           ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), now))
-               rxrpc_send_ACK(call, RXRPC_ACK_PING, sp->hdr.serial,
-                              rxrpc_propose_ack_ping_for_params);
-}
-
-/*
  * Allocate a new incoming call from the prealloc pool, along with a connection
  * and a peer as necessary.
  */
@@ -261,6 +244,7 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx,
                                                    struct rxrpc_peer *peer,
                                                    struct rxrpc_connection *conn,
                                                    const struct rxrpc_security *sec,
+                                                   struct sockaddr_rxrpc *peer_srx,
                                                    struct sk_buff *skb)
 {
        struct rxrpc_backlog *b = rx->backlog;
@@ -286,12 +270,11 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx,
                return NULL;
 
        if (!conn) {
-               if (peer && !rxrpc_get_peer_maybe(peer))
+               if (peer && !rxrpc_get_peer_maybe(peer, rxrpc_peer_get_service_conn))
                        peer = NULL;
                if (!peer) {
                        peer = b->peer_backlog[peer_tail];
-                       if (rxrpc_extract_addr_from_skb(&peer->srx, skb) < 0)
-                               return NULL;
+                       peer->srx = *peer_srx;
                        b->peer_backlog[peer_tail] = NULL;
                        smp_store_release(&b->peer_backlog_tail,
                                          (peer_tail + 1) &
@@ -305,12 +288,13 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx,
                b->conn_backlog[conn_tail] = NULL;
                smp_store_release(&b->conn_backlog_tail,
                                  (conn_tail + 1) & (RXRPC_BACKLOG_MAX - 1));
-               conn->params.local = rxrpc_get_local(local);
-               conn->params.peer = peer;
-               rxrpc_see_connection(conn);
+               conn->local = rxrpc_get_local(local, rxrpc_local_get_prealloc_conn);
+               conn->peer = peer;
+               rxrpc_see_connection(conn, rxrpc_conn_see_new_service_conn);
                rxrpc_new_incoming_connection(rx, conn, sec, skb);
        } else {
-               rxrpc_get_connection(conn);
+               rxrpc_get_connection(conn, rxrpc_conn_get_service_conn);
+               atomic_inc(&conn->active);
        }
 
        /* And now we can allocate and set up a new call */
@@ -319,43 +303,69 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx,
        smp_store_release(&b->call_backlog_tail,
                          (call_tail + 1) & (RXRPC_BACKLOG_MAX - 1));
 
-       rxrpc_see_call(call);
+       rxrpc_see_call(call, rxrpc_call_see_accept);
+       call->local = rxrpc_get_local(conn->local, rxrpc_local_get_call);
        call->conn = conn;
        call->security = conn->security;
        call->security_ix = conn->security_ix;
-       call->peer = rxrpc_get_peer(conn->params.peer);
+       call->peer = rxrpc_get_peer(conn->peer, rxrpc_peer_get_accept);
+       call->dest_srx = peer->srx;
        call->cong_ssthresh = call->peer->cong_ssthresh;
        call->tx_last_sent = ktime_get_real();
        return call;
 }
 
 /*
- * Set up a new incoming call.  Called in BH context with the RCU read lock
- * held.
+ * Set up a new incoming call.  Called from the I/O thread.
  *
  * If this is for a kernel service, when we allocate the call, it will have
  * three refs on it: (1) the kernel service, (2) the user_call_ID tree, (3) the
  * retainer ref obtained from the backlog buffer.  Prealloc calls for userspace
- * services only have the ref from the backlog buffer.  We want to pass this
- * ref to non-BH context to dispose of.
+ * services only have the ref from the backlog buffer.
  *
  * If we want to report an error, we mark the skb with the packet type and
- * abort code and return NULL.
- *
- * The call is returned with the user access mutex held.
+ * abort code and return false.
  */
-struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local,
-                                          struct rxrpc_sock *rx,
-                                          struct sk_buff *skb)
+bool rxrpc_new_incoming_call(struct rxrpc_local *local,
+                            struct rxrpc_peer *peer,
+                            struct rxrpc_connection *conn,
+                            struct sockaddr_rxrpc *peer_srx,
+                            struct sk_buff *skb)
 {
-       struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
        const struct rxrpc_security *sec = NULL;
-       struct rxrpc_connection *conn;
-       struct rxrpc_peer *peer = NULL;
+       struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
        struct rxrpc_call *call = NULL;
+       struct rxrpc_sock *rx;
 
        _enter("");
 
+       /* Don't set up a call for anything other than the first DATA packet. */
+       if (sp->hdr.seq != 1 ||
+           sp->hdr.type != RXRPC_PACKET_TYPE_DATA)
+               return true; /* Just discard */
+
+       rcu_read_lock();
+
+       /* Weed out packets to services we're not offering.  Packets that would
+        * begin a call are explicitly rejected and the rest are just
+        * discarded.
+        */
+       rx = rcu_dereference(local->service);
+       if (!rx || (sp->hdr.serviceId != rx->srx.srx_service &&
+                   sp->hdr.serviceId != rx->second_service)
+           ) {
+               if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA &&
+                   sp->hdr.seq == 1)
+                       goto unsupported_service;
+               goto discard;
+       }
+
+       if (!conn) {
+               sec = rxrpc_get_incoming_security(rx, skb);
+               if (!sec)
+                       goto reject;
+       }
+
        spin_lock(&rx->incoming_lock);
        if (rx->sk.sk_state == RXRPC_SERVER_LISTEN_DISABLED ||
            rx->sk.sk_state == RXRPC_CLOSE) {
@@ -366,20 +376,8 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local,
                goto no_call;
        }
 
-       /* The peer, connection and call may all have sprung into existence due
-        * to a duplicate packet being handled on another CPU in parallel, so
-        * we have to recheck the routing.  However, we're now holding
-        * rx->incoming_lock, so the values should remain stable.
-        */
-       conn = rxrpc_find_connection_rcu(local, skb, &peer);
-
-       if (!conn) {
-               sec = rxrpc_get_incoming_security(rx, skb);
-               if (!sec)
-                       goto no_call;
-       }
-
-       call = rxrpc_alloc_incoming_call(rx, local, peer, conn, sec, skb);
+       call = rxrpc_alloc_incoming_call(rx, local, peer, conn, sec, peer_srx,
+                                        skb);
        if (!call) {
                skb->mark = RXRPC_SKB_MARK_REJECT_BUSY;
                goto no_call;
@@ -396,50 +394,41 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local,
                rx->notify_new_call(&rx->sk, call, call->user_call_ID);
 
        spin_lock(&conn->state_lock);
-       switch (conn->state) {
-       case RXRPC_CONN_SERVICE_UNSECURED:
+       if (conn->state == RXRPC_CONN_SERVICE_UNSECURED) {
                conn->state = RXRPC_CONN_SERVICE_CHALLENGING;
                set_bit(RXRPC_CONN_EV_CHALLENGE, &call->conn->events);
-               rxrpc_queue_conn(call->conn);
-               break;
-
-       case RXRPC_CONN_SERVICE:
-               write_lock(&call->state_lock);
-               if (call->state < RXRPC_CALL_COMPLETE)
-                       call->state = RXRPC_CALL_SERVER_RECV_REQUEST;
-               write_unlock(&call->state_lock);
-               break;
-
-       case RXRPC_CONN_REMOTELY_ABORTED:
-               rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED,
-                                         conn->abort_code, conn->error);
-               break;
-       case RXRPC_CONN_LOCALLY_ABORTED:
-               rxrpc_abort_call("CON", call, sp->hdr.seq,
-                                conn->abort_code, conn->error);
-               break;
-       default:
-               BUG();
+               rxrpc_queue_conn(call->conn, rxrpc_conn_queue_challenge);
        }
        spin_unlock(&conn->state_lock);
-       spin_unlock(&rx->incoming_lock);
 
-       rxrpc_send_ping(call, skb);
+       spin_unlock(&rx->incoming_lock);
+       rcu_read_unlock();
 
-       /* We have to discard the prealloc queue's ref here and rely on a
-        * combination of the RCU read lock and refs held either by the socket
-        * (recvmsg queue, to-be-accepted queue or user ID tree) or the kernel
-        * service to prevent the call from being deallocated too early.
-        */
-       rxrpc_put_call(call, rxrpc_call_put);
+       if (hlist_unhashed(&call->error_link)) {
+               spin_lock(&call->peer->lock);
+               hlist_add_head(&call->error_link, &call->peer->error_targets);
+               spin_unlock(&call->peer->lock);
+       }
 
        _leave(" = %p{%d}", call, call->debug_id);
-       return call;
-
+       rxrpc_input_call_event(call, skb);
+       rxrpc_put_call(call, rxrpc_call_put_input);
+       return true;
+
+unsupported_service:
+       trace_rxrpc_abort(0, "INV", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq,
+                         RX_INVALID_OPERATION, EOPNOTSUPP);
+       skb->priority = RX_INVALID_OPERATION;
+       goto reject;
 no_call:
        spin_unlock(&rx->incoming_lock);
-       _leave(" = NULL [%u]", skb->mark);
-       return NULL;
+reject:
+       rcu_read_unlock();
+       _leave(" = f [%u]", skb->mark);
+       return false;
+discard:
+       rcu_read_unlock();
+       return true;
 }
 
 /*
index 1e21a70..b2cf448 100644 (file)
@@ -69,21 +69,15 @@ void rxrpc_propose_delay_ACK(struct rxrpc_call *call, rxrpc_serial_t serial,
 void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason,
                    rxrpc_serial_t serial, enum rxrpc_propose_ack_trace why)
 {
-       struct rxrpc_local *local = call->conn->params.local;
        struct rxrpc_txbuf *txb;
 
        if (test_bit(RXRPC_CALL_DISCONNECTED, &call->flags))
                return;
-       if (ack_reason == RXRPC_ACK_DELAY &&
-           test_and_set_bit(RXRPC_CALL_DELAY_ACK_PENDING, &call->flags)) {
-               trace_rxrpc_drop_ack(call, why, ack_reason, serial, false);
-               return;
-       }
 
        rxrpc_inc_stat(call->rxnet, stat_tx_acks[ack_reason]);
 
        txb = rxrpc_alloc_txbuf(call, RXRPC_PACKET_TYPE_ACK,
-                               in_softirq() ? GFP_ATOMIC | __GFP_NOWARN : GFP_NOFS);
+                               rcu_read_lock_held() ? GFP_ATOMIC | __GFP_NOWARN : GFP_NOFS);
        if (!txb) {
                kleave(" = -ENOMEM");
                return;
@@ -101,22 +95,9 @@ void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason,
        txb->ack.reason         = ack_reason;
        txb->ack.nAcks          = 0;
 
-       if (!rxrpc_try_get_call(call, rxrpc_call_got)) {
-               rxrpc_put_txbuf(txb, rxrpc_txbuf_put_nomem);
-               return;
-       }
-
-       spin_lock_bh(&local->ack_tx_lock);
-       list_add_tail(&txb->tx_link, &local->ack_tx_queue);
-       spin_unlock_bh(&local->ack_tx_lock);
        trace_rxrpc_send_ack(call, why, ack_reason, serial);
-
-       if (in_task()) {
-               rxrpc_transmit_ack_packets(call->peer->local);
-       } else {
-               rxrpc_get_local(local);
-               rxrpc_queue_local(local);
-       }
+       rxrpc_send_ack_packet(call, txb);
+       rxrpc_put_txbuf(txb, rxrpc_txbuf_put_ack_tx);
 }
 
 /*
@@ -130,11 +111,10 @@ static void rxrpc_congestion_timeout(struct rxrpc_call *call)
 /*
  * Perform retransmission of NAK'd and unack'd packets.
  */
-static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j)
+void rxrpc_resend(struct rxrpc_call *call, struct sk_buff *ack_skb)
 {
        struct rxrpc_ackpacket *ack = NULL;
        struct rxrpc_txbuf *txb;
-       struct sk_buff *ack_skb = NULL;
        unsigned long resend_at;
        rxrpc_seq_t transmitted = READ_ONCE(call->tx_transmitted);
        ktime_t now, max_age, oldest, ack_ts;
@@ -148,32 +128,21 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j)
        max_age = ktime_sub_us(now, jiffies_to_usecs(call->peer->rto_j));
        oldest = now;
 
-       /* See if there's an ACK saved with a soft-ACK table in it. */
-       if (call->acks_soft_tbl) {
-               spin_lock_bh(&call->acks_ack_lock);
-               ack_skb = call->acks_soft_tbl;
-               if (ack_skb) {
-                       rxrpc_get_skb(ack_skb, rxrpc_skb_ack);
-                       ack = (void *)ack_skb->data + sizeof(struct rxrpc_wire_header);
-               }
-               spin_unlock_bh(&call->acks_ack_lock);
-       }
-
        if (list_empty(&call->tx_buffer))
                goto no_resend;
 
-       spin_lock(&call->tx_lock);
-
        if (list_empty(&call->tx_buffer))
                goto no_further_resend;
 
-       trace_rxrpc_resend(call);
+       trace_rxrpc_resend(call, ack_skb);
        txb = list_first_entry(&call->tx_buffer, struct rxrpc_txbuf, call_link);
 
        /* Scan the soft ACK table without dropping the lock and resend any
         * explicitly NAK'd packets.
         */
-       if (ack) {
+       if (ack_skb) {
+               ack = (void *)ack_skb->data + sizeof(struct rxrpc_wire_header);
+
                for (i = 0; i < ack->nAcks; i++) {
                        rxrpc_seq_t seq;
 
@@ -197,8 +166,6 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j)
                        rxrpc_see_txbuf(txb, rxrpc_txbuf_see_unacked);
 
                        if (list_empty(&txb->tx_link)) {
-                               rxrpc_get_txbuf(txb, rxrpc_txbuf_get_retrans);
-                               rxrpc_get_call(call, rxrpc_call_got_tx);
                                list_add_tail(&txb->tx_link, &retrans_queue);
                                set_bit(RXRPC_TXBUF_RESENT, &txb->flags);
                        }
@@ -242,7 +209,6 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j)
        do_resend:
                unacked = true;
                if (list_empty(&txb->tx_link)) {
-                       rxrpc_get_txbuf(txb, rxrpc_txbuf_get_retrans);
                        list_add_tail(&txb->tx_link, &retrans_queue);
                        set_bit(RXRPC_TXBUF_RESENT, &txb->flags);
                        rxrpc_inc_stat(call->rxnet, stat_tx_data_retrans);
@@ -250,10 +216,7 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j)
        }
 
 no_further_resend:
-       spin_unlock(&call->tx_lock);
 no_resend:
-       rxrpc_free_skb(ack_skb, rxrpc_skb_freed);
-
        resend_at = nsecs_to_jiffies(ktime_to_ns(ktime_sub(now, oldest)));
        resend_at += jiffies + rxrpc_get_rto_backoff(call->peer,
                                                     !list_empty(&retrans_queue));
@@ -267,7 +230,7 @@ no_resend:
         * retransmitting data.
         */
        if (list_empty(&retrans_queue)) {
-               rxrpc_reduce_call_timer(call, resend_at, now_j,
+               rxrpc_reduce_call_timer(call, resend_at, jiffies,
                                        rxrpc_timer_set_for_resend);
                ack_ts = ktime_sub(now, call->acks_latest_ts);
                if (ktime_to_us(ack_ts) < (call->peer->srtt_us >> 3))
@@ -277,76 +240,134 @@ no_resend:
                goto out;
        }
 
+       /* Retransmit the queue */
        while ((txb = list_first_entry_or_null(&retrans_queue,
                                               struct rxrpc_txbuf, tx_link))) {
                list_del_init(&txb->tx_link);
-               rxrpc_send_data_packet(call, txb);
-               rxrpc_put_txbuf(txb, rxrpc_txbuf_put_trans);
-
-               trace_rxrpc_retransmit(call, txb->seq,
-                                      ktime_to_ns(ktime_sub(txb->last_sent,
-                                                            max_age)));
+               rxrpc_transmit_one(call, txb);
        }
 
 out:
        _leave("");
 }
 
+static bool rxrpc_tx_window_has_space(struct rxrpc_call *call)
+{
+       unsigned int winsize = min_t(unsigned int, call->tx_winsize,
+                                    call->cong_cwnd + call->cong_extra);
+       rxrpc_seq_t window = call->acks_hard_ack, wtop = window + winsize;
+       rxrpc_seq_t tx_top = call->tx_top;
+       int space;
+
+       space = wtop - tx_top;
+       return space > 0;
+}
+
+/*
+ * Decant some if the sendmsg prepared queue into the transmission buffer.
+ */
+static void rxrpc_decant_prepared_tx(struct rxrpc_call *call)
+{
+       struct rxrpc_txbuf *txb;
+
+       if (rxrpc_is_client_call(call) &&
+           !test_bit(RXRPC_CALL_EXPOSED, &call->flags))
+               rxrpc_expose_client_call(call);
+
+       while ((txb = list_first_entry_or_null(&call->tx_sendmsg,
+                                              struct rxrpc_txbuf, call_link))) {
+               spin_lock(&call->tx_lock);
+               list_del(&txb->call_link);
+               spin_unlock(&call->tx_lock);
+
+               call->tx_top = txb->seq;
+               list_add_tail(&txb->call_link, &call->tx_buffer);
+
+               rxrpc_transmit_one(call, txb);
+
+               if (!rxrpc_tx_window_has_space(call))
+                       break;
+       }
+}
+
+static void rxrpc_transmit_some_data(struct rxrpc_call *call)
+{
+       switch (call->state) {
+       case RXRPC_CALL_SERVER_ACK_REQUEST:
+               if (list_empty(&call->tx_sendmsg))
+                       return;
+               fallthrough;
+
+       case RXRPC_CALL_SERVER_SEND_REPLY:
+       case RXRPC_CALL_SERVER_AWAIT_ACK:
+       case RXRPC_CALL_CLIENT_SEND_REQUEST:
+       case RXRPC_CALL_CLIENT_AWAIT_REPLY:
+               if (!rxrpc_tx_window_has_space(call))
+                       return;
+               if (list_empty(&call->tx_sendmsg)) {
+                       rxrpc_inc_stat(call->rxnet, stat_tx_data_underflow);
+                       return;
+               }
+               rxrpc_decant_prepared_tx(call);
+               break;
+       default:
+               return;
+       }
+}
+
+/*
+ * Ping the other end to fill our RTT cache and to retrieve the rwind
+ * and MTU parameters.
+ */
+static void rxrpc_send_initial_ping(struct rxrpc_call *call)
+{
+       if (call->peer->rtt_count < 3 ||
+           ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000),
+                        ktime_get_real()))
+               rxrpc_send_ACK(call, RXRPC_ACK_PING, 0,
+                              rxrpc_propose_ack_ping_for_params);
+}
+
 /*
  * Handle retransmission and deferred ACK/abort generation.
  */
-void rxrpc_process_call(struct work_struct *work)
+void rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb)
 {
-       struct rxrpc_call *call =
-               container_of(work, struct rxrpc_call, processor);
        unsigned long now, next, t;
-       unsigned int iterations = 0;
        rxrpc_serial_t ackr_serial;
+       bool resend = false, expired = false;
 
-       rxrpc_see_call(call);
+       rxrpc_see_call(call, rxrpc_call_see_input);
 
        //printk("\n--------------------\n");
        _enter("{%d,%s,%lx}",
               call->debug_id, rxrpc_call_states[call->state], call->events);
 
-recheck_state:
-       /* Limit the number of times we do this before returning to the manager */
-       iterations++;
-       if (iterations > 5)
-               goto requeue;
-
-       if (test_and_clear_bit(RXRPC_CALL_EV_ABORT, &call->events)) {
-               rxrpc_send_abort_packet(call);
-               goto recheck_state;
-       }
-
-       if (READ_ONCE(call->acks_hard_ack) != call->tx_bottom)
-               rxrpc_shrink_call_tx_buffer(call);
+       if (call->state == RXRPC_CALL_COMPLETE)
+               goto out;
 
-       if (call->state == RXRPC_CALL_COMPLETE) {
-               rxrpc_delete_call_timer(call);
-               goto out_put;
-       }
+       if (skb && skb->mark == RXRPC_SKB_MARK_ERROR)
+               goto out;
 
-       /* Work out if any timeouts tripped */
+       /* If we see our async-event poke, check for timeout trippage. */
        now = jiffies;
        t = READ_ONCE(call->expect_rx_by);
        if (time_after_eq(now, t)) {
                trace_rxrpc_timer(call, rxrpc_timer_exp_normal, now);
-               set_bit(RXRPC_CALL_EV_EXPIRED, &call->events);
+               expired = true;
        }
 
        t = READ_ONCE(call->expect_req_by);
        if (call->state == RXRPC_CALL_SERVER_RECV_REQUEST &&
            time_after_eq(now, t)) {
                trace_rxrpc_timer(call, rxrpc_timer_exp_idle, now);
-               set_bit(RXRPC_CALL_EV_EXPIRED, &call->events);
+               expired = true;
        }
 
        t = READ_ONCE(call->expect_term_by);
        if (time_after_eq(now, t)) {
                trace_rxrpc_timer(call, rxrpc_timer_exp_hard, now);
-               set_bit(RXRPC_CALL_EV_EXPIRED, &call->events);
+               expired = true;
        }
 
        t = READ_ONCE(call->delay_ack_at);
@@ -385,11 +406,26 @@ recheck_state:
        if (time_after_eq(now, t)) {
                trace_rxrpc_timer(call, rxrpc_timer_exp_resend, now);
                cmpxchg(&call->resend_at, t, now + MAX_JIFFY_OFFSET);
-               set_bit(RXRPC_CALL_EV_RESEND, &call->events);
+               resend = true;
        }
 
+       if (skb)
+               rxrpc_input_call_packet(call, skb);
+
+       rxrpc_transmit_some_data(call);
+
+       if (skb) {
+               struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+
+               if (sp->hdr.type == RXRPC_PACKET_TYPE_ACK)
+                       rxrpc_congestion_degrade(call);
+       }
+
+       if (test_and_clear_bit(RXRPC_CALL_EV_INITIAL_PING, &call->events))
+               rxrpc_send_initial_ping(call);
+
        /* Process events */
-       if (test_and_clear_bit(RXRPC_CALL_EV_EXPIRED, &call->events)) {
+       if (expired) {
                if (test_bit(RXRPC_CALL_RX_HEARD, &call->flags) &&
                    (int)call->conn->hi_serial - (int)call->rx_serial > 0) {
                        trace_rxrpc_call_reset(call);
@@ -397,52 +433,50 @@ recheck_state:
                } else {
                        rxrpc_abort_call("EXP", call, 0, RX_CALL_TIMEOUT, -ETIME);
                }
-               set_bit(RXRPC_CALL_EV_ABORT, &call->events);
-               goto recheck_state;
+               rxrpc_send_abort_packet(call);
+               goto out;
        }
 
-       if (test_and_clear_bit(RXRPC_CALL_EV_ACK_LOST, &call->events)) {
-               call->acks_lost_top = call->tx_top;
+       if (test_and_clear_bit(RXRPC_CALL_EV_ACK_LOST, &call->events))
                rxrpc_send_ACK(call, RXRPC_ACK_PING, 0,
                               rxrpc_propose_ack_ping_for_lost_ack);
-       }
 
-       if (test_and_clear_bit(RXRPC_CALL_EV_RESEND, &call->events) &&
-           call->state != RXRPC_CALL_CLIENT_RECV_REPLY) {
-               rxrpc_resend(call, now);
-               goto recheck_state;
-       }
+       if (resend && call->state != RXRPC_CALL_CLIENT_RECV_REPLY)
+               rxrpc_resend(call, NULL);
+
+       if (test_and_clear_bit(RXRPC_CALL_RX_IS_IDLE, &call->flags))
+               rxrpc_send_ACK(call, RXRPC_ACK_IDLE, 0,
+                              rxrpc_propose_ack_rx_idle);
+
+       if (atomic_read(&call->ackr_nr_unacked) > 2)
+               rxrpc_send_ACK(call, RXRPC_ACK_IDLE, 0,
+                              rxrpc_propose_ack_input_data);
 
        /* Make sure the timer is restarted */
-       next = call->expect_rx_by;
+       if (call->state != RXRPC_CALL_COMPLETE) {
+               next = call->expect_rx_by;
 
 #define set(T) { t = READ_ONCE(T); if (time_before(t, next)) next = t; }
 
-       set(call->expect_req_by);
-       set(call->expect_term_by);
-       set(call->delay_ack_at);
-       set(call->ack_lost_at);
-       set(call->resend_at);
-       set(call->keepalive_at);
-       set(call->ping_at);
-
-       now = jiffies;
-       if (time_after_eq(now, next))
-               goto recheck_state;
+               set(call->expect_req_by);
+               set(call->expect_term_by);
+               set(call->delay_ack_at);
+               set(call->ack_lost_at);
+               set(call->resend_at);
+               set(call->keepalive_at);
+               set(call->ping_at);
 
-       rxrpc_reduce_call_timer(call, next, now, rxrpc_timer_restart);
+               now = jiffies;
+               if (time_after_eq(now, next))
+                       rxrpc_poke_call(call, rxrpc_call_poke_timer_now);
 
-       /* other events may have been raised since we started checking */
-       if (call->events && call->state < RXRPC_CALL_COMPLETE)
-               goto requeue;
+               rxrpc_reduce_call_timer(call, next, now, rxrpc_timer_restart);
+       }
 
-out_put:
-       rxrpc_put_call(call, rxrpc_call_put);
 out:
+       if (call->state == RXRPC_CALL_COMPLETE)
+               del_timer_sync(&call->timer);
+       if (call->acks_hard_ack != call->tx_bottom)
+               rxrpc_shrink_call_tx_buffer(call);
        _leave("");
-       return;
-
-requeue:
-       __rxrpc_queue_call(call);
-       goto out;
 }
index 1befe22..be5eb8c 100644 (file)
@@ -45,6 +45,24 @@ static struct semaphore rxrpc_call_limiter =
 static struct semaphore rxrpc_kernel_call_limiter =
        __SEMAPHORE_INITIALIZER(rxrpc_kernel_call_limiter, 1000);
 
+void rxrpc_poke_call(struct rxrpc_call *call, enum rxrpc_call_poke_trace what)
+{
+       struct rxrpc_local *local = call->local;
+       bool busy;
+
+       if (call->state < RXRPC_CALL_COMPLETE) {
+               spin_lock_bh(&local->lock);
+               busy = !list_empty(&call->attend_link);
+               trace_rxrpc_poke_call(call, busy, what);
+               if (!busy) {
+                       rxrpc_get_call(call, rxrpc_call_get_poke);
+                       list_add_tail(&call->attend_link, &local->call_attend_q);
+               }
+               spin_unlock_bh(&local->lock);
+               rxrpc_wake_up_io_thread(local);
+       }
+}
+
 static void rxrpc_call_timer_expired(struct timer_list *t)
 {
        struct rxrpc_call *call = from_timer(call, t, timer);
@@ -53,9 +71,7 @@ static void rxrpc_call_timer_expired(struct timer_list *t)
 
        if (call->state < RXRPC_CALL_COMPLETE) {
                trace_rxrpc_timer_expired(call, jiffies);
-               __rxrpc_queue_call(call);
-       } else {
-               rxrpc_put_call(call, rxrpc_call_put);
+               rxrpc_poke_call(call, rxrpc_call_poke_timer);
        }
 }
 
@@ -64,21 +80,14 @@ void rxrpc_reduce_call_timer(struct rxrpc_call *call,
                             unsigned long now,
                             enum rxrpc_timer_trace why)
 {
-       if (rxrpc_try_get_call(call, rxrpc_call_got_timer)) {
-               trace_rxrpc_timer(call, why, now);
-               if (timer_reduce(&call->timer, expire_at))
-                       rxrpc_put_call(call, rxrpc_call_put_notimer);
-       }
-}
-
-void rxrpc_delete_call_timer(struct rxrpc_call *call)
-{
-       if (del_timer_sync(&call->timer))
-               rxrpc_put_call(call, rxrpc_call_put_timer);
+       trace_rxrpc_timer(call, why, now);
+       timer_reduce(&call->timer, expire_at);
 }
 
 static struct lock_class_key rxrpc_call_user_mutex_lock_class_key;
 
+static void rxrpc_destroy_call(struct work_struct *);
+
 /*
  * find an extant server call
  * - called in process context with IRQs enabled
@@ -110,7 +119,7 @@ struct rxrpc_call *rxrpc_find_call_by_user_ID(struct rxrpc_sock *rx,
        return NULL;
 
 found_extant_call:
-       rxrpc_get_call(call, rxrpc_call_got);
+       rxrpc_get_call(call, rxrpc_call_get_sendmsg);
        read_unlock(&rx->call_lock);
        _leave(" = %p [%d]", call, refcount_read(&call->ref));
        return call;
@@ -139,20 +148,20 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp,
                                  &rxrpc_call_user_mutex_lock_class_key);
 
        timer_setup(&call->timer, rxrpc_call_timer_expired, 0);
-       INIT_WORK(&call->processor, &rxrpc_process_call);
+       INIT_WORK(&call->destroyer, rxrpc_destroy_call);
        INIT_LIST_HEAD(&call->link);
        INIT_LIST_HEAD(&call->chan_wait_link);
        INIT_LIST_HEAD(&call->accept_link);
        INIT_LIST_HEAD(&call->recvmsg_link);
        INIT_LIST_HEAD(&call->sock_link);
+       INIT_LIST_HEAD(&call->attend_link);
+       INIT_LIST_HEAD(&call->tx_sendmsg);
        INIT_LIST_HEAD(&call->tx_buffer);
        skb_queue_head_init(&call->recvmsg_queue);
        skb_queue_head_init(&call->rx_oos_queue);
        init_waitqueue_head(&call->waitq);
        spin_lock_init(&call->notify_lock);
        spin_lock_init(&call->tx_lock);
-       spin_lock_init(&call->input_lock);
-       spin_lock_init(&call->acks_ack_lock);
        rwlock_init(&call->state_lock);
        refcount_set(&call->ref, 1);
        call->debug_id = debug_id;
@@ -185,22 +194,45 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp,
  */
 static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx,
                                                  struct sockaddr_rxrpc *srx,
+                                                 struct rxrpc_conn_parameters *cp,
+                                                 struct rxrpc_call_params *p,
                                                  gfp_t gfp,
                                                  unsigned int debug_id)
 {
        struct rxrpc_call *call;
        ktime_t now;
+       int ret;
 
        _enter("");
 
        call = rxrpc_alloc_call(rx, gfp, debug_id);
        if (!call)
                return ERR_PTR(-ENOMEM);
-       call->state = RXRPC_CALL_CLIENT_AWAIT_CONN;
-       call->service_id = srx->srx_service;
        now = ktime_get_real();
-       call->acks_latest_ts = now;
-       call->cong_tstamp = now;
+       call->acks_latest_ts    = now;
+       call->cong_tstamp       = now;
+       call->state             = RXRPC_CALL_CLIENT_AWAIT_CONN;
+       call->dest_srx          = *srx;
+       call->interruptibility  = p->interruptibility;
+       call->tx_total_len      = p->tx_total_len;
+       call->key               = key_get(cp->key);
+       call->local             = rxrpc_get_local(cp->local, rxrpc_local_get_call);
+       if (p->kernel)
+               __set_bit(RXRPC_CALL_KERNEL, &call->flags);
+       if (cp->upgrade)
+               __set_bit(RXRPC_CALL_UPGRADE, &call->flags);
+       if (cp->exclusive)
+               __set_bit(RXRPC_CALL_EXCLUSIVE, &call->flags);
+
+       ret = rxrpc_init_client_call_security(call);
+       if (ret < 0) {
+               __rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR, 0, ret);
+               rxrpc_put_call(call, rxrpc_call_put_discard_error);
+               return ERR_PTR(ret);
+       }
+
+       trace_rxrpc_call(call->debug_id, refcount_read(&call->ref),
+                        p->user_call_ID, rxrpc_call_new_client);
 
        _leave(" = %p", call);
        return call;
@@ -218,6 +250,7 @@ static void rxrpc_start_call_timer(struct rxrpc_call *call)
        call->ack_lost_at = j;
        call->resend_at = j;
        call->ping_at = j;
+       call->keepalive_at = j;
        call->expect_rx_by = j;
        call->expect_req_by = j;
        call->expect_term_by = j;
@@ -270,7 +303,6 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
        struct rxrpc_net *rxnet;
        struct semaphore *limiter;
        struct rb_node *parent, **pp;
-       const void *here = __builtin_return_address(0);
        int ret;
 
        _enter("%p,%lx", rx, p->user_call_ID);
@@ -281,7 +313,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
                return ERR_PTR(-ERESTARTSYS);
        }
 
-       call = rxrpc_alloc_client_call(rx, srx, gfp, debug_id);
+       call = rxrpc_alloc_client_call(rx, srx, cp, p, gfp, debug_id);
        if (IS_ERR(call)) {
                release_sock(&rx->sk);
                up(limiter);
@@ -289,14 +321,6 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
                return call;
        }
 
-       call->interruptibility = p->interruptibility;
-       call->tx_total_len = p->tx_total_len;
-       trace_rxrpc_call(call->debug_id, rxrpc_call_new_client,
-                        refcount_read(&call->ref),
-                        here, (const void *)p->user_call_ID);
-       if (p->kernel)
-               __set_bit(RXRPC_CALL_KERNEL, &call->flags);
-
        /* We need to protect a partially set up call against the user as we
         * will be acting outside the socket lock.
         */
@@ -322,7 +346,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
        rcu_assign_pointer(call->socket, rx);
        call->user_call_ID = p->user_call_ID;
        __set_bit(RXRPC_CALL_HAS_USERID, &call->flags);
-       rxrpc_get_call(call, rxrpc_call_got_userid);
+       rxrpc_get_call(call, rxrpc_call_get_userid);
        rb_link_node(&call->sock_node, parent, pp);
        rb_insert_color(&call->sock_node, &rx->calls);
        list_add(&call->sock_link, &rx->sock_calls);
@@ -330,9 +354,9 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
        write_unlock(&rx->call_lock);
 
        rxnet = call->rxnet;
-       spin_lock_bh(&rxnet->call_lock);
+       spin_lock(&rxnet->call_lock);
        list_add_tail_rcu(&call->link, &rxnet->calls);
-       spin_unlock_bh(&rxnet->call_lock);
+       spin_unlock(&rxnet->call_lock);
 
        /* From this point on, the call is protected by its own lock. */
        release_sock(&rx->sk);
@@ -344,13 +368,10 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
        if (ret < 0)
                goto error_attached_to_socket;
 
-       trace_rxrpc_call(call->debug_id, rxrpc_call_connected,
-                        refcount_read(&call->ref), here, NULL);
+       rxrpc_see_call(call, rxrpc_call_see_connected);
 
        rxrpc_start_call_timer(call);
 
-       _net("CALL new %d on CONN %d", call->debug_id, call->conn->debug_id);
-
        _leave(" = %p [new]", call);
        return call;
 
@@ -364,11 +385,11 @@ error_dup_user_ID:
        release_sock(&rx->sk);
        __rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR,
                                    RX_CALL_DEAD, -EEXIST);
-       trace_rxrpc_call(call->debug_id, rxrpc_call_error,
-                        refcount_read(&call->ref), here, ERR_PTR(-EEXIST));
+       trace_rxrpc_call(call->debug_id, refcount_read(&call->ref), 0,
+                        rxrpc_call_see_userid_exists);
        rxrpc_release_call(rx, call);
        mutex_unlock(&call->user_mutex);
-       rxrpc_put_call(call, rxrpc_call_put);
+       rxrpc_put_call(call, rxrpc_call_put_userid_exists);
        _leave(" = -EEXIST");
        return ERR_PTR(-EEXIST);
 
@@ -378,8 +399,8 @@ error_dup_user_ID:
         * leave the error to recvmsg() to deal with.
         */
 error_attached_to_socket:
-       trace_rxrpc_call(call->debug_id, rxrpc_call_error,
-                        refcount_read(&call->ref), here, ERR_PTR(ret));
+       trace_rxrpc_call(call->debug_id, refcount_read(&call->ref), ret,
+                        rxrpc_call_see_connect_failed);
        set_bit(RXRPC_CALL_DISCONNECTED, &call->flags);
        __rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR,
                                    RX_CALL_DEAD, ret);
@@ -403,11 +424,34 @@ void rxrpc_incoming_call(struct rxrpc_sock *rx,
 
        rcu_assign_pointer(call->socket, rx);
        call->call_id           = sp->hdr.callNumber;
-       call->service_id        = sp->hdr.serviceId;
+       call->dest_srx.srx_service = sp->hdr.serviceId;
        call->cid               = sp->hdr.cid;
        call->state             = RXRPC_CALL_SERVER_SECURING;
        call->cong_tstamp       = skb->tstamp;
 
+       spin_lock(&conn->state_lock);
+
+       switch (conn->state) {
+       case RXRPC_CONN_SERVICE_UNSECURED:
+       case RXRPC_CONN_SERVICE_CHALLENGING:
+               call->state = RXRPC_CALL_SERVER_SECURING;
+               break;
+       case RXRPC_CONN_SERVICE:
+               call->state = RXRPC_CALL_SERVER_RECV_REQUEST;
+               break;
+
+       case RXRPC_CONN_REMOTELY_ABORTED:
+               __rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED,
+                                           conn->abort_code, conn->error);
+               break;
+       case RXRPC_CONN_LOCALLY_ABORTED:
+               __rxrpc_abort_call("CON", call, 1,
+                                  conn->abort_code, conn->error);
+               break;
+       default:
+               BUG();
+       }
+
        /* Set the channel for this call.  We don't get channel_lock as we're
         * only defending against the data_ready handler (which we're called
         * from) and the RESPONSE packet parser (which is only really
@@ -418,86 +462,48 @@ void rxrpc_incoming_call(struct rxrpc_sock *rx,
        conn->channels[chan].call_counter = call->call_id;
        conn->channels[chan].call_id = call->call_id;
        rcu_assign_pointer(conn->channels[chan].call, call);
+       spin_unlock(&conn->state_lock);
 
-       spin_lock(&conn->params.peer->lock);
-       hlist_add_head_rcu(&call->error_link, &conn->params.peer->error_targets);
-       spin_unlock(&conn->params.peer->lock);
-
-       _net("CALL incoming %d on CONN %d", call->debug_id, call->conn->debug_id);
+       spin_lock(&conn->peer->lock);
+       hlist_add_head(&call->error_link, &conn->peer->error_targets);
+       spin_unlock(&conn->peer->lock);
 
        rxrpc_start_call_timer(call);
        _leave("");
 }
 
 /*
- * Queue a call's work processor, getting a ref to pass to the work queue.
- */
-bool rxrpc_queue_call(struct rxrpc_call *call)
-{
-       const void *here = __builtin_return_address(0);
-       int n;
-
-       if (!__refcount_inc_not_zero(&call->ref, &n))
-               return false;
-       if (rxrpc_queue_work(&call->processor))
-               trace_rxrpc_call(call->debug_id, rxrpc_call_queued, n + 1,
-                                here, NULL);
-       else
-               rxrpc_put_call(call, rxrpc_call_put_noqueue);
-       return true;
-}
-
-/*
- * Queue a call's work processor, passing the callers ref to the work queue.
- */
-bool __rxrpc_queue_call(struct rxrpc_call *call)
-{
-       const void *here = __builtin_return_address(0);
-       int n = refcount_read(&call->ref);
-       ASSERTCMP(n, >=, 1);
-       if (rxrpc_queue_work(&call->processor))
-               trace_rxrpc_call(call->debug_id, rxrpc_call_queued_ref, n,
-                                here, NULL);
-       else
-               rxrpc_put_call(call, rxrpc_call_put_noqueue);
-       return true;
-}
-
-/*
  * Note the re-emergence of a call.
  */
-void rxrpc_see_call(struct rxrpc_call *call)
+void rxrpc_see_call(struct rxrpc_call *call, enum rxrpc_call_trace why)
 {
-       const void *here = __builtin_return_address(0);
        if (call) {
-               int n = refcount_read(&call->ref);
+               int r = refcount_read(&call->ref);
 
-               trace_rxrpc_call(call->debug_id, rxrpc_call_seen, n,
-                                here, NULL);
+               trace_rxrpc_call(call->debug_id, r, 0, why);
        }
 }
 
-bool rxrpc_try_get_call(struct rxrpc_call *call, enum rxrpc_call_trace op)
+struct rxrpc_call *rxrpc_try_get_call(struct rxrpc_call *call,
+                                     enum rxrpc_call_trace why)
 {
-       const void *here = __builtin_return_address(0);
-       int n;
+       int r;
 
-       if (!__refcount_inc_not_zero(&call->ref, &n))
-               return false;
-       trace_rxrpc_call(call->debug_id, op, n + 1, here, NULL);
-       return true;
+       if (!call || !__refcount_inc_not_zero(&call->ref, &r))
+               return NULL;
+       trace_rxrpc_call(call->debug_id, r + 1, 0, why);
+       return call;
 }
 
 /*
  * Note the addition of a ref on a call.
  */
-void rxrpc_get_call(struct rxrpc_call *call, enum rxrpc_call_trace op)
+void rxrpc_get_call(struct rxrpc_call *call, enum rxrpc_call_trace why)
 {
-       const void *here = __builtin_return_address(0);
-       int n;
+       int r;
 
-       __refcount_inc(&call->ref, &n);
-       trace_rxrpc_call(call->debug_id, op, n + 1, here, NULL);
+       __refcount_inc(&call->ref, &r);
+       trace_rxrpc_call(call->debug_id, r + 1, 0, why);
 }
 
 /*
@@ -514,15 +520,13 @@ static void rxrpc_cleanup_ring(struct rxrpc_call *call)
  */
 void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call)
 {
-       const void *here = __builtin_return_address(0);
        struct rxrpc_connection *conn = call->conn;
        bool put = false;
 
        _enter("{%d,%d}", call->debug_id, refcount_read(&call->ref));
 
-       trace_rxrpc_call(call->debug_id, rxrpc_call_release,
-                        refcount_read(&call->ref),
-                        here, (const void *)call->flags);
+       trace_rxrpc_call(call->debug_id, refcount_read(&call->ref),
+                        call->flags, rxrpc_call_see_release);
 
        ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE);
 
@@ -530,10 +534,10 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call)
                BUG();
 
        rxrpc_put_call_slot(call);
-       rxrpc_delete_call_timer(call);
+       del_timer_sync(&call->timer);
 
        /* Make sure we don't get any more notifications */
-       write_lock_bh(&rx->recvmsg_lock);
+       write_lock(&rx->recvmsg_lock);
 
        if (!list_empty(&call->recvmsg_link)) {
                _debug("unlinking once-pending call %p { e=%lx f=%lx }",
@@ -546,16 +550,16 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call)
        call->recvmsg_link.next = NULL;
        call->recvmsg_link.prev = NULL;
 
-       write_unlock_bh(&rx->recvmsg_lock);
+       write_unlock(&rx->recvmsg_lock);
        if (put)
-               rxrpc_put_call(call, rxrpc_call_put);
+               rxrpc_put_call(call, rxrpc_call_put_unnotify);
 
        write_lock(&rx->call_lock);
 
        if (test_and_clear_bit(RXRPC_CALL_HAS_USERID, &call->flags)) {
                rb_erase(&call->sock_node, &rx->calls);
                memset(&call->sock_node, 0xdd, sizeof(call->sock_node));
-               rxrpc_put_call(call, rxrpc_call_put_userid);
+               rxrpc_put_call(call, rxrpc_call_put_userid_exists);
        }
 
        list_del(&call->sock_link);
@@ -584,17 +588,17 @@ void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx)
                                  struct rxrpc_call, accept_link);
                list_del(&call->accept_link);
                rxrpc_abort_call("SKR", call, 0, RX_CALL_DEAD, -ECONNRESET);
-               rxrpc_put_call(call, rxrpc_call_put);
+               rxrpc_put_call(call, rxrpc_call_put_release_sock_tba);
        }
 
        while (!list_empty(&rx->sock_calls)) {
                call = list_entry(rx->sock_calls.next,
                                  struct rxrpc_call, sock_link);
-               rxrpc_get_call(call, rxrpc_call_got);
+               rxrpc_get_call(call, rxrpc_call_get_release_sock);
                rxrpc_abort_call("SKT", call, 0, RX_CALL_DEAD, -ECONNRESET);
                rxrpc_send_abort_packet(call);
                rxrpc_release_call(rx, call);
-               rxrpc_put_call(call, rxrpc_call_put);
+               rxrpc_put_call(call, rxrpc_call_put_release_sock);
        }
 
        _leave("");
@@ -603,26 +607,24 @@ void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx)
 /*
  * release a call
  */
-void rxrpc_put_call(struct rxrpc_call *call, enum rxrpc_call_trace op)
+void rxrpc_put_call(struct rxrpc_call *call, enum rxrpc_call_trace why)
 {
        struct rxrpc_net *rxnet = call->rxnet;
-       const void *here = __builtin_return_address(0);
        unsigned int debug_id = call->debug_id;
        bool dead;
-       int n;
+       int r;
 
        ASSERT(call != NULL);
 
-       dead = __refcount_dec_and_test(&call->ref, &n);
-       trace_rxrpc_call(debug_id, op, n, here, NULL);
+       dead = __refcount_dec_and_test(&call->ref, &r);
+       trace_rxrpc_call(debug_id, r - 1, 0, why);
        if (dead) {
-               _debug("call %d dead", call->debug_id);
                ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE);
 
                if (!list_empty(&call->link)) {
-                       spin_lock_bh(&rxnet->call_lock);
+                       spin_lock(&rxnet->call_lock);
                        list_del_init(&call->link);
-                       spin_unlock_bh(&rxnet->call_lock);
+                       spin_unlock(&rxnet->call_lock);
                }
 
                rxrpc_cleanup_call(call);
@@ -630,36 +632,45 @@ void rxrpc_put_call(struct rxrpc_call *call, enum rxrpc_call_trace op)
 }
 
 /*
- * Final call destruction - but must be done in process context.
+ * Free up the call under RCU.
  */
-static void rxrpc_destroy_call(struct work_struct *work)
+static void rxrpc_rcu_free_call(struct rcu_head *rcu)
 {
-       struct rxrpc_call *call = container_of(work, struct rxrpc_call, processor);
-       struct rxrpc_net *rxnet = call->rxnet;
-
-       rxrpc_delete_call_timer(call);
+       struct rxrpc_call *call = container_of(rcu, struct rxrpc_call, rcu);
+       struct rxrpc_net *rxnet = READ_ONCE(call->rxnet);
 
-       rxrpc_put_connection(call->conn);
-       rxrpc_put_peer(call->peer);
        kmem_cache_free(rxrpc_call_jar, call);
        if (atomic_dec_and_test(&rxnet->nr_calls))
                wake_up_var(&rxnet->nr_calls);
 }
 
 /*
- * Final call destruction under RCU.
+ * Final call destruction - but must be done in process context.
  */
-static void rxrpc_rcu_destroy_call(struct rcu_head *rcu)
+static void rxrpc_destroy_call(struct work_struct *work)
 {
-       struct rxrpc_call *call = container_of(rcu, struct rxrpc_call, rcu);
+       struct rxrpc_call *call = container_of(work, struct rxrpc_call, destroyer);
+       struct rxrpc_txbuf *txb;
 
-       if (in_softirq()) {
-               INIT_WORK(&call->processor, rxrpc_destroy_call);
-               if (!rxrpc_queue_work(&call->processor))
-                       BUG();
-       } else {
-               rxrpc_destroy_call(&call->processor);
+       del_timer_sync(&call->timer);
+
+       rxrpc_cleanup_ring(call);
+       while ((txb = list_first_entry_or_null(&call->tx_sendmsg,
+                                              struct rxrpc_txbuf, call_link))) {
+               list_del(&txb->call_link);
+               rxrpc_put_txbuf(txb, rxrpc_txbuf_put_cleaned);
        }
+       while ((txb = list_first_entry_or_null(&call->tx_buffer,
+                                              struct rxrpc_txbuf, call_link))) {
+               list_del(&txb->call_link);
+               rxrpc_put_txbuf(txb, rxrpc_txbuf_put_cleaned);
+       }
+
+       rxrpc_put_txbuf(call->tx_pending, rxrpc_txbuf_put_cleaned);
+       rxrpc_put_connection(call->conn, rxrpc_conn_put_call);
+       rxrpc_put_peer(call->peer, rxrpc_peer_put_call);
+       rxrpc_put_local(call->local, rxrpc_local_put_call);
+       call_rcu(&call->rcu, rxrpc_rcu_free_call);
 }
 
 /*
@@ -667,25 +678,20 @@ static void rxrpc_rcu_destroy_call(struct rcu_head *rcu)
  */
 void rxrpc_cleanup_call(struct rxrpc_call *call)
 {
-       struct rxrpc_txbuf *txb;
-
-       _net("DESTROY CALL %d", call->debug_id);
-
        memset(&call->sock_node, 0xcd, sizeof(call->sock_node));
 
        ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE);
        ASSERT(test_bit(RXRPC_CALL_RELEASED, &call->flags));
 
-       rxrpc_cleanup_ring(call);
-       while ((txb = list_first_entry_or_null(&call->tx_buffer,
-                                              struct rxrpc_txbuf, call_link))) {
-               list_del(&txb->call_link);
-               rxrpc_put_txbuf(txb, rxrpc_txbuf_put_cleaned);
-       }
-       rxrpc_put_txbuf(call->tx_pending, rxrpc_txbuf_put_cleaned);
-       rxrpc_free_skb(call->acks_soft_tbl, rxrpc_skb_cleaned);
+       del_timer(&call->timer);
 
-       call_rcu(&call->rcu, rxrpc_rcu_destroy_call);
+       if (rcu_read_lock_held())
+               /* Can't use the rxrpc workqueue as we need to cancel/flush
+                * something that may be running/waiting there.
+                */
+               schedule_work(&call->destroyer);
+       else
+               rxrpc_destroy_call(&call->destroyer);
 }
 
 /*
@@ -700,14 +706,14 @@ void rxrpc_destroy_all_calls(struct rxrpc_net *rxnet)
        _enter("");
 
        if (!list_empty(&rxnet->calls)) {
-               spin_lock_bh(&rxnet->call_lock);
+               spin_lock(&rxnet->call_lock);
 
                while (!list_empty(&rxnet->calls)) {
                        call = list_entry(rxnet->calls.next,
                                          struct rxrpc_call, link);
                        _debug("Zapping call %p", call);
 
-                       rxrpc_see_call(call);
+                       rxrpc_see_call(call, rxrpc_call_see_zap);
                        list_del_init(&call->link);
 
                        pr_err("Call %p still in use (%d,%s,%lx,%lx)!\n",
@@ -715,12 +721,12 @@ void rxrpc_destroy_all_calls(struct rxrpc_net *rxnet)
                               rxrpc_call_states[call->state],
                               call->flags, call->events);
 
-                       spin_unlock_bh(&rxnet->call_lock);
+                       spin_unlock(&rxnet->call_lock);
                        cond_resched();
-                       spin_lock_bh(&rxnet->call_lock);
+                       spin_lock(&rxnet->call_lock);
                }
 
-               spin_unlock_bh(&rxnet->call_lock);
+               spin_unlock(&rxnet->call_lock);
        }
 
        atomic_dec(&rxnet->nr_calls);
index f11c97e..a08e33c 100644 (file)
@@ -51,7 +51,7 @@ static void rxrpc_deactivate_bundle(struct rxrpc_bundle *bundle);
 static int rxrpc_get_client_connection_id(struct rxrpc_connection *conn,
                                          gfp_t gfp)
 {
-       struct rxrpc_net *rxnet = conn->params.local->rxnet;
+       struct rxrpc_net *rxnet = conn->rxnet;
        int id;
 
        _enter("");
@@ -122,37 +122,47 @@ static struct rxrpc_bundle *rxrpc_alloc_bundle(struct rxrpc_conn_parameters *cp,
 
        bundle = kzalloc(sizeof(*bundle), gfp);
        if (bundle) {
-               bundle->params = *cp;
-               rxrpc_get_peer(bundle->params.peer);
+               bundle->local           = cp->local;
+               bundle->peer            = rxrpc_get_peer(cp->peer, rxrpc_peer_get_bundle);
+               bundle->key             = cp->key;
+               bundle->exclusive       = cp->exclusive;
+               bundle->upgrade         = cp->upgrade;
+               bundle->service_id      = cp->service_id;
+               bundle->security_level  = cp->security_level;
                refcount_set(&bundle->ref, 1);
                atomic_set(&bundle->active, 1);
                spin_lock_init(&bundle->channel_lock);
                INIT_LIST_HEAD(&bundle->waiting_calls);
+               trace_rxrpc_bundle(bundle->debug_id, 1, rxrpc_bundle_new);
        }
        return bundle;
 }
 
-struct rxrpc_bundle *rxrpc_get_bundle(struct rxrpc_bundle *bundle)
+struct rxrpc_bundle *rxrpc_get_bundle(struct rxrpc_bundle *bundle,
+                                     enum rxrpc_bundle_trace why)
 {
-       refcount_inc(&bundle->ref);
+       int r;
+
+       __refcount_inc(&bundle->ref, &r);
+       trace_rxrpc_bundle(bundle->debug_id, r + 1, why);
        return bundle;
 }
 
 static void rxrpc_free_bundle(struct rxrpc_bundle *bundle)
 {
-       rxrpc_put_peer(bundle->params.peer);
+       trace_rxrpc_bundle(bundle->debug_id, 1, rxrpc_bundle_free);
+       rxrpc_put_peer(bundle->peer, rxrpc_peer_put_bundle);
        kfree(bundle);
 }
 
-void rxrpc_put_bundle(struct rxrpc_bundle *bundle)
+void rxrpc_put_bundle(struct rxrpc_bundle *bundle, enum rxrpc_bundle_trace why)
 {
-       unsigned int d = bundle->debug_id;
+       unsigned int id = bundle->debug_id;
        bool dead;
        int r;
 
        dead = __refcount_dec_and_test(&bundle->ref, &r);
-
-       _debug("PUT B=%x %d", d, r - 1);
+       trace_rxrpc_bundle(id, r - 1, why);
        if (dead)
                rxrpc_free_bundle(bundle);
 }
@@ -164,12 +174,12 @@ static struct rxrpc_connection *
 rxrpc_alloc_client_connection(struct rxrpc_bundle *bundle, gfp_t gfp)
 {
        struct rxrpc_connection *conn;
-       struct rxrpc_net *rxnet = bundle->params.local->rxnet;
+       struct rxrpc_net *rxnet = bundle->local->rxnet;
        int ret;
 
        _enter("");
 
-       conn = rxrpc_alloc_connection(gfp);
+       conn = rxrpc_alloc_connection(rxnet, gfp);
        if (!conn) {
                _leave(" = -ENOMEM");
                return ERR_PTR(-ENOMEM);
@@ -177,10 +187,16 @@ rxrpc_alloc_client_connection(struct rxrpc_bundle *bundle, gfp_t gfp)
 
        refcount_set(&conn->ref, 1);
        conn->bundle            = bundle;
-       conn->params            = bundle->params;
+       conn->local             = bundle->local;
+       conn->peer              = bundle->peer;
+       conn->key               = bundle->key;
+       conn->exclusive         = bundle->exclusive;
+       conn->upgrade           = bundle->upgrade;
+       conn->orig_service_id   = bundle->service_id;
+       conn->security_level    = bundle->security_level;
        conn->out_clientflag    = RXRPC_CLIENT_INITIATED;
        conn->state             = RXRPC_CONN_CLIENT;
-       conn->service_id        = conn->params.service_id;
+       conn->service_id        = conn->orig_service_id;
 
        ret = rxrpc_get_client_connection_id(conn, gfp);
        if (ret < 0)
@@ -195,14 +211,13 @@ rxrpc_alloc_client_connection(struct rxrpc_bundle *bundle, gfp_t gfp)
        list_add_tail(&conn->proc_link, &rxnet->conn_proc_list);
        write_unlock(&rxnet->conn_lock);
 
-       rxrpc_get_bundle(bundle);
-       rxrpc_get_peer(conn->params.peer);
-       rxrpc_get_local(conn->params.local);
-       key_get(conn->params.key);
+       rxrpc_get_bundle(bundle, rxrpc_bundle_get_client_conn);
+       rxrpc_get_peer(conn->peer, rxrpc_peer_get_client_conn);
+       rxrpc_get_local(conn->local, rxrpc_local_get_client_conn);
+       key_get(conn->key);
 
-       trace_rxrpc_conn(conn->debug_id, rxrpc_conn_new_client,
-                        refcount_read(&conn->ref),
-                        __builtin_return_address(0));
+       trace_rxrpc_conn(conn->debug_id, refcount_read(&conn->ref),
+                        rxrpc_conn_new_client);
 
        atomic_inc(&rxnet->nr_client_conns);
        trace_rxrpc_client(conn, -1, rxrpc_client_alloc);
@@ -228,7 +243,7 @@ static bool rxrpc_may_reuse_conn(struct rxrpc_connection *conn)
        if (!conn)
                goto dont_reuse;
 
-       rxnet = conn->params.local->rxnet;
+       rxnet = conn->rxnet;
        if (test_bit(RXRPC_CONN_DONT_REUSE, &conn->flags))
                goto dont_reuse;
 
@@ -285,7 +300,7 @@ static struct rxrpc_bundle *rxrpc_look_up_bundle(struct rxrpc_conn_parameters *c
        while (p) {
                bundle = rb_entry(p, struct rxrpc_bundle, local_node);
 
-#define cmp(X) ((long)bundle->params.X - (long)cp->X)
+#define cmp(X) ((long)bundle->X - (long)cp->X)
                diff = (cmp(peer) ?:
                        cmp(key) ?:
                        cmp(security_level) ?:
@@ -314,7 +329,7 @@ static struct rxrpc_bundle *rxrpc_look_up_bundle(struct rxrpc_conn_parameters *c
                parent = *pp;
                bundle = rb_entry(parent, struct rxrpc_bundle, local_node);
 
-#define cmp(X) ((long)bundle->params.X - (long)cp->X)
+#define cmp(X) ((long)bundle->X - (long)cp->X)
                diff = (cmp(peer) ?:
                        cmp(key) ?:
                        cmp(security_level) ?:
@@ -332,7 +347,7 @@ static struct rxrpc_bundle *rxrpc_look_up_bundle(struct rxrpc_conn_parameters *c
        candidate->debug_id = atomic_inc_return(&rxrpc_bundle_id);
        rb_link_node(&candidate->local_node, parent, pp);
        rb_insert_color(&candidate->local_node, &local->client_bundles);
-       rxrpc_get_bundle(candidate);
+       rxrpc_get_bundle(candidate, rxrpc_bundle_get_client_call);
        spin_unlock(&local->client_bundles_lock);
        _leave(" = %u [new]", candidate->debug_id);
        return candidate;
@@ -340,7 +355,7 @@ static struct rxrpc_bundle *rxrpc_look_up_bundle(struct rxrpc_conn_parameters *c
 found_bundle_free:
        rxrpc_free_bundle(candidate);
 found_bundle:
-       rxrpc_get_bundle(bundle);
+       rxrpc_get_bundle(bundle, rxrpc_bundle_get_client_call);
        atomic_inc(&bundle->active);
        spin_unlock(&local->client_bundles_lock);
        _leave(" = %u [found]", bundle->debug_id);
@@ -456,10 +471,10 @@ static void rxrpc_add_conn_to_bundle(struct rxrpc_bundle *bundle, gfp_t gfp)
        if (candidate) {
                _debug("discard C=%x", candidate->debug_id);
                trace_rxrpc_client(candidate, -1, rxrpc_client_duplicate);
-               rxrpc_put_connection(candidate);
+               rxrpc_put_connection(candidate, rxrpc_conn_put_discard);
        }
 
-       rxrpc_put_connection(old);
+       rxrpc_put_connection(old, rxrpc_conn_put_noreuse);
        _leave("");
 }
 
@@ -530,23 +545,21 @@ static void rxrpc_activate_one_channel(struct rxrpc_connection *conn,
        clear_bit(RXRPC_CONN_FINAL_ACK_0 + channel, &conn->flags);
        clear_bit(conn->bundle_shift + channel, &bundle->avail_chans);
 
-       rxrpc_see_call(call);
+       rxrpc_see_call(call, rxrpc_call_see_activate_client);
        list_del_init(&call->chan_wait_link);
-       call->peer      = rxrpc_get_peer(conn->params.peer);
-       call->conn      = rxrpc_get_connection(conn);
+       call->peer      = rxrpc_get_peer(conn->peer, rxrpc_peer_get_activate_call);
+       call->conn      = rxrpc_get_connection(conn, rxrpc_conn_get_activate_call);
        call->cid       = conn->proto.cid | channel;
        call->call_id   = call_id;
        call->security  = conn->security;
        call->security_ix = conn->security_ix;
-       call->service_id = conn->service_id;
+       call->dest_srx.srx_service = conn->service_id;
 
        trace_rxrpc_connect_call(call);
-       _net("CONNECT call %08x:%08x as call %d on conn %d",
-            call->cid, call->call_id, call->debug_id, conn->debug_id);
 
-       write_lock_bh(&call->state_lock);
+       write_lock(&call->state_lock);
        call->state = RXRPC_CALL_CLIENT_SEND_REQUEST;
-       write_unlock_bh(&call->state_lock);
+       write_unlock(&call->state_lock);
 
        /* Paired with the read barrier in rxrpc_connect_call().  This orders
         * cid and epoch in the connection wrt to call_id without the need to
@@ -571,7 +584,7 @@ static void rxrpc_activate_one_channel(struct rxrpc_connection *conn,
  */
 static void rxrpc_unidle_conn(struct rxrpc_bundle *bundle, struct rxrpc_connection *conn)
 {
-       struct rxrpc_net *rxnet = bundle->params.local->rxnet;
+       struct rxrpc_net *rxnet = bundle->local->rxnet;
        bool drop_ref;
 
        if (!list_empty(&conn->cache_link)) {
@@ -583,7 +596,7 @@ static void rxrpc_unidle_conn(struct rxrpc_bundle *bundle, struct rxrpc_connecti
                }
                spin_unlock(&rxnet->client_conn_cache_lock);
                if (drop_ref)
-                       rxrpc_put_connection(conn);
+                       rxrpc_put_connection(conn, rxrpc_conn_put_unidle);
        }
 }
 
@@ -732,7 +745,7 @@ granted_channel:
 
 out_put_bundle:
        rxrpc_deactivate_bundle(bundle);
-       rxrpc_put_bundle(bundle);
+       rxrpc_put_bundle(bundle, rxrpc_bundle_get_client_call);
 out:
        _leave(" = %d", ret);
        return ret;
@@ -773,6 +786,10 @@ void rxrpc_expose_client_call(struct rxrpc_call *call)
                if (chan->call_counter >= INT_MAX)
                        set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags);
                trace_rxrpc_client(conn, channel, rxrpc_client_exposed);
+
+               spin_lock(&call->peer->lock);
+               hlist_add_head(&call->error_link, &call->peer->error_targets);
+               spin_unlock(&call->peer->lock);
        }
 }
 
@@ -797,7 +814,7 @@ void rxrpc_disconnect_client_call(struct rxrpc_bundle *bundle, struct rxrpc_call
 {
        struct rxrpc_connection *conn;
        struct rxrpc_channel *chan = NULL;
-       struct rxrpc_net *rxnet = bundle->params.local->rxnet;
+       struct rxrpc_net *rxnet = bundle->local->rxnet;
        unsigned int channel;
        bool may_reuse;
        u32 cid;
@@ -887,7 +904,7 @@ void rxrpc_disconnect_client_call(struct rxrpc_bundle *bundle, struct rxrpc_call
                trace_rxrpc_client(conn, channel, rxrpc_client_to_idle);
                conn->idle_timestamp = jiffies;
 
-               rxrpc_get_connection(conn);
+               rxrpc_get_connection(conn, rxrpc_conn_get_idle);
                spin_lock(&rxnet->client_conn_cache_lock);
                list_move_tail(&conn->cache_link, &rxnet->idle_client_conns);
                spin_unlock(&rxnet->client_conn_cache_lock);
@@ -929,7 +946,7 @@ static void rxrpc_unbundle_conn(struct rxrpc_connection *conn)
 
        if (need_drop) {
                rxrpc_deactivate_bundle(bundle);
-               rxrpc_put_connection(conn);
+               rxrpc_put_connection(conn, rxrpc_conn_put_unbundle);
        }
 }
 
@@ -938,11 +955,11 @@ static void rxrpc_unbundle_conn(struct rxrpc_connection *conn)
  */
 static void rxrpc_deactivate_bundle(struct rxrpc_bundle *bundle)
 {
-       struct rxrpc_local *local = bundle->params.local;
+       struct rxrpc_local *local = bundle->local;
        bool need_put = false;
 
        if (atomic_dec_and_lock(&bundle->active, &local->client_bundles_lock)) {
-               if (!bundle->params.exclusive) {
+               if (!bundle->exclusive) {
                        _debug("erase bundle");
                        rb_erase(&bundle->local_node, &local->client_bundles);
                        need_put = true;
@@ -950,16 +967,16 @@ static void rxrpc_deactivate_bundle(struct rxrpc_bundle *bundle)
 
                spin_unlock(&local->client_bundles_lock);
                if (need_put)
-                       rxrpc_put_bundle(bundle);
+                       rxrpc_put_bundle(bundle, rxrpc_bundle_put_discard);
        }
 }
 
 /*
  * Clean up a dead client connection.
  */
-static void rxrpc_kill_client_conn(struct rxrpc_connection *conn)
+void rxrpc_kill_client_conn(struct rxrpc_connection *conn)
 {
-       struct rxrpc_local *local = conn->params.local;
+       struct rxrpc_local *local = conn->local;
        struct rxrpc_net *rxnet = local->rxnet;
 
        _enter("C=%x", conn->debug_id);
@@ -968,23 +985,6 @@ static void rxrpc_kill_client_conn(struct rxrpc_connection *conn)
        atomic_dec(&rxnet->nr_client_conns);
 
        rxrpc_put_client_connection_id(conn);
-       rxrpc_kill_connection(conn);
-}
-
-/*
- * Clean up a dead client connections.
- */
-void rxrpc_put_client_conn(struct rxrpc_connection *conn)
-{
-       const void *here = __builtin_return_address(0);
-       unsigned int debug_id = conn->debug_id;
-       bool dead;
-       int r;
-
-       dead = __refcount_dec_and_test(&conn->ref, &r);
-       trace_rxrpc_conn(debug_id, rxrpc_conn_put_client, r - 1, here);
-       if (dead)
-               rxrpc_kill_client_conn(conn);
 }
 
 /*
@@ -1010,7 +1010,7 @@ void rxrpc_discard_expired_client_conns(struct work_struct *work)
        }
 
        /* Don't double up on the discarding */
-       if (!spin_trylock(&rxnet->client_conn_discard_lock)) {
+       if (!mutex_trylock(&rxnet->client_conn_discard_lock)) {
                _leave(" [already]");
                return;
        }
@@ -1038,7 +1038,7 @@ next:
                expiry = rxrpc_conn_idle_client_expiry;
                if (nr_conns > rxrpc_reap_client_connections)
                        expiry = rxrpc_conn_idle_client_fast_expiry;
-               if (conn->params.local->service_closed)
+               if (conn->local->service_closed)
                        expiry = rxrpc_closed_conn_expiry * HZ;
 
                conn_expires_at = conn->idle_timestamp + expiry;
@@ -1048,13 +1048,15 @@ next:
                        goto not_yet_expired;
        }
 
+       atomic_dec(&conn->active);
        trace_rxrpc_client(conn, -1, rxrpc_client_discard);
        list_del_init(&conn->cache_link);
 
        spin_unlock(&rxnet->client_conn_cache_lock);
 
        rxrpc_unbundle_conn(conn);
-       rxrpc_put_connection(conn); /* Drop the ->cache_link ref */
+       /* Drop the ->cache_link ref */
+       rxrpc_put_connection(conn, rxrpc_conn_put_discard_idle);
 
        nr_conns--;
        goto next;
@@ -1073,7 +1075,7 @@ not_yet_expired:
 
 out:
        spin_unlock(&rxnet->client_conn_cache_lock);
-       spin_unlock(&rxnet->client_conn_discard_lock);
+       mutex_unlock(&rxnet->client_conn_discard_lock);
        _leave("");
 }
 
@@ -1112,7 +1114,8 @@ void rxrpc_clean_up_local_conns(struct rxrpc_local *local)
 
        list_for_each_entry_safe(conn, tmp, &rxnet->idle_client_conns,
                                 cache_link) {
-               if (conn->params.local == local) {
+               if (conn->local == local) {
+                       atomic_dec(&conn->active);
                        trace_rxrpc_client(conn, -1, rxrpc_client_discard);
                        list_move(&conn->cache_link, &graveyard);
                }
@@ -1125,7 +1128,7 @@ void rxrpc_clean_up_local_conns(struct rxrpc_local *local)
                                  struct rxrpc_connection, cache_link);
                list_del_init(&conn->cache_link);
                rxrpc_unbundle_conn(conn);
-               rxrpc_put_connection(conn);
+               rxrpc_put_connection(conn, rxrpc_conn_put_local_dead);
        }
 
        _leave(" [culled]");
index aab0697..480364b 100644 (file)
@@ -52,8 +52,8 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
        if (skb && call_id != sp->hdr.callNumber)
                return;
 
-       msg.msg_name    = &conn->params.peer->srx.transport;
-       msg.msg_namelen = conn->params.peer->srx.transport_len;
+       msg.msg_name    = &conn->peer->srx.transport;
+       msg.msg_namelen = conn->peer->srx.transport_len;
        msg.msg_control = NULL;
        msg.msg_controllen = 0;
        msg.msg_flags   = 0;
@@ -86,8 +86,8 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
                break;
 
        case RXRPC_PACKET_TYPE_ACK:
-               mtu = conn->params.peer->if_mtu;
-               mtu -= conn->params.peer->hdrsize;
+               mtu = conn->peer->if_mtu;
+               mtu -= conn->peer->hdrsize;
                pkt.ack.bufferSpace     = 0;
                pkt.ack.maxSkew         = htons(skb ? skb->priority : 0);
                pkt.ack.firstPacket     = htonl(chan->last_seq + 1);
@@ -122,19 +122,17 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
 
        switch (chan->last_type) {
        case RXRPC_PACKET_TYPE_ABORT:
-               _proto("Tx ABORT %%%u { %d } [re]", serial, conn->abort_code);
                break;
        case RXRPC_PACKET_TYPE_ACK:
                trace_rxrpc_tx_ack(chan->call_debug_id, serial,
                                   ntohl(pkt.ack.firstPacket),
                                   ntohl(pkt.ack.serial),
                                   pkt.ack.reason, 0);
-               _proto("Tx ACK %%%u [re]", serial);
                break;
        }
 
-       ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, ioc, len);
-       conn->params.peer->last_tx_at = ktime_get_seconds();
+       ret = kernel_sendmsg(conn->local->socket, &msg, iov, ioc, len);
+       conn->peer->last_tx_at = ktime_get_seconds();
        if (ret < 0)
                trace_rxrpc_tx_fail(chan->call_debug_id, serial, ret,
                                    rxrpc_tx_point_call_final_resend);
@@ -200,9 +198,9 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn,
        _enter("%d,,%u,%u", conn->debug_id, error, abort_code);
 
        /* generate a connection-level abort */
-       spin_lock_bh(&conn->state_lock);
+       spin_lock(&conn->state_lock);
        if (conn->state >= RXRPC_CONN_REMOTELY_ABORTED) {
-               spin_unlock_bh(&conn->state_lock);
+               spin_unlock(&conn->state_lock);
                _leave(" = 0 [already dead]");
                return 0;
        }
@@ -211,10 +209,10 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn,
        conn->abort_code = abort_code;
        conn->state = RXRPC_CONN_LOCALLY_ABORTED;
        set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags);
-       spin_unlock_bh(&conn->state_lock);
+       spin_unlock(&conn->state_lock);
 
-       msg.msg_name    = &conn->params.peer->srx.transport;
-       msg.msg_namelen = conn->params.peer->srx.transport_len;
+       msg.msg_name    = &conn->peer->srx.transport;
+       msg.msg_namelen = conn->peer->srx.transport_len;
        msg.msg_control = NULL;
        msg.msg_controllen = 0;
        msg.msg_flags   = 0;
@@ -242,9 +240,8 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn,
        serial = atomic_inc_return(&conn->serial);
        rxrpc_abort_calls(conn, RXRPC_CALL_LOCALLY_ABORTED, serial);
        whdr.serial = htonl(serial);
-       _proto("Tx CONN ABORT %%%u { %d }", serial, conn->abort_code);
 
-       ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len);
+       ret = kernel_sendmsg(conn->local->socket, &msg, iov, 2, len);
        if (ret < 0) {
                trace_rxrpc_tx_fail(conn->debug_id, serial, ret,
                                    rxrpc_tx_point_conn_abort);
@@ -254,7 +251,7 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn,
 
        trace_rxrpc_tx_packet(conn->debug_id, &whdr, rxrpc_tx_point_conn_abort);
 
-       conn->params.peer->last_tx_at = ktime_get_seconds();
+       conn->peer->last_tx_at = ktime_get_seconds();
 
        _leave(" = 0");
        return 0;
@@ -268,12 +265,12 @@ static void rxrpc_call_is_secure(struct rxrpc_call *call)
 {
        _enter("%p", call);
        if (call) {
-               write_lock_bh(&call->state_lock);
+               write_lock(&call->state_lock);
                if (call->state == RXRPC_CALL_SERVER_SECURING) {
                        call->state = RXRPC_CALL_SERVER_RECV_REQUEST;
                        rxrpc_notify_socket(call);
                }
-               write_unlock_bh(&call->state_lock);
+               write_unlock(&call->state_lock);
        }
 }
 
@@ -285,8 +282,6 @@ static int rxrpc_process_event(struct rxrpc_connection *conn,
                               u32 *_abort_code)
 {
        struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
-       __be32 wtmp;
-       u32 abort_code;
        int loop, ret;
 
        if (conn->state >= RXRPC_CONN_REMOTELY_ABORTED) {
@@ -308,17 +303,8 @@ static int rxrpc_process_event(struct rxrpc_connection *conn,
                return 0;
 
        case RXRPC_PACKET_TYPE_ABORT:
-               if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header),
-                                 &wtmp, sizeof(wtmp)) < 0) {
-                       trace_rxrpc_rx_eproto(NULL, sp->hdr.serial,
-                                             tracepoint_string("bad_abort"));
-                       return -EPROTO;
-               }
-               abort_code = ntohl(wtmp);
-               _proto("Rx ABORT %%%u { ac=%d }", sp->hdr.serial, abort_code);
-
                conn->error = -ECONNABORTED;
-               conn->abort_code = abort_code;
+               conn->abort_code = skb->priority;
                conn->state = RXRPC_CONN_REMOTELY_ABORTED;
                set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags);
                rxrpc_abort_calls(conn, RXRPC_CALL_REMOTELY_ABORTED, sp->hdr.serial);
@@ -334,23 +320,23 @@ static int rxrpc_process_event(struct rxrpc_connection *conn,
                        return ret;
 
                ret = conn->security->init_connection_security(
-                       conn, conn->params.key->payload.data[0]);
+                       conn, conn->key->payload.data[0]);
                if (ret < 0)
                        return ret;
 
                spin_lock(&conn->bundle->channel_lock);
-               spin_lock_bh(&conn->state_lock);
+               spin_lock(&conn->state_lock);
 
                if (conn->state == RXRPC_CONN_SERVICE_CHALLENGING) {
                        conn->state = RXRPC_CONN_SERVICE;
-                       spin_unlock_bh(&conn->state_lock);
+                       spin_unlock(&conn->state_lock);
                        for (loop = 0; loop < RXRPC_MAXCALLS; loop++)
                                rxrpc_call_is_secure(
                                        rcu_dereference_protected(
                                                conn->channels[loop].call,
                                                lockdep_is_held(&conn->bundle->channel_lock)));
                } else {
-                       spin_unlock_bh(&conn->state_lock);
+                       spin_unlock(&conn->state_lock);
                }
 
                spin_unlock(&conn->bundle->channel_lock);
@@ -451,7 +437,7 @@ static void rxrpc_do_process_connection(struct rxrpc_connection *conn)
        /* go through the conn-level event packets, releasing the ref on this
         * connection that each one has when we've finished with it */
        while ((skb = skb_dequeue(&conn->rx_queue))) {
-               rxrpc_see_skb(skb, rxrpc_skb_seen);
+               rxrpc_see_skb(skb, rxrpc_skb_see_conn_work);
                ret = rxrpc_process_event(conn, skb, &abort_code);
                switch (ret) {
                case -EPROTO:
@@ -463,7 +449,7 @@ static void rxrpc_do_process_connection(struct rxrpc_connection *conn)
                        goto requeue_and_leave;
                case -ECONNABORTED:
                default:
-                       rxrpc_free_skb(skb, rxrpc_skb_freed);
+                       rxrpc_free_skb(skb, rxrpc_skb_put_conn_work);
                        break;
                }
        }
@@ -477,7 +463,7 @@ requeue_and_leave:
 protocol_error:
        if (rxrpc_abort_connection(conn, ret, abort_code) < 0)
                goto requeue_and_leave;
-       rxrpc_free_skb(skb, rxrpc_skb_freed);
+       rxrpc_free_skb(skb, rxrpc_skb_put_conn_work);
        return;
 }
 
@@ -486,14 +472,70 @@ void rxrpc_process_connection(struct work_struct *work)
        struct rxrpc_connection *conn =
                container_of(work, struct rxrpc_connection, processor);
 
-       rxrpc_see_connection(conn);
+       rxrpc_see_connection(conn, rxrpc_conn_see_work);
 
-       if (__rxrpc_use_local(conn->params.local)) {
+       if (__rxrpc_use_local(conn->local, rxrpc_local_use_conn_work)) {
                rxrpc_do_process_connection(conn);
-               rxrpc_unuse_local(conn->params.local);
+               rxrpc_unuse_local(conn->local, rxrpc_local_unuse_conn_work);
        }
+}
 
-       rxrpc_put_connection(conn);
-       _leave("");
-       return;
+/*
+ * post connection-level events to the connection
+ * - this includes challenges, responses, some aborts and call terminal packet
+ *   retransmission.
+ */
+static void rxrpc_post_packet_to_conn(struct rxrpc_connection *conn,
+                                     struct sk_buff *skb)
+{
+       _enter("%p,%p", conn, skb);
+
+       rxrpc_get_skb(skb, rxrpc_skb_get_conn_work);
+       skb_queue_tail(&conn->rx_queue, skb);
+       rxrpc_queue_conn(conn, rxrpc_conn_queue_rx_work);
+}
+
+/*
+ * Input a connection-level packet.
+ */
+int rxrpc_input_conn_packet(struct rxrpc_connection *conn, struct sk_buff *skb)
+{
+       struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+
+       if (conn->state >= RXRPC_CONN_REMOTELY_ABORTED) {
+               _leave(" = -ECONNABORTED [%u]", conn->state);
+               return -ECONNABORTED;
+       }
+
+       _enter("{%d},{%u,%%%u},", conn->debug_id, sp->hdr.type, sp->hdr.serial);
+
+       switch (sp->hdr.type) {
+       case RXRPC_PACKET_TYPE_DATA:
+       case RXRPC_PACKET_TYPE_ACK:
+               rxrpc_conn_retransmit_call(conn, skb,
+                                          sp->hdr.cid & RXRPC_CHANNELMASK);
+               return 0;
+
+       case RXRPC_PACKET_TYPE_BUSY:
+               /* Just ignore BUSY packets for now. */
+               return 0;
+
+       case RXRPC_PACKET_TYPE_ABORT:
+               conn->error = -ECONNABORTED;
+               conn->abort_code = skb->priority;
+               conn->state = RXRPC_CONN_REMOTELY_ABORTED;
+               set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags);
+               rxrpc_abort_calls(conn, RXRPC_CALL_REMOTELY_ABORTED, sp->hdr.serial);
+               return -ECONNABORTED;
+
+       case RXRPC_PACKET_TYPE_CHALLENGE:
+       case RXRPC_PACKET_TYPE_RESPONSE:
+               rxrpc_post_packet_to_conn(conn, skb);
+               return 0;
+
+       default:
+               trace_rxrpc_rx_eproto(NULL, sp->hdr.serial,
+                                     tracepoint_string("bad_conn_pkt"));
+               return -EPROTO;
+       }
 }
index 156bd26..3c8f83d 100644 (file)
 unsigned int __read_mostly rxrpc_connection_expiry = 10 * 60;
 unsigned int __read_mostly rxrpc_closed_conn_expiry = 10;
 
-static void rxrpc_destroy_connection(struct rcu_head *);
+static void rxrpc_clean_up_connection(struct work_struct *work);
+static void rxrpc_set_service_reap_timer(struct rxrpc_net *rxnet,
+                                        unsigned long reap_at);
 
 static void rxrpc_connection_timer(struct timer_list *timer)
 {
        struct rxrpc_connection *conn =
                container_of(timer, struct rxrpc_connection, timer);
 
-       rxrpc_queue_conn(conn);
+       rxrpc_queue_conn(conn, rxrpc_conn_queue_timer);
 }
 
 /*
  * allocate a new connection
  */
-struct rxrpc_connection *rxrpc_alloc_connection(gfp_t gfp)
+struct rxrpc_connection *rxrpc_alloc_connection(struct rxrpc_net *rxnet,
+                                               gfp_t gfp)
 {
        struct rxrpc_connection *conn;
 
@@ -42,10 +45,12 @@ struct rxrpc_connection *rxrpc_alloc_connection(gfp_t gfp)
        if (conn) {
                INIT_LIST_HEAD(&conn->cache_link);
                timer_setup(&conn->timer, &rxrpc_connection_timer, 0);
-               INIT_WORK(&conn->processor, &rxrpc_process_connection);
+               INIT_WORK(&conn->processor, rxrpc_process_connection);
+               INIT_WORK(&conn->destructor, rxrpc_clean_up_connection);
                INIT_LIST_HEAD(&conn->proc_link);
                INIT_LIST_HEAD(&conn->link);
                skb_queue_head_init(&conn->rx_queue);
+               conn->rxnet = rxnet;
                conn->security = &rxrpc_no_security;
                spin_lock_init(&conn->state_lock);
                conn->debug_id = atomic_inc_return(&rxrpc_debug_id);
@@ -67,89 +72,55 @@ struct rxrpc_connection *rxrpc_alloc_connection(gfp_t gfp)
  *
  * The caller must be holding the RCU read lock.
  */
-struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *local,
-                                                  struct sk_buff *skb,
-                                                  struct rxrpc_peer **_peer)
+struct rxrpc_connection *rxrpc_find_client_connection_rcu(struct rxrpc_local *local,
+                                                         struct sockaddr_rxrpc *srx,
+                                                         struct sk_buff *skb)
 {
        struct rxrpc_connection *conn;
-       struct rxrpc_conn_proto k;
        struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
-       struct sockaddr_rxrpc srx;
        struct rxrpc_peer *peer;
 
        _enter(",%x", sp->hdr.cid & RXRPC_CIDMASK);
 
-       if (rxrpc_extract_addr_from_skb(&srx, skb) < 0)
-               goto not_found;
-
-       if (srx.transport.family != local->srx.transport.family &&
-           (srx.transport.family == AF_INET &&
-            local->srx.transport.family != AF_INET6)) {
-               pr_warn_ratelimited("AF_RXRPC: Protocol mismatch %u not %u\n",
-                                   srx.transport.family,
-                                   local->srx.transport.family);
+       /* Look up client connections by connection ID alone as their IDs are
+        * unique for this machine.
+        */
+       conn = idr_find(&rxrpc_client_conn_ids, sp->hdr.cid >> RXRPC_CIDSHIFT);
+       if (!conn || refcount_read(&conn->ref) == 0) {
+               _debug("no conn");
                goto not_found;
        }
 
-       k.epoch = sp->hdr.epoch;
-       k.cid   = sp->hdr.cid & RXRPC_CIDMASK;
-
-       if (rxrpc_to_server(sp)) {
-               /* We need to look up service connections by the full protocol
-                * parameter set.  We look up the peer first as an intermediate
-                * step and then the connection from the peer's tree.
-                */
-               peer = rxrpc_lookup_peer_rcu(local, &srx);
-               if (!peer)
-                       goto not_found;
-               *_peer = peer;
-               conn = rxrpc_find_service_conn_rcu(peer, skb);
-               if (!conn || refcount_read(&conn->ref) == 0)
-                       goto not_found;
-               _leave(" = %p", conn);
-               return conn;
-       } else {
-               /* Look up client connections by connection ID alone as their
-                * IDs are unique for this machine.
-                */
-               conn = idr_find(&rxrpc_client_conn_ids,
-                               sp->hdr.cid >> RXRPC_CIDSHIFT);
-               if (!conn || refcount_read(&conn->ref) == 0) {
-                       _debug("no conn");
-                       goto not_found;
-               }
+       if (conn->proto.epoch != sp->hdr.epoch ||
+           conn->local != local)
+               goto not_found;
 
-               if (conn->proto.epoch != k.epoch ||
-                   conn->params.local != local)
+       peer = conn->peer;
+       switch (srx->transport.family) {
+       case AF_INET:
+               if (peer->srx.transport.sin.sin_port !=
+                   srx->transport.sin.sin_port ||
+                   peer->srx.transport.sin.sin_addr.s_addr !=
+                   srx->transport.sin.sin_addr.s_addr)
                        goto not_found;
-
-               peer = conn->params.peer;
-               switch (srx.transport.family) {
-               case AF_INET:
-                       if (peer->srx.transport.sin.sin_port !=
-                           srx.transport.sin.sin_port ||
-                           peer->srx.transport.sin.sin_addr.s_addr !=
-                           srx.transport.sin.sin_addr.s_addr)
-                               goto not_found;
-                       break;
+               break;
 #ifdef CONFIG_AF_RXRPC_IPV6
-               case AF_INET6:
-                       if (peer->srx.transport.sin6.sin6_port !=
-                           srx.transport.sin6.sin6_port ||
-                           memcmp(&peer->srx.transport.sin6.sin6_addr,
-                                  &srx.transport.sin6.sin6_addr,
-                                  sizeof(struct in6_addr)) != 0)
-                               goto not_found;
-                       break;
+       case AF_INET6:
+               if (peer->srx.transport.sin6.sin6_port !=
+                   srx->transport.sin6.sin6_port ||
+                   memcmp(&peer->srx.transport.sin6.sin6_addr,
+                          &srx->transport.sin6.sin6_addr,
+                          sizeof(struct in6_addr)) != 0)
+                       goto not_found;
+               break;
 #endif
-               default:
-                       BUG();
-               }
-
-               _leave(" = %p", conn);
-               return conn;
+       default:
+               BUG();
        }
 
+       _leave(" = %p", conn);
+       return conn;
+
 not_found:
        _leave(" = NULL");
        return NULL;
@@ -210,9 +181,9 @@ void rxrpc_disconnect_call(struct rxrpc_call *call)
        call->peer->cong_ssthresh = call->cong_ssthresh;
 
        if (!hlist_unhashed(&call->error_link)) {
-               spin_lock_bh(&call->peer->lock);
-               hlist_del_rcu(&call->error_link);
-               spin_unlock_bh(&call->peer->lock);
+               spin_lock(&call->peer->lock);
+               hlist_del_init(&call->error_link);
+               spin_unlock(&call->peer->lock);
        }
 
        if (rxrpc_is_client_call(call))
@@ -224,79 +195,45 @@ void rxrpc_disconnect_call(struct rxrpc_call *call)
 
        set_bit(RXRPC_CALL_DISCONNECTED, &call->flags);
        conn->idle_timestamp = jiffies;
-}
-
-/*
- * Kill off a connection.
- */
-void rxrpc_kill_connection(struct rxrpc_connection *conn)
-{
-       struct rxrpc_net *rxnet = conn->params.local->rxnet;
-
-       ASSERT(!rcu_access_pointer(conn->channels[0].call) &&
-              !rcu_access_pointer(conn->channels[1].call) &&
-              !rcu_access_pointer(conn->channels[2].call) &&
-              !rcu_access_pointer(conn->channels[3].call));
-       ASSERT(list_empty(&conn->cache_link));
-
-       write_lock(&rxnet->conn_lock);
-       list_del_init(&conn->proc_link);
-       write_unlock(&rxnet->conn_lock);
-
-       /* Drain the Rx queue.  Note that even though we've unpublished, an
-        * incoming packet could still be being added to our Rx queue, so we
-        * will need to drain it again in the RCU cleanup handler.
-        */
-       rxrpc_purge_queue(&conn->rx_queue);
-
-       /* Leave final destruction to RCU.  The connection processor work item
-        * must carry a ref on the connection to prevent us getting here whilst
-        * it is queued or running.
-        */
-       call_rcu(&conn->rcu, rxrpc_destroy_connection);
+       if (atomic_dec_and_test(&conn->active))
+               rxrpc_set_service_reap_timer(conn->rxnet,
+                                            jiffies + rxrpc_connection_expiry);
 }
 
 /*
  * Queue a connection's work processor, getting a ref to pass to the work
  * queue.
  */
-bool rxrpc_queue_conn(struct rxrpc_connection *conn)
+void rxrpc_queue_conn(struct rxrpc_connection *conn, enum rxrpc_conn_trace why)
 {
-       const void *here = __builtin_return_address(0);
-       int r;
-
-       if (!__refcount_inc_not_zero(&conn->ref, &r))
-               return false;
-       if (rxrpc_queue_work(&conn->processor))
-               trace_rxrpc_conn(conn->debug_id, rxrpc_conn_queued, r + 1, here);
-       else
-               rxrpc_put_connection(conn);
-       return true;
+       if (atomic_read(&conn->active) >= 0 &&
+           rxrpc_queue_work(&conn->processor))
+               rxrpc_see_connection(conn, why);
 }
 
 /*
  * Note the re-emergence of a connection.
  */
-void rxrpc_see_connection(struct rxrpc_connection *conn)
+void rxrpc_see_connection(struct rxrpc_connection *conn,
+                         enum rxrpc_conn_trace why)
 {
-       const void *here = __builtin_return_address(0);
        if (conn) {
-               int n = refcount_read(&conn->ref);
+               int r = refcount_read(&conn->ref);
 
-               trace_rxrpc_conn(conn->debug_id, rxrpc_conn_seen, n, here);
+               trace_rxrpc_conn(conn->debug_id, r, why);
        }
 }
 
 /*
  * Get a ref on a connection.
  */
-struct rxrpc_connection *rxrpc_get_connection(struct rxrpc_connection *conn)
+struct rxrpc_connection *rxrpc_get_connection(struct rxrpc_connection *conn,
+                                             enum rxrpc_conn_trace why)
 {
-       const void *here = __builtin_return_address(0);
        int r;
 
        __refcount_inc(&conn->ref, &r);
-       trace_rxrpc_conn(conn->debug_id, rxrpc_conn_got, r, here);
+       trace_rxrpc_conn(conn->debug_id, r + 1, why);
        return conn;
 }
 
@@ -304,14 +241,14 @@ struct rxrpc_connection *rxrpc_get_connection(struct rxrpc_connection *conn)
  * Try to get a ref on a connection.
  */
 struct rxrpc_connection *
-rxrpc_get_connection_maybe(struct rxrpc_connection *conn)
+rxrpc_get_connection_maybe(struct rxrpc_connection *conn,
+                          enum rxrpc_conn_trace why)
 {
-       const void *here = __builtin_return_address(0);
        int r;
 
        if (conn) {
                if (__refcount_inc_not_zero(&conn->ref, &r))
-                       trace_rxrpc_conn(conn->debug_id, rxrpc_conn_got, r + 1, here);
+                       trace_rxrpc_conn(conn->debug_id, r + 1, why);
                else
                        conn = NULL;
        }
@@ -329,49 +266,95 @@ static void rxrpc_set_service_reap_timer(struct rxrpc_net *rxnet,
 }
 
 /*
- * Release a service connection
+ * destroy a virtual connection
  */
-void rxrpc_put_service_conn(struct rxrpc_connection *conn)
+static void rxrpc_rcu_free_connection(struct rcu_head *rcu)
 {
-       const void *here = __builtin_return_address(0);
-       unsigned int debug_id = conn->debug_id;
-       int r;
+       struct rxrpc_connection *conn =
+               container_of(rcu, struct rxrpc_connection, rcu);
+       struct rxrpc_net *rxnet = conn->rxnet;
 
-       __refcount_dec(&conn->ref, &r);
-       trace_rxrpc_conn(debug_id, rxrpc_conn_put_service, r - 1, here);
-       if (r - 1 == 1)
-               rxrpc_set_service_reap_timer(conn->params.local->rxnet,
-                                            jiffies + rxrpc_connection_expiry);
+       _enter("{%d,u=%d}", conn->debug_id, refcount_read(&conn->ref));
+
+       trace_rxrpc_conn(conn->debug_id, refcount_read(&conn->ref),
+                        rxrpc_conn_free);
+       kfree(conn);
+
+       if (atomic_dec_and_test(&rxnet->nr_conns))
+               wake_up_var(&rxnet->nr_conns);
 }
 
 /*
- * destroy a virtual connection
+ * Clean up a dead connection.
  */
-static void rxrpc_destroy_connection(struct rcu_head *rcu)
+static void rxrpc_clean_up_connection(struct work_struct *work)
 {
        struct rxrpc_connection *conn =
-               container_of(rcu, struct rxrpc_connection, rcu);
+               container_of(work, struct rxrpc_connection, destructor);
+       struct rxrpc_net *rxnet = conn->rxnet;
 
-       _enter("{%d,u=%d}", conn->debug_id, refcount_read(&conn->ref));
+       ASSERT(!rcu_access_pointer(conn->channels[0].call) &&
+              !rcu_access_pointer(conn->channels[1].call) &&
+              !rcu_access_pointer(conn->channels[2].call) &&
+              !rcu_access_pointer(conn->channels[3].call));
+       ASSERT(list_empty(&conn->cache_link));
 
-       ASSERTCMP(refcount_read(&conn->ref), ==, 0);
+       del_timer_sync(&conn->timer);
+       cancel_work_sync(&conn->processor); /* Processing may restart the timer */
+       del_timer_sync(&conn->timer);
 
-       _net("DESTROY CONN %d", conn->debug_id);
+       write_lock(&rxnet->conn_lock);
+       list_del_init(&conn->proc_link);
+       write_unlock(&rxnet->conn_lock);
 
-       del_timer_sync(&conn->timer);
        rxrpc_purge_queue(&conn->rx_queue);
 
+       rxrpc_kill_client_conn(conn);
+
        conn->security->clear(conn);
-       key_put(conn->params.key);
-       rxrpc_put_bundle(conn->bundle);
-       rxrpc_put_peer(conn->params.peer);
+       key_put(conn->key);
+       rxrpc_put_bundle(conn->bundle, rxrpc_bundle_put_conn);
+       rxrpc_put_peer(conn->peer, rxrpc_peer_put_conn);
+       rxrpc_put_local(conn->local, rxrpc_local_put_kill_conn);
+
+       /* Drain the Rx queue.  Note that even though we've unpublished, an
+        * incoming packet could still be being added to our Rx queue, so we
+        * will need to drain it again in the RCU cleanup handler.
+        */
+       rxrpc_purge_queue(&conn->rx_queue);
 
-       if (atomic_dec_and_test(&conn->params.local->rxnet->nr_conns))
-               wake_up_var(&conn->params.local->rxnet->nr_conns);
-       rxrpc_put_local(conn->params.local);
+       call_rcu(&conn->rcu, rxrpc_rcu_free_connection);
+}
 
-       kfree(conn);
-       _leave("");
+/*
+ * Drop a ref on a connection.
+ */
+void rxrpc_put_connection(struct rxrpc_connection *conn,
+                         enum rxrpc_conn_trace why)
+{
+       unsigned int debug_id;
+       bool dead;
+       int r;
+
+       if (!conn)
+               return;
+
+       debug_id = conn->debug_id;
+       dead = __refcount_dec_and_test(&conn->ref, &r);
+       trace_rxrpc_conn(debug_id, r - 1, why);
+       if (dead) {
+               del_timer(&conn->timer);
+               cancel_work(&conn->processor);
+
+               if (in_softirq() || work_busy(&conn->processor) ||
+                   timer_pending(&conn->timer))
+                       /* Can't use the rxrpc workqueue as we need to cancel/flush
+                        * something that may be running/waiting there.
+                        */
+                       schedule_work(&conn->destructor);
+               else
+                       rxrpc_clean_up_connection(&conn->destructor);
+       }
 }
 
 /*
@@ -383,6 +366,7 @@ void rxrpc_service_connection_reaper(struct work_struct *work)
        struct rxrpc_net *rxnet =
                container_of(work, struct rxrpc_net, service_conn_reaper);
        unsigned long expire_at, earliest, idle_timestamp, now;
+       int active;
 
        LIST_HEAD(graveyard);
 
@@ -393,20 +377,20 @@ void rxrpc_service_connection_reaper(struct work_struct *work)
 
        write_lock(&rxnet->conn_lock);
        list_for_each_entry_safe(conn, _p, &rxnet->service_conns, link) {
-               ASSERTCMP(refcount_read(&conn->ref), >, 0);
-               if (likely(refcount_read(&conn->ref) > 1))
+               ASSERTCMP(atomic_read(&conn->active), >=, 0);
+               if (likely(atomic_read(&conn->active) > 0))
                        continue;
                if (conn->state == RXRPC_CONN_SERVICE_PREALLOC)
                        continue;
 
-               if (rxnet->live && !conn->params.local->dead) {
+               if (rxnet->live && !conn->local->dead) {
                        idle_timestamp = READ_ONCE(conn->idle_timestamp);
                        expire_at = idle_timestamp + rxrpc_connection_expiry * HZ;
-                       if (conn->params.local->service_closed)
+                       if (conn->local->service_closed)
                                expire_at = idle_timestamp + rxrpc_closed_conn_expiry * HZ;
 
-                       _debug("reap CONN %d { u=%d,t=%ld }",
-                              conn->debug_id, refcount_read(&conn->ref),
+                       _debug("reap CONN %d { a=%d,t=%ld }",
+                              conn->debug_id, atomic_read(&conn->active),
                               (long)expire_at - (long)now);
 
                        if (time_before(now, expire_at)) {
@@ -416,12 +400,13 @@ void rxrpc_service_connection_reaper(struct work_struct *work)
                        }
                }
 
-               /* The usage count sits at 1 whilst the object is unused on the
-                * list; we reduce that to 0 to make the object unavailable.
+               /* The activity count sits at 0 whilst the conn is unused on
+                * the list; we reduce that to -1 to make the conn unavailable.
                 */
-               if (!refcount_dec_if_one(&conn->ref))
+               active = 0;
+               if (!atomic_try_cmpxchg(&conn->active, &active, -1))
                        continue;
-               trace_rxrpc_conn(conn->debug_id, rxrpc_conn_reap_service, 0, NULL);
+               rxrpc_see_connection(conn, rxrpc_conn_see_reap_service);
 
                if (rxrpc_conn_is_client(conn))
                        BUG();
@@ -443,8 +428,8 @@ void rxrpc_service_connection_reaper(struct work_struct *work)
                                  link);
                list_del_init(&conn->link);
 
-               ASSERTCMP(refcount_read(&conn->ref), ==, 0);
-               rxrpc_kill_connection(conn);
+               ASSERTCMP(atomic_read(&conn->active), ==, -1);
+               rxrpc_put_connection(conn, rxrpc_conn_put_service_reaped);
        }
 
        _leave("");
index 6e6aa02..2a55a88 100644 (file)
@@ -73,7 +73,7 @@ static void rxrpc_publish_service_conn(struct rxrpc_peer *peer,
        struct rxrpc_conn_proto k = conn->proto;
        struct rb_node **pp, *parent;
 
-       write_seqlock_bh(&peer->service_conn_lock);
+       write_seqlock(&peer->service_conn_lock);
 
        pp = &peer->service_conns.rb_node;
        parent = NULL;
@@ -94,14 +94,14 @@ static void rxrpc_publish_service_conn(struct rxrpc_peer *peer,
        rb_insert_color(&conn->service_node, &peer->service_conns);
 conn_published:
        set_bit(RXRPC_CONN_IN_SERVICE_CONNS, &conn->flags);
-       write_sequnlock_bh(&peer->service_conn_lock);
+       write_sequnlock(&peer->service_conn_lock);
        _leave(" = %d [new]", conn->debug_id);
        return;
 
 found_extant_conn:
        if (refcount_read(&cursor->ref) == 0)
                goto replace_old_connection;
-       write_sequnlock_bh(&peer->service_conn_lock);
+       write_sequnlock(&peer->service_conn_lock);
        /* We should not be able to get here.  rxrpc_incoming_connection() is
         * called in a non-reentrant context, so there can't be a race to
         * insert a new connection.
@@ -125,7 +125,7 @@ replace_old_connection:
 struct rxrpc_connection *rxrpc_prealloc_service_connection(struct rxrpc_net *rxnet,
                                                           gfp_t gfp)
 {
-       struct rxrpc_connection *conn = rxrpc_alloc_connection(gfp);
+       struct rxrpc_connection *conn = rxrpc_alloc_connection(rxnet, gfp);
 
        if (conn) {
                /* We maintain an extra ref on the connection whilst it is on
@@ -133,7 +133,8 @@ struct rxrpc_connection *rxrpc_prealloc_service_connection(struct rxrpc_net *rxn
                 */
                conn->state = RXRPC_CONN_SERVICE_PREALLOC;
                refcount_set(&conn->ref, 2);
-               conn->bundle = rxrpc_get_bundle(&rxrpc_service_dummy_bundle);
+               conn->bundle = rxrpc_get_bundle(&rxrpc_service_dummy_bundle,
+                                               rxrpc_bundle_get_service_conn);
 
                atomic_inc(&rxnet->nr_conns);
                write_lock(&rxnet->conn_lock);
@@ -141,9 +142,7 @@ struct rxrpc_connection *rxrpc_prealloc_service_connection(struct rxrpc_net *rxn
                list_add_tail(&conn->proc_link, &rxnet->conn_proc_list);
                write_unlock(&rxnet->conn_lock);
 
-               trace_rxrpc_conn(conn->debug_id, rxrpc_conn_new_service,
-                                refcount_read(&conn->ref),
-                                __builtin_return_address(0));
+               rxrpc_see_connection(conn, rxrpc_conn_new_service);
        }
 
        return conn;
@@ -164,7 +163,7 @@ void rxrpc_new_incoming_connection(struct rxrpc_sock *rx,
 
        conn->proto.epoch       = sp->hdr.epoch;
        conn->proto.cid         = sp->hdr.cid & RXRPC_CIDMASK;
-       conn->params.service_id = sp->hdr.serviceId;
+       conn->orig_service_id   = sp->hdr.serviceId;
        conn->service_id        = sp->hdr.serviceId;
        conn->security_ix       = sp->hdr.securityIndex;
        conn->out_clientflag    = 0;
@@ -182,10 +181,10 @@ void rxrpc_new_incoming_connection(struct rxrpc_sock *rx,
            conn->service_id == rx->service_upgrade.from)
                conn->service_id = rx->service_upgrade.to;
 
-       /* Make the connection a target for incoming packets. */
-       rxrpc_publish_service_conn(conn->params.peer, conn);
+       atomic_set(&conn->active, 1);
 
-       _net("CONNECTION new %d {%x}", conn->debug_id, conn->proto.cid);
+       /* Make the connection a target for incoming packets. */
+       rxrpc_publish_service_conn(conn->peer, conn);
 }
 
 /*
@@ -194,10 +193,10 @@ void rxrpc_new_incoming_connection(struct rxrpc_sock *rx,
  */
 void rxrpc_unpublish_service_conn(struct rxrpc_connection *conn)
 {
-       struct rxrpc_peer *peer = conn->params.peer;
+       struct rxrpc_peer *peer = conn->peer;
 
-       write_seqlock_bh(&peer->service_conn_lock);
+       write_seqlock(&peer->service_conn_lock);
        if (test_and_clear_bit(RXRPC_CONN_IN_SERVICE_CONNS, &conn->flags))
                rb_erase(&conn->service_node, &peer->service_conns);
-       write_sequnlock_bh(&peer->service_conn_lock);
+       write_sequnlock(&peer->service_conn_lock);
 }
index bdf70b8..d0e20e9 100644 (file)
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* RxRPC packet reception
+/* Processing of received RxRPC packets
  *
- * Copyright (C) 2007, 2016 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2020 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  */
 
 static void rxrpc_proto_abort(const char *why,
                              struct rxrpc_call *call, rxrpc_seq_t seq)
 {
-       if (rxrpc_abort_call(why, call, seq, RX_PROTOCOL_ERROR, -EBADMSG)) {
-               set_bit(RXRPC_CALL_EV_ABORT, &call->events);
-               rxrpc_queue_call(call);
-       }
+       if (rxrpc_abort_call(why, call, seq, RX_PROTOCOL_ERROR, -EBADMSG))
+               rxrpc_send_abort_packet(call);
 }
 
 /*
@@ -58,25 +56,6 @@ static void rxrpc_congestion_management(struct rxrpc_call *call,
        summary->cumulative_acks = cumulative_acks;
        summary->dup_acks = call->cong_dup_acks;
 
-       /* If we haven't transmitted anything for >1RTT, we should reset the
-        * congestion management state.
-        */
-       if ((call->cong_mode == RXRPC_CALL_SLOW_START ||
-            call->cong_mode == RXRPC_CALL_CONGEST_AVOIDANCE) &&
-           ktime_before(ktime_add_us(call->tx_last_sent,
-                                     call->peer->srtt_us >> 3),
-                        ktime_get_real())
-           ) {
-               change = rxrpc_cong_idle_reset;
-               summary->mode = RXRPC_CALL_SLOW_START;
-               if (RXRPC_TX_SMSS > 2190)
-                       summary->cwnd = 2;
-               else if (RXRPC_TX_SMSS > 1095)
-                       summary->cwnd = 3;
-               else
-                       summary->cwnd = 4;
-       }
-
        switch (call->cong_mode) {
        case RXRPC_CALL_SLOW_START:
                if (summary->saw_nacks)
@@ -174,8 +153,8 @@ out_no_clear_ca:
        call->cong_cwnd = cwnd;
        call->cong_cumul_acks = cumulative_acks;
        trace_rxrpc_congest(call, summary, acked_serial, change);
-       if (resend && !test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events))
-               rxrpc_queue_call(call);
+       if (resend)
+               rxrpc_resend(call, skb);
        return;
 
 packet_loss_detected:
@@ -197,6 +176,33 @@ send_extra_data:
 }
 
 /*
+ * Degrade the congestion window if we haven't transmitted a packet for >1RTT.
+ */
+void rxrpc_congestion_degrade(struct rxrpc_call *call)
+{
+       ktime_t rtt, now;
+
+       if (call->cong_mode != RXRPC_CALL_SLOW_START &&
+           call->cong_mode != RXRPC_CALL_CONGEST_AVOIDANCE)
+               return;
+       if (call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY)
+               return;
+
+       rtt = ns_to_ktime(call->peer->srtt_us * (1000 / 8));
+       now = ktime_get_real();
+       if (!ktime_before(ktime_add(call->tx_last_sent, rtt), now))
+               return;
+
+       trace_rxrpc_reset_cwnd(call, now);
+       rxrpc_inc_stat(call->rxnet, stat_tx_data_cwnd_reset);
+       call->tx_last_sent = now;
+       call->cong_mode = RXRPC_CALL_SLOW_START;
+       call->cong_ssthresh = max_t(unsigned int, call->cong_ssthresh,
+                                   call->cong_cwnd * 3 / 4);
+       call->cong_cwnd = max_t(unsigned int, call->cong_cwnd / 2, RXRPC_MIN_CWND);
+}
+
+/*
  * Apply a hard ACK by advancing the Tx window.
  */
 static bool rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to,
@@ -338,7 +344,8 @@ static void rxrpc_input_queue_data(struct rxrpc_call *call, struct sk_buff *skb,
 /*
  * Process a DATA packet.
  */
-static void rxrpc_input_data_one(struct rxrpc_call *call, struct sk_buff *skb)
+static void rxrpc_input_data_one(struct rxrpc_call *call, struct sk_buff *skb,
+                                bool *_notify)
 {
        struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
        struct sk_buff *oos;
@@ -361,7 +368,7 @@ static void rxrpc_input_data_one(struct rxrpc_call *call, struct sk_buff *skb)
                if (test_and_set_bit(RXRPC_CALL_RX_LAST, &call->flags) &&
                    seq + 1 != wtop) {
                        rxrpc_proto_abort("LSN", call, seq);
-                       goto err_free;
+                       return;
                }
        } else {
                if (test_bit(RXRPC_CALL_RX_LAST, &call->flags) &&
@@ -369,7 +376,7 @@ static void rxrpc_input_data_one(struct rxrpc_call *call, struct sk_buff *skb)
                        pr_warn("Packet beyond last: c=%x q=%x window=%x-%x wlimit=%x\n",
                                call->debug_id, seq, window, wtop, wlimit);
                        rxrpc_proto_abort("LSA", call, seq);
-                       goto err_free;
+                       return;
                }
        }
 
@@ -397,14 +404,18 @@ static void rxrpc_input_data_one(struct rxrpc_call *call, struct sk_buff *skb)
                /* Send an immediate ACK if we fill in a hole */
                else if (!skb_queue_empty(&call->rx_oos_queue))
                        ack_reason = RXRPC_ACK_DELAY;
+               else
+                       atomic_inc_return(&call->ackr_nr_unacked);
 
                window++;
                if (after(window, wtop))
                        wtop = window;
 
+               rxrpc_get_skb(skb, rxrpc_skb_get_to_recvmsg);
+
                spin_lock(&call->recvmsg_queue.lock);
                rxrpc_input_queue_data(call, skb, window, wtop, rxrpc_receive_queue);
-               skb = NULL;
+               *_notify = true;
 
                while ((oos = skb_peek(&call->rx_oos_queue))) {
                        struct rxrpc_skb_priv *osp = rxrpc_skb(oos);
@@ -456,36 +467,26 @@ static void rxrpc_input_data_one(struct rxrpc_call *call, struct sk_buff *skb)
                        struct rxrpc_skb_priv *osp = rxrpc_skb(oos);
 
                        if (after(osp->hdr.seq, seq)) {
+                               rxrpc_get_skb(skb, rxrpc_skb_get_to_recvmsg_oos);
                                __skb_queue_before(&call->rx_oos_queue, oos, skb);
                                goto oos_queued;
                        }
                }
 
+               rxrpc_get_skb(skb, rxrpc_skb_get_to_recvmsg_oos);
                __skb_queue_tail(&call->rx_oos_queue, skb);
        oos_queued:
                trace_rxrpc_receive(call, last ? rxrpc_receive_oos_last : rxrpc_receive_oos,
                                    sp->hdr.serial, sp->hdr.seq);
-               skb = NULL;
        }
 
 send_ack:
-       if (ack_reason < 0 &&
-           atomic_inc_return(&call->ackr_nr_unacked) > 2 &&
-           test_and_set_bit(RXRPC_CALL_IDLE_ACK_PENDING, &call->flags)) {
-               ack_reason = RXRPC_ACK_IDLE;
-       } else if (ack_reason >= 0) {
-               set_bit(RXRPC_CALL_IDLE_ACK_PENDING, &call->flags);
-       }
-
        if (ack_reason >= 0)
                rxrpc_send_ACK(call, ack_reason, serial,
                               rxrpc_propose_ack_input_data);
        else
                rxrpc_propose_delay_ACK(call, serial,
                                        rxrpc_propose_ack_input_data);
-
-err_free:
-       rxrpc_free_skb(skb, rxrpc_skb_freed);
 }
 
 /*
@@ -498,6 +499,7 @@ static bool rxrpc_input_split_jumbo(struct rxrpc_call *call, struct sk_buff *skb
        struct sk_buff *jskb;
        unsigned int offset = sizeof(struct rxrpc_wire_header);
        unsigned int len = skb->len - offset;
+       bool notify = false;
 
        while (sp->hdr.flags & RXRPC_JUMBO_PACKET) {
                if (len < RXRPC_JUMBO_SUBPKTLEN)
@@ -508,16 +510,17 @@ static bool rxrpc_input_split_jumbo(struct rxrpc_call *call, struct sk_buff *skb
                                  &jhdr, sizeof(jhdr)) < 0)
                        goto protocol_error;
 
-               jskb = skb_clone(skb, GFP_ATOMIC);
+               jskb = skb_clone(skb, GFP_NOFS);
                if (!jskb) {
                        kdebug("couldn't clone");
                        return false;
                }
-               rxrpc_new_skb(jskb, rxrpc_skb_cloned_jumbo);
+               rxrpc_new_skb(jskb, rxrpc_skb_new_jumbo_subpacket);
                jsp = rxrpc_skb(jskb);
                jsp->offset = offset;
                jsp->len = RXRPC_JUMBO_DATALEN;
-               rxrpc_input_data_one(call, jskb);
+               rxrpc_input_data_one(call, jskb, &notify);
+               rxrpc_free_skb(jskb, rxrpc_skb_put_jumbo_subpacket);
 
                sp->hdr.flags = jhdr.flags;
                sp->hdr._rsvd = ntohs(jhdr._rsvd);
@@ -529,7 +532,11 @@ static bool rxrpc_input_split_jumbo(struct rxrpc_call *call, struct sk_buff *skb
 
        sp->offset = offset;
        sp->len    = len;
-       rxrpc_input_data_one(call, skb);
+       rxrpc_input_data_one(call, skb, &notify);
+       if (notify) {
+               trace_rxrpc_notify_socket(call->debug_id, sp->hdr.serial);
+               rxrpc_notify_socket(call);
+       }
        return true;
 
 protocol_error:
@@ -551,32 +558,9 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb)
               atomic64_read(&call->ackr_window), call->rx_highest_seq,
               skb->len, seq0);
 
-       _proto("Rx DATA %%%u { #%u f=%02x }",
-              sp->hdr.serial, seq0, sp->hdr.flags);
-
        state = READ_ONCE(call->state);
-       if (state >= RXRPC_CALL_COMPLETE) {
-               rxrpc_free_skb(skb, rxrpc_skb_freed);
+       if (state >= RXRPC_CALL_COMPLETE)
                return;
-       }
-
-       /* Unshare the packet so that it can be modified for in-place
-        * decryption.
-        */
-       if (sp->hdr.securityIndex != 0) {
-               struct sk_buff *nskb = skb_unshare(skb, GFP_ATOMIC);
-               if (!nskb) {
-                       rxrpc_eaten_skb(skb, rxrpc_skb_unshared_nomem);
-                       return;
-               }
-
-               if (nskb != skb) {
-                       rxrpc_eaten_skb(skb, rxrpc_skb_received);
-                       skb = nskb;
-                       rxrpc_new_skb(skb, rxrpc_skb_unshared);
-                       sp = rxrpc_skb(skb);
-               }
-       }
 
        if (state == RXRPC_CALL_SERVER_RECV_REQUEST) {
                unsigned long timo = READ_ONCE(call->next_req_timo);
@@ -591,28 +575,23 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb)
                }
        }
 
-       spin_lock(&call->input_lock);
-
        /* Received data implicitly ACKs all of the request packets we sent
         * when we're acting as a client.
         */
        if ((state == RXRPC_CALL_CLIENT_SEND_REQUEST ||
             state == RXRPC_CALL_CLIENT_AWAIT_REPLY) &&
            !rxrpc_receiving_reply(call))
-               goto out;
+               goto out_notify;
 
        if (!rxrpc_input_split_jumbo(call, skb)) {
                rxrpc_proto_abort("VLD", call, sp->hdr.seq);
-               goto out;
+               goto out_notify;
        }
        skb = NULL;
 
-out:
+out_notify:
        trace_rxrpc_notify_socket(call->debug_id, serial);
        rxrpc_notify_socket(call);
-
-       spin_unlock(&call->input_lock);
-       rxrpc_free_skb(skb, rxrpc_skb_freed);
        _leave(" [queued]");
 }
 
@@ -671,32 +650,6 @@ static void rxrpc_complete_rtt_probe(struct rxrpc_call *call,
 }
 
 /*
- * Process the response to a ping that we sent to find out if we lost an ACK.
- *
- * If we got back a ping response that indicates a lower tx_top than what we
- * had at the time of the ping transmission, we adjudge all the DATA packets
- * sent between the response tx_top and the ping-time tx_top to have been lost.
- */
-static void rxrpc_input_check_for_lost_ack(struct rxrpc_call *call)
-{
-       if (after(call->acks_lost_top, call->acks_prev_seq) &&
-           !test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events))
-               rxrpc_queue_call(call);
-}
-
-/*
- * Process a ping response.
- */
-static void rxrpc_input_ping_response(struct rxrpc_call *call,
-                                     ktime_t resp_time,
-                                     rxrpc_serial_t acked_serial,
-                                     rxrpc_serial_t ack_serial)
-{
-       if (acked_serial == call->acks_lost_ping)
-               rxrpc_input_check_for_lost_ack(call);
-}
-
-/*
  * Process the extra information that may be appended to an ACK packet
  */
 static void rxrpc_input_ackinfo(struct rxrpc_call *call, struct sk_buff *skb,
@@ -708,11 +661,6 @@ static void rxrpc_input_ackinfo(struct rxrpc_call *call, struct sk_buff *skb,
        bool wake = false;
        u32 rwind = ntohl(ackinfo->rwind);
 
-       _proto("Rx ACK %%%u Info { rx=%u max=%u rwin=%u jm=%u }",
-              sp->hdr.serial,
-              ntohl(ackinfo->rxMTU), ntohl(ackinfo->maxMTU),
-              rwind, ntohl(ackinfo->jumbo_max));
-
        if (rwind > RXRPC_TX_MAX_WINDOW)
                rwind = RXRPC_TX_MAX_WINDOW;
        if (call->tx_winsize != rwind) {
@@ -729,11 +677,10 @@ static void rxrpc_input_ackinfo(struct rxrpc_call *call, struct sk_buff *skb,
 
        peer = call->peer;
        if (mtu < peer->maxdata) {
-               spin_lock_bh(&peer->lock);
+               spin_lock(&peer->lock);
                peer->maxdata = mtu;
                peer->mtu = mtu + peer->hdrsize;
-               spin_unlock_bh(&peer->lock);
-               _net("Net MTU %u (maxdata %u)", peer->mtu, peer->maxdata);
+               spin_unlock(&peer->lock);
        }
 
        if (wake)
@@ -810,7 +757,6 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
        struct rxrpc_ackpacket ack;
        struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
        struct rxrpc_ackinfo info;
-       struct sk_buff *skb_old = NULL, *skb_put = skb;
        rxrpc_serial_t ack_serial, acked_serial;
        rxrpc_seq_t first_soft_ack, hard_ack, prev_pkt;
        int nr_acks, offset, ioffset;
@@ -818,10 +764,8 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
        _enter("");
 
        offset = sizeof(struct rxrpc_wire_header);
-       if (skb_copy_bits(skb, offset, &ack, sizeof(ack)) < 0) {
-               rxrpc_proto_abort("XAK", call, 0);
-               goto out_not_locked;
-       }
+       if (skb_copy_bits(skb, offset, &ack, sizeof(ack)) < 0)
+               return rxrpc_proto_abort("XAK", call, 0);
        offset += sizeof(ack);
 
        ack_serial = sp->hdr.serial;
@@ -855,7 +799,6 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
        }
 
        if (ack.reason == RXRPC_ACK_PING) {
-               _proto("Rx ACK %%%u PING Request", ack_serial);
                rxrpc_send_ACK(call, RXRPC_ACK_PING_RESPONSE, ack_serial,
                               rxrpc_propose_ack_respond_to_ping);
        } else if (sp->hdr.flags & RXRPC_REQUEST_ACK) {
@@ -895,41 +838,25 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
                trace_rxrpc_rx_discard_ack(call->debug_id, ack_serial,
                                           first_soft_ack, call->acks_first_seq,
                                           prev_pkt, call->acks_prev_seq);
-               goto out_not_locked;
+               return;
        }
 
        info.rxMTU = 0;
        ioffset = offset + nr_acks + 3;
        if (skb->len >= ioffset + sizeof(info) &&
-           skb_copy_bits(skb, ioffset, &info, sizeof(info)) < 0) {
-               rxrpc_proto_abort("XAI", call, 0);
-               goto out_not_locked;
-       }
+           skb_copy_bits(skb, ioffset, &info, sizeof(info)) < 0)
+               return rxrpc_proto_abort("XAI", call, 0);
 
        if (nr_acks > 0)
                skb_condense(skb);
 
-       spin_lock(&call->input_lock);
-
-       /* Discard any out-of-order or duplicate ACKs (inside lock). */
-       if (!rxrpc_is_ack_valid(call, first_soft_ack, prev_pkt)) {
-               trace_rxrpc_rx_discard_ack(call->debug_id, ack_serial,
-                                          first_soft_ack, call->acks_first_seq,
-                                          prev_pkt, call->acks_prev_seq);
-               goto out;
-       }
        call->acks_latest_ts = skb->tstamp;
-
        call->acks_first_seq = first_soft_ack;
        call->acks_prev_seq = prev_pkt;
 
        switch (ack.reason) {
        case RXRPC_ACK_PING:
                break;
-       case RXRPC_ACK_PING_RESPONSE:
-               rxrpc_input_ping_response(call, skb->tstamp, acked_serial,
-                                         ack_serial);
-               fallthrough;
        default:
                if (after(acked_serial, call->acks_highest_serial))
                        call->acks_highest_serial = acked_serial;
@@ -940,10 +867,8 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
        if (info.rxMTU)
                rxrpc_input_ackinfo(call, skb, &info);
 
-       if (first_soft_ack == 0) {
-               rxrpc_proto_abort("AK0", call, 0);
-               goto out;
-       }
+       if (first_soft_ack == 0)
+               return rxrpc_proto_abort("AK0", call, 0);
 
        /* Ignore ACKs unless we are or have just been transmitting. */
        switch (READ_ONCE(call->state)) {
@@ -953,45 +878,27 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
        case RXRPC_CALL_SERVER_AWAIT_ACK:
                break;
        default:
-               goto out;
+               return;
        }
 
        if (before(hard_ack, call->acks_hard_ack) ||
-           after(hard_ack, call->tx_top)) {
-               rxrpc_proto_abort("AKW", call, 0);
-               goto out;
-       }
-       if (nr_acks > call->tx_top - hard_ack) {
-               rxrpc_proto_abort("AKN", call, 0);
-               goto out;
-       }
+           after(hard_ack, call->tx_top))
+               return rxrpc_proto_abort("AKW", call, 0);
+       if (nr_acks > call->tx_top - hard_ack)
+               return rxrpc_proto_abort("AKN", call, 0);
 
        if (after(hard_ack, call->acks_hard_ack)) {
                if (rxrpc_rotate_tx_window(call, hard_ack, &summary)) {
                        rxrpc_end_tx_phase(call, false, "ETA");
-                       goto out;
+                       return;
                }
        }
 
        if (nr_acks > 0) {
-               if (offset > (int)skb->len - nr_acks) {
-                       rxrpc_proto_abort("XSA", call, 0);
-                       goto out;
-               }
-
-               spin_lock(&call->acks_ack_lock);
-               skb_old = call->acks_soft_tbl;
-               call->acks_soft_tbl = skb;
-               spin_unlock(&call->acks_ack_lock);
-
+               if (offset > (int)skb->len - nr_acks)
+                       return rxrpc_proto_abort("XSA", call, 0);
                rxrpc_input_soft_acks(call, skb->data + offset, first_soft_ack,
                                      nr_acks, &summary);
-               skb_put = NULL;
-       } else if (call->acks_soft_tbl) {
-               spin_lock(&call->acks_ack_lock);
-               skb_old = call->acks_soft_tbl;
-               call->acks_soft_tbl = NULL;
-               spin_unlock(&call->acks_ack_lock);
        }
 
        if (test_bit(RXRPC_CALL_TX_LAST, &call->flags) &&
@@ -1001,11 +908,6 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
                                   rxrpc_propose_ack_ping_for_lost_reply);
 
        rxrpc_congestion_management(call, skb, &summary, acked_serial);
-out:
-       spin_unlock(&call->input_lock);
-out_not_locked:
-       rxrpc_free_skb(skb_put, rxrpc_skb_freed);
-       rxrpc_free_skb(skb_old, rxrpc_skb_freed);
 }
 
 /*
@@ -1014,16 +916,9 @@ out_not_locked:
 static void rxrpc_input_ackall(struct rxrpc_call *call, struct sk_buff *skb)
 {
        struct rxrpc_ack_summary summary = { 0 };
-       struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
-
-       _proto("Rx ACKALL %%%u", sp->hdr.serial);
-
-       spin_lock(&call->input_lock);
 
        if (rxrpc_rotate_tx_window(call, call->tx_top, &summary))
                rxrpc_end_tx_phase(call, false, "ETL");
-
-       spin_unlock(&call->input_lock);
 }
 
 /*
@@ -1032,35 +927,30 @@ static void rxrpc_input_ackall(struct rxrpc_call *call, struct sk_buff *skb)
 static void rxrpc_input_abort(struct rxrpc_call *call, struct sk_buff *skb)
 {
        struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
-       __be32 wtmp;
-       u32 abort_code = RX_CALL_DEAD;
-
-       _enter("");
-
-       if (skb->len >= 4 &&
-           skb_copy_bits(skb, sizeof(struct rxrpc_wire_header),
-                         &wtmp, sizeof(wtmp)) >= 0)
-               abort_code = ntohl(wtmp);
 
-       trace_rxrpc_rx_abort(call, sp->hdr.serial, abort_code);
-
-       _proto("Rx ABORT %%%u { %x }", sp->hdr.serial, abort_code);
+       trace_rxrpc_rx_abort(call, sp->hdr.serial, skb->priority);
 
        rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED,
-                                 abort_code, -ECONNABORTED);
+                                 skb->priority, -ECONNABORTED);
 }
 
 /*
  * Process an incoming call packet.
  */
-static void rxrpc_input_call_packet(struct rxrpc_call *call,
-                                   struct sk_buff *skb)
+void rxrpc_input_call_packet(struct rxrpc_call *call, struct sk_buff *skb)
 {
        struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
        unsigned long timo;
 
        _enter("%p,%p", call, skb);
 
+       if (sp->hdr.serviceId != call->dest_srx.srx_service)
+               call->dest_srx.srx_service = sp->hdr.serviceId;
+       if ((int)sp->hdr.serial - (int)call->rx_serial > 0)
+               call->rx_serial = sp->hdr.serial;
+       if (!test_bit(RXRPC_CALL_RX_HEARD, &call->flags))
+               set_bit(RXRPC_CALL_RX_HEARD, &call->flags);
+
        timo = READ_ONCE(call->next_rx_timo);
        if (timo) {
                unsigned long now = jiffies, expect_rx_by;
@@ -1074,15 +964,13 @@ static void rxrpc_input_call_packet(struct rxrpc_call *call,
        switch (sp->hdr.type) {
        case RXRPC_PACKET_TYPE_DATA:
                rxrpc_input_data(call, skb);
-               goto no_free;
+               break;
 
        case RXRPC_PACKET_TYPE_ACK:
                rxrpc_input_ack(call, skb);
-               goto no_free;
+               break;
 
        case RXRPC_PACKET_TYPE_BUSY:
-               _proto("Rx BUSY %%%u", sp->hdr.serial);
-
                /* Just ignore BUSY packets from the server; the retry and
                 * lifespan timers will take care of business.  BUSY packets
                 * from the client don't make sense.
@@ -1100,10 +988,6 @@ static void rxrpc_input_call_packet(struct rxrpc_call *call,
        default:
                break;
        }
-
-       rxrpc_free_skb(skb, rxrpc_skb_freed);
-no_free:
-       _leave("");
 }
 
 /*
@@ -1112,10 +996,10 @@ no_free:
  *
  * TODO: If callNumber > call_id + 1, renegotiate security.
  */
-static void rxrpc_input_implicit_end_call(struct rxrpc_sock *rx,
-                                         struct rxrpc_connection *conn,
-                                         struct rxrpc_call *call)
+void rxrpc_implicit_end_call(struct rxrpc_call *call, struct sk_buff *skb)
 {
+       struct rxrpc_connection *conn = call->conn;
+
        switch (READ_ONCE(call->state)) {
        case RXRPC_CALL_SERVER_AWAIT_ACK:
                rxrpc_call_completed(call);
@@ -1123,360 +1007,15 @@ static void rxrpc_input_implicit_end_call(struct rxrpc_sock *rx,
        case RXRPC_CALL_COMPLETE:
                break;
        default:
-               if (rxrpc_abort_call("IMP", call, 0, RX_CALL_DEAD, -ESHUTDOWN)) {
-                       set_bit(RXRPC_CALL_EV_ABORT, &call->events);
-                       rxrpc_queue_call(call);
-               }
+               if (rxrpc_abort_call("IMP", call, 0, RX_CALL_DEAD, -ESHUTDOWN))
+                       rxrpc_send_abort_packet(call);
                trace_rxrpc_improper_term(call);
                break;
        }
 
-       spin_lock(&rx->incoming_lock);
-       __rxrpc_disconnect_call(conn, call);
-       spin_unlock(&rx->incoming_lock);
-}
-
-/*
- * post connection-level events to the connection
- * - this includes challenges, responses, some aborts and call terminal packet
- *   retransmission.
- */
-static void rxrpc_post_packet_to_conn(struct rxrpc_connection *conn,
-                                     struct sk_buff *skb)
-{
-       _enter("%p,%p", conn, skb);
-
-       skb_queue_tail(&conn->rx_queue, skb);
-       rxrpc_queue_conn(conn);
-}
-
-/*
- * post endpoint-level events to the local endpoint
- * - this includes debug and version messages
- */
-static void rxrpc_post_packet_to_local(struct rxrpc_local *local,
-                                      struct sk_buff *skb)
-{
-       _enter("%p,%p", local, skb);
-
-       if (rxrpc_get_local_maybe(local)) {
-               skb_queue_tail(&local->event_queue, skb);
-               rxrpc_queue_local(local);
-       } else {
-               rxrpc_free_skb(skb, rxrpc_skb_freed);
-       }
-}
-
-/*
- * put a packet up for transport-level abort
- */
-static void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb)
-{
-       if (rxrpc_get_local_maybe(local)) {
-               skb_queue_tail(&local->reject_queue, skb);
-               rxrpc_queue_local(local);
-       } else {
-               rxrpc_free_skb(skb, rxrpc_skb_freed);
-       }
-}
-
-/*
- * Extract the wire header from a packet and translate the byte order.
- */
-static noinline
-int rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb)
-{
-       struct rxrpc_wire_header whdr;
-
-       /* dig out the RxRPC connection details */
-       if (skb_copy_bits(skb, 0, &whdr, sizeof(whdr)) < 0) {
-               trace_rxrpc_rx_eproto(NULL, sp->hdr.serial,
-                                     tracepoint_string("bad_hdr"));
-               return -EBADMSG;
-       }
-
-       memset(sp, 0, sizeof(*sp));
-       sp->hdr.epoch           = ntohl(whdr.epoch);
-       sp->hdr.cid             = ntohl(whdr.cid);
-       sp->hdr.callNumber      = ntohl(whdr.callNumber);
-       sp->hdr.seq             = ntohl(whdr.seq);
-       sp->hdr.serial          = ntohl(whdr.serial);
-       sp->hdr.flags           = whdr.flags;
-       sp->hdr.type            = whdr.type;
-       sp->hdr.userStatus      = whdr.userStatus;
-       sp->hdr.securityIndex   = whdr.securityIndex;
-       sp->hdr._rsvd           = ntohs(whdr._rsvd);
-       sp->hdr.serviceId       = ntohs(whdr.serviceId);
-       return 0;
-}
-
-/*
- * handle data received on the local endpoint
- * - may be called in interrupt context
- *
- * [!] Note that as this is called from the encap_rcv hook, the socket is not
- * held locked by the caller and nothing prevents sk_user_data on the UDP from
- * being cleared in the middle of processing this function.
- *
- * Called with the RCU read lock held from the IP layer via UDP.
- */
-int rxrpc_input_packet(struct sock *udp_sk, struct sk_buff *skb)
-{
-       struct rxrpc_local *local = rcu_dereference_sk_user_data(udp_sk);
-       struct rxrpc_connection *conn;
-       struct rxrpc_channel *chan;
-       struct rxrpc_call *call = NULL;
-       struct rxrpc_skb_priv *sp;
-       struct rxrpc_peer *peer = NULL;
-       struct rxrpc_sock *rx = NULL;
-       unsigned int channel;
-
-       _enter("%p", udp_sk);
-
-       if (unlikely(!local)) {
-               kfree_skb(skb);
-               return 0;
-       }
-       if (skb->tstamp == 0)
-               skb->tstamp = ktime_get_real();
-
-       rxrpc_new_skb(skb, rxrpc_skb_received);
-
-       skb_pull(skb, sizeof(struct udphdr));
-
-       /* The UDP protocol already released all skb resources;
-        * we are free to add our own data there.
-        */
-       sp = rxrpc_skb(skb);
-
-       /* dig out the RxRPC connection details */
-       if (rxrpc_extract_header(sp, skb) < 0)
-               goto bad_message;
-
-       if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) {
-               static int lose;
-               if ((lose++ & 7) == 7) {
-                       trace_rxrpc_rx_lose(sp);
-                       rxrpc_free_skb(skb, rxrpc_skb_lost);
-                       return 0;
-               }
-       }
-
-       if (skb->tstamp == 0)
-               skb->tstamp = ktime_get_real();
-       trace_rxrpc_rx_packet(sp);
-
-       switch (sp->hdr.type) {
-       case RXRPC_PACKET_TYPE_VERSION:
-               if (rxrpc_to_client(sp))
-                       goto discard;
-               rxrpc_post_packet_to_local(local, skb);
-               goto out;
-
-       case RXRPC_PACKET_TYPE_BUSY:
-               if (rxrpc_to_server(sp))
-                       goto discard;
-               fallthrough;
-       case RXRPC_PACKET_TYPE_ACK:
-       case RXRPC_PACKET_TYPE_ACKALL:
-               if (sp->hdr.callNumber == 0)
-                       goto bad_message;
-               fallthrough;
-       case RXRPC_PACKET_TYPE_ABORT:
-               break;
-
-       case RXRPC_PACKET_TYPE_DATA:
-               if (sp->hdr.callNumber == 0 ||
-                   sp->hdr.seq == 0)
-                       goto bad_message;
-
-               /* Unshare the packet so that it can be modified for in-place
-                * decryption.
-                */
-               if (sp->hdr.securityIndex != 0) {
-                       struct sk_buff *nskb = skb_unshare(skb, GFP_ATOMIC);
-                       if (!nskb) {
-                               rxrpc_eaten_skb(skb, rxrpc_skb_unshared_nomem);
-                               goto out;
-                       }
-
-                       if (nskb != skb) {
-                               rxrpc_eaten_skb(skb, rxrpc_skb_received);
-                               skb = nskb;
-                               rxrpc_new_skb(skb, rxrpc_skb_unshared);
-                               sp = rxrpc_skb(skb);
-                       }
-               }
-               break;
-
-       case RXRPC_PACKET_TYPE_CHALLENGE:
-               if (rxrpc_to_server(sp))
-                       goto discard;
-               break;
-       case RXRPC_PACKET_TYPE_RESPONSE:
-               if (rxrpc_to_client(sp))
-                       goto discard;
-               break;
-
-               /* Packet types 9-11 should just be ignored. */
-       case RXRPC_PACKET_TYPE_PARAMS:
-       case RXRPC_PACKET_TYPE_10:
-       case RXRPC_PACKET_TYPE_11:
-               goto discard;
-
-       default:
-               _proto("Rx Bad Packet Type %u", sp->hdr.type);
-               goto bad_message;
-       }
-
-       if (sp->hdr.serviceId == 0)
-               goto bad_message;
-
-       if (rxrpc_to_server(sp)) {
-               /* Weed out packets to services we're not offering.  Packets
-                * that would begin a call are explicitly rejected and the rest
-                * are just discarded.
-                */
-               rx = rcu_dereference(local->service);
-               if (!rx || (sp->hdr.serviceId != rx->srx.srx_service &&
-                           sp->hdr.serviceId != rx->second_service)) {
-                       if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA &&
-                           sp->hdr.seq == 1)
-                               goto unsupported_service;
-                       goto discard;
-               }
-       }
-
-       conn = rxrpc_find_connection_rcu(local, skb, &peer);
-       if (conn) {
-               if (sp->hdr.securityIndex != conn->security_ix)
-                       goto wrong_security;
-
-               if (sp->hdr.serviceId != conn->service_id) {
-                       int old_id;
-
-                       if (!test_bit(RXRPC_CONN_PROBING_FOR_UPGRADE, &conn->flags))
-                               goto reupgrade;
-                       old_id = cmpxchg(&conn->service_id, conn->params.service_id,
-                                        sp->hdr.serviceId);
-
-                       if (old_id != conn->params.service_id &&
-                           old_id != sp->hdr.serviceId)
-                               goto reupgrade;
-               }
-
-               if (sp->hdr.callNumber == 0) {
-                       /* Connection-level packet */
-                       _debug("CONN %p {%d}", conn, conn->debug_id);
-                       rxrpc_post_packet_to_conn(conn, skb);
-                       goto out;
-               }
-
-               if ((int)sp->hdr.serial - (int)conn->hi_serial > 0)
-                       conn->hi_serial = sp->hdr.serial;
-
-               /* Call-bound packets are routed by connection channel. */
-               channel = sp->hdr.cid & RXRPC_CHANNELMASK;
-               chan = &conn->channels[channel];
-
-               /* Ignore really old calls */
-               if (sp->hdr.callNumber < chan->last_call)
-                       goto discard;
-
-               if (sp->hdr.callNumber == chan->last_call) {
-                       if (chan->call ||
-                           sp->hdr.type == RXRPC_PACKET_TYPE_ABORT)
-                               goto discard;
-
-                       /* For the previous service call, if completed
-                        * successfully, we discard all further packets.
-                        */
-                       if (rxrpc_conn_is_service(conn) &&
-                           chan->last_type == RXRPC_PACKET_TYPE_ACK)
-                               goto discard;
-
-                       /* But otherwise we need to retransmit the final packet
-                        * from data cached in the connection record.
-                        */
-                       if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA)
-                               trace_rxrpc_rx_data(chan->call_debug_id,
-                                                   sp->hdr.seq,
-                                                   sp->hdr.serial,
-                                                   sp->hdr.flags);
-                       rxrpc_post_packet_to_conn(conn, skb);
-                       goto out;
-               }
-
-               call = rcu_dereference(chan->call);
-
-               if (sp->hdr.callNumber > chan->call_id) {
-                       if (rxrpc_to_client(sp))
-                               goto reject_packet;
-                       if (call)
-                               rxrpc_input_implicit_end_call(rx, conn, call);
-                       call = NULL;
-               }
-
-               if (call) {
-                       if (sp->hdr.serviceId != call->service_id)
-                               call->service_id = sp->hdr.serviceId;
-                       if ((int)sp->hdr.serial - (int)call->rx_serial > 0)
-                               call->rx_serial = sp->hdr.serial;
-                       if (!test_bit(RXRPC_CALL_RX_HEARD, &call->flags))
-                               set_bit(RXRPC_CALL_RX_HEARD, &call->flags);
-               }
-       }
-
-       if (!call || refcount_read(&call->ref) == 0) {
-               if (rxrpc_to_client(sp) ||
-                   sp->hdr.type != RXRPC_PACKET_TYPE_DATA)
-                       goto bad_message;
-               if (sp->hdr.seq != 1)
-                       goto discard;
-               call = rxrpc_new_incoming_call(local, rx, skb);
-               if (!call)
-                       goto reject_packet;
-       }
-
-       /* Process a call packet; this either discards or passes on the ref
-        * elsewhere.
-        */
-       rxrpc_input_call_packet(call, skb);
-       goto out;
+       rxrpc_input_call_event(call, skb);
 
-discard:
-       rxrpc_free_skb(skb, rxrpc_skb_freed);
-out:
-       trace_rxrpc_rx_done(0, 0);
-       return 0;
-
-wrong_security:
-       trace_rxrpc_abort(0, "SEC", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq,
-                         RXKADINCONSISTENCY, EBADMSG);
-       skb->priority = RXKADINCONSISTENCY;
-       goto post_abort;
-
-unsupported_service:
-       trace_rxrpc_abort(0, "INV", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq,
-                         RX_INVALID_OPERATION, EOPNOTSUPP);
-       skb->priority = RX_INVALID_OPERATION;
-       goto post_abort;
-
-reupgrade:
-       trace_rxrpc_abort(0, "UPG", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq,
-                         RX_PROTOCOL_ERROR, EBADMSG);
-       goto protocol_error;
-
-bad_message:
-       trace_rxrpc_abort(0, "BAD", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq,
-                         RX_PROTOCOL_ERROR, EBADMSG);
-protocol_error:
-       skb->priority = RX_PROTOCOL_ERROR;
-post_abort:
-       skb->mark = RXRPC_SKB_MARK_REJECT_ABORT;
-reject_packet:
-       trace_rxrpc_rx_done(skb->mark, skb->priority);
-       rxrpc_reject_packet(local, skb);
-       _leave(" [badmsg]");
-       return 0;
+       spin_lock(&conn->bundle->channel_lock);
+       __rxrpc_disconnect_call(conn, call);
+       spin_unlock(&conn->bundle->channel_lock);
 }
diff --git a/net/rxrpc/io_thread.c b/net/rxrpc/io_thread.c
new file mode 100644 (file)
index 0000000..d83ae31
--- /dev/null
@@ -0,0 +1,496 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* RxRPC packet reception
+ *
+ * Copyright (C) 2007, 2016, 2022 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include "ar-internal.h"
+
+static int rxrpc_input_packet_on_conn(struct rxrpc_connection *conn,
+                                     struct sockaddr_rxrpc *peer_srx,
+                                     struct sk_buff *skb);
+
+/*
+ * handle data received on the local endpoint
+ * - may be called in interrupt context
+ *
+ * [!] Note that as this is called from the encap_rcv hook, the socket is not
+ * held locked by the caller and nothing prevents sk_user_data on the UDP from
+ * being cleared in the middle of processing this function.
+ *
+ * Called with the RCU read lock held from the IP layer via UDP.
+ */
+int rxrpc_encap_rcv(struct sock *udp_sk, struct sk_buff *skb)
+{
+       struct rxrpc_local *local = rcu_dereference_sk_user_data(udp_sk);
+
+       if (unlikely(!local)) {
+               kfree_skb(skb);
+               return 0;
+       }
+       if (skb->tstamp == 0)
+               skb->tstamp = ktime_get_real();
+
+       skb->mark = RXRPC_SKB_MARK_PACKET;
+       rxrpc_new_skb(skb, rxrpc_skb_new_encap_rcv);
+       skb_queue_tail(&local->rx_queue, skb);
+       rxrpc_wake_up_io_thread(local);
+       return 0;
+}
+
+/*
+ * Handle an error received on the local endpoint.
+ */
+void rxrpc_error_report(struct sock *sk)
+{
+       struct rxrpc_local *local;
+       struct sk_buff *skb;
+
+       rcu_read_lock();
+       local = rcu_dereference_sk_user_data(sk);
+       if (unlikely(!local)) {
+               rcu_read_unlock();
+               return;
+       }
+
+       while ((skb = skb_dequeue(&sk->sk_error_queue))) {
+               skb->mark = RXRPC_SKB_MARK_ERROR;
+               rxrpc_new_skb(skb, rxrpc_skb_new_error_report);
+               skb_queue_tail(&local->rx_queue, skb);
+       }
+
+       rxrpc_wake_up_io_thread(local);
+       rcu_read_unlock();
+}
+
+/*
+ * Process event packets targeted at a local endpoint.
+ */
+static void rxrpc_input_version(struct rxrpc_local *local, struct sk_buff *skb)
+{
+       struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+       char v;
+
+       _enter("");
+
+       rxrpc_see_skb(skb, rxrpc_skb_see_version);
+       if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), &v, 1) >= 0) {
+               if (v == 0)
+                       rxrpc_send_version_request(local, &sp->hdr, skb);
+       }
+}
+
+/*
+ * Extract the wire header from a packet and translate the byte order.
+ */
+static noinline
+int rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb)
+{
+       struct rxrpc_wire_header whdr;
+
+       /* dig out the RxRPC connection details */
+       if (skb_copy_bits(skb, 0, &whdr, sizeof(whdr)) < 0) {
+               trace_rxrpc_rx_eproto(NULL, sp->hdr.serial,
+                                     tracepoint_string("bad_hdr"));
+               return -EBADMSG;
+       }
+
+       memset(sp, 0, sizeof(*sp));
+       sp->hdr.epoch           = ntohl(whdr.epoch);
+       sp->hdr.cid             = ntohl(whdr.cid);
+       sp->hdr.callNumber      = ntohl(whdr.callNumber);
+       sp->hdr.seq             = ntohl(whdr.seq);
+       sp->hdr.serial          = ntohl(whdr.serial);
+       sp->hdr.flags           = whdr.flags;
+       sp->hdr.type            = whdr.type;
+       sp->hdr.userStatus      = whdr.userStatus;
+       sp->hdr.securityIndex   = whdr.securityIndex;
+       sp->hdr._rsvd           = ntohs(whdr._rsvd);
+       sp->hdr.serviceId       = ntohs(whdr.serviceId);
+       return 0;
+}
+
+/*
+ * Extract the abort code from an ABORT packet and stash it in skb->priority.
+ */
+static bool rxrpc_extract_abort(struct sk_buff *skb)
+{
+       __be32 wtmp;
+
+       if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header),
+                         &wtmp, sizeof(wtmp)) < 0)
+               return false;
+       skb->priority = ntohl(wtmp);
+       return true;
+}
+
+/*
+ * Process packets received on the local endpoint
+ */
+static int rxrpc_input_packet(struct rxrpc_local *local, struct sk_buff **_skb)
+{
+       struct rxrpc_connection *conn;
+       struct sockaddr_rxrpc peer_srx;
+       struct rxrpc_skb_priv *sp;
+       struct rxrpc_peer *peer = NULL;
+       struct sk_buff *skb = *_skb;
+       int ret = 0;
+
+       skb_pull(skb, sizeof(struct udphdr));
+
+       sp = rxrpc_skb(skb);
+
+       /* dig out the RxRPC connection details */
+       if (rxrpc_extract_header(sp, skb) < 0)
+               goto bad_message;
+
+       if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) {
+               static int lose;
+               if ((lose++ & 7) == 7) {
+                       trace_rxrpc_rx_lose(sp);
+                       return 0;
+               }
+       }
+
+       trace_rxrpc_rx_packet(sp);
+
+       switch (sp->hdr.type) {
+       case RXRPC_PACKET_TYPE_VERSION:
+               if (rxrpc_to_client(sp))
+                       return 0;
+               rxrpc_input_version(local, skb);
+               return 0;
+
+       case RXRPC_PACKET_TYPE_BUSY:
+               if (rxrpc_to_server(sp))
+                       return 0;
+               fallthrough;
+       case RXRPC_PACKET_TYPE_ACK:
+       case RXRPC_PACKET_TYPE_ACKALL:
+               if (sp->hdr.callNumber == 0)
+                       goto bad_message;
+               break;
+       case RXRPC_PACKET_TYPE_ABORT:
+               if (!rxrpc_extract_abort(skb))
+                       return 0; /* Just discard if malformed */
+               break;
+
+       case RXRPC_PACKET_TYPE_DATA:
+               if (sp->hdr.callNumber == 0 ||
+                   sp->hdr.seq == 0)
+                       goto bad_message;
+
+               /* Unshare the packet so that it can be modified for in-place
+                * decryption.
+                */
+               if (sp->hdr.securityIndex != 0) {
+                       skb = skb_unshare(skb, GFP_ATOMIC);
+                       if (!skb) {
+                               rxrpc_eaten_skb(*_skb, rxrpc_skb_eaten_by_unshare_nomem);
+                               *_skb = NULL;
+                               return 0;
+                       }
+
+                       if (skb != *_skb) {
+                               rxrpc_eaten_skb(*_skb, rxrpc_skb_eaten_by_unshare);
+                               *_skb = skb;
+                               rxrpc_new_skb(skb, rxrpc_skb_new_unshared);
+                               sp = rxrpc_skb(skb);
+                       }
+               }
+               break;
+
+       case RXRPC_PACKET_TYPE_CHALLENGE:
+               if (rxrpc_to_server(sp))
+                       return 0;
+               break;
+       case RXRPC_PACKET_TYPE_RESPONSE:
+               if (rxrpc_to_client(sp))
+                       return 0;
+               break;
+
+               /* Packet types 9-11 should just be ignored. */
+       case RXRPC_PACKET_TYPE_PARAMS:
+       case RXRPC_PACKET_TYPE_10:
+       case RXRPC_PACKET_TYPE_11:
+               return 0;
+
+       default:
+               goto bad_message;
+       }
+
+       if (sp->hdr.serviceId == 0)
+               goto bad_message;
+
+       if (WARN_ON_ONCE(rxrpc_extract_addr_from_skb(&peer_srx, skb) < 0))
+               return true; /* Unsupported address type - discard. */
+
+       if (peer_srx.transport.family != local->srx.transport.family &&
+           (peer_srx.transport.family == AF_INET &&
+            local->srx.transport.family != AF_INET6)) {
+               pr_warn_ratelimited("AF_RXRPC: Protocol mismatch %u not %u\n",
+                                   peer_srx.transport.family,
+                                   local->srx.transport.family);
+               return true; /* Wrong address type - discard. */
+       }
+
+       if (rxrpc_to_client(sp)) {
+               rcu_read_lock();
+               conn = rxrpc_find_client_connection_rcu(local, &peer_srx, skb);
+               conn = rxrpc_get_connection_maybe(conn, rxrpc_conn_get_call_input);
+               rcu_read_unlock();
+               if (!conn) {
+                       trace_rxrpc_abort(0, "NCC", sp->hdr.cid,
+                                         sp->hdr.callNumber, sp->hdr.seq,
+                                         RXKADINCONSISTENCY, EBADMSG);
+                       goto protocol_error;
+               }
+
+               ret = rxrpc_input_packet_on_conn(conn, &peer_srx, skb);
+               rxrpc_put_connection(conn, rxrpc_conn_put_call_input);
+               return ret;
+       }
+
+       /* We need to look up service connections by the full protocol
+        * parameter set.  We look up the peer first as an intermediate step
+        * and then the connection from the peer's tree.
+        */
+       rcu_read_lock();
+
+       peer = rxrpc_lookup_peer_rcu(local, &peer_srx);
+       if (!peer) {
+               rcu_read_unlock();
+               return rxrpc_new_incoming_call(local, NULL, NULL, &peer_srx, skb);
+       }
+
+       conn = rxrpc_find_service_conn_rcu(peer, skb);
+       conn = rxrpc_get_connection_maybe(conn, rxrpc_conn_get_call_input);
+       if (conn) {
+               rcu_read_unlock();
+               ret = rxrpc_input_packet_on_conn(conn, &peer_srx, skb);
+               rxrpc_put_connection(conn, rxrpc_conn_put_call_input);
+               return ret;
+       }
+
+       peer = rxrpc_get_peer_maybe(peer, rxrpc_peer_get_input);
+       rcu_read_unlock();
+
+       ret = rxrpc_new_incoming_call(local, peer, NULL, &peer_srx, skb);
+       rxrpc_put_peer(peer, rxrpc_peer_put_input);
+       if (ret < 0)
+               goto reject_packet;
+       return 0;
+
+bad_message:
+       trace_rxrpc_abort(0, "BAD", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq,
+                         RX_PROTOCOL_ERROR, EBADMSG);
+protocol_error:
+       skb->priority = RX_PROTOCOL_ERROR;
+       skb->mark = RXRPC_SKB_MARK_REJECT_ABORT;
+reject_packet:
+       rxrpc_reject_packet(local, skb);
+       return ret;
+}
+
+/*
+ * Deal with a packet that's associated with an extant connection.
+ */
+static int rxrpc_input_packet_on_conn(struct rxrpc_connection *conn,
+                                     struct sockaddr_rxrpc *peer_srx,
+                                     struct sk_buff *skb)
+{
+       struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+       struct rxrpc_channel *chan;
+       struct rxrpc_call *call = NULL;
+       unsigned int channel;
+
+       if (sp->hdr.securityIndex != conn->security_ix)
+               goto wrong_security;
+
+       if (sp->hdr.serviceId != conn->service_id) {
+               int old_id;
+
+               if (!test_bit(RXRPC_CONN_PROBING_FOR_UPGRADE, &conn->flags))
+                       goto reupgrade;
+               old_id = cmpxchg(&conn->service_id, conn->orig_service_id,
+                                sp->hdr.serviceId);
+
+               if (old_id != conn->orig_service_id &&
+                   old_id != sp->hdr.serviceId)
+                       goto reupgrade;
+       }
+
+       if (after(sp->hdr.serial, conn->hi_serial))
+               conn->hi_serial = sp->hdr.serial;
+
+       /* It's a connection-level packet if the call number is 0. */
+       if (sp->hdr.callNumber == 0)
+               return rxrpc_input_conn_packet(conn, skb);
+
+       /* Call-bound packets are routed by connection channel. */
+       channel = sp->hdr.cid & RXRPC_CHANNELMASK;
+       chan = &conn->channels[channel];
+
+       /* Ignore really old calls */
+       if (sp->hdr.callNumber < chan->last_call)
+               return 0;
+
+       if (sp->hdr.callNumber == chan->last_call) {
+               if (chan->call ||
+                   sp->hdr.type == RXRPC_PACKET_TYPE_ABORT)
+                       return 0;
+
+               /* For the previous service call, if completed successfully, we
+                * discard all further packets.
+                */
+               if (rxrpc_conn_is_service(conn) &&
+                   chan->last_type == RXRPC_PACKET_TYPE_ACK)
+                       return 0;
+
+               /* But otherwise we need to retransmit the final packet from
+                * data cached in the connection record.
+                */
+               if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA)
+                       trace_rxrpc_rx_data(chan->call_debug_id,
+                                           sp->hdr.seq,
+                                           sp->hdr.serial,
+                                           sp->hdr.flags);
+               rxrpc_input_conn_packet(conn, skb);
+               return 0;
+       }
+
+       rcu_read_lock();
+       call = rxrpc_try_get_call(rcu_dereference(chan->call),
+                                 rxrpc_call_get_input);
+       rcu_read_unlock();
+
+       if (sp->hdr.callNumber > chan->call_id) {
+               if (rxrpc_to_client(sp)) {
+                       rxrpc_put_call(call, rxrpc_call_put_input);
+                       goto reject_packet;
+               }
+
+               if (call) {
+                       rxrpc_implicit_end_call(call, skb);
+                       rxrpc_put_call(call, rxrpc_call_put_input);
+                       call = NULL;
+               }
+       }
+
+       if (!call) {
+               if (rxrpc_to_client(sp))
+                       goto bad_message;
+               if (rxrpc_new_incoming_call(conn->local, conn->peer, conn,
+                                           peer_srx, skb))
+                       return 0;
+               goto reject_packet;
+       }
+
+       rxrpc_input_call_event(call, skb);
+       rxrpc_put_call(call, rxrpc_call_put_input);
+       return 0;
+
+wrong_security:
+       trace_rxrpc_abort(0, "SEC", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq,
+                         RXKADINCONSISTENCY, EBADMSG);
+       skb->priority = RXKADINCONSISTENCY;
+       goto post_abort;
+
+reupgrade:
+       trace_rxrpc_abort(0, "UPG", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq,
+                         RX_PROTOCOL_ERROR, EBADMSG);
+       goto protocol_error;
+
+bad_message:
+       trace_rxrpc_abort(0, "BAD", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq,
+                         RX_PROTOCOL_ERROR, EBADMSG);
+protocol_error:
+       skb->priority = RX_PROTOCOL_ERROR;
+post_abort:
+       skb->mark = RXRPC_SKB_MARK_REJECT_ABORT;
+reject_packet:
+       rxrpc_reject_packet(conn->local, skb);
+       return 0;
+}
+
+/*
+ * I/O and event handling thread.
+ */
+int rxrpc_io_thread(void *data)
+{
+       struct sk_buff_head rx_queue;
+       struct rxrpc_local *local = data;
+       struct rxrpc_call *call;
+       struct sk_buff *skb;
+
+       skb_queue_head_init(&rx_queue);
+
+       set_user_nice(current, MIN_NICE);
+
+       for (;;) {
+               rxrpc_inc_stat(local->rxnet, stat_io_loop);
+
+               /* Deal with calls that want immediate attention. */
+               if ((call = list_first_entry_or_null(&local->call_attend_q,
+                                                    struct rxrpc_call,
+                                                    attend_link))) {
+                       spin_lock_bh(&local->lock);
+                       list_del_init(&call->attend_link);
+                       spin_unlock_bh(&local->lock);
+
+                       trace_rxrpc_call_poked(call);
+                       rxrpc_input_call_event(call, NULL);
+                       rxrpc_put_call(call, rxrpc_call_put_poke);
+                       continue;
+               }
+
+               /* Process received packets and errors. */
+               if ((skb = __skb_dequeue(&rx_queue))) {
+                       switch (skb->mark) {
+                       case RXRPC_SKB_MARK_PACKET:
+                               skb->priority = 0;
+                               rxrpc_input_packet(local, &skb);
+                               trace_rxrpc_rx_done(skb->mark, skb->priority);
+                               rxrpc_free_skb(skb, rxrpc_skb_put_input);
+                               break;
+                       case RXRPC_SKB_MARK_ERROR:
+                               rxrpc_input_error(local, skb);
+                               rxrpc_free_skb(skb, rxrpc_skb_put_error_report);
+                               break;
+                       default:
+                               WARN_ON_ONCE(1);
+                               rxrpc_free_skb(skb, rxrpc_skb_put_unknown);
+                               break;
+                       }
+                       continue;
+               }
+
+               if (!skb_queue_empty(&local->rx_queue)) {
+                       spin_lock_irq(&local->rx_queue.lock);
+                       skb_queue_splice_tail_init(&local->rx_queue, &rx_queue);
+                       spin_unlock_irq(&local->rx_queue.lock);
+                       continue;
+               }
+
+               set_current_state(TASK_INTERRUPTIBLE);
+               if (!skb_queue_empty(&local->rx_queue) ||
+                   !list_empty(&local->call_attend_q)) {
+                       __set_current_state(TASK_RUNNING);
+                       continue;
+               }
+
+               if (kthread_should_stop())
+                       break;
+               schedule();
+       }
+
+       __set_current_state(TASK_RUNNING);
+       rxrpc_see_local(local, rxrpc_local_stop);
+       rxrpc_destroy_local(local);
+       local->io_thread = NULL;
+       rxrpc_see_local(local, rxrpc_local_stopped);
+       return 0;
+}
index 8d2073e..8d53ade 100644 (file)
@@ -513,7 +513,7 @@ int rxrpc_get_server_data_key(struct rxrpc_connection *conn,
        if (ret < 0)
                goto error;
 
-       conn->params.key = key;
+       conn->key = key;
        _leave(" = 0 [%d]", key_serial(key));
        return 0;
 
@@ -602,7 +602,8 @@ static long rxrpc_read(const struct key *key,
                }
 
                _debug("token[%u]: toksize=%u", ntoks, toksize);
-               ASSERTCMP(toksize, <=, AFSTOKEN_LENGTH_MAX);
+               if (WARN_ON(toksize > AFSTOKEN_LENGTH_MAX))
+                       return -EIO;
 
                toksizes[ntoks++] = toksize;
                size += toksize + 4; /* each token has a length word */
@@ -679,8 +680,9 @@ static long rxrpc_read(const struct key *key,
                        return -ENOPKG;
                }
 
-               ASSERTCMP((unsigned long)xdr - (unsigned long)oldxdr, ==,
-                         toksize);
+               if (WARN_ON((unsigned long)xdr - (unsigned long)oldxdr ==
+                           toksize))
+                       return -EIO;
        }
 
 #undef ENCODE_STR
@@ -688,8 +690,10 @@ static long rxrpc_read(const struct key *key,
 #undef ENCODE64
 #undef ENCODE
 
-       ASSERTCMP(tok, ==, ntoks);
-       ASSERTCMP((char __user *) xdr - buffer, ==, size);
+       if (WARN_ON(tok != ntoks))
+               return -EIO;
+       if (WARN_ON((unsigned long)xdr - (unsigned long)buffer != size))
+               return -EIO;
        _leave(" = %zu", size);
        return size;
 }
index 19e929c..5e69ea6 100644 (file)
@@ -21,9 +21,9 @@ static const char rxrpc_version_string[65] = "linux-" UTS_RELEASE " AF_RXRPC";
 /*
  * Reply to a version request
  */
-static void rxrpc_send_version_request(struct rxrpc_local *local,
-                                      struct rxrpc_host_header *hdr,
-                                      struct sk_buff *skb)
+void rxrpc_send_version_request(struct rxrpc_local *local,
+                               struct rxrpc_host_header *hdr,
+                               struct sk_buff *skb)
 {
        struct rxrpc_wire_header whdr;
        struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
@@ -63,8 +63,6 @@ static void rxrpc_send_version_request(struct rxrpc_local *local,
 
        len = iov[0].iov_len + iov[1].iov_len;
 
-       _proto("Tx VERSION (reply)");
-
        ret = kernel_sendmsg(local->socket, &msg, iov, 2, len);
        if (ret < 0)
                trace_rxrpc_tx_fail(local->debug_id, 0, ret,
@@ -75,41 +73,3 @@ static void rxrpc_send_version_request(struct rxrpc_local *local,
 
        _leave("");
 }
-
-/*
- * Process event packets targeted at a local endpoint.
- */
-void rxrpc_process_local_events(struct rxrpc_local *local)
-{
-       struct sk_buff *skb;
-       char v;
-
-       _enter("");
-
-       skb = skb_dequeue(&local->event_queue);
-       if (skb) {
-               struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
-
-               rxrpc_see_skb(skb, rxrpc_skb_seen);
-               _debug("{%d},{%u}", local->debug_id, sp->hdr.type);
-
-               switch (sp->hdr.type) {
-               case RXRPC_PACKET_TYPE_VERSION:
-                       if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header),
-                                         &v, 1) < 0)
-                               return;
-                       _proto("Rx VERSION { %02x }", v);
-                       if (v == 0)
-                               rxrpc_send_version_request(local, &sp->hdr, skb);
-                       break;
-
-               default:
-                       /* Just ignore anything we don't understand */
-                       break;
-               }
-
-               rxrpc_free_skb(skb, rxrpc_skb_freed);
-       }
-
-       _leave("");
-}
index a943fdf..4422292 100644 (file)
@@ -20,7 +20,6 @@
 #include <net/af_rxrpc.h>
 #include "ar-internal.h"
 
-static void rxrpc_local_processor(struct work_struct *);
 static void rxrpc_local_rcu(struct rcu_head *);
 
 /*
@@ -97,12 +96,9 @@ static struct rxrpc_local *rxrpc_alloc_local(struct rxrpc_net *rxnet,
                atomic_set(&local->active_users, 1);
                local->rxnet = rxnet;
                INIT_HLIST_NODE(&local->link);
-               INIT_WORK(&local->processor, rxrpc_local_processor);
-               INIT_LIST_HEAD(&local->ack_tx_queue);
-               spin_lock_init(&local->ack_tx_lock);
                init_rwsem(&local->defrag_sem);
-               skb_queue_head_init(&local->reject_queue);
-               skb_queue_head_init(&local->event_queue);
+               skb_queue_head_init(&local->rx_queue);
+               INIT_LIST_HEAD(&local->call_attend_q);
                local->client_bundles = RB_ROOT;
                spin_lock_init(&local->client_bundles_lock);
                spin_lock_init(&local->lock);
@@ -110,7 +106,7 @@ static struct rxrpc_local *rxrpc_alloc_local(struct rxrpc_net *rxnet,
                local->debug_id = atomic_inc_return(&rxrpc_debug_id);
                memcpy(&local->srx, srx, sizeof(*srx));
                local->srx.srx_service = 0;
-               trace_rxrpc_local(local->debug_id, rxrpc_local_new, 1, NULL);
+               trace_rxrpc_local(local->debug_id, rxrpc_local_new, 1, 1);
        }
 
        _leave(" = %p", local);
@@ -126,6 +122,7 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net)
        struct udp_tunnel_sock_cfg tuncfg = {NULL};
        struct sockaddr_rxrpc *srx = &local->srx;
        struct udp_port_cfg udp_conf = {0};
+       struct task_struct *io_thread;
        struct sock *usk;
        int ret;
 
@@ -152,7 +149,7 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net)
        }
 
        tuncfg.encap_type = UDP_ENCAP_RXRPC;
-       tuncfg.encap_rcv = rxrpc_input_packet;
+       tuncfg.encap_rcv = rxrpc_encap_rcv;
        tuncfg.encap_err_rcv = rxrpc_encap_err_rcv;
        tuncfg.sk_user_data = local;
        setup_udp_tunnel_sock(net, local->socket, &tuncfg);
@@ -185,8 +182,23 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net)
                BUG();
        }
 
+       io_thread = kthread_run(rxrpc_io_thread, local,
+                               "krxrpcio/%u", ntohs(udp_conf.local_udp_port));
+       if (IS_ERR(io_thread)) {
+               ret = PTR_ERR(io_thread);
+               goto error_sock;
+       }
+
+       local->io_thread = io_thread;
        _leave(" = 0");
        return 0;
+
+error_sock:
+       kernel_sock_shutdown(local->socket, SHUT_RDWR);
+       local->socket->sk->sk_user_data = NULL;
+       sock_release(local->socket);
+       local->socket = NULL;
+       return ret;
 }
 
 /*
@@ -198,7 +210,6 @@ struct rxrpc_local *rxrpc_lookup_local(struct net *net,
        struct rxrpc_local *local;
        struct rxrpc_net *rxnet = rxrpc_net(net);
        struct hlist_node *cursor;
-       const char *age;
        long diff;
        int ret;
 
@@ -229,10 +240,9 @@ struct rxrpc_local *rxrpc_lookup_local(struct net *net,
                 * we're attempting to use a local address that the dying
                 * object is still using.
                 */
-               if (!rxrpc_use_local(local))
+               if (!rxrpc_use_local(local, rxrpc_local_use_lookup))
                        break;
 
-               age = "old";
                goto found;
        }
 
@@ -250,14 +260,9 @@ struct rxrpc_local *rxrpc_lookup_local(struct net *net,
        } else {
                hlist_add_head_rcu(&local->link, &rxnet->local_endpoints);
        }
-       age = "new";
 
 found:
        mutex_unlock(&rxnet->local_mutex);
-
-       _net("LOCAL %s %d {%pISp}",
-            age, local->debug_id, &local->srx.transport);
-
        _leave(" = %p", local);
        return local;
 
@@ -279,64 +284,49 @@ addr_in_use:
 /*
  * Get a ref on a local endpoint.
  */
-struct rxrpc_local *rxrpc_get_local(struct rxrpc_local *local)
+struct rxrpc_local *rxrpc_get_local(struct rxrpc_local *local,
+                                   enum rxrpc_local_trace why)
 {
-       const void *here = __builtin_return_address(0);
-       int r;
+       int r, u;
 
+       u = atomic_read(&local->active_users);
        __refcount_inc(&local->ref, &r);
-       trace_rxrpc_local(local->debug_id, rxrpc_local_got, r + 1, here);
+       trace_rxrpc_local(local->debug_id, why, r + 1, u);
        return local;
 }
 
 /*
  * Get a ref on a local endpoint unless its usage has already reached 0.
  */
-struct rxrpc_local *rxrpc_get_local_maybe(struct rxrpc_local *local)
+struct rxrpc_local *rxrpc_get_local_maybe(struct rxrpc_local *local,
+                                         enum rxrpc_local_trace why)
 {
-       const void *here = __builtin_return_address(0);
-       int r;
+       int r, u;
 
-       if (local) {
-               if (__refcount_inc_not_zero(&local->ref, &r))
-                       trace_rxrpc_local(local->debug_id, rxrpc_local_got,
-                                         r + 1, here);
-               else
-                       local = NULL;
+       if (local && __refcount_inc_not_zero(&local->ref, &r)) {
+               u = atomic_read(&local->active_users);
+               trace_rxrpc_local(local->debug_id, why, r + 1, u);
+               return local;
        }
-       return local;
-}
 
-/*
- * Queue a local endpoint and pass the caller's reference to the work item.
- */
-void rxrpc_queue_local(struct rxrpc_local *local)
-{
-       const void *here = __builtin_return_address(0);
-       unsigned int debug_id = local->debug_id;
-       int r = refcount_read(&local->ref);
-
-       if (rxrpc_queue_work(&local->processor))
-               trace_rxrpc_local(debug_id, rxrpc_local_queued, r + 1, here);
-       else
-               rxrpc_put_local(local);
+       return NULL;
 }
 
 /*
  * Drop a ref on a local endpoint.
  */
-void rxrpc_put_local(struct rxrpc_local *local)
+void rxrpc_put_local(struct rxrpc_local *local, enum rxrpc_local_trace why)
 {
-       const void *here = __builtin_return_address(0);
        unsigned int debug_id;
        bool dead;
-       int r;
+       int r, u;
 
        if (local) {
                debug_id = local->debug_id;
 
+               u = atomic_read(&local->active_users);
                dead = __refcount_dec_and_test(&local->ref, &r);
-               trace_rxrpc_local(debug_id, rxrpc_local_put, r, here);
+               trace_rxrpc_local(debug_id, why, r, u);
 
                if (dead)
                        call_rcu(&local->rcu, rxrpc_local_rcu);
@@ -346,14 +336,15 @@ void rxrpc_put_local(struct rxrpc_local *local)
 /*
  * Start using a local endpoint.
  */
-struct rxrpc_local *rxrpc_use_local(struct rxrpc_local *local)
+struct rxrpc_local *rxrpc_use_local(struct rxrpc_local *local,
+                                   enum rxrpc_local_trace why)
 {
-       local = rxrpc_get_local_maybe(local);
+       local = rxrpc_get_local_maybe(local, rxrpc_local_get_for_use);
        if (!local)
                return NULL;
 
-       if (!__rxrpc_use_local(local)) {
-               rxrpc_put_local(local);
+       if (!__rxrpc_use_local(local, why)) {
+               rxrpc_put_local(local, rxrpc_local_put_for_use);
                return NULL;
        }
 
@@ -362,15 +353,19 @@ struct rxrpc_local *rxrpc_use_local(struct rxrpc_local *local)
 
 /*
  * Cease using a local endpoint.  Once the number of active users reaches 0, we
- * start the closure of the transport in the work processor.
+ * start the closure of the transport in the I/O thread..
  */
-void rxrpc_unuse_local(struct rxrpc_local *local)
+void rxrpc_unuse_local(struct rxrpc_local *local, enum rxrpc_local_trace why)
 {
+       unsigned int debug_id = local->debug_id;
+       int r, u;
+
        if (local) {
-               if (__rxrpc_unuse_local(local)) {
-                       rxrpc_get_local(local);
-                       rxrpc_queue_local(local);
-               }
+               r = refcount_read(&local->ref);
+               u = atomic_dec_return(&local->active_users);
+               trace_rxrpc_local(debug_id, why, r, u);
+               if (u == 0)
+                       kthread_stop(local->io_thread);
        }
 }
 
@@ -381,7 +376,7 @@ void rxrpc_unuse_local(struct rxrpc_local *local)
  * Closing the socket cannot be done from bottom half context or RCU callback
  * context because it might sleep.
  */
-static void rxrpc_local_destroyer(struct rxrpc_local *local)
+void rxrpc_destroy_local(struct rxrpc_local *local)
 {
        struct socket *socket = local->socket;
        struct rxrpc_net *rxnet = local->rxnet;
@@ -408,52 +403,7 @@ static void rxrpc_local_destroyer(struct rxrpc_local *local)
        /* At this point, there should be no more packets coming in to the
         * local endpoint.
         */
-       rxrpc_purge_queue(&local->reject_queue);
-       rxrpc_purge_queue(&local->event_queue);
-}
-
-/*
- * Process events on an endpoint.  The work item carries a ref which
- * we must release.
- */
-static void rxrpc_local_processor(struct work_struct *work)
-{
-       struct rxrpc_local *local =
-               container_of(work, struct rxrpc_local, processor);
-       bool again;
-
-       if (local->dead)
-               return;
-
-       trace_rxrpc_local(local->debug_id, rxrpc_local_processing,
-                         refcount_read(&local->ref), NULL);
-
-       do {
-               again = false;
-               if (!__rxrpc_use_local(local)) {
-                       rxrpc_local_destroyer(local);
-                       break;
-               }
-
-               if (!list_empty(&local->ack_tx_queue)) {
-                       rxrpc_transmit_ack_packets(local);
-                       again = true;
-               }
-
-               if (!skb_queue_empty(&local->reject_queue)) {
-                       rxrpc_reject_packets(local);
-                       again = true;
-               }
-
-               if (!skb_queue_empty(&local->event_queue)) {
-                       rxrpc_process_local_events(local);
-                       again = true;
-               }
-
-               __rxrpc_unuse_local(local);
-       } while (again);
-
-       rxrpc_put_local(local);
+       rxrpc_purge_queue(&local->rx_queue);
 }
 
 /*
@@ -463,13 +413,8 @@ static void rxrpc_local_rcu(struct rcu_head *rcu)
 {
        struct rxrpc_local *local = container_of(rcu, struct rxrpc_local, rcu);
 
-       _enter("%d", local->debug_id);
-
-       ASSERT(!work_pending(&local->processor));
-
-       _net("DESTROY LOCAL %d", local->debug_id);
+       rxrpc_see_local(local, rxrpc_local_free);
        kfree(local);
-       _leave("");
 }
 
 /*
index 84242c0..5905530 100644 (file)
@@ -65,7 +65,7 @@ static __net_init int rxrpc_init_net(struct net *net)
        atomic_set(&rxnet->nr_client_conns, 0);
        rxnet->kill_all_client_conns = false;
        spin_lock_init(&rxnet->client_conn_cache_lock);
-       spin_lock_init(&rxnet->client_conn_discard_lock);
+       mutex_init(&rxnet->client_conn_discard_lock);
        INIT_LIST_HEAD(&rxnet->idle_client_conns);
        INIT_WORK(&rxnet->client_conn_reaper,
                  rxrpc_discard_expired_client_conns);
index c5eed0e..3d8c9f8 100644 (file)
@@ -142,8 +142,8 @@ retry:
                txb->ack.reason = RXRPC_ACK_IDLE;
        }
 
-       mtu = conn->params.peer->if_mtu;
-       mtu -= conn->params.peer->hdrsize;
+       mtu = conn->peer->if_mtu;
+       mtu -= conn->peer->hdrsize;
        jmax = rxrpc_rx_jumbo_max;
        qsize = (window - 1) - call->rx_consumed;
        rsize = max_t(int, call->rx_winsize - qsize, 0);
@@ -203,12 +203,11 @@ static void rxrpc_cancel_rtt_probe(struct rxrpc_call *call,
 }
 
 /*
- * Send an ACK call packet.
+ * Transmit an ACK packet.
  */
-static int rxrpc_send_ack_packet(struct rxrpc_local *local, struct rxrpc_txbuf *txb)
+int rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb)
 {
        struct rxrpc_connection *conn;
-       struct rxrpc_call *call = txb->call;
        struct msghdr msg;
        struct kvec iov[1];
        rxrpc_serial_t serial;
@@ -229,11 +228,6 @@ static int rxrpc_send_ack_packet(struct rxrpc_local *local, struct rxrpc_txbuf *
        if (txb->ack.reason == RXRPC_ACK_PING)
                txb->wire.flags |= RXRPC_REQUEST_ACK;
 
-       if (txb->ack.reason == RXRPC_ACK_DELAY)
-               clear_bit(RXRPC_CALL_DELAY_ACK_PENDING, &call->flags);
-       if (txb->ack.reason == RXRPC_ACK_IDLE)
-               clear_bit(RXRPC_CALL_IDLE_ACK_PENDING, &call->flags);
-
        n = rxrpc_fill_out_ack(conn, call, txb);
        if (n == 0)
                return 0;
@@ -247,8 +241,6 @@ static int rxrpc_send_ack_packet(struct rxrpc_local *local, struct rxrpc_txbuf *
        trace_rxrpc_tx_ack(call->debug_id, serial,
                           ntohl(txb->ack.firstPacket),
                           ntohl(txb->ack.serial), txb->ack.reason, txb->ack.nAcks);
-       if (txb->ack_why == rxrpc_propose_ack_ping_for_lost_ack)
-               call->acks_lost_ping = serial;
 
        if (txb->ack.reason == RXRPC_ACK_PING)
                rtt_slot = rxrpc_begin_rtt_probe(call, serial, rxrpc_rtt_tx_ping);
@@ -259,7 +251,7 @@ static int rxrpc_send_ack_packet(struct rxrpc_local *local, struct rxrpc_txbuf *
        txb->ack.previousPacket = htonl(call->rx_highest_seq);
 
        iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, len);
-       ret = do_udp_sendmsg(conn->params.local->socket, &msg, len);
+       ret = do_udp_sendmsg(conn->local->socket, &msg, len);
        call->peer->last_tx_at = ktime_get_seconds();
        if (ret < 0)
                trace_rxrpc_tx_fail(call->debug_id, serial, ret,
@@ -279,44 +271,6 @@ static int rxrpc_send_ack_packet(struct rxrpc_local *local, struct rxrpc_txbuf *
 }
 
 /*
- * ACK transmitter for a local endpoint.  The UDP socket locks around each
- * transmission, so we can only transmit one packet at a time, ACK, DATA or
- * otherwise.
- */
-void rxrpc_transmit_ack_packets(struct rxrpc_local *local)
-{
-       LIST_HEAD(queue);
-       int ret;
-
-       trace_rxrpc_local(local->debug_id, rxrpc_local_tx_ack,
-                         refcount_read(&local->ref), NULL);
-
-       if (list_empty(&local->ack_tx_queue))
-               return;
-
-       spin_lock_bh(&local->ack_tx_lock);
-       list_splice_tail_init(&local->ack_tx_queue, &queue);
-       spin_unlock_bh(&local->ack_tx_lock);
-
-       while (!list_empty(&queue)) {
-               struct rxrpc_txbuf *txb =
-                       list_entry(queue.next, struct rxrpc_txbuf, tx_link);
-
-               ret = rxrpc_send_ack_packet(local, txb);
-               if (ret < 0 && ret != -ECONNRESET) {
-                       spin_lock_bh(&local->ack_tx_lock);
-                       list_splice_init(&queue, &local->ack_tx_queue);
-                       spin_unlock_bh(&local->ack_tx_lock);
-                       break;
-               }
-
-               list_del_init(&txb->tx_link);
-               rxrpc_put_call(txb->call, rxrpc_call_put);
-               rxrpc_put_txbuf(txb, rxrpc_txbuf_put_ack_tx);
-       }
-}
-
-/*
  * Send an ABORT call packet.
  */
 int rxrpc_send_abort_packet(struct rxrpc_call *call)
@@ -358,7 +312,7 @@ int rxrpc_send_abort_packet(struct rxrpc_call *call)
        pkt.whdr.userStatus     = 0;
        pkt.whdr.securityIndex  = call->security_ix;
        pkt.whdr._rsvd          = 0;
-       pkt.whdr.serviceId      = htons(call->service_id);
+       pkt.whdr.serviceId      = htons(call->dest_srx.srx_service);
        pkt.abort_code          = htonl(call->abort_code);
 
        iov[0].iov_base = &pkt;
@@ -368,8 +322,8 @@ int rxrpc_send_abort_packet(struct rxrpc_call *call)
        pkt.whdr.serial = htonl(serial);
 
        iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, sizeof(pkt));
-       ret = do_udp_sendmsg(conn->params.local->socket, &msg, sizeof(pkt));
-       conn->params.peer->last_tx_at = ktime_get_seconds();
+       ret = do_udp_sendmsg(conn->local->socket, &msg, sizeof(pkt));
+       conn->peer->last_tx_at = ktime_get_seconds();
        if (ret < 0)
                trace_rxrpc_tx_fail(call->debug_id, serial, ret,
                                    rxrpc_tx_point_call_abort);
@@ -395,12 +349,6 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb)
 
        _enter("%x,{%d}", txb->seq, txb->len);
 
-       if (hlist_unhashed(&call->error_link)) {
-               spin_lock_bh(&call->peer->lock);
-               hlist_add_head_rcu(&call->error_link, &call->peer->error_targets);
-               spin_unlock_bh(&call->peer->lock);
-       }
-
        /* Each transmission of a Tx packet needs a new serial number */
        serial = atomic_inc_return(&conn->serial);
        txb->wire.serial = htonl(serial);
@@ -466,6 +414,14 @@ dont_set_request_ack:
 
        trace_rxrpc_tx_data(call, txb->seq, serial, txb->wire.flags,
                            test_bit(RXRPC_TXBUF_RESENT, &txb->flags), false);
+
+       /* Track what we've attempted to transmit at least once so that the
+        * retransmission algorithm doesn't try to resend what we haven't sent
+        * yet.  However, this can race as we can receive an ACK before we get
+        * to this point.  But, OTOH, if we won't get an ACK mentioning this
+        * packet unless the far side received it (though it could have
+        * discarded it anyway and NAK'd it).
+        */
        cmpxchg(&call->tx_transmitted, txb->seq - 1, txb->seq);
 
        /* send the packet with the don't fragment bit set if we currently
@@ -473,7 +429,7 @@ dont_set_request_ack:
        if (txb->len >= call->peer->maxdata)
                goto send_fragmentable;
 
-       down_read(&conn->params.local->defrag_sem);
+       down_read(&conn->local->defrag_sem);
 
        txb->last_sent = ktime_get_real();
        if (txb->wire.flags & RXRPC_REQUEST_ACK)
@@ -486,11 +442,12 @@ dont_set_request_ack:
         *     message and update the peer record
         */
        rxrpc_inc_stat(call->rxnet, stat_tx_data_send);
-       ret = do_udp_sendmsg(conn->params.local->socket, &msg, len);
-       conn->params.peer->last_tx_at = ktime_get_seconds();
+       ret = do_udp_sendmsg(conn->local->socket, &msg, len);
+       conn->peer->last_tx_at = ktime_get_seconds();
 
-       up_read(&conn->params.local->defrag_sem);
+       up_read(&conn->local->defrag_sem);
        if (ret < 0) {
+               rxrpc_inc_stat(call->rxnet, stat_tx_data_send_fail);
                rxrpc_cancel_rtt_probe(call, serial, rtt_slot);
                trace_rxrpc_tx_fail(call->debug_id, serial, ret,
                                    rxrpc_tx_point_call_data_nofrag);
@@ -549,22 +506,22 @@ send_fragmentable:
        /* attempt to send this message with fragmentation enabled */
        _debug("send fragment");
 
-       down_write(&conn->params.local->defrag_sem);
+       down_write(&conn->local->defrag_sem);
 
        txb->last_sent = ktime_get_real();
        if (txb->wire.flags & RXRPC_REQUEST_ACK)
                rtt_slot = rxrpc_begin_rtt_probe(call, serial, rxrpc_rtt_tx_data);
 
-       switch (conn->params.local->srx.transport.family) {
+       switch (conn->local->srx.transport.family) {
        case AF_INET6:
        case AF_INET:
-               ip_sock_set_mtu_discover(conn->params.local->socket->sk,
+               ip_sock_set_mtu_discover(conn->local->socket->sk,
                                         IP_PMTUDISC_DONT);
                rxrpc_inc_stat(call->rxnet, stat_tx_data_send_frag);
-               ret = do_udp_sendmsg(conn->params.local->socket, &msg, len);
-               conn->params.peer->last_tx_at = ktime_get_seconds();
+               ret = do_udp_sendmsg(conn->local->socket, &msg, len);
+               conn->peer->last_tx_at = ktime_get_seconds();
 
-               ip_sock_set_mtu_discover(conn->params.local->socket->sk,
+               ip_sock_set_mtu_discover(conn->local->socket->sk,
                                         IP_PMTUDISC_DO);
                break;
 
@@ -573,6 +530,7 @@ send_fragmentable:
        }
 
        if (ret < 0) {
+               rxrpc_inc_stat(call->rxnet, stat_tx_data_send_fail);
                rxrpc_cancel_rtt_probe(call, serial, rtt_slot);
                trace_rxrpc_tx_fail(call->debug_id, serial, ret,
                                    rxrpc_tx_point_call_data_frag);
@@ -582,26 +540,25 @@ send_fragmentable:
        }
        rxrpc_tx_backoff(call, ret);
 
-       up_write(&conn->params.local->defrag_sem);
+       up_write(&conn->local->defrag_sem);
        goto done;
 }
 
 /*
- * reject packets through the local endpoint
+ * Reject a packet through the local endpoint.
  */
-void rxrpc_reject_packets(struct rxrpc_local *local)
+void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb)
 {
-       struct sockaddr_rxrpc srx;
-       struct rxrpc_skb_priv *sp;
        struct rxrpc_wire_header whdr;
-       struct sk_buff *skb;
+       struct sockaddr_rxrpc srx;
+       struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
        struct msghdr msg;
        struct kvec iov[2];
        size_t size;
        __be32 code;
        int ret, ioc;
 
-       _enter("%d", local->debug_id);
+       rxrpc_see_skb(skb, rxrpc_skb_see_reject);
 
        iov[0].iov_base = &whdr;
        iov[0].iov_len = sizeof(whdr);
@@ -615,52 +572,42 @@ void rxrpc_reject_packets(struct rxrpc_local *local)
 
        memset(&whdr, 0, sizeof(whdr));
 
-       while ((skb = skb_dequeue(&local->reject_queue))) {
-               rxrpc_see_skb(skb, rxrpc_skb_seen);
-               sp = rxrpc_skb(skb);
+       switch (skb->mark) {
+       case RXRPC_SKB_MARK_REJECT_BUSY:
+               whdr.type = RXRPC_PACKET_TYPE_BUSY;
+               size = sizeof(whdr);
+               ioc = 1;
+               break;
+       case RXRPC_SKB_MARK_REJECT_ABORT:
+               whdr.type = RXRPC_PACKET_TYPE_ABORT;
+               code = htonl(skb->priority);
+               size = sizeof(whdr) + sizeof(code);
+               ioc = 2;
+               break;
+       default:
+               return;
+       }
 
-               switch (skb->mark) {
-               case RXRPC_SKB_MARK_REJECT_BUSY:
-                       whdr.type = RXRPC_PACKET_TYPE_BUSY;
-                       size = sizeof(whdr);
-                       ioc = 1;
-                       break;
-               case RXRPC_SKB_MARK_REJECT_ABORT:
-                       whdr.type = RXRPC_PACKET_TYPE_ABORT;
-                       code = htonl(skb->priority);
-                       size = sizeof(whdr) + sizeof(code);
-                       ioc = 2;
-                       break;
-               default:
-                       rxrpc_free_skb(skb, rxrpc_skb_freed);
-                       continue;
-               }
+       if (rxrpc_extract_addr_from_skb(&srx, skb) == 0) {
+               msg.msg_namelen = srx.transport_len;
 
-               if (rxrpc_extract_addr_from_skb(&srx, skb) == 0) {
-                       msg.msg_namelen = srx.transport_len;
-
-                       whdr.epoch      = htonl(sp->hdr.epoch);
-                       whdr.cid        = htonl(sp->hdr.cid);
-                       whdr.callNumber = htonl(sp->hdr.callNumber);
-                       whdr.serviceId  = htons(sp->hdr.serviceId);
-                       whdr.flags      = sp->hdr.flags;
-                       whdr.flags      ^= RXRPC_CLIENT_INITIATED;
-                       whdr.flags      &= RXRPC_CLIENT_INITIATED;
-
-                       iov_iter_kvec(&msg.msg_iter, WRITE, iov, ioc, size);
-                       ret = do_udp_sendmsg(local->socket, &msg, size);
-                       if (ret < 0)
-                               trace_rxrpc_tx_fail(local->debug_id, 0, ret,
-                                                   rxrpc_tx_point_reject);
-                       else
-                               trace_rxrpc_tx_packet(local->debug_id, &whdr,
-                                                     rxrpc_tx_point_reject);
-               }
+               whdr.epoch      = htonl(sp->hdr.epoch);
+               whdr.cid        = htonl(sp->hdr.cid);
+               whdr.callNumber = htonl(sp->hdr.callNumber);
+               whdr.serviceId  = htons(sp->hdr.serviceId);
+               whdr.flags      = sp->hdr.flags;
+               whdr.flags      ^= RXRPC_CLIENT_INITIATED;
+               whdr.flags      &= RXRPC_CLIENT_INITIATED;
 
-               rxrpc_free_skb(skb, rxrpc_skb_freed);
+               iov_iter_kvec(&msg.msg_iter, WRITE, iov, ioc, size);
+               ret = do_udp_sendmsg(local->socket, &msg, size);
+               if (ret < 0)
+                       trace_rxrpc_tx_fail(local->debug_id, 0, ret,
+                                           rxrpc_tx_point_reject);
+               else
+                       trace_rxrpc_tx_packet(local->debug_id, &whdr,
+                                             rxrpc_tx_point_reject);
        }
-
-       _leave("");
 }
 
 /*
@@ -701,8 +648,6 @@ void rxrpc_send_keepalive(struct rxrpc_peer *peer)
 
        len = iov[0].iov_len + iov[1].iov_len;
 
-       _proto("Tx VERSION (keepalive)");
-
        iov_iter_kvec(&msg.msg_iter, WRITE, iov, 2, len);
        ret = do_udp_sendmsg(peer->local->socket, &msg, len);
        if (ret < 0)
@@ -715,3 +660,43 @@ void rxrpc_send_keepalive(struct rxrpc_peer *peer)
        peer->last_tx_at = ktime_get_seconds();
        _leave("");
 }
+
+/*
+ * Schedule an instant Tx resend.
+ */
+static inline void rxrpc_instant_resend(struct rxrpc_call *call,
+                                       struct rxrpc_txbuf *txb)
+{
+       if (call->state < RXRPC_CALL_COMPLETE)
+               kdebug("resend");
+}
+
+/*
+ * Transmit one packet.
+ */
+void rxrpc_transmit_one(struct rxrpc_call *call, struct rxrpc_txbuf *txb)
+{
+       int ret;
+
+       ret = rxrpc_send_data_packet(call, txb);
+       if (ret < 0) {
+               switch (ret) {
+               case -ENETUNREACH:
+               case -EHOSTUNREACH:
+               case -ECONNREFUSED:
+                       rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR,
+                                                 0, ret);
+                       break;
+               default:
+                       _debug("need instant resend %d", ret);
+                       rxrpc_instant_resend(call, txb);
+               }
+       } else {
+               unsigned long now = jiffies;
+               unsigned long resend_at = now + call->peer->rto_j;
+
+               WRITE_ONCE(call->resend_at, resend_at);
+               rxrpc_reduce_call_timer(call, resend_at, now,
+                                       rxrpc_timer_set_for_send);
+       }
+}
index cda3890..6685bf9 100644 (file)
@@ -18,9 +18,9 @@
 #include <net/ip.h>
 #include "ar-internal.h"
 
-static void rxrpc_store_error(struct rxrpc_peer *, struct sock_exterr_skb *);
-static void rxrpc_distribute_error(struct rxrpc_peer *, int,
-                                  enum rxrpc_call_completion);
+static void rxrpc_store_error(struct rxrpc_peer *, struct sk_buff *);
+static void rxrpc_distribute_error(struct rxrpc_peer *, struct sk_buff *,
+                                  enum rxrpc_call_completion, int);
 
 /*
  * Find the peer associated with a local error.
@@ -48,13 +48,11 @@ static struct rxrpc_peer *rxrpc_lookup_peer_local_rcu(struct rxrpc_local *local,
                srx->transport.sin.sin_port = serr->port;
                switch (serr->ee.ee_origin) {
                case SO_EE_ORIGIN_ICMP:
-                       _net("Rx ICMP");
                        memcpy(&srx->transport.sin.sin_addr,
                               skb_network_header(skb) + serr->addr_offset,
                               sizeof(struct in_addr));
                        break;
                case SO_EE_ORIGIN_ICMP6:
-                       _net("Rx ICMP6 on v4 sock");
                        memcpy(&srx->transport.sin.sin_addr,
                               skb_network_header(skb) + serr->addr_offset + 12,
                               sizeof(struct in_addr));
@@ -70,14 +68,12 @@ static struct rxrpc_peer *rxrpc_lookup_peer_local_rcu(struct rxrpc_local *local,
        case AF_INET6:
                switch (serr->ee.ee_origin) {
                case SO_EE_ORIGIN_ICMP6:
-                       _net("Rx ICMP6");
                        srx->transport.sin6.sin6_port = serr->port;
                        memcpy(&srx->transport.sin6.sin6_addr,
                               skb_network_header(skb) + serr->addr_offset,
                               sizeof(struct in6_addr));
                        break;
                case SO_EE_ORIGIN_ICMP:
-                       _net("Rx ICMP on v6 sock");
                        srx->transport_len = sizeof(srx->transport.sin);
                        srx->transport.family = AF_INET;
                        srx->transport.sin.sin_port = serr->port;
@@ -106,13 +102,9 @@ static struct rxrpc_peer *rxrpc_lookup_peer_local_rcu(struct rxrpc_local *local,
  */
 static void rxrpc_adjust_mtu(struct rxrpc_peer *peer, unsigned int mtu)
 {
-       _net("Rx ICMP Fragmentation Needed (%d)", mtu);
-
        /* wind down the local interface MTU */
-       if (mtu > 0 && peer->if_mtu == 65535 && mtu < peer->if_mtu) {
+       if (mtu > 0 && peer->if_mtu == 65535 && mtu < peer->if_mtu)
                peer->if_mtu = mtu;
-               _net("I/F MTU %u", mtu);
-       }
 
        if (mtu == 0) {
                /* they didn't give us a size, estimate one */
@@ -129,63 +121,36 @@ static void rxrpc_adjust_mtu(struct rxrpc_peer *peer, unsigned int mtu)
        }
 
        if (mtu < peer->mtu) {
-               spin_lock_bh(&peer->lock);
+               spin_lock(&peer->lock);
                peer->mtu = mtu;
                peer->maxdata = peer->mtu - peer->hdrsize;
-               spin_unlock_bh(&peer->lock);
-               _net("Net MTU %u (maxdata %u)",
-                    peer->mtu, peer->maxdata);
+               spin_unlock(&peer->lock);
        }
 }
 
 /*
  * Handle an error received on the local endpoint.
  */
-void rxrpc_error_report(struct sock *sk)
+void rxrpc_input_error(struct rxrpc_local *local, struct sk_buff *skb)
 {
-       struct sock_exterr_skb *serr;
+       struct sock_exterr_skb *serr = SKB_EXT_ERR(skb);
        struct sockaddr_rxrpc srx;
-       struct rxrpc_local *local;
        struct rxrpc_peer *peer = NULL;
-       struct sk_buff *skb;
 
-       rcu_read_lock();
-       local = rcu_dereference_sk_user_data(sk);
-       if (unlikely(!local)) {
-               rcu_read_unlock();
-               return;
-       }
-       _enter("%p{%d}", sk, local->debug_id);
-
-       /* Clear the outstanding error value on the socket so that it doesn't
-        * cause kernel_sendmsg() to return it later.
-        */
-       sock_error(sk);
+       _enter("L=%x", local->debug_id);
 
-       skb = sock_dequeue_err_skb(sk);
-       if (!skb) {
-               rcu_read_unlock();
-               _leave("UDP socket errqueue empty");
-               return;
-       }
-       rxrpc_new_skb(skb, rxrpc_skb_received);
-       serr = SKB_EXT_ERR(skb);
        if (!skb->len && serr->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING) {
                _leave("UDP empty message");
-               rcu_read_unlock();
-               rxrpc_free_skb(skb, rxrpc_skb_freed);
                return;
        }
 
+       rcu_read_lock();
        peer = rxrpc_lookup_peer_local_rcu(local, skb, &srx);
-       if (peer && !rxrpc_get_peer_maybe(peer))
+       if (peer && !rxrpc_get_peer_maybe(peer, rxrpc_peer_get_input_error))
                peer = NULL;
-       if (!peer) {
-               rcu_read_unlock();
-               rxrpc_free_skb(skb, rxrpc_skb_freed);
-               _leave(" [no peer]");
+       rcu_read_unlock();
+       if (!peer)
                return;
-       }
 
        trace_rxrpc_rx_icmp(peer, &serr->ee, &srx);
 
@@ -196,72 +161,26 @@ void rxrpc_error_report(struct sock *sk)
                goto out;
        }
 
-       rxrpc_store_error(peer, serr);
+       rxrpc_store_error(peer, skb);
 out:
-       rcu_read_unlock();
-       rxrpc_free_skb(skb, rxrpc_skb_freed);
-       rxrpc_put_peer(peer);
-
-       _leave("");
+       rxrpc_put_peer(peer, rxrpc_peer_put_input_error);
 }
 
 /*
  * Map an error report to error codes on the peer record.
  */
-static void rxrpc_store_error(struct rxrpc_peer *peer,
-                             struct sock_exterr_skb *serr)
+static void rxrpc_store_error(struct rxrpc_peer *peer, struct sk_buff *skb)
 {
        enum rxrpc_call_completion compl = RXRPC_CALL_NETWORK_ERROR;
-       struct sock_extended_err *ee;
-       int err;
+       struct sock_exterr_skb *serr = SKB_EXT_ERR(skb);
+       struct sock_extended_err *ee = &serr->ee;
+       int err = ee->ee_errno;
 
        _enter("");
 
-       ee = &serr->ee;
-
-       err = ee->ee_errno;
-
        switch (ee->ee_origin) {
-       case SO_EE_ORIGIN_ICMP:
-               switch (ee->ee_type) {
-               case ICMP_DEST_UNREACH:
-                       switch (ee->ee_code) {
-                       case ICMP_NET_UNREACH:
-                               _net("Rx Received ICMP Network Unreachable");
-                               break;
-                       case ICMP_HOST_UNREACH:
-                               _net("Rx Received ICMP Host Unreachable");
-                               break;
-                       case ICMP_PORT_UNREACH:
-                               _net("Rx Received ICMP Port Unreachable");
-                               break;
-                       case ICMP_NET_UNKNOWN:
-                               _net("Rx Received ICMP Unknown Network");
-                               break;
-                       case ICMP_HOST_UNKNOWN:
-                               _net("Rx Received ICMP Unknown Host");
-                               break;
-                       default:
-                               _net("Rx Received ICMP DestUnreach code=%u",
-                                    ee->ee_code);
-                               break;
-                       }
-                       break;
-
-               case ICMP_TIME_EXCEEDED:
-                       _net("Rx Received ICMP TTL Exceeded");
-                       break;
-
-               default:
-                       _proto("Rx Received ICMP error { type=%u code=%u }",
-                              ee->ee_type, ee->ee_code);
-                       break;
-               }
-               break;
-
        case SO_EE_ORIGIN_NONE:
        case SO_EE_ORIGIN_LOCAL:
-               _proto("Rx Received local error { error=%d }", err);
                compl = RXRPC_CALL_LOCAL_ERROR;
                break;
 
@@ -269,26 +188,40 @@ static void rxrpc_store_error(struct rxrpc_peer *peer,
                if (err == EACCES)
                        err = EHOSTUNREACH;
                fallthrough;
+       case SO_EE_ORIGIN_ICMP:
        default:
-               _proto("Rx Received error report { orig=%u }", ee->ee_origin);
                break;
        }
 
-       rxrpc_distribute_error(peer, err, compl);
+       rxrpc_distribute_error(peer, skb, compl, err);
 }
 
 /*
  * Distribute an error that occurred on a peer.
  */
-static void rxrpc_distribute_error(struct rxrpc_peer *peer, int error,
-                                  enum rxrpc_call_completion compl)
+static void rxrpc_distribute_error(struct rxrpc_peer *peer, struct sk_buff *skb,
+                                  enum rxrpc_call_completion compl, int err)
 {
        struct rxrpc_call *call;
+       HLIST_HEAD(error_targets);
 
-       hlist_for_each_entry_rcu(call, &peer->error_targets, error_link) {
-               rxrpc_see_call(call);
-               rxrpc_set_call_completion(call, compl, 0, -error);
+       spin_lock(&peer->lock);
+       hlist_move_list(&peer->error_targets, &error_targets);
+
+       while (!hlist_empty(&error_targets)) {
+               call = hlist_entry(error_targets.first,
+                                  struct rxrpc_call, error_link);
+               hlist_del_init(&call->error_link);
+               spin_unlock(&peer->lock);
+
+               rxrpc_see_call(call, rxrpc_call_see_distribute_error);
+               rxrpc_set_call_completion(call, compl, 0, -err);
+               rxrpc_input_call_event(call, skb);
+
+               spin_lock(&peer->lock);
        }
+
+       spin_unlock(&peer->lock);
 }
 
 /*
@@ -304,18 +237,18 @@ static void rxrpc_peer_keepalive_dispatch(struct rxrpc_net *rxnet,
        time64_t keepalive_at;
        int slot;
 
-       spin_lock_bh(&rxnet->peer_hash_lock);
+       spin_lock(&rxnet->peer_hash_lock);
 
        while (!list_empty(collector)) {
                peer = list_entry(collector->next,
                                  struct rxrpc_peer, keepalive_link);
 
                list_del_init(&peer->keepalive_link);
-               if (!rxrpc_get_peer_maybe(peer))
+               if (!rxrpc_get_peer_maybe(peer, rxrpc_peer_get_keepalive))
                        continue;
 
-               if (__rxrpc_use_local(peer->local)) {
-                       spin_unlock_bh(&rxnet->peer_hash_lock);
+               if (__rxrpc_use_local(peer->local, rxrpc_local_use_peer_keepalive)) {
+                       spin_unlock(&rxnet->peer_hash_lock);
 
                        keepalive_at = peer->last_tx_at + RXRPC_KEEPALIVE_TIME;
                        slot = keepalive_at - base;
@@ -334,15 +267,15 @@ static void rxrpc_peer_keepalive_dispatch(struct rxrpc_net *rxnet,
                         */
                        slot += cursor;
                        slot &= mask;
-                       spin_lock_bh(&rxnet->peer_hash_lock);
+                       spin_lock(&rxnet->peer_hash_lock);
                        list_add_tail(&peer->keepalive_link,
                                      &rxnet->peer_keepalive[slot & mask]);
-                       rxrpc_unuse_local(peer->local);
+                       rxrpc_unuse_local(peer->local, rxrpc_local_unuse_peer_keepalive);
                }
-               rxrpc_put_peer_locked(peer);
+               rxrpc_put_peer_locked(peer, rxrpc_peer_put_keepalive);
        }
 
-       spin_unlock_bh(&rxnet->peer_hash_lock);
+       spin_unlock(&rxnet->peer_hash_lock);
 }
 
 /*
@@ -372,7 +305,7 @@ void rxrpc_peer_keepalive_worker(struct work_struct *work)
         * second; the bucket at cursor + 1 goes at now + 1s and so
         * on...
         */
-       spin_lock_bh(&rxnet->peer_hash_lock);
+       spin_lock(&rxnet->peer_hash_lock);
        list_splice_init(&rxnet->peer_keepalive_new, &collector);
 
        stop = cursor + ARRAY_SIZE(rxnet->peer_keepalive);
@@ -384,7 +317,7 @@ void rxrpc_peer_keepalive_worker(struct work_struct *work)
        }
 
        base = now;
-       spin_unlock_bh(&rxnet->peer_hash_lock);
+       spin_unlock(&rxnet->peer_hash_lock);
 
        rxnet->peer_keepalive_base = base;
        rxnet->peer_keepalive_cursor = cursor;
index 041a512..608946d 100644 (file)
@@ -138,10 +138,8 @@ struct rxrpc_peer *rxrpc_lookup_peer_rcu(struct rxrpc_local *local,
        unsigned long hash_key = rxrpc_peer_hash_key(local, srx);
 
        peer = __rxrpc_lookup_peer_rcu(local, srx, hash_key);
-       if (peer) {
-               _net("PEER %d {%pISp}", peer->debug_id, &peer->srx.transport);
+       if (peer)
                _leave(" = %p {u=%d}", peer, refcount_read(&peer->ref));
-       }
        return peer;
 }
 
@@ -207,9 +205,9 @@ static void rxrpc_assess_MTU_size(struct rxrpc_sock *rx,
 /*
  * Allocate a peer.
  */
-struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp)
+struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp,
+                                   enum rxrpc_peer_trace why)
 {
-       const void *here = __builtin_return_address(0);
        struct rxrpc_peer *peer;
 
        _enter("");
@@ -217,7 +215,7 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp)
        peer = kzalloc(sizeof(struct rxrpc_peer), gfp);
        if (peer) {
                refcount_set(&peer->ref, 1);
-               peer->local = rxrpc_get_local(local);
+               peer->local = rxrpc_get_local(local, rxrpc_local_get_peer);
                INIT_HLIST_HEAD(&peer->error_targets);
                peer->service_conns = RB_ROOT;
                seqlock_init(&peer->service_conn_lock);
@@ -228,7 +226,7 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp)
                rxrpc_peer_init_rtt(peer);
 
                peer->cong_ssthresh = RXRPC_TX_MAX_WINDOW;
-               trace_rxrpc_peer(peer->debug_id, rxrpc_peer_new, 1, here);
+               trace_rxrpc_peer(peer->debug_id, why, 1);
        }
 
        _leave(" = %p", peer);
@@ -284,7 +282,7 @@ static struct rxrpc_peer *rxrpc_create_peer(struct rxrpc_sock *rx,
 
        _enter("");
 
-       peer = rxrpc_alloc_peer(local, gfp);
+       peer = rxrpc_alloc_peer(local, gfp, rxrpc_peer_new_client);
        if (peer) {
                memcpy(&peer->srx, srx, sizeof(*srx));
                rxrpc_init_peer(rx, peer, hash_key);
@@ -296,7 +294,8 @@ static struct rxrpc_peer *rxrpc_create_peer(struct rxrpc_sock *rx,
 
 static void rxrpc_free_peer(struct rxrpc_peer *peer)
 {
-       rxrpc_put_local(peer->local);
+       trace_rxrpc_peer(peer->debug_id, 0, rxrpc_peer_free);
+       rxrpc_put_local(peer->local, rxrpc_local_put_peer);
        kfree_rcu(peer, rcu);
 }
 
@@ -336,7 +335,7 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_sock *rx,
        /* search the peer list first */
        rcu_read_lock();
        peer = __rxrpc_lookup_peer_rcu(local, srx, hash_key);
-       if (peer && !rxrpc_get_peer_maybe(peer))
+       if (peer && !rxrpc_get_peer_maybe(peer, rxrpc_peer_get_lookup_client))
                peer = NULL;
        rcu_read_unlock();
 
@@ -350,11 +349,11 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_sock *rx,
                        return NULL;
                }
 
-               spin_lock_bh(&rxnet->peer_hash_lock);
+               spin_lock(&rxnet->peer_hash_lock);
 
                /* Need to check that we aren't racing with someone else */
                peer = __rxrpc_lookup_peer_rcu(local, srx, hash_key);
-               if (peer && !rxrpc_get_peer_maybe(peer))
+               if (peer && !rxrpc_get_peer_maybe(peer, rxrpc_peer_get_lookup_client))
                        peer = NULL;
                if (!peer) {
                        hash_add_rcu(rxnet->peer_hash,
@@ -363,7 +362,7 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_sock *rx,
                                      &rxnet->peer_keepalive_new);
                }
 
-               spin_unlock_bh(&rxnet->peer_hash_lock);
+               spin_unlock(&rxnet->peer_hash_lock);
 
                if (peer)
                        rxrpc_free_peer(candidate);
@@ -371,8 +370,6 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_sock *rx,
                        peer = candidate;
        }
 
-       _net("PEER %d {%pISp}", peer->debug_id, &peer->srx.transport);
-
        _leave(" = %p {u=%d}", peer, refcount_read(&peer->ref));
        return peer;
 }
@@ -380,27 +377,26 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_sock *rx,
 /*
  * Get a ref on a peer record.
  */
-struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *peer)
+struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *peer, enum rxrpc_peer_trace why)
 {
-       const void *here = __builtin_return_address(0);
        int r;
 
        __refcount_inc(&peer->ref, &r);
-       trace_rxrpc_peer(peer->debug_id, rxrpc_peer_got, r + 1, here);
+       trace_rxrpc_peer(peer->debug_id, why, r + 1);
        return peer;
 }
 
 /*
  * Get a ref on a peer record unless its usage has already reached 0.
  */
-struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *peer)
+struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *peer,
+                                       enum rxrpc_peer_trace why)
 {
-       const void *here = __builtin_return_address(0);
        int r;
 
        if (peer) {
                if (__refcount_inc_not_zero(&peer->ref, &r))
-                       trace_rxrpc_peer(peer->debug_id, rxrpc_peer_got, r + 1, here);
+                       trace_rxrpc_peer(peer->debug_id, r + 1, why);
                else
                        peer = NULL;
        }
@@ -416,10 +412,10 @@ static void __rxrpc_put_peer(struct rxrpc_peer *peer)
 
        ASSERT(hlist_empty(&peer->error_targets));
 
-       spin_lock_bh(&rxnet->peer_hash_lock);
+       spin_lock(&rxnet->peer_hash_lock);
        hash_del_rcu(&peer->hash_link);
        list_del_init(&peer->keepalive_link);
-       spin_unlock_bh(&rxnet->peer_hash_lock);
+       spin_unlock(&rxnet->peer_hash_lock);
 
        rxrpc_free_peer(peer);
 }
@@ -427,9 +423,8 @@ static void __rxrpc_put_peer(struct rxrpc_peer *peer)
 /*
  * Drop a ref on a peer record.
  */
-void rxrpc_put_peer(struct rxrpc_peer *peer)
+void rxrpc_put_peer(struct rxrpc_peer *peer, enum rxrpc_peer_trace why)
 {
-       const void *here = __builtin_return_address(0);
        unsigned int debug_id;
        bool dead;
        int r;
@@ -437,7 +432,7 @@ void rxrpc_put_peer(struct rxrpc_peer *peer)
        if (peer) {
                debug_id = peer->debug_id;
                dead = __refcount_dec_and_test(&peer->ref, &r);
-               trace_rxrpc_peer(debug_id, rxrpc_peer_put, r - 1, here);
+               trace_rxrpc_peer(debug_id, r - 1, why);
                if (dead)
                        __rxrpc_put_peer(peer);
        }
@@ -447,15 +442,14 @@ void rxrpc_put_peer(struct rxrpc_peer *peer)
  * Drop a ref on a peer record where the caller already holds the
  * peer_hash_lock.
  */
-void rxrpc_put_peer_locked(struct rxrpc_peer *peer)
+void rxrpc_put_peer_locked(struct rxrpc_peer *peer, enum rxrpc_peer_trace why)
 {
-       const void *here = __builtin_return_address(0);
        unsigned int debug_id = peer->debug_id;
        bool dead;
        int r;
 
        dead = __refcount_dec_and_test(&peer->ref, &r);
-       trace_rxrpc_peer(debug_id, rxrpc_peer_put, r - 1, here);
+       trace_rxrpc_peer(debug_id, r - 1, why);
        if (dead) {
                hash_del_rcu(&peer->hash_link);
                list_del_init(&peer->keepalive_link);
index fae22a8..3a59591 100644 (file)
@@ -49,8 +49,6 @@ static void rxrpc_call_seq_stop(struct seq_file *seq, void *v)
 static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
 {
        struct rxrpc_local *local;
-       struct rxrpc_sock *rx;
-       struct rxrpc_peer *peer;
        struct rxrpc_call *call;
        struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq));
        unsigned long timeout = 0;
@@ -63,28 +61,19 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
                         "Proto Local                                          "
                         " Remote                                         "
                         " SvID ConnID   CallID   End Use State    Abort   "
-                        " DebugId  TxSeq    TW RxSeq    RW RxSerial RxTimo\n");
+                        " DebugId  TxSeq    TW RxSeq    RW RxSerial CW RxTimo\n");
                return 0;
        }
 
        call = list_entry(v, struct rxrpc_call, link);
 
-       rx = rcu_dereference(call->socket);
-       if (rx) {
-               local = READ_ONCE(rx->local);
-               if (local)
-                       sprintf(lbuff, "%pISpc", &local->srx.transport);
-               else
-                       strcpy(lbuff, "no_local");
-       } else {
-               strcpy(lbuff, "no_socket");
-       }
-
-       peer = call->peer;
-       if (peer)
-               sprintf(rbuff, "%pISpc", &peer->srx.transport);
+       local = call->local;
+       if (local)
+               sprintf(lbuff, "%pISpc", &local->srx.transport);
        else
-               strcpy(rbuff, "no_connection");
+               strcpy(lbuff, "no_local");
+
+       sprintf(rbuff, "%pISpc", &call->dest_srx.transport);
 
        if (call->state != RXRPC_CALL_SERVER_PREALLOC) {
                timeout = READ_ONCE(call->expect_rx_by);
@@ -95,10 +84,10 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
        wtmp   = atomic64_read_acquire(&call->ackr_window);
        seq_printf(seq,
                   "UDP   %-47.47s %-47.47s %4x %08x %08x %s %3u"
-                  " %-8.8s %08x %08x %08x %02x %08x %02x %08x %06lx\n",
+                  " %-8.8s %08x %08x %08x %02x %08x %02x %08x %02x %06lx\n",
                   lbuff,
                   rbuff,
-                  call->service_id,
+                  call->dest_srx.srx_service,
                   call->cid,
                   call->call_id,
                   rxrpc_is_service_call(call) ? "Svc" : "Clt",
@@ -109,6 +98,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
                   acks_hard_ack, READ_ONCE(call->tx_top) - acks_hard_ack,
                   lower_32_bits(wtmp), upper_32_bits(wtmp) - lower_32_bits(wtmp),
                   call->rx_serial,
+                  call->cong_cwnd,
                   timeout);
 
        return 0;
@@ -159,7 +149,7 @@ static int rxrpc_connection_seq_show(struct seq_file *seq, void *v)
                seq_puts(seq,
                         "Proto Local                                          "
                         " Remote                                         "
-                        " SvID ConnID   End Use State    Key     "
+                        " SvID ConnID   End Ref Act State    Key     "
                         " Serial   ISerial  CallId0  CallId1  CallId2  CallId3\n"
                         );
                return 0;
@@ -172,12 +162,12 @@ static int rxrpc_connection_seq_show(struct seq_file *seq, void *v)
                goto print;
        }
 
-       sprintf(lbuff, "%pISpc", &conn->params.local->srx.transport);
+       sprintf(lbuff, "%pISpc", &conn->local->srx.transport);
 
-       sprintf(rbuff, "%pISpc", &conn->params.peer->srx.transport);
+       sprintf(rbuff, "%pISpc", &conn->peer->srx.transport);
 print:
        seq_printf(seq,
-                  "UDP   %-47.47s %-47.47s %4x %08x %s %3u"
+                  "UDP   %-47.47s %-47.47s %4x %08x %s %3u %3d"
                   " %s %08x %08x %08x %08x %08x %08x %08x\n",
                   lbuff,
                   rbuff,
@@ -185,8 +175,9 @@ print:
                   conn->proto.cid,
                   rxrpc_conn_is_service(conn) ? "Svc" : "Clt",
                   refcount_read(&conn->ref),
+                  atomic_read(&conn->active),
                   rxrpc_conn_states[conn->state],
-                  key_serial(conn->params.key),
+                  key_serial(conn->key),
                   atomic_read(&conn->serial),
                   conn->hi_serial,
                   conn->channels[0].call_id,
@@ -341,7 +332,7 @@ static int rxrpc_local_seq_show(struct seq_file *seq, void *v)
        if (v == SEQ_START_TOKEN) {
                seq_puts(seq,
                         "Proto Local                                          "
-                        " Use Act\n");
+                        " Use Act RxQ\n");
                return 0;
        }
 
@@ -350,10 +341,11 @@ static int rxrpc_local_seq_show(struct seq_file *seq, void *v)
        sprintf(lbuff, "%pISpc", &local->srx.transport);
 
        seq_printf(seq,
-                  "UDP   %-47.47s %3u %3u\n",
+                  "UDP   %-47.47s %3u %3u %3u\n",
                   lbuff,
                   refcount_read(&local->ref),
-                  atomic_read(&local->active_users));
+                  atomic_read(&local->active_users),
+                  local->rx_queue.qlen);
 
        return 0;
 }
@@ -407,13 +399,16 @@ int rxrpc_stats_show(struct seq_file *seq, void *v)
        struct rxrpc_net *rxnet = rxrpc_net(seq_file_single_net(seq));
 
        seq_printf(seq,
-                  "Data     : send=%u sendf=%u\n",
+                  "Data     : send=%u sendf=%u fail=%u\n",
                   atomic_read(&rxnet->stat_tx_data_send),
-                  atomic_read(&rxnet->stat_tx_data_send_frag));
+                  atomic_read(&rxnet->stat_tx_data_send_frag),
+                  atomic_read(&rxnet->stat_tx_data_send_fail));
        seq_printf(seq,
-                  "Data-Tx  : nr=%u retrans=%u\n",
+                  "Data-Tx  : nr=%u retrans=%u uf=%u cwr=%u\n",
                   atomic_read(&rxnet->stat_tx_data),
-                  atomic_read(&rxnet->stat_tx_data_retrans));
+                  atomic_read(&rxnet->stat_tx_data_retrans),
+                  atomic_read(&rxnet->stat_tx_data_underflow),
+                  atomic_read(&rxnet->stat_tx_data_cwnd_reset));
        seq_printf(seq,
                   "Data-Rx  : nr=%u reqack=%u jumbo=%u\n",
                   atomic_read(&rxnet->stat_rx_data),
@@ -462,6 +457,9 @@ int rxrpc_stats_show(struct seq_file *seq, void *v)
                   "Buffers  : txb=%u rxb=%u\n",
                   atomic_read(&rxrpc_nr_txbuf),
                   atomic_read(&rxrpc_n_rx_skbs));
+       seq_printf(seq,
+                  "IO-thread: loops=%u\n",
+                  atomic_read(&rxnet->stat_io_loop));
        return 0;
 }
 
@@ -478,8 +476,11 @@ int rxrpc_stats_clear(struct file *file, char *buf, size_t size)
 
        atomic_set(&rxnet->stat_tx_data, 0);
        atomic_set(&rxnet->stat_tx_data_retrans, 0);
+       atomic_set(&rxnet->stat_tx_data_underflow, 0);
+       atomic_set(&rxnet->stat_tx_data_cwnd_reset, 0);
        atomic_set(&rxnet->stat_tx_data_send, 0);
        atomic_set(&rxnet->stat_tx_data_send_frag, 0);
+       atomic_set(&rxnet->stat_tx_data_send_fail, 0);
        atomic_set(&rxnet->stat_rx_data, 0);
        atomic_set(&rxnet->stat_rx_data_reqack, 0);
        atomic_set(&rxnet->stat_rx_data_jumbo, 0);
@@ -491,5 +492,7 @@ int rxrpc_stats_clear(struct file *file, char *buf, size_t size)
        memset(&rxnet->stat_rx_acks, 0, sizeof(rxnet->stat_rx_acks));
 
        memset(&rxnet->stat_why_req_ack, 0, sizeof(rxnet->stat_why_req_ack));
+
+       atomic_set(&rxnet->stat_io_loop, 0);
        return size;
 }
index efb85f9..36b25d0 100644 (file)
@@ -36,16 +36,16 @@ void rxrpc_notify_socket(struct rxrpc_call *call)
        sk = &rx->sk;
        if (rx && sk->sk_state < RXRPC_CLOSE) {
                if (call->notify_rx) {
-                       spin_lock_bh(&call->notify_lock);
+                       spin_lock(&call->notify_lock);
                        call->notify_rx(sk, call, call->user_call_ID);
-                       spin_unlock_bh(&call->notify_lock);
+                       spin_unlock(&call->notify_lock);
                } else {
-                       write_lock_bh(&rx->recvmsg_lock);
+                       write_lock(&rx->recvmsg_lock);
                        if (list_empty(&call->recvmsg_link)) {
-                               rxrpc_get_call(call, rxrpc_call_got);
+                               rxrpc_get_call(call, rxrpc_call_get_notify_socket);
                                list_add_tail(&call->recvmsg_link, &rx->recvmsg_q);
                        }
-                       write_unlock_bh(&rx->recvmsg_lock);
+                       write_unlock(&rx->recvmsg_lock);
 
                        if (!sock_flag(sk, SOCK_DEAD)) {
                                _debug("call %ps", sk->sk_data_ready);
@@ -87,9 +87,9 @@ bool rxrpc_set_call_completion(struct rxrpc_call *call,
        bool ret = false;
 
        if (call->state < RXRPC_CALL_COMPLETE) {
-               write_lock_bh(&call->state_lock);
+               write_lock(&call->state_lock);
                ret = __rxrpc_set_call_completion(call, compl, abort_code, error);
-               write_unlock_bh(&call->state_lock);
+               write_unlock(&call->state_lock);
        }
        return ret;
 }
@@ -107,9 +107,9 @@ bool rxrpc_call_completed(struct rxrpc_call *call)
        bool ret = false;
 
        if (call->state < RXRPC_CALL_COMPLETE) {
-               write_lock_bh(&call->state_lock);
+               write_lock(&call->state_lock);
                ret = __rxrpc_call_completed(call);
-               write_unlock_bh(&call->state_lock);
+               write_unlock(&call->state_lock);
        }
        return ret;
 }
@@ -131,9 +131,9 @@ bool rxrpc_abort_call(const char *why, struct rxrpc_call *call,
 {
        bool ret;
 
-       write_lock_bh(&call->state_lock);
+       write_lock(&call->state_lock);
        ret = __rxrpc_abort_call(why, call, seq, abort_code, error);
-       write_unlock_bh(&call->state_lock);
+       write_unlock(&call->state_lock);
        return ret;
 }
 
@@ -193,23 +193,23 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call, rxrpc_serial_t serial)
        if (call->state == RXRPC_CALL_CLIENT_RECV_REPLY)
                rxrpc_propose_delay_ACK(call, serial, rxrpc_propose_ack_terminal_ack);
 
-       write_lock_bh(&call->state_lock);
+       write_lock(&call->state_lock);
 
        switch (call->state) {
        case RXRPC_CALL_CLIENT_RECV_REPLY:
                __rxrpc_call_completed(call);
-               write_unlock_bh(&call->state_lock);
+               write_unlock(&call->state_lock);
                break;
 
        case RXRPC_CALL_SERVER_RECV_REQUEST:
                call->state = RXRPC_CALL_SERVER_ACK_REQUEST;
                call->expect_req_by = jiffies + MAX_JIFFY_OFFSET;
-               write_unlock_bh(&call->state_lock);
+               write_unlock(&call->state_lock);
                rxrpc_propose_delay_ACK(call, serial,
                                        rxrpc_propose_ack_processing_op);
                break;
        default:
-               write_unlock_bh(&call->state_lock);
+               write_unlock(&call->state_lock);
                break;
        }
 }
@@ -228,9 +228,8 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call)
 
        _enter("%d", call->debug_id);
 
-further_rotation:
        skb = skb_dequeue(&call->recvmsg_queue);
-       rxrpc_see_skb(skb, rxrpc_skb_rotated);
+       rxrpc_see_skb(skb, rxrpc_skb_see_rotate);
 
        sp = rxrpc_skb(skb);
        tseq   = sp->hdr.seq;
@@ -241,7 +240,7 @@ further_rotation:
        if (after(tseq, call->rx_consumed))
                smp_store_release(&call->rx_consumed, tseq);
 
-       rxrpc_free_skb(skb, rxrpc_skb_freed);
+       rxrpc_free_skb(skb, rxrpc_skb_put_rotate);
 
        trace_rxrpc_receive(call, last ? rxrpc_receive_rotate_last : rxrpc_receive_rotate,
                            serial, call->rx_consumed);
@@ -250,26 +249,12 @@ further_rotation:
                return;
        }
 
-       /* The next packet on the queue might entirely overlap with the one we
-        * just consumed; if so, rotate that away also.
-        */
-       skb = skb_peek(&call->recvmsg_queue);
-       if (skb) {
-               sp = rxrpc_skb(skb);
-               if (sp->hdr.seq != call->rx_consumed &&
-                   after_eq(call->rx_consumed, sp->hdr.seq))
-                       goto further_rotation;
-       }
-
        /* Check to see if there's an ACK that needs sending. */
        acked = atomic_add_return(call->rx_consumed - old_consumed,
                                  &call->ackr_nr_consumed);
        if (acked > 2 &&
-           !test_and_set_bit(RXRPC_CALL_IDLE_ACK_PENDING, &call->flags)) {
-               rxrpc_send_ACK(call, RXRPC_ACK_IDLE, serial,
-                              rxrpc_propose_ack_rotate_rx);
-               rxrpc_transmit_ack_packets(call->peer->local);
-       }
+           !test_and_set_bit(RXRPC_CALL_RX_IS_IDLE, &call->flags))
+               rxrpc_poke_call(call, rxrpc_call_poke_idle);
 }
 
 /*
@@ -314,15 +299,10 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
         */
        skb = skb_peek(&call->recvmsg_queue);
        while (skb) {
-               rxrpc_see_skb(skb, rxrpc_skb_seen);
+               rxrpc_see_skb(skb, rxrpc_skb_see_recvmsg);
                sp = rxrpc_skb(skb);
                seq = sp->hdr.seq;
 
-               if (after_eq(call->rx_consumed, seq)) {
-                       kdebug("obsolete %x %x", call->rx_consumed, seq);
-                       goto skip_obsolete;
-               }
-
                if (!(flags & MSG_PEEK))
                        trace_rxrpc_receive(call, rxrpc_receive_front,
                                            sp->hdr.serial, seq);
@@ -340,7 +320,6 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
                                ret = ret2;
                                goto out;
                        }
-                       rxrpc_transmit_ack_packets(call->peer->local);
                } else {
                        trace_rxrpc_recvdata(call, rxrpc_recvmsg_cont, seq,
                                             rx_pkt_offset, rx_pkt_len, 0);
@@ -373,7 +352,6 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
                        break;
                }
 
-       skip_obsolete:
                /* The whole packet has been transferred. */
                if (sp->hdr.flags & RXRPC_LAST_PACKET)
                        ret = 1;
@@ -395,7 +373,7 @@ done:
        trace_rxrpc_recvdata(call, rxrpc_recvmsg_data_return, seq,
                             rx_pkt_offset, rx_pkt_len, ret);
        if (ret == -EAGAIN)
-               set_bit(RXRPC_CALL_RX_UNDERRUN, &call->flags);
+               set_bit(RXRPC_CALL_RX_IS_IDLE, &call->flags);
        return ret;
 }
 
@@ -463,14 +441,14 @@ try_again:
        /* Find the next call and dequeue it if we're not just peeking.  If we
         * do dequeue it, that comes with a ref that we will need to release.
         */
-       write_lock_bh(&rx->recvmsg_lock);
+       write_lock(&rx->recvmsg_lock);
        l = rx->recvmsg_q.next;
        call = list_entry(l, struct rxrpc_call, recvmsg_link);
        if (!(flags & MSG_PEEK))
                list_del_init(&call->recvmsg_link);
        else
-               rxrpc_get_call(call, rxrpc_call_got);
-       write_unlock_bh(&rx->recvmsg_lock);
+               rxrpc_get_call(call, rxrpc_call_get_recvmsg);
+       write_unlock(&rx->recvmsg_lock);
 
        trace_rxrpc_recvmsg(call, rxrpc_recvmsg_dequeue, 0);
 
@@ -508,11 +486,9 @@ try_again:
        }
 
        if (msg->msg_name && call->peer) {
-               struct sockaddr_rxrpc *srx = msg->msg_name;
-               size_t len = sizeof(call->peer->srx);
+               size_t len = sizeof(call->dest_srx);
 
-               memcpy(msg->msg_name, &call->peer->srx, len);
-               srx->srx_service = call->service_id;
+               memcpy(msg->msg_name, &call->dest_srx, len);
                msg->msg_namelen = len;
        }
 
@@ -525,7 +501,6 @@ try_again:
                if (ret == -EAGAIN)
                        ret = 0;
 
-               rxrpc_transmit_ack_packets(call->peer->local);
                if (!skb_queue_empty(&call->recvmsg_queue))
                        rxrpc_notify_socket(call);
                break;
@@ -555,18 +530,18 @@ try_again:
 
 error_unlock_call:
        mutex_unlock(&call->user_mutex);
-       rxrpc_put_call(call, rxrpc_call_put);
+       rxrpc_put_call(call, rxrpc_call_put_recvmsg);
        trace_rxrpc_recvmsg(call, rxrpc_recvmsg_return, ret);
        return ret;
 
 error_requeue_call:
        if (!(flags & MSG_PEEK)) {
-               write_lock_bh(&rx->recvmsg_lock);
+               write_lock(&rx->recvmsg_lock);
                list_add(&call->recvmsg_link, &rx->recvmsg_q);
-               write_unlock_bh(&rx->recvmsg_lock);
+               write_unlock(&rx->recvmsg_lock);
                trace_rxrpc_recvmsg(call, rxrpc_recvmsg_requeue, 0);
        } else {
-               rxrpc_put_call(call, rxrpc_call_put);
+               rxrpc_put_call(call, rxrpc_call_put_recvmsg);
        }
 error_no_call:
        release_sock(&rx->sk);
@@ -655,9 +630,8 @@ int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call,
 read_phase_complete:
        ret = 1;
 out:
-       rxrpc_transmit_ack_packets(call->peer->local);
        if (_service)
-               *_service = call->service_id;
+               *_service = call->dest_srx.srx_service;
        mutex_unlock(&call->user_mutex);
        _leave(" = %d [%zu,%d]", ret, iov_iter_count(iter), *_abort);
        return ret;
index 110a555..d123372 100644 (file)
@@ -103,7 +103,7 @@ static int rxkad_init_connection_security(struct rxrpc_connection *conn,
        struct crypto_sync_skcipher *ci;
        int ret;
 
-       _enter("{%d},{%x}", conn->debug_id, key_serial(conn->params.key));
+       _enter("{%d},{%x}", conn->debug_id, key_serial(conn->key));
 
        conn->security_ix = token->security_index;
 
@@ -118,7 +118,7 @@ static int rxkad_init_connection_security(struct rxrpc_connection *conn,
                                   sizeof(token->kad->session_key)) < 0)
                BUG();
 
-       switch (conn->params.security_level) {
+       switch (conn->security_level) {
        case RXRPC_SECURITY_PLAIN:
        case RXRPC_SECURITY_AUTH:
        case RXRPC_SECURITY_ENCRYPT:
@@ -150,7 +150,7 @@ static int rxkad_how_much_data(struct rxrpc_call *call, size_t remain,
 {
        size_t shdr, buf_size, chunk;
 
-       switch (call->conn->params.security_level) {
+       switch (call->conn->security_level) {
        default:
                buf_size = chunk = min_t(size_t, remain, RXRPC_JUMBO_DATALEN);
                shdr = 0;
@@ -192,7 +192,7 @@ static int rxkad_prime_packet_security(struct rxrpc_connection *conn,
 
        _enter("");
 
-       if (!conn->params.key)
+       if (!conn->key)
                return 0;
 
        tmpbuf = kmalloc(tmpsize, GFP_KERNEL);
@@ -205,7 +205,7 @@ static int rxkad_prime_packet_security(struct rxrpc_connection *conn,
                return -ENOMEM;
        }
 
-       token = conn->params.key->payload.data[0];
+       token = conn->key->payload.data[0];
        memcpy(&iv, token->kad->session_key, sizeof(iv));
 
        tmpbuf[0] = htonl(conn->proto.epoch);
@@ -317,7 +317,7 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call,
        }
 
        /* encrypt from the session key */
-       token = call->conn->params.key->payload.data[0];
+       token = call->conn->key->payload.data[0];
        memcpy(&iv, token->kad->session_key, sizeof(iv));
 
        sg_init_one(&sg, txb->data, txb->len);
@@ -344,13 +344,13 @@ static int rxkad_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb)
        int ret;
 
        _enter("{%d{%x}},{#%u},%u,",
-              call->debug_id, key_serial(call->conn->params.key),
+              call->debug_id, key_serial(call->conn->key),
               txb->seq, txb->len);
 
        if (!call->conn->rxkad.cipher)
                return 0;
 
-       ret = key_validate(call->conn->params.key);
+       ret = key_validate(call->conn->key);
        if (ret < 0)
                return ret;
 
@@ -380,7 +380,7 @@ static int rxkad_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb)
                y = 1; /* zero checksums are not permitted */
        txb->wire.cksum = htons(y);
 
-       switch (call->conn->params.security_level) {
+       switch (call->conn->security_level) {
        case RXRPC_SECURITY_PLAIN:
                ret = 0;
                break;
@@ -525,7 +525,7 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb,
        }
 
        /* decrypt from the session key */
-       token = call->conn->params.key->payload.data[0];
+       token = call->conn->key->payload.data[0];
        memcpy(&iv, token->kad->session_key, sizeof(iv));
 
        skcipher_request_set_sync_tfm(req, call->conn->rxkad.cipher);
@@ -596,7 +596,7 @@ static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb)
        u32 x, y;
 
        _enter("{%d{%x}},{#%u}",
-              call->debug_id, key_serial(call->conn->params.key), seq);
+              call->debug_id, key_serial(call->conn->key), seq);
 
        if (!call->conn->rxkad.cipher)
                return 0;
@@ -632,7 +632,7 @@ static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb)
                goto protocol_error;
        }
 
-       switch (call->conn->params.security_level) {
+       switch (call->conn->security_level) {
        case RXRPC_SECURITY_PLAIN:
                ret = 0;
                break;
@@ -678,8 +678,8 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn)
        challenge.min_level     = htonl(0);
        challenge.__padding     = 0;
 
-       msg.msg_name    = &conn->params.peer->srx.transport;
-       msg.msg_namelen = conn->params.peer->srx.transport_len;
+       msg.msg_name    = &conn->peer->srx.transport;
+       msg.msg_namelen = conn->peer->srx.transport_len;
        msg.msg_control = NULL;
        msg.msg_controllen = 0;
        msg.msg_flags   = 0;
@@ -704,16 +704,15 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn)
 
        serial = atomic_inc_return(&conn->serial);
        whdr.serial = htonl(serial);
-       _proto("Tx CHALLENGE %%%u", serial);
 
-       ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len);
+       ret = kernel_sendmsg(conn->local->socket, &msg, iov, 2, len);
        if (ret < 0) {
                trace_rxrpc_tx_fail(conn->debug_id, serial, ret,
                                    rxrpc_tx_point_rxkad_challenge);
                return -EAGAIN;
        }
 
-       conn->params.peer->last_tx_at = ktime_get_seconds();
+       conn->peer->last_tx_at = ktime_get_seconds();
        trace_rxrpc_tx_packet(conn->debug_id, &whdr,
                              rxrpc_tx_point_rxkad_challenge);
        _leave(" = 0");
@@ -737,8 +736,8 @@ static int rxkad_send_response(struct rxrpc_connection *conn,
 
        _enter("");
 
-       msg.msg_name    = &conn->params.peer->srx.transport;
-       msg.msg_namelen = conn->params.peer->srx.transport_len;
+       msg.msg_name    = &conn->peer->srx.transport;
+       msg.msg_namelen = conn->peer->srx.transport_len;
        msg.msg_control = NULL;
        msg.msg_controllen = 0;
        msg.msg_flags   = 0;
@@ -762,16 +761,15 @@ static int rxkad_send_response(struct rxrpc_connection *conn,
 
        serial = atomic_inc_return(&conn->serial);
        whdr.serial = htonl(serial);
-       _proto("Tx RESPONSE %%%u", serial);
 
-       ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 3, len);
+       ret = kernel_sendmsg(conn->local->socket, &msg, iov, 3, len);
        if (ret < 0) {
                trace_rxrpc_tx_fail(conn->debug_id, serial, ret,
                                    rxrpc_tx_point_rxkad_response);
                return -EAGAIN;
        }
 
-       conn->params.peer->last_tx_at = ktime_get_seconds();
+       conn->peer->last_tx_at = ktime_get_seconds();
        _leave(" = 0");
        return 0;
 }
@@ -834,15 +832,15 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn,
        u32 version, nonce, min_level, abort_code;
        int ret;
 
-       _enter("{%d,%x}", conn->debug_id, key_serial(conn->params.key));
+       _enter("{%d,%x}", conn->debug_id, key_serial(conn->key));
 
        eproto = tracepoint_string("chall_no_key");
        abort_code = RX_PROTOCOL_ERROR;
-       if (!conn->params.key)
+       if (!conn->key)
                goto protocol_error;
 
        abort_code = RXKADEXPIRED;
-       ret = key_validate(conn->params.key);
+       ret = key_validate(conn->key);
        if (ret < 0)
                goto other_error;
 
@@ -856,8 +854,7 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn,
        nonce = ntohl(challenge.nonce);
        min_level = ntohl(challenge.min_level);
 
-       _proto("Rx CHALLENGE %%%u { v=%u n=%u ml=%u }",
-              sp->hdr.serial, version, nonce, min_level);
+       trace_rxrpc_rx_challenge(conn, sp->hdr.serial, version, nonce, min_level);
 
        eproto = tracepoint_string("chall_ver");
        abort_code = RXKADINCONSISTENCY;
@@ -866,10 +863,10 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn,
 
        abort_code = RXKADLEVELFAIL;
        ret = -EACCES;
-       if (conn->params.security_level < min_level)
+       if (conn->security_level < min_level)
                goto other_error;
 
-       token = conn->params.key->payload.data[0];
+       token = conn->key->payload.data[0];
 
        /* build the response packet */
        resp = kzalloc(sizeof(struct rxkad_response), GFP_NOFS);
@@ -881,7 +878,7 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn,
        resp->encrypted.cid             = htonl(conn->proto.cid);
        resp->encrypted.securityIndex   = htonl(conn->security_ix);
        resp->encrypted.inc_nonce       = htonl(nonce + 1);
-       resp->encrypted.level           = htonl(conn->params.security_level);
+       resp->encrypted.level           = htonl(conn->security_level);
        resp->kvno                      = htonl(token->kad->kvno);
        resp->ticket_len                = htonl(token->kad->ticket_len);
        resp->encrypted.call_id[0]      = htonl(conn->channels[0].call_counter);
@@ -1139,8 +1136,8 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
        version = ntohl(response->version);
        ticket_len = ntohl(response->ticket_len);
        kvno = ntohl(response->kvno);
-       _proto("Rx RESPONSE %%%u { v=%u kv=%u tl=%u }",
-              sp->hdr.serial, version, kvno, ticket_len);
+
+       trace_rxrpc_rx_response(conn, sp->hdr.serial, version, kvno, ticket_len);
 
        eproto = tracepoint_string("rxkad_rsp_ver");
        abort_code = RXKADINCONSISTENCY;
@@ -1229,7 +1226,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
        level = ntohl(response->encrypted.level);
        if (level > RXRPC_SECURITY_ENCRYPT)
                goto protocol_error_free;
-       conn->params.security_level = level;
+       conn->security_level = level;
 
        /* create a key to hold the security data and expiration time - after
         * this the connection security can be handled in exactly the same way
diff --git a/net/rxrpc/rxperf.c b/net/rxrpc/rxperf.c
new file mode 100644 (file)
index 0000000..66f5eea
--- /dev/null
@@ -0,0 +1,619 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* In-kernel rxperf server for testing purposes.
+ *
+ * Copyright (C) 2022 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#define pr_fmt(fmt) "rxperf: " fmt
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+
+MODULE_DESCRIPTION("rxperf test server (afs)");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+
+#define RXPERF_PORT            7009
+#define RX_PERF_SERVICE                147
+#define RX_PERF_VERSION                3
+#define RX_PERF_SEND           0
+#define RX_PERF_RECV           1
+#define RX_PERF_RPC            3
+#define RX_PERF_FILE           4
+#define RX_PERF_MAGIC_COOKIE   0x4711
+
+struct rxperf_proto_params {
+       __be32          version;
+       __be32          type;
+       __be32          rsize;
+       __be32          wsize;
+} __packed;
+
+static const u8 rxperf_magic_cookie[] = { 0x00, 0x00, 0x47, 0x11 };
+static const u8 secret[8] = { 0xa7, 0x83, 0x8a, 0xcb, 0xc7, 0x83, 0xec, 0x94 };
+
+enum rxperf_call_state {
+       RXPERF_CALL_SV_AWAIT_PARAMS,    /* Server: Awaiting parameter block */
+       RXPERF_CALL_SV_AWAIT_REQUEST,   /* Server: Awaiting request data */
+       RXPERF_CALL_SV_REPLYING,        /* Server: Replying */
+       RXPERF_CALL_SV_AWAIT_ACK,       /* Server: Awaiting final ACK */
+       RXPERF_CALL_COMPLETE,           /* Completed or failed */
+};
+
+struct rxperf_call {
+       struct rxrpc_call       *rxcall;
+       struct iov_iter         iter;
+       struct kvec             kvec[1];
+       struct work_struct      work;
+       const char              *type;
+       size_t                  iov_len;
+       size_t                  req_len;        /* Size of request blob */
+       size_t                  reply_len;      /* Size of reply blob */
+       unsigned int            debug_id;
+       unsigned int            operation_id;
+       struct rxperf_proto_params params;
+       __be32                  tmp[2];
+       s32                     abort_code;
+       enum rxperf_call_state  state;
+       short                   error;
+       unsigned short          unmarshal;
+       u16                     service_id;
+       int (*deliver)(struct rxperf_call *call);
+       void (*processor)(struct work_struct *work);
+};
+
+static struct socket *rxperf_socket;
+static struct key *rxperf_sec_keyring; /* Ring of security/crypto keys */
+static struct workqueue_struct *rxperf_workqueue;
+
+static void rxperf_deliver_to_call(struct work_struct *work);
+static int rxperf_deliver_param_block(struct rxperf_call *call);
+static int rxperf_deliver_request(struct rxperf_call *call);
+static int rxperf_process_call(struct rxperf_call *call);
+static void rxperf_charge_preallocation(struct work_struct *work);
+
+static DECLARE_WORK(rxperf_charge_preallocation_work,
+                   rxperf_charge_preallocation);
+
+static inline void rxperf_set_call_state(struct rxperf_call *call,
+                                        enum rxperf_call_state to)
+{
+       call->state = to;
+}
+
+static inline void rxperf_set_call_complete(struct rxperf_call *call,
+                                           int error, s32 remote_abort)
+{
+       if (call->state != RXPERF_CALL_COMPLETE) {
+               call->abort_code = remote_abort;
+               call->error = error;
+               call->state = RXPERF_CALL_COMPLETE;
+       }
+}
+
+static void rxperf_rx_discard_new_call(struct rxrpc_call *rxcall,
+                                      unsigned long user_call_ID)
+{
+       kfree((struct rxperf_call *)user_call_ID);
+}
+
+static void rxperf_rx_new_call(struct sock *sk, struct rxrpc_call *rxcall,
+                              unsigned long user_call_ID)
+{
+       queue_work(rxperf_workqueue, &rxperf_charge_preallocation_work);
+}
+
+static void rxperf_queue_call_work(struct rxperf_call *call)
+{
+       queue_work(rxperf_workqueue, &call->work);
+}
+
+static void rxperf_notify_rx(struct sock *sk, struct rxrpc_call *rxcall,
+                            unsigned long call_user_ID)
+{
+       struct rxperf_call *call = (struct rxperf_call *)call_user_ID;
+
+       if (call->state != RXPERF_CALL_COMPLETE)
+               rxperf_queue_call_work(call);
+}
+
+static void rxperf_rx_attach(struct rxrpc_call *rxcall, unsigned long user_call_ID)
+{
+       struct rxperf_call *call = (struct rxperf_call *)user_call_ID;
+
+       call->rxcall = rxcall;
+}
+
+static void rxperf_notify_end_reply_tx(struct sock *sock,
+                                      struct rxrpc_call *rxcall,
+                                      unsigned long call_user_ID)
+{
+       rxperf_set_call_state((struct rxperf_call *)call_user_ID,
+                             RXPERF_CALL_SV_AWAIT_ACK);
+}
+
+/*
+ * Charge the incoming call preallocation.
+ */
+static void rxperf_charge_preallocation(struct work_struct *work)
+{
+       struct rxperf_call *call;
+
+       for (;;) {
+               call = kzalloc(sizeof(*call), GFP_KERNEL);
+               if (!call)
+                       break;
+
+               call->type              = "unset";
+               call->debug_id          = atomic_inc_return(&rxrpc_debug_id);
+               call->deliver           = rxperf_deliver_param_block;
+               call->state             = RXPERF_CALL_SV_AWAIT_PARAMS;
+               call->service_id        = RX_PERF_SERVICE;
+               call->iov_len           = sizeof(call->params);
+               call->kvec[0].iov_len   = sizeof(call->params);
+               call->kvec[0].iov_base  = &call->params;
+               iov_iter_kvec(&call->iter, READ, call->kvec, 1, call->iov_len);
+               INIT_WORK(&call->work, rxperf_deliver_to_call);
+
+               if (rxrpc_kernel_charge_accept(rxperf_socket,
+                                              rxperf_notify_rx,
+                                              rxperf_rx_attach,
+                                              (unsigned long)call,
+                                              GFP_KERNEL,
+                                              call->debug_id) < 0)
+                       break;
+               call = NULL;
+       }
+
+       kfree(call);
+}
+
+/*
+ * Open an rxrpc socket and bind it to be a server for callback notifications
+ * - the socket is left in blocking mode and non-blocking ops use MSG_DONTWAIT
+ */
+static int rxperf_open_socket(void)
+{
+       struct sockaddr_rxrpc srx;
+       struct socket *socket;
+       int ret;
+
+       ret = sock_create_kern(&init_net, AF_RXRPC, SOCK_DGRAM, PF_INET6,
+                              &socket);
+       if (ret < 0)
+               goto error_1;
+
+       socket->sk->sk_allocation = GFP_NOFS;
+
+       /* bind the callback manager's address to make this a server socket */
+       memset(&srx, 0, sizeof(srx));
+       srx.srx_family                  = AF_RXRPC;
+       srx.srx_service                 = RX_PERF_SERVICE;
+       srx.transport_type              = SOCK_DGRAM;
+       srx.transport_len               = sizeof(srx.transport.sin6);
+       srx.transport.sin6.sin6_family  = AF_INET6;
+       srx.transport.sin6.sin6_port    = htons(RXPERF_PORT);
+
+       ret = rxrpc_sock_set_min_security_level(socket->sk,
+                                               RXRPC_SECURITY_ENCRYPT);
+       if (ret < 0)
+               goto error_2;
+
+       ret = rxrpc_sock_set_security_keyring(socket->sk, rxperf_sec_keyring);
+
+       ret = kernel_bind(socket, (struct sockaddr *)&srx, sizeof(srx));
+       if (ret < 0)
+               goto error_2;
+
+       rxrpc_kernel_new_call_notification(socket, rxperf_rx_new_call,
+                                          rxperf_rx_discard_new_call);
+
+       ret = kernel_listen(socket, INT_MAX);
+       if (ret < 0)
+               goto error_2;
+
+       rxperf_socket = socket;
+       rxperf_charge_preallocation(&rxperf_charge_preallocation_work);
+       return 0;
+
+error_2:
+       sock_release(socket);
+error_1:
+       pr_err("Can't set up rxperf socket: %d\n", ret);
+       return ret;
+}
+
+/*
+ * close the rxrpc socket rxperf was using
+ */
+static void rxperf_close_socket(void)
+{
+       kernel_listen(rxperf_socket, 0);
+       kernel_sock_shutdown(rxperf_socket, SHUT_RDWR);
+       flush_workqueue(rxperf_workqueue);
+       sock_release(rxperf_socket);
+}
+
+/*
+ * Log remote abort codes that indicate that we have a protocol disagreement
+ * with the server.
+ */
+static void rxperf_log_error(struct rxperf_call *call, s32 remote_abort)
+{
+       static int max = 0;
+       const char *msg;
+       int m;
+
+       switch (remote_abort) {
+       case RX_EOF:             msg = "unexpected EOF";        break;
+       case RXGEN_CC_MARSHAL:   msg = "client marshalling";    break;
+       case RXGEN_CC_UNMARSHAL: msg = "client unmarshalling";  break;
+       case RXGEN_SS_MARSHAL:   msg = "server marshalling";    break;
+       case RXGEN_SS_UNMARSHAL: msg = "server unmarshalling";  break;
+       case RXGEN_DECODE:       msg = "opcode decode";         break;
+       case RXGEN_SS_XDRFREE:   msg = "server XDR cleanup";    break;
+       case RXGEN_CC_XDRFREE:   msg = "client XDR cleanup";    break;
+       case -32:                msg = "insufficient data";     break;
+       default:
+               return;
+       }
+
+       m = max;
+       if (m < 3) {
+               max = m + 1;
+               pr_info("Peer reported %s failure on %s\n", msg, call->type);
+       }
+}
+
+/*
+ * deliver messages to a call
+ */
+static void rxperf_deliver_to_call(struct work_struct *work)
+{
+       struct rxperf_call *call = container_of(work, struct rxperf_call, work);
+       enum rxperf_call_state state;
+       u32 abort_code, remote_abort = 0;
+       int ret;
+
+       if (call->state == RXPERF_CALL_COMPLETE)
+               return;
+
+       while (state = call->state,
+              state == RXPERF_CALL_SV_AWAIT_PARAMS ||
+              state == RXPERF_CALL_SV_AWAIT_REQUEST ||
+              state == RXPERF_CALL_SV_AWAIT_ACK
+              ) {
+               if (state == RXPERF_CALL_SV_AWAIT_ACK) {
+                       if (!rxrpc_kernel_check_life(rxperf_socket, call->rxcall))
+                               goto call_complete;
+                       return;
+               }
+
+               ret = call->deliver(call);
+               if (ret == 0)
+                       ret = rxperf_process_call(call);
+
+               switch (ret) {
+               case 0:
+                       continue;
+               case -EINPROGRESS:
+               case -EAGAIN:
+                       return;
+               case -ECONNABORTED:
+                       rxperf_log_error(call, call->abort_code);
+                       goto call_complete;
+               case -EOPNOTSUPP:
+                       abort_code = RXGEN_OPCODE;
+                       rxrpc_kernel_abort_call(rxperf_socket, call->rxcall,
+                                               abort_code, ret, "GOP");
+                       goto call_complete;
+               case -ENOTSUPP:
+                       abort_code = RX_USER_ABORT;
+                       rxrpc_kernel_abort_call(rxperf_socket, call->rxcall,
+                                               abort_code, ret, "GUA");
+                       goto call_complete;
+               case -EIO:
+                       pr_err("Call %u in bad state %u\n",
+                              call->debug_id, call->state);
+                       fallthrough;
+               case -ENODATA:
+               case -EBADMSG:
+               case -EMSGSIZE:
+               case -ENOMEM:
+               case -EFAULT:
+                       rxrpc_kernel_abort_call(rxperf_socket, call->rxcall,
+                                               RXGEN_SS_UNMARSHAL, ret, "GUM");
+                       goto call_complete;
+               default:
+                       rxrpc_kernel_abort_call(rxperf_socket, call->rxcall,
+                                               RX_CALL_DEAD, ret, "GER");
+                       goto call_complete;
+               }
+       }
+
+call_complete:
+       rxperf_set_call_complete(call, ret, remote_abort);
+       /* The call may have been requeued */
+       rxrpc_kernel_end_call(rxperf_socket, call->rxcall);
+       cancel_work(&call->work);
+       kfree(call);
+}
+
+/*
+ * Extract a piece of data from the received data socket buffers.
+ */
+static int rxperf_extract_data(struct rxperf_call *call, bool want_more)
+{
+       u32 remote_abort = 0;
+       int ret;
+
+       ret = rxrpc_kernel_recv_data(rxperf_socket, call->rxcall, &call->iter,
+                                    &call->iov_len, want_more, &remote_abort,
+                                    &call->service_id);
+       pr_debug("Extract i=%zu l=%zu m=%u ret=%d\n",
+                iov_iter_count(&call->iter), call->iov_len, want_more, ret);
+       if (ret == 0 || ret == -EAGAIN)
+               return ret;
+
+       if (ret == 1) {
+               switch (call->state) {
+               case RXPERF_CALL_SV_AWAIT_REQUEST:
+                       rxperf_set_call_state(call, RXPERF_CALL_SV_REPLYING);
+                       break;
+               case RXPERF_CALL_COMPLETE:
+                       pr_debug("premature completion %d", call->error);
+                       return call->error;
+               default:
+                       break;
+               }
+               return 0;
+       }
+
+       rxperf_set_call_complete(call, ret, remote_abort);
+       return ret;
+}
+
+/*
+ * Grab the operation ID from an incoming manager call.
+ */
+static int rxperf_deliver_param_block(struct rxperf_call *call)
+{
+       u32 version;
+       int ret;
+
+       /* Extract the parameter block */
+       ret = rxperf_extract_data(call, true);
+       if (ret < 0)
+               return ret;
+
+       version                 = ntohl(call->params.version);
+       call->operation_id      = ntohl(call->params.type);
+       call->deliver           = rxperf_deliver_request;
+
+       if (version != RX_PERF_VERSION) {
+               pr_info("Version mismatch %x\n", version);
+               return -ENOTSUPP;
+       }
+
+       switch (call->operation_id) {
+       case RX_PERF_SEND:
+               call->type = "send";
+               call->reply_len = 0;
+               call->iov_len = 4;      /* Expect req size */
+               break;
+       case RX_PERF_RECV:
+               call->type = "recv";
+               call->req_len = 0;
+               call->iov_len = 4;      /* Expect reply size */
+               break;
+       case RX_PERF_RPC:
+               call->type = "rpc";
+               call->iov_len = 8;      /* Expect req size and reply size */
+               break;
+       case RX_PERF_FILE:
+               call->type = "file";
+               fallthrough;
+       default:
+               return -EOPNOTSUPP;
+       }
+
+       rxperf_set_call_state(call, RXPERF_CALL_SV_AWAIT_REQUEST);
+       return call->deliver(call);
+}
+
+/*
+ * Deliver the request data.
+ */
+static int rxperf_deliver_request(struct rxperf_call *call)
+{
+       int ret;
+
+       switch (call->unmarshal) {
+       case 0:
+               call->kvec[0].iov_len   = call->iov_len;
+               call->kvec[0].iov_base  = call->tmp;
+               iov_iter_kvec(&call->iter, READ, call->kvec, 1, call->iov_len);
+               call->unmarshal++;
+               fallthrough;
+       case 1:
+               ret = rxperf_extract_data(call, true);
+               if (ret < 0)
+                       return ret;
+
+               switch (call->operation_id) {
+               case RX_PERF_SEND:
+                       call->type = "send";
+                       call->req_len   = ntohl(call->tmp[0]);
+                       call->reply_len = 0;
+                       break;
+               case RX_PERF_RECV:
+                       call->type = "recv";
+                       call->req_len = 0;
+                       call->reply_len = ntohl(call->tmp[0]);
+                       break;
+               case RX_PERF_RPC:
+                       call->type = "rpc";
+                       call->req_len   = ntohl(call->tmp[0]);
+                       call->reply_len = ntohl(call->tmp[1]);
+                       break;
+               default:
+                       pr_info("Can't parse extra params\n");
+                       return -EIO;
+               }
+
+               pr_debug("CALL op=%s rq=%zx rp=%zx\n",
+                        call->type, call->req_len, call->reply_len);
+
+               call->iov_len = call->req_len;
+               iov_iter_discard(&call->iter, READ, call->req_len);
+               call->unmarshal++;
+               fallthrough;
+       case 2:
+               ret = rxperf_extract_data(call, false);
+               if (ret < 0)
+                       return ret;
+               call->unmarshal++;
+               fallthrough;
+       default:
+               return 0;
+       }
+}
+
+/*
+ * Process a call for which we've received the request.
+ */
+static int rxperf_process_call(struct rxperf_call *call)
+{
+       struct msghdr msg = {};
+       struct bio_vec bv[1];
+       struct kvec iov[1];
+       ssize_t n;
+       size_t reply_len = call->reply_len, len;
+
+       rxrpc_kernel_set_tx_length(rxperf_socket, call->rxcall,
+                                  reply_len + sizeof(rxperf_magic_cookie));
+
+       while (reply_len > 0) {
+               len = min_t(size_t, reply_len, PAGE_SIZE);
+               bv[0].bv_page   = ZERO_PAGE(0);
+               bv[0].bv_offset = 0;
+               bv[0].bv_len    = len;
+               iov_iter_bvec(&msg.msg_iter, WRITE, bv, 1, len);
+               msg.msg_flags = MSG_MORE;
+               n = rxrpc_kernel_send_data(rxperf_socket, call->rxcall, &msg,
+                                          len, rxperf_notify_end_reply_tx);
+               if (n < 0)
+                       return n;
+               if (n == 0)
+                       return -EIO;
+               reply_len -= n;
+       }
+
+       len = sizeof(rxperf_magic_cookie);
+       iov[0].iov_base = (void *)rxperf_magic_cookie;
+       iov[0].iov_len  = len;
+       iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, len);
+       msg.msg_flags = 0;
+       n = rxrpc_kernel_send_data(rxperf_socket, call->rxcall, &msg, len,
+                                  rxperf_notify_end_reply_tx);
+       if (n >= 0)
+               return 0; /* Success */
+
+       if (n == -ENOMEM)
+               rxrpc_kernel_abort_call(rxperf_socket, call->rxcall,
+                                       RXGEN_SS_MARSHAL, -ENOMEM, "GOM");
+       return n;
+}
+
+/*
+ * Add a key to the security keyring.
+ */
+static int rxperf_add_key(struct key *keyring)
+{
+       key_ref_t kref;
+       int ret;
+
+       kref = key_create_or_update(make_key_ref(keyring, true),
+                                   "rxrpc_s",
+                                   __stringify(RX_PERF_SERVICE) ":2",
+                                   secret,
+                                   sizeof(secret),
+                                   KEY_POS_VIEW | KEY_POS_READ | KEY_POS_SEARCH
+                                   | KEY_USR_VIEW,
+                                   KEY_ALLOC_NOT_IN_QUOTA);
+
+       if (IS_ERR(kref)) {
+               pr_err("Can't allocate rxperf server key: %ld\n", PTR_ERR(kref));
+               return PTR_ERR(kref);
+       }
+
+       ret = key_link(keyring, key_ref_to_ptr(kref));
+       if (ret < 0)
+               pr_err("Can't link rxperf server key: %d\n", ret);
+       key_ref_put(kref);
+       return ret;
+}
+
+/*
+ * Initialise the rxperf server.
+ */
+static int __init rxperf_init(void)
+{
+       struct key *keyring;
+       int ret = -ENOMEM;
+
+       pr_info("Server registering\n");
+
+       rxperf_workqueue = alloc_workqueue("rxperf", 0, 0);
+       if (!rxperf_workqueue)
+               goto error_workqueue;
+
+       keyring = keyring_alloc("rxperf_server",
+                               GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(),
+                               KEY_POS_VIEW | KEY_POS_READ | KEY_POS_SEARCH |
+                               KEY_POS_WRITE |
+                               KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH |
+                               KEY_USR_WRITE |
+                               KEY_OTH_VIEW | KEY_OTH_READ | KEY_OTH_SEARCH,
+                               KEY_ALLOC_NOT_IN_QUOTA,
+                               NULL, NULL);
+       if (IS_ERR(keyring)) {
+               pr_err("Can't allocate rxperf server keyring: %ld\n",
+                      PTR_ERR(keyring));
+               goto error_keyring;
+       }
+       rxperf_sec_keyring = keyring;
+       ret = rxperf_add_key(keyring);
+       if (ret < 0)
+               goto error_key;
+
+       ret = rxperf_open_socket();
+       if (ret < 0)
+               goto error_socket;
+       return 0;
+
+error_socket:
+error_key:
+       key_put(rxperf_sec_keyring);
+error_keyring:
+       destroy_workqueue(rxperf_workqueue);
+       rcu_barrier();
+error_workqueue:
+       pr_err("Failed to register: %d\n", ret);
+       return ret;
+}
+late_initcall(rxperf_init); /* Must be called after net/ to create socket */
+
+static void __exit rxperf_exit(void)
+{
+       pr_info("Server unregistering.\n");
+
+       rxperf_close_socket();
+       key_put(rxperf_sec_keyring);
+       destroy_workqueue(rxperf_workqueue);
+       rcu_barrier();
+}
+module_exit(rxperf_exit);
+
index 50cb5f1..209f2c2 100644 (file)
@@ -63,13 +63,43 @@ const struct rxrpc_security *rxrpc_security_lookup(u8 security_index)
 }
 
 /*
+ * Initialise the security on a client call.
+ */
+int rxrpc_init_client_call_security(struct rxrpc_call *call)
+{
+       const struct rxrpc_security *sec;
+       struct rxrpc_key_token *token;
+       struct key *key = call->key;
+       int ret;
+
+       if (!key)
+               return 0;
+
+       ret = key_validate(key);
+       if (ret < 0)
+               return ret;
+
+       for (token = key->payload.data[0]; token; token = token->next) {
+               sec = rxrpc_security_lookup(token->security_index);
+               if (sec)
+                       goto found;
+       }
+       return -EKEYREJECTED;
+
+found:
+       call->security = sec;
+       _leave(" = 0");
+       return 0;
+}
+
+/*
  * initialise the security on a client connection
  */
 int rxrpc_init_client_conn_security(struct rxrpc_connection *conn)
 {
        const struct rxrpc_security *sec;
        struct rxrpc_key_token *token;
-       struct key *key = conn->params.key;
+       struct key *key = conn->key;
        int ret;
 
        _enter("{%d},{%x}", conn->debug_id, key_serial(key));
@@ -163,7 +193,7 @@ struct key *rxrpc_look_up_server_security(struct rxrpc_connection *conn,
 
        rcu_read_lock();
 
-       rx = rcu_dereference(conn->params.local->service);
+       rx = rcu_dereference(conn->local->service);
        if (!rx)
                goto out;
 
index e5fd8a9..9fa7e37 100644 (file)
  */
 static bool rxrpc_check_tx_space(struct rxrpc_call *call, rxrpc_seq_t *_tx_win)
 {
-       unsigned int win_size;
-       rxrpc_seq_t tx_win = smp_load_acquire(&call->acks_hard_ack);
-
-       /* If we haven't transmitted anything for >1RTT, we should reset the
-        * congestion management state.
-        */
-       if (ktime_before(ktime_add_us(call->tx_last_sent,
-                                     call->peer->srtt_us >> 3),
-                        ktime_get_real())) {
-               if (RXRPC_TX_SMSS > 2190)
-                       win_size = 2;
-               else if (RXRPC_TX_SMSS > 1095)
-                       win_size = 3;
-               else
-                       win_size = 4;
-               win_size += call->cong_extra;
-       } else {
-               win_size = min_t(unsigned int, call->tx_winsize,
-                                call->cong_cwnd + call->cong_extra);
-       }
-
        if (_tx_win)
-               *_tx_win = tx_win;
-       return call->tx_top - tx_win < win_size;
+               *_tx_win = call->tx_bottom;
+       return call->tx_prepared - call->tx_bottom < 256;
 }
 
 /*
@@ -66,11 +45,6 @@ static int rxrpc_wait_for_tx_window_intr(struct rxrpc_sock *rx,
                if (signal_pending(current))
                        return sock_intr_errno(*timeo);
 
-               if (READ_ONCE(call->acks_hard_ack) != call->tx_bottom) {
-                       rxrpc_shrink_call_tx_buffer(call);
-                       continue;
-               }
-
                trace_rxrpc_txqueue(call, rxrpc_txqueue_wait);
                *timeo = schedule_timeout(*timeo);
        }
@@ -107,11 +81,6 @@ static int rxrpc_wait_for_tx_window_waitall(struct rxrpc_sock *rx,
                    tx_win == tx_start && signal_pending(current))
                        return -EINTR;
 
-               if (READ_ONCE(call->acks_hard_ack) != call->tx_bottom) {
-                       rxrpc_shrink_call_tx_buffer(call);
-                       continue;
-               }
-
                if (tx_win != tx_start) {
                        timeout = rtt;
                        tx_start = tx_win;
@@ -137,11 +106,6 @@ static int rxrpc_wait_for_tx_window_nonintr(struct rxrpc_sock *rx,
                if (call->state >= RXRPC_CALL_COMPLETE)
                        return call->error;
 
-               if (READ_ONCE(call->acks_hard_ack) != call->tx_bottom) {
-                       rxrpc_shrink_call_tx_buffer(call);
-                       continue;
-               }
-
                trace_rxrpc_txqueue(call, rxrpc_txqueue_wait);
                *timeo = schedule_timeout(*timeo);
        }
@@ -206,33 +170,32 @@ static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call,
 {
        unsigned long now;
        rxrpc_seq_t seq = txb->seq;
-       bool last = test_bit(RXRPC_TXBUF_LAST, &txb->flags);
-       int ret;
+       bool last = test_bit(RXRPC_TXBUF_LAST, &txb->flags), poke;
 
        rxrpc_inc_stat(call->rxnet, stat_tx_data);
 
-       ASSERTCMP(seq, ==, call->tx_top + 1);
+       ASSERTCMP(txb->seq, ==, call->tx_prepared + 1);
 
        /* We have to set the timestamp before queueing as the retransmit
         * algorithm can see the packet as soon as we queue it.
         */
        txb->last_sent = ktime_get_real();
 
-       /* Add the packet to the call's output buffer */
-       rxrpc_get_txbuf(txb, rxrpc_txbuf_get_buffer);
-       spin_lock(&call->tx_lock);
-       list_add_tail(&txb->call_link, &call->tx_buffer);
-       call->tx_top = seq;
-       spin_unlock(&call->tx_lock);
-
        if (last)
                trace_rxrpc_txqueue(call, rxrpc_txqueue_queue_last);
        else
                trace_rxrpc_txqueue(call, rxrpc_txqueue_queue);
 
+       /* Add the packet to the call's output buffer */
+       spin_lock(&call->tx_lock);
+       poke = list_empty(&call->tx_sendmsg);
+       list_add_tail(&txb->call_link, &call->tx_sendmsg);
+       call->tx_prepared = seq;
+       spin_unlock(&call->tx_lock);
+
        if (last || call->state == RXRPC_CALL_SERVER_ACK_REQUEST) {
                _debug("________awaiting reply/ACK__________");
-               write_lock_bh(&call->state_lock);
+               write_lock(&call->state_lock);
                switch (call->state) {
                case RXRPC_CALL_CLIENT_SEND_REQUEST:
                        call->state = RXRPC_CALL_CLIENT_AWAIT_REPLY;
@@ -255,33 +218,11 @@ static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call,
                default:
                        break;
                }
-               write_unlock_bh(&call->state_lock);
+               write_unlock(&call->state_lock);
        }
 
-       if (seq == 1 && rxrpc_is_client_call(call))
-               rxrpc_expose_client_call(call);
-
-       ret = rxrpc_send_data_packet(call, txb);
-       if (ret < 0) {
-               switch (ret) {
-               case -ENETUNREACH:
-               case -EHOSTUNREACH:
-               case -ECONNREFUSED:
-                       rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR,
-                                                 0, ret);
-                       goto out;
-               }
-       } else {
-               unsigned long now = jiffies;
-               unsigned long resend_at = now + call->peer->rto_j;
-
-               WRITE_ONCE(call->resend_at, resend_at);
-               rxrpc_reduce_call_timer(call, resend_at, now,
-                                       rxrpc_timer_set_for_send);
-       }
-
-out:
-       rxrpc_put_txbuf(txb, rxrpc_txbuf_put_trans);
+       if (poke)
+               rxrpc_poke_call(call, rxrpc_call_poke_start);
 }
 
 /*
@@ -335,8 +276,6 @@ reload:
                rxrpc_see_txbuf(txb, rxrpc_txbuf_see_send_more);
 
        do {
-               rxrpc_transmit_ack_packets(call->peer->local);
-
                if (!txb) {
                        size_t remain, bufsize, chunk, offset;
 
@@ -416,10 +355,10 @@ reload:
 success:
        ret = copied;
        if (READ_ONCE(call->state) == RXRPC_CALL_COMPLETE) {
-               read_lock_bh(&call->state_lock);
+               read_lock(&call->state_lock);
                if (call->error < 0)
                        ret = call->error;
-               read_unlock_bh(&call->state_lock);
+               read_unlock(&call->state_lock);
        }
 out:
        call->tx_pending = txb;
@@ -604,7 +543,7 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg,
                                     atomic_inc_return(&rxrpc_debug_id));
        /* The socket is now unlocked */
 
-       rxrpc_put_peer(cp.peer);
+       rxrpc_put_peer(cp.peer, rxrpc_peer_put_discard_tmp);
        _leave(" = %p\n", call);
        return call;
 }
@@ -667,7 +606,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
                case RXRPC_CALL_CLIENT_AWAIT_CONN:
                case RXRPC_CALL_SERVER_PREALLOC:
                case RXRPC_CALL_SERVER_SECURING:
-                       rxrpc_put_call(call, rxrpc_call_put);
+                       rxrpc_put_call(call, rxrpc_call_put_sendmsg);
                        ret = -EBUSY;
                        goto error_release_sock;
                default:
@@ -737,7 +676,7 @@ out_put_unlock:
        if (!dropped_lock)
                mutex_unlock(&call->user_mutex);
 error_put:
-       rxrpc_put_call(call, rxrpc_call_put);
+       rxrpc_put_call(call, rxrpc_call_put_sendmsg);
        _leave(" = %d", ret);
        return ret;
 
@@ -784,9 +723,9 @@ int rxrpc_kernel_send_data(struct socket *sock, struct rxrpc_call *call,
                                      notify_end_tx, &dropped_lock);
                break;
        case RXRPC_CALL_COMPLETE:
-               read_lock_bh(&call->state_lock);
+               read_lock(&call->state_lock);
                ret = call->error;
-               read_unlock_bh(&call->state_lock);
+               read_unlock(&call->state_lock);
                break;
        default:
                /* Request phase complete for this client call */
index ee269e0..e519405 100644 (file)
@@ -144,3 +144,28 @@ int rxrpc_server_keyring(struct rxrpc_sock *rx, sockptr_t optval, int optlen)
        _leave(" = 0 [key %x]", key->serial);
        return 0;
 }
+
+/**
+ * rxrpc_sock_set_security_keyring - Set the security keyring for a kernel service
+ * @sk: The socket to set the keyring on
+ * @keyring: The keyring to set
+ *
+ * Set the server security keyring on an rxrpc socket.  This is used to provide
+ * the encryption keys for a kernel service.
+ */
+int rxrpc_sock_set_security_keyring(struct sock *sk, struct key *keyring)
+{
+       struct rxrpc_sock *rx = rxrpc_sk(sk);
+       int ret = 0;
+
+       lock_sock(sk);
+       if (rx->securities)
+               ret = -EINVAL;
+       else if (rx->sk.sk_state != RXRPC_UNBOUND)
+               ret = -EISCONN;
+       else
+               rx->securities = key_get(keyring);
+       release_sock(sk);
+       return ret;
+}
+EXPORT_SYMBOL(rxrpc_sock_set_security_keyring);
index 0c827d5..ebe0c75 100644 (file)
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* ar-skbuff.c: socket buffer destruction handling
+/* Socket buffer accounting
  *
  * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
 /*
  * Note the allocation or reception of a socket buffer.
  */
-void rxrpc_new_skb(struct sk_buff *skb, enum rxrpc_skb_trace op)
+void rxrpc_new_skb(struct sk_buff *skb, enum rxrpc_skb_trace why)
 {
-       const void *here = __builtin_return_address(0);
        int n = atomic_inc_return(select_skb_count(skb));
-       trace_rxrpc_skb(skb, op, refcount_read(&skb->users), n, here);
+       trace_rxrpc_skb(skb, refcount_read(&skb->users), n, why);
 }
 
 /*
  * Note the re-emergence of a socket buffer from a queue or buffer.
  */
-void rxrpc_see_skb(struct sk_buff *skb, enum rxrpc_skb_trace op)
+void rxrpc_see_skb(struct sk_buff *skb, enum rxrpc_skb_trace why)
 {
-       const void *here = __builtin_return_address(0);
        if (skb) {
                int n = atomic_read(select_skb_count(skb));
-               trace_rxrpc_skb(skb, op, refcount_read(&skb->users), n, here);
+               trace_rxrpc_skb(skb, refcount_read(&skb->users), n, why);
        }
 }
 
 /*
  * Note the addition of a ref on a socket buffer.
  */
-void rxrpc_get_skb(struct sk_buff *skb, enum rxrpc_skb_trace op)
+void rxrpc_get_skb(struct sk_buff *skb, enum rxrpc_skb_trace why)
 {
-       const void *here = __builtin_return_address(0);
        int n = atomic_inc_return(select_skb_count(skb));
-       trace_rxrpc_skb(skb, op, refcount_read(&skb->users), n, here);
+       trace_rxrpc_skb(skb, refcount_read(&skb->users), n, why);
        skb_get(skb);
 }
 
 /*
  * Note the dropping of a ref on a socket buffer by the core.
  */
-void rxrpc_eaten_skb(struct sk_buff *skb, enum rxrpc_skb_trace op)
+void rxrpc_eaten_skb(struct sk_buff *skb, enum rxrpc_skb_trace why)
 {
-       const void *here = __builtin_return_address(0);
        int n = atomic_inc_return(&rxrpc_n_rx_skbs);
-       trace_rxrpc_skb(skb, op, 0, n, here);
+       trace_rxrpc_skb(skb, 0, n, why);
 }
 
 /*
  * Note the destruction of a socket buffer.
  */
-void rxrpc_free_skb(struct sk_buff *skb, enum rxrpc_skb_trace op)
+void rxrpc_free_skb(struct sk_buff *skb, enum rxrpc_skb_trace why)
 {
-       const void *here = __builtin_return_address(0);
        if (skb) {
-               int n;
-               n = atomic_dec_return(select_skb_count(skb));
-               trace_rxrpc_skb(skb, op, refcount_read(&skb->users), n, here);
+               int n = atomic_dec_return(select_skb_count(skb));
+               trace_rxrpc_skb(skb, refcount_read(&skb->users), n, why);
                kfree_skb(skb);
        }
 }
@@ -78,12 +72,12 @@ void rxrpc_free_skb(struct sk_buff *skb, enum rxrpc_skb_trace op)
  */
 void rxrpc_purge_queue(struct sk_buff_head *list)
 {
-       const void *here = __builtin_return_address(0);
        struct sk_buff *skb;
+
        while ((skb = skb_dequeue((list))) != NULL) {
                int n = atomic_dec_return(select_skb_count(skb));
-               trace_rxrpc_skb(skb, rxrpc_skb_purged,
-                               refcount_read(&skb->users), n, here);
+               trace_rxrpc_skb(skb, refcount_read(&skb->users), n,
+                               rxrpc_skb_put_purge);
                kfree_skb(skb);
        }
 }
index 96bfee8..d2cf2aa 100644 (file)
@@ -26,7 +26,6 @@ struct rxrpc_txbuf *rxrpc_alloc_txbuf(struct rxrpc_call *call, u8 packet_type,
                INIT_LIST_HEAD(&txb->call_link);
                INIT_LIST_HEAD(&txb->tx_link);
                refcount_set(&txb->ref, 1);
-               txb->call               = call;
                txb->call_debug_id      = call->debug_id;
                txb->debug_id           = atomic_inc_return(&rxrpc_txbuf_debug_ids);
                txb->space              = sizeof(txb->data);
@@ -34,7 +33,7 @@ struct rxrpc_txbuf *rxrpc_alloc_txbuf(struct rxrpc_call *call, u8 packet_type,
                txb->offset             = 0;
                txb->flags              = 0;
                txb->ack_why            = 0;
-               txb->seq                = call->tx_top + 1;
+               txb->seq                = call->tx_prepared + 1;
                txb->wire.epoch         = htonl(call->conn->proto.epoch);
                txb->wire.cid           = htonl(call->cid);
                txb->wire.callNumber    = htonl(call->call_id);
@@ -44,7 +43,7 @@ struct rxrpc_txbuf *rxrpc_alloc_txbuf(struct rxrpc_call *call, u8 packet_type,
                txb->wire.userStatus    = 0;
                txb->wire.securityIndex = call->security_ix;
                txb->wire._rsvd         = 0;
-               txb->wire.serviceId     = htons(call->service_id);
+               txb->wire.serviceId     = htons(call->dest_srx.srx_service);
 
                trace_rxrpc_txbuf(txb->debug_id,
                                  txb->call_debug_id, txb->seq, 1,
@@ -107,6 +106,7 @@ void rxrpc_shrink_call_tx_buffer(struct rxrpc_call *call)
 {
        struct rxrpc_txbuf *txb;
        rxrpc_seq_t hard_ack = smp_load_acquire(&call->acks_hard_ack);
+       bool wake = false;
 
        _enter("%x/%x/%x", call->tx_bottom, call->acks_hard_ack, call->tx_top);
 
@@ -120,8 +120,10 @@ void rxrpc_shrink_call_tx_buffer(struct rxrpc_call *call)
                if (before(hard_ack, txb->seq))
                        break;
 
+               if (txb->seq != call->tx_bottom + 1)
+                       rxrpc_see_txbuf(txb, rxrpc_txbuf_see_out_of_step);
                ASSERTCMP(txb->seq, ==, call->tx_bottom + 1);
-               call->tx_bottom++;
+               smp_store_release(&call->tx_bottom, call->tx_bottom + 1);
                list_del_rcu(&txb->call_link);
 
                trace_rxrpc_txqueue(call, rxrpc_txqueue_dequeue);
@@ -129,7 +131,12 @@ void rxrpc_shrink_call_tx_buffer(struct rxrpc_call *call)
                spin_unlock(&call->tx_lock);
 
                rxrpc_put_txbuf(txb, rxrpc_txbuf_put_rotated);
+               if (after(call->acks_hard_ack, call->tx_bottom + 128))
+                       wake = true;
        }
 
        spin_unlock(&call->tx_lock);
+
+       if (wake)
+               wake_up(&call->waitq);
 }
index 4662a6c..777d6b5 100644 (file)
@@ -977,6 +977,7 @@ config NET_ACT_TUNNEL_KEY
 config NET_ACT_CT
        tristate "connection tracking tc action"
        depends on NET_CLS_ACT && NF_CONNTRACK && (!NF_NAT || NF_NAT) && NF_FLOW_TABLE
+       select NF_NAT_OVS if NF_NAT
        help
          Say Y here to allow sending the packets to conntrack module.
 
index 9b31a10..5b3c0ac 100644 (file)
@@ -23,6 +23,7 @@
 #include <net/act_api.h>
 #include <net/netlink.h>
 #include <net/flow_offload.h>
+#include <net/tc_wrapper.h>
 
 #ifdef CONFIG_INET
 DEFINE_STATIC_KEY_FALSE(tcf_frag_xmit_count);
@@ -1080,7 +1081,7 @@ restart_act_graph:
 
                repeat_ttl = 32;
 repeat:
-               ret = a->ops->act(skb, a, res);
+               ret = tc_act(skb, a, res);
                if (unlikely(ret == TC_ACT_REPEAT)) {
                        if (--repeat_ttl != 0)
                                goto repeat;
index b79eee4..b0455fd 100644 (file)
@@ -18,6 +18,7 @@
 
 #include <linux/tc_act/tc_bpf.h>
 #include <net/tc_act/tc_bpf.h>
+#include <net/tc_wrapper.h>
 
 #define ACT_BPF_NAME_LEN       256
 
@@ -31,8 +32,9 @@ struct tcf_bpf_cfg {
 
 static struct tc_action_ops act_bpf_ops;
 
-static int tcf_bpf_act(struct sk_buff *skb, const struct tc_action *act,
-                      struct tcf_result *res)
+TC_INDIRECT_SCOPE int tcf_bpf_act(struct sk_buff *skb,
+                                 const struct tc_action *act,
+                                 struct tcf_result *res)
 {
        bool at_ingress = skb_at_tc_ingress(skb);
        struct tcf_bpf *prog = to_bpf(act);
index d41002e..7e63ff7 100644 (file)
@@ -20,6 +20,7 @@
 #include <net/pkt_cls.h>
 #include <uapi/linux/tc_act/tc_connmark.h>
 #include <net/tc_act/tc_connmark.h>
+#include <net/tc_wrapper.h>
 
 #include <net/netfilter/nf_conntrack.h>
 #include <net/netfilter/nf_conntrack_core.h>
@@ -27,8 +28,9 @@
 
 static struct tc_action_ops act_connmark_ops;
 
-static int tcf_connmark_act(struct sk_buff *skb, const struct tc_action *a,
-                           struct tcf_result *res)
+TC_INDIRECT_SCOPE int tcf_connmark_act(struct sk_buff *skb,
+                                      const struct tc_action *a,
+                                      struct tcf_result *res)
 {
        const struct nf_conntrack_tuple_hash *thash;
        struct nf_conntrack_tuple tuple;
index 1366adf..95e9304 100644 (file)
@@ -32,6 +32,7 @@
 
 #include <linux/tc_act/tc_csum.h>
 #include <net/tc_act/tc_csum.h>
+#include <net/tc_wrapper.h>
 
 static const struct nla_policy csum_policy[TCA_CSUM_MAX + 1] = {
        [TCA_CSUM_PARMS] = { .len = sizeof(struct tc_csum), },
@@ -563,8 +564,9 @@ fail:
        return 0;
 }
 
-static int tcf_csum_act(struct sk_buff *skb, const struct tc_action *a,
-                       struct tcf_result *res)
+TC_INDIRECT_SCOPE int tcf_csum_act(struct sk_buff *skb,
+                                  const struct tc_action *a,
+                                  struct tcf_result *res)
 {
        struct tcf_csum *p = to_tcf_csum(a);
        bool orig_vlan_tag_present = false;
index dd5ae75..0ca2bb8 100644 (file)
@@ -24,6 +24,7 @@
 #include <net/ipv6_frag.h>
 #include <uapi/linux/tc_act/tc_ct.h>
 #include <net/tc_act/tc_ct.h>
+#include <net/tc_wrapper.h>
 
 #include <net/netfilter/nf_flow_table.h>
 #include <net/netfilter/nf_conntrack.h>
@@ -863,90 +864,6 @@ static void tcf_ct_params_free_rcu(struct rcu_head *head)
        tcf_ct_params_free(params);
 }
 
-#if IS_ENABLED(CONFIG_NF_NAT)
-/* Modelled after nf_nat_ipv[46]_fn().
- * range is only used for new, uninitialized NAT state.
- * Returns either NF_ACCEPT or NF_DROP.
- */
-static int ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct,
-                         enum ip_conntrack_info ctinfo,
-                         const struct nf_nat_range2 *range,
-                         enum nf_nat_manip_type maniptype)
-{
-       __be16 proto = skb_protocol(skb, true);
-       int hooknum, err = NF_ACCEPT;
-
-       /* See HOOK2MANIP(). */
-       if (maniptype == NF_NAT_MANIP_SRC)
-               hooknum = NF_INET_LOCAL_IN; /* Source NAT */
-       else
-               hooknum = NF_INET_LOCAL_OUT; /* Destination NAT */
-
-       switch (ctinfo) {
-       case IP_CT_RELATED:
-       case IP_CT_RELATED_REPLY:
-               if (proto == htons(ETH_P_IP) &&
-                   ip_hdr(skb)->protocol == IPPROTO_ICMP) {
-                       if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
-                                                          hooknum))
-                               err = NF_DROP;
-                       goto out;
-               } else if (IS_ENABLED(CONFIG_IPV6) && proto == htons(ETH_P_IPV6)) {
-                       __be16 frag_off;
-                       u8 nexthdr = ipv6_hdr(skb)->nexthdr;
-                       int hdrlen = ipv6_skip_exthdr(skb,
-                                                     sizeof(struct ipv6hdr),
-                                                     &nexthdr, &frag_off);
-
-                       if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) {
-                               if (!nf_nat_icmpv6_reply_translation(skb, ct,
-                                                                    ctinfo,
-                                                                    hooknum,
-                                                                    hdrlen))
-                                       err = NF_DROP;
-                               goto out;
-                       }
-               }
-               /* Non-ICMP, fall thru to initialize if needed. */
-               fallthrough;
-       case IP_CT_NEW:
-               /* Seen it before?  This can happen for loopback, retrans,
-                * or local packets.
-                */
-               if (!nf_nat_initialized(ct, maniptype)) {
-                       /* Initialize according to the NAT action. */
-                       err = (range && range->flags & NF_NAT_RANGE_MAP_IPS)
-                               /* Action is set up to establish a new
-                                * mapping.
-                                */
-                               ? nf_nat_setup_info(ct, range, maniptype)
-                               : nf_nat_alloc_null_binding(ct, hooknum);
-                       if (err != NF_ACCEPT)
-                               goto out;
-               }
-               break;
-
-       case IP_CT_ESTABLISHED:
-       case IP_CT_ESTABLISHED_REPLY:
-               break;
-
-       default:
-               err = NF_DROP;
-               goto out;
-       }
-
-       err = nf_nat_packet(ct, ctinfo, hooknum, skb);
-       if (err == NF_ACCEPT) {
-               if (maniptype == NF_NAT_MANIP_SRC)
-                       tc_skb_cb(skb)->post_ct_snat = 1;
-               if (maniptype == NF_NAT_MANIP_DST)
-                       tc_skb_cb(skb)->post_ct_dnat = 1;
-       }
-out:
-       return err;
-}
-#endif /* CONFIG_NF_NAT */
-
 static void tcf_ct_act_set_mark(struct nf_conn *ct, u32 mark, u32 mask)
 {
 #if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)
@@ -986,60 +903,30 @@ static int tcf_ct_act_nat(struct sk_buff *skb,
                          bool commit)
 {
 #if IS_ENABLED(CONFIG_NF_NAT)
-       int err;
-       enum nf_nat_manip_type maniptype;
+       int err, action = 0;
 
        if (!(ct_action & TCA_CT_ACT_NAT))
                return NF_ACCEPT;
+       if (ct_action & TCA_CT_ACT_NAT_SRC)
+               action |= BIT(NF_NAT_MANIP_SRC);
+       if (ct_action & TCA_CT_ACT_NAT_DST)
+               action |= BIT(NF_NAT_MANIP_DST);
 
-       /* Add NAT extension if not confirmed yet. */
-       if (!nf_ct_is_confirmed(ct) && !nf_ct_nat_ext_add(ct))
-               return NF_DROP;   /* Can't NAT. */
-
-       if (ctinfo != IP_CT_NEW && (ct->status & IPS_NAT_MASK) &&
-           (ctinfo != IP_CT_RELATED || commit)) {
-               /* NAT an established or related connection like before. */
-               if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY)
-                       /* This is the REPLY direction for a connection
-                        * for which NAT was applied in the forward
-                        * direction.  Do the reverse NAT.
-                        */
-                       maniptype = ct->status & IPS_SRC_NAT
-                               ? NF_NAT_MANIP_DST : NF_NAT_MANIP_SRC;
-               else
-                       maniptype = ct->status & IPS_SRC_NAT
-                               ? NF_NAT_MANIP_SRC : NF_NAT_MANIP_DST;
-       } else if (ct_action & TCA_CT_ACT_NAT_SRC) {
-               maniptype = NF_NAT_MANIP_SRC;
-       } else if (ct_action & TCA_CT_ACT_NAT_DST) {
-               maniptype = NF_NAT_MANIP_DST;
-       } else {
-               return NF_ACCEPT;
-       }
+       err = nf_ct_nat(skb, ct, ctinfo, &action, range, commit);
+
+       if (action & BIT(NF_NAT_MANIP_SRC))
+               tc_skb_cb(skb)->post_ct_snat = 1;
+       if (action & BIT(NF_NAT_MANIP_DST))
+               tc_skb_cb(skb)->post_ct_dnat = 1;
 
-       err = ct_nat_execute(skb, ct, ctinfo, range, maniptype);
-       if (err == NF_ACCEPT && ct->status & IPS_DST_NAT) {
-               if (ct->status & IPS_SRC_NAT) {
-                       if (maniptype == NF_NAT_MANIP_SRC)
-                               maniptype = NF_NAT_MANIP_DST;
-                       else
-                               maniptype = NF_NAT_MANIP_SRC;
-
-                       err = ct_nat_execute(skb, ct, ctinfo, range,
-                                            maniptype);
-               } else if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) {
-                       err = ct_nat_execute(skb, ct, ctinfo, NULL,
-                                            NF_NAT_MANIP_SRC);
-               }
-       }
        return err;
 #else
        return NF_ACCEPT;
 #endif
 }
 
-static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
-                     struct tcf_result *res)
+TC_INDIRECT_SCOPE int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
+                                struct tcf_result *res)
 {
        struct net *net = dev_net(skb->dev);
        enum ip_conntrack_info ctinfo;
index eaa02f0..4b1b59d 100644 (file)
@@ -18,6 +18,7 @@
 #include <net/pkt_cls.h>
 #include <uapi/linux/tc_act/tc_ctinfo.h>
 #include <net/tc_act/tc_ctinfo.h>
+#include <net/tc_wrapper.h>
 
 #include <net/netfilter/nf_conntrack.h>
 #include <net/netfilter/nf_conntrack_core.h>
@@ -75,8 +76,9 @@ static void tcf_ctinfo_cpmark_set(struct nf_conn *ct, struct tcf_ctinfo *ca,
        skb->mark = READ_ONCE(ct->mark) & cp->cpmarkmask;
 }
 
-static int tcf_ctinfo_act(struct sk_buff *skb, const struct tc_action *a,
-                         struct tcf_result *res)
+TC_INDIRECT_SCOPE int tcf_ctinfo_act(struct sk_buff *skb,
+                                    const struct tc_action *a,
+                                    struct tcf_result *res)
 {
        const struct nf_conntrack_tuple_hash *thash = NULL;
        struct tcf_ctinfo *ca = to_ctinfo(a);
index 62d682b..54f1b13 100644 (file)
@@ -18,6 +18,7 @@
 #include <net/pkt_cls.h>
 #include <linux/tc_act/tc_gact.h>
 #include <net/tc_act/tc_gact.h>
+#include <net/tc_wrapper.h>
 
 static struct tc_action_ops act_gact_ops;
 
@@ -145,8 +146,9 @@ release_idr:
        return err;
 }
 
-static int tcf_gact_act(struct sk_buff *skb, const struct tc_action *a,
-                       struct tcf_result *res)
+TC_INDIRECT_SCOPE int tcf_gact_act(struct sk_buff *skb,
+                                  const struct tc_action *a,
+                                  struct tcf_result *res)
 {
        struct tcf_gact *gact = to_gact(a);
        int action = READ_ONCE(gact->tcf_action);
index 3049878..9b8def0 100644 (file)
@@ -14,6 +14,7 @@
 #include <net/netlink.h>
 #include <net/pkt_cls.h>
 #include <net/tc_act/tc_gate.h>
+#include <net/tc_wrapper.h>
 
 static struct tc_action_ops act_gate_ops;
 
@@ -113,8 +114,9 @@ static enum hrtimer_restart gate_timer_func(struct hrtimer *timer)
        return HRTIMER_RESTART;
 }
 
-static int tcf_gate_act(struct sk_buff *skb, const struct tc_action *a,
-                       struct tcf_result *res)
+TC_INDIRECT_SCOPE int tcf_gate_act(struct sk_buff *skb,
+                                  const struct tc_action *a,
+                                  struct tcf_result *res)
 {
        struct tcf_gate *gact = to_gate(a);
 
index 41d63b3..bc7611b 100644 (file)
@@ -29,6 +29,7 @@
 #include <net/tc_act/tc_ife.h>
 #include <linux/etherdevice.h>
 #include <net/ife.h>
+#include <net/tc_wrapper.h>
 
 static int max_metacnt = IFE_META_MAX + 1;
 static struct tc_action_ops act_ife_ops;
@@ -861,8 +862,9 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
        return action;
 }
 
-static int tcf_ife_act(struct sk_buff *skb, const struct tc_action *a,
-                      struct tcf_result *res)
+TC_INDIRECT_SCOPE int tcf_ife_act(struct sk_buff *skb,
+                                 const struct tc_action *a,
+                                 struct tcf_result *res)
 {
        struct tcf_ife_info *ife = to_ife(a);
        struct tcf_ife_params *p;
index 1625e10..5d96ffe 100644 (file)
@@ -20,6 +20,7 @@
 #include <net/pkt_sched.h>
 #include <linux/tc_act/tc_ipt.h>
 #include <net/tc_act/tc_ipt.h>
+#include <net/tc_wrapper.h>
 
 #include <linux/netfilter_ipv4/ip_tables.h>
 
@@ -216,8 +217,9 @@ static int tcf_xt_init(struct net *net, struct nlattr *nla,
                              a, &act_xt_ops, tp, flags);
 }
 
-static int tcf_ipt_act(struct sk_buff *skb, const struct tc_action *a,
-                      struct tcf_result *res)
+TC_INDIRECT_SCOPE int tcf_ipt_act(struct sk_buff *skb,
+                                 const struct tc_action *a,
+                                 struct tcf_result *res)
 {
        int ret = 0, result = 0;
        struct tcf_ipt *ipt = to_ipt(a);
index b8ad6ae..7284bce 100644 (file)
@@ -24,6 +24,7 @@
 #include <net/pkt_cls.h>
 #include <linux/tc_act/tc_mirred.h>
 #include <net/tc_act/tc_mirred.h>
+#include <net/tc_wrapper.h>
 
 static LIST_HEAD(mirred_list);
 static DEFINE_SPINLOCK(mirred_list_lock);
@@ -217,8 +218,9 @@ static int tcf_mirred_forward(bool want_ingress, struct sk_buff *skb)
        return err;
 }
 
-static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a,
-                         struct tcf_result *res)
+TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb,
+                                    const struct tc_action *a,
+                                    struct tcf_result *res)
 {
        struct tcf_mirred *m = to_mirred(a);
        struct sk_buff *skb2 = skb;
index 8ad25cc..ff47ce4 100644 (file)
@@ -14,6 +14,7 @@
 #include <net/pkt_sched.h>
 #include <net/pkt_cls.h>
 #include <net/tc_act/tc_mpls.h>
+#include <net/tc_wrapper.h>
 
 static struct tc_action_ops act_mpls_ops;
 
@@ -49,8 +50,9 @@ static __be32 tcf_mpls_get_lse(struct mpls_shim_hdr *lse,
        return cpu_to_be32(new_lse);
 }
 
-static int tcf_mpls_act(struct sk_buff *skb, const struct tc_action *a,
-                       struct tcf_result *res)
+TC_INDIRECT_SCOPE int tcf_mpls_act(struct sk_buff *skb,
+                                  const struct tc_action *a,
+                                  struct tcf_result *res)
 {
        struct tcf_mpls *m = to_mpls(a);
        struct tcf_mpls_params *p;
index 9265145..74c74be 100644 (file)
@@ -24,7 +24,7 @@
 #include <net/tc_act/tc_nat.h>
 #include <net/tcp.h>
 #include <net/udp.h>
-
+#include <net/tc_wrapper.h>
 
 static struct tc_action_ops act_nat_ops;
 
@@ -98,8 +98,9 @@ release_idr:
        return err;
 }
 
-static int tcf_nat_act(struct sk_buff *skb, const struct tc_action *a,
-                      struct tcf_result *res)
+TC_INDIRECT_SCOPE int tcf_nat_act(struct sk_buff *skb,
+                                 const struct tc_action *a,
+                                 struct tcf_result *res)
 {
        struct tcf_nat *p = to_tcf_nat(a);
        struct iphdr *iph;
index 94ed585..a0378e9 100644 (file)
@@ -20,6 +20,7 @@
 #include <net/tc_act/tc_pedit.h>
 #include <uapi/linux/tc_act/tc_pedit.h>
 #include <net/pkt_cls.h>
+#include <net/tc_wrapper.h>
 
 static struct tc_action_ops act_pedit_ops;
 
@@ -319,8 +320,9 @@ static int pedit_skb_hdr_offset(struct sk_buff *skb,
        return ret;
 }
 
-static int tcf_pedit_act(struct sk_buff *skb, const struct tc_action *a,
-                        struct tcf_result *res)
+TC_INDIRECT_SCOPE int tcf_pedit_act(struct sk_buff *skb,
+                                   const struct tc_action *a,
+                                   struct tcf_result *res)
 {
        struct tcf_pedit *p = to_pedit(a);
        u32 max_offset;
index 0adb26e..227cba5 100644 (file)
@@ -19,6 +19,7 @@
 #include <net/netlink.h>
 #include <net/pkt_cls.h>
 #include <net/tc_act/tc_police.h>
+#include <net/tc_wrapper.h>
 
 /* Each policer is serialized by its individual spinlock */
 
@@ -242,8 +243,9 @@ static bool tcf_police_mtu_check(struct sk_buff *skb, u32 limit)
        return len <= limit;
 }
 
-static int tcf_police_act(struct sk_buff *skb, const struct tc_action *a,
-                         struct tcf_result *res)
+TC_INDIRECT_SCOPE int tcf_police_act(struct sk_buff *skb,
+                                    const struct tc_action *a,
+                                    struct tcf_result *res)
 {
        struct tcf_police *police = to_police(a);
        s64 now, toks, ppstoks = 0, ptoks = 0;
index 7a25477..98dea08 100644 (file)
@@ -20,6 +20,7 @@
 #include <net/tc_act/tc_sample.h>
 #include <net/psample.h>
 #include <net/pkt_cls.h>
+#include <net/tc_wrapper.h>
 
 #include <linux/if_arp.h>
 
@@ -153,8 +154,9 @@ static bool tcf_sample_dev_ok_push(struct net_device *dev)
        }
 }
 
-static int tcf_sample_act(struct sk_buff *skb, const struct tc_action *a,
-                         struct tcf_result *res)
+TC_INDIRECT_SCOPE int tcf_sample_act(struct sk_buff *skb,
+                                    const struct tc_action *a,
+                                    struct tcf_result *res)
 {
        struct tcf_sample *s = to_sample(a);
        struct psample_group *psample_group;
index 18d3761..4b84514 100644 (file)
@@ -14,6 +14,7 @@
 #include <net/netlink.h>
 #include <net/pkt_sched.h>
 #include <net/pkt_cls.h>
+#include <net/tc_wrapper.h>
 
 #include <linux/tc_act/tc_defact.h>
 #include <net/tc_act/tc_defact.h>
@@ -21,8 +22,9 @@
 static struct tc_action_ops act_simp_ops;
 
 #define SIMP_MAX_DATA  32
-static int tcf_simp_act(struct sk_buff *skb, const struct tc_action *a,
-                       struct tcf_result *res)
+TC_INDIRECT_SCOPE int tcf_simp_act(struct sk_buff *skb,
+                                  const struct tc_action *a,
+                                  struct tcf_result *res)
 {
        struct tcf_defact *d = to_defact(a);
 
index 1710780..ce7008c 100644 (file)
@@ -16,6 +16,7 @@
 #include <net/ipv6.h>
 #include <net/dsfield.h>
 #include <net/pkt_cls.h>
+#include <net/tc_wrapper.h>
 
 #include <linux/tc_act/tc_skbedit.h>
 #include <net/tc_act/tc_skbedit.h>
@@ -36,8 +37,9 @@ static u16 tcf_skbedit_hash(struct tcf_skbedit_params *params,
        return netdev_cap_txqueue(skb->dev, queue_mapping);
 }
 
-static int tcf_skbedit_act(struct sk_buff *skb, const struct tc_action *a,
-                          struct tcf_result *res)
+TC_INDIRECT_SCOPE int tcf_skbedit_act(struct sk_buff *skb,
+                                     const struct tc_action *a,
+                                     struct tcf_result *res)
 {
        struct tcf_skbedit *d = to_skbedit(a);
        struct tcf_skbedit_params *params;
index d98758a..dffa990 100644 (file)
 #include <net/netlink.h>
 #include <net/pkt_sched.h>
 #include <net/pkt_cls.h>
+#include <net/tc_wrapper.h>
 
 #include <linux/tc_act/tc_skbmod.h>
 #include <net/tc_act/tc_skbmod.h>
 
 static struct tc_action_ops act_skbmod_ops;
 
-static int tcf_skbmod_act(struct sk_buff *skb, const struct tc_action *a,
-                         struct tcf_result *res)
+TC_INDIRECT_SCOPE int tcf_skbmod_act(struct sk_buff *skb,
+                                    const struct tc_action *a,
+                                    struct tcf_result *res)
 {
        struct tcf_skbmod *d = to_skbmod(a);
        int action, max_edit_len, err;
index 2691a3d..2d12d26 100644 (file)
 #include <net/pkt_sched.h>
 #include <net/dst.h>
 #include <net/pkt_cls.h>
+#include <net/tc_wrapper.h>
 
 #include <linux/tc_act/tc_tunnel_key.h>
 #include <net/tc_act/tc_tunnel_key.h>
 
 static struct tc_action_ops act_tunnel_key_ops;
 
-static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a,
-                         struct tcf_result *res)
+TC_INDIRECT_SCOPE int tunnel_key_act(struct sk_buff *skb,
+                                    const struct tc_action *a,
+                                    struct tcf_result *res)
 {
        struct tcf_tunnel_key *t = to_tunnel_key(a);
        struct tcf_tunnel_key_params *params;
index 7b24e89..0251442 100644 (file)
 #include <net/netlink.h>
 #include <net/pkt_sched.h>
 #include <net/pkt_cls.h>
+#include <net/tc_wrapper.h>
 
 #include <linux/tc_act/tc_vlan.h>
 #include <net/tc_act/tc_vlan.h>
 
 static struct tc_action_ops act_vlan_ops;
 
-static int tcf_vlan_act(struct sk_buff *skb, const struct tc_action *a,
-                       struct tcf_result *res)
+TC_INDIRECT_SCOPE int tcf_vlan_act(struct sk_buff *skb,
+                                  const struct tc_action *a,
+                                  struct tcf_result *res)
 {
        struct tcf_vlan *v = to_vlan(a);
        struct tcf_vlan_params *p;
index 23d1cfa..668130f 100644 (file)
@@ -40,6 +40,7 @@
 #include <net/tc_act/tc_mpls.h>
 #include <net/tc_act/tc_gate.h>
 #include <net/flow_offload.h>
+#include <net/tc_wrapper.h>
 
 extern const struct nla_policy rtm_tca_policy[TCA_MAX + 1];
 
@@ -1564,7 +1565,7 @@ reclassify:
                    tp->protocol != htons(ETH_P_ALL))
                        continue;
 
-               err = tp->classify(skb, tp, res);
+               err = tc_classify(skb, tp, res);
 #ifdef CONFIG_NET_CLS_ACT
                if (unlikely(err == TC_ACT_RECLASSIFY && !compat_mode)) {
                        first_tp = orig_tp;
index d229ce9..1b92c33 100644 (file)
@@ -18,6 +18,7 @@
 #include <net/netlink.h>
 #include <net/act_api.h>
 #include <net/pkt_cls.h>
+#include <net/tc_wrapper.h>
 
 struct basic_head {
        struct list_head        flist;
@@ -36,8 +37,9 @@ struct basic_filter {
        struct rcu_work         rwork;
 };
 
-static int basic_classify(struct sk_buff *skb, const struct tcf_proto *tp,
-                         struct tcf_result *res)
+TC_INDIRECT_SCOPE int basic_classify(struct sk_buff *skb,
+                                    const struct tcf_proto *tp,
+                                    struct tcf_result *res)
 {
        int r;
        struct basic_head *head = rcu_dereference_bh(tp->root);
index bc317b3..466c26d 100644 (file)
@@ -19,6 +19,7 @@
 #include <net/rtnetlink.h>
 #include <net/pkt_cls.h>
 #include <net/sock.h>
+#include <net/tc_wrapper.h>
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>");
@@ -77,8 +78,9 @@ static int cls_bpf_exec_opcode(int code)
        }
 }
 
-static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
-                           struct tcf_result *res)
+TC_INDIRECT_SCOPE int cls_bpf_classify(struct sk_buff *skb,
+                                      const struct tcf_proto *tp,
+                                      struct tcf_result *res)
 {
        struct cls_bpf_head *head = rcu_dereference_bh(tp->root);
        bool at_ingress = skb_at_tc_ingress(skb);
index ed00001..bd9322d 100644 (file)
@@ -13,6 +13,7 @@
 #include <net/pkt_cls.h>
 #include <net/sock.h>
 #include <net/cls_cgroup.h>
+#include <net/tc_wrapper.h>
 
 struct cls_cgroup_head {
        u32                     handle;
@@ -22,8 +23,9 @@ struct cls_cgroup_head {
        struct rcu_work         rwork;
 };
 
-static int cls_cgroup_classify(struct sk_buff *skb, const struct tcf_proto *tp,
-                              struct tcf_result *res)
+TC_INDIRECT_SCOPE int cls_cgroup_classify(struct sk_buff *skb,
+                                         const struct tcf_proto *tp,
+                                         struct tcf_result *res)
 {
        struct cls_cgroup_head *head = rcu_dereference_bh(tp->root);
        u32 classid = task_get_classid(skb);
index 014cd3d..535668e 100644 (file)
@@ -24,6 +24,7 @@
 #include <net/ip.h>
 #include <net/route.h>
 #include <net/flow_dissector.h>
+#include <net/tc_wrapper.h>
 
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
 #include <net/netfilter/nf_conntrack.h>
@@ -292,8 +293,9 @@ static u32 flow_key_get(struct sk_buff *skb, int key, struct flow_keys *flow)
                          (1 << FLOW_KEY_NFCT_PROTO_SRC) |      \
                          (1 << FLOW_KEY_NFCT_PROTO_DST))
 
-static int flow_classify(struct sk_buff *skb, const struct tcf_proto *tp,
-                        struct tcf_result *res)
+TC_INDIRECT_SCOPE int flow_classify(struct sk_buff *skb,
+                                   const struct tcf_proto *tp,
+                                   struct tcf_result *res)
 {
        struct flow_head *head = rcu_dereference_bh(tp->root);
        struct flow_filter *f;
index 25bc57e..0b15698 100644 (file)
@@ -27,6 +27,7 @@
 #include <net/vxlan.h>
 #include <net/erspan.h>
 #include <net/gtp.h>
+#include <net/tc_wrapper.h>
 
 #include <net/dst.h>
 #include <net/dst_metadata.h>
@@ -305,8 +306,9 @@ static u16 fl_ct_info_to_flower_map[] = {
                                        TCA_FLOWER_KEY_CT_FLAGS_NEW,
 };
 
-static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
-                      struct tcf_result *res)
+TC_INDIRECT_SCOPE int fl_classify(struct sk_buff *skb,
+                                 const struct tcf_proto *tp,
+                                 struct tcf_result *res)
 {
        struct cls_fl_head *head = rcu_dereference_bh(tp->root);
        bool post_ct = tc_skb_cb(skb)->post_ct;
index a32351d..ae9439a 100644 (file)
@@ -21,6 +21,7 @@
 #include <net/act_api.h>
 #include <net/pkt_cls.h>
 #include <net/sch_generic.h>
+#include <net/tc_wrapper.h>
 
 #define HTSIZE 256
 
@@ -47,8 +48,9 @@ static u32 fw_hash(u32 handle)
        return handle % HTSIZE;
 }
 
-static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp,
-                      struct tcf_result *res)
+TC_INDIRECT_SCOPE int fw_classify(struct sk_buff *skb,
+                                 const struct tcf_proto *tp,
+                                 struct tcf_result *res)
 {
        struct fw_head *head = rcu_dereference_bh(tp->root);
        struct fw_filter *f;
index 39a5d9c..705f63d 100644 (file)
@@ -12,6 +12,7 @@
 
 #include <net/sch_generic.h>
 #include <net/pkt_cls.h>
+#include <net/tc_wrapper.h>
 
 struct cls_mall_head {
        struct tcf_exts exts;
@@ -24,8 +25,9 @@ struct cls_mall_head {
        bool deleting;
 };
 
-static int mall_classify(struct sk_buff *skb, const struct tcf_proto *tp,
-                        struct tcf_result *res)
+TC_INDIRECT_SCOPE int mall_classify(struct sk_buff *skb,
+                                   const struct tcf_proto *tp,
+                                   struct tcf_result *res)
 {
        struct cls_mall_head *head = rcu_dereference_bh(tp->root);
 
index 9e43b92..d0c5372 100644 (file)
@@ -17,6 +17,7 @@
 #include <net/netlink.h>
 #include <net/act_api.h>
 #include <net/pkt_cls.h>
+#include <net/tc_wrapper.h>
 
 /*
  * 1. For now we assume that route tags < 256.
@@ -121,8 +122,9 @@ static inline int route4_hash_wild(void)
        return 0;                                               \
 }
 
-static int route4_classify(struct sk_buff *skb, const struct tcf_proto *tp,
-                          struct tcf_result *res)
+TC_INDIRECT_SCOPE int route4_classify(struct sk_buff *skb,
+                                     const struct tcf_proto *tp,
+                                     struct tcf_result *res)
 {
        struct route4_head *head = rcu_dereference_bh(tp->root);
        struct dst_entry *dst;
index de1c1d4..03d8619 100644 (file)
 #include <net/netlink.h>
 #include <net/act_api.h>
 #include <net/pkt_cls.h>
+#include <net/tc_wrapper.h>
 
 #define RSVP_DST_LEN   1
 #define RSVP_ID                "rsvp"
 #define RSVP_OPS       cls_rsvp_ops
+#define RSVP_CLS       rsvp_classify
 
 #include "cls_rsvp.h"
 MODULE_LICENSE("GPL");
index b00a7db..869efba 100644 (file)
@@ -124,8 +124,8 @@ static inline unsigned int hash_src(__be32 *src)
                return r;                               \
 }
 
-static int rsvp_classify(struct sk_buff *skb, const struct tcf_proto *tp,
-                        struct tcf_result *res)
+TC_INDIRECT_SCOPE int RSVP_CLS(struct sk_buff *skb, const struct tcf_proto *tp,
+                              struct tcf_result *res)
 {
        struct rsvp_head *head = rcu_dereference_bh(tp->root);
        struct rsvp_session *s;
@@ -738,7 +738,7 @@ static void rsvp_bind_class(void *fh, u32 classid, unsigned long cl, void *q,
 
 static struct tcf_proto_ops RSVP_OPS __read_mostly = {
        .kind           =       RSVP_ID,
-       .classify       =       rsvp_classify,
+       .classify       =       RSVP_CLS,
        .init           =       rsvp_init,
        .destroy        =       rsvp_destroy,
        .get            =       rsvp_get,
index 6407884..e627cc3 100644 (file)
 #include <net/act_api.h>
 #include <net/pkt_cls.h>
 #include <net/netlink.h>
+#include <net/tc_wrapper.h>
 
 #define RSVP_DST_LEN   4
 #define RSVP_ID                "rsvp6"
 #define RSVP_OPS       cls_rsvp6_ops
+#define RSVP_CLS rsvp6_classify
 
 #include "cls_rsvp.h"
 MODULE_LICENSE("GPL");
index 1c9eeb9..eb0e945 100644 (file)
@@ -16,6 +16,7 @@
 #include <net/netlink.h>
 #include <net/pkt_cls.h>
 #include <net/sch_generic.h>
+#include <net/tc_wrapper.h>
 
 /*
  * Passing parameters to the root seems to be done more awkwardly than really
@@ -98,9 +99,9 @@ static struct tcindex_filter_result *tcindex_lookup(struct tcindex_data *p,
        return NULL;
 }
 
-
-static int tcindex_classify(struct sk_buff *skb, const struct tcf_proto *tp,
-                           struct tcf_result *res)
+TC_INDIRECT_SCOPE int tcindex_classify(struct sk_buff *skb,
+                                      const struct tcf_proto *tp,
+                                      struct tcf_result *res)
 {
        struct tcindex_data *p = rcu_dereference_bh(tp->root);
        struct tcindex_filter_result *f;
index 34d25f7..4e2e269 100644 (file)
@@ -39,6 +39,7 @@
 #include <net/act_api.h>
 #include <net/pkt_cls.h>
 #include <linux/idr.h>
+#include <net/tc_wrapper.h>
 
 struct tc_u_knode {
        struct tc_u_knode __rcu *next;
@@ -100,8 +101,9 @@ static inline unsigned int u32_hash_fold(__be32 key,
        return h;
 }
 
-static int u32_classify(struct sk_buff *skb, const struct tcf_proto *tp,
-                       struct tcf_result *res)
+TC_INDIRECT_SCOPE int u32_classify(struct sk_buff *skb,
+                                  const struct tcf_proto *tp,
+                                  struct tcf_result *res)
 {
        struct {
                struct tc_u_knode *knode;
index 4a27dfb..2317db0 100644 (file)
@@ -31,6 +31,7 @@
 #include <net/netlink.h>
 #include <net/pkt_sched.h>
 #include <net/pkt_cls.h>
+#include <net/tc_wrapper.h>
 
 #include <trace/events/qdisc.h>
 
@@ -2273,6 +2274,8 @@ static struct pernet_operations psched_net_ops = {
        .exit = psched_net_exit,
 };
 
+DEFINE_STATIC_KEY_FALSE(tc_skip_wrapper);
+
 static int __init pktsched_init(void)
 {
        int err;
@@ -2300,6 +2303,8 @@ static int __init pktsched_init(void)
        rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
                      0);
 
+       tc_wrapper_init();
+
        return 0;
 }
 
index 7f40ed1..a7a9136 100644 (file)
@@ -84,17 +84,18 @@ static struct ctl_table sctp_table[] = {
        { /* sentinel */ }
 };
 
+/* The following index defines are used in sctp_sysctl_net_register().
+ * If you add new items to the sctp_net_table, please ensure that
+ * the index values of these defines hold the same meaning indicated by
+ * their macro names when they appear in sctp_net_table.
+ */
+#define SCTP_RTO_MIN_IDX       0
+#define SCTP_RTO_MAX_IDX       1
+#define SCTP_PF_RETRANS_IDX    2
+#define SCTP_PS_RETRANS_IDX    3
+
 static struct ctl_table sctp_net_table[] = {
-       {
-               .procname       = "rto_initial",
-               .data           = &init_net.sctp.rto_initial,
-               .maxlen         = sizeof(unsigned int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec_minmax,
-               .extra1         = SYSCTL_ONE,
-               .extra2         = &timer_max
-       },
-       {
+       [SCTP_RTO_MIN_IDX] = {
                .procname       = "rto_min",
                .data           = &init_net.sctp.rto_min,
                .maxlen         = sizeof(unsigned int),
@@ -103,7 +104,7 @@ static struct ctl_table sctp_net_table[] = {
                .extra1         = SYSCTL_ONE,
                .extra2         = &init_net.sctp.rto_max
        },
-       {
+       [SCTP_RTO_MAX_IDX] =  {
                .procname       = "rto_max",
                .data           = &init_net.sctp.rto_max,
                .maxlen         = sizeof(unsigned int),
@@ -112,6 +113,33 @@ static struct ctl_table sctp_net_table[] = {
                .extra1         = &init_net.sctp.rto_min,
                .extra2         = &timer_max
        },
+       [SCTP_PF_RETRANS_IDX] = {
+               .procname       = "pf_retrans",
+               .data           = &init_net.sctp.pf_retrans,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = SYSCTL_ZERO,
+               .extra2         = &init_net.sctp.ps_retrans,
+       },
+       [SCTP_PS_RETRANS_IDX] = {
+               .procname       = "ps_retrans",
+               .data           = &init_net.sctp.ps_retrans,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = &init_net.sctp.pf_retrans,
+               .extra2         = &ps_retrans_max,
+       },
+       {
+               .procname       = "rto_initial",
+               .data           = &init_net.sctp.rto_initial,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = SYSCTL_ONE,
+               .extra2         = &timer_max
+       },
        {
                .procname       = "rto_alpha_exp_divisor",
                .data           = &init_net.sctp.rto_alpha,
@@ -208,24 +236,6 @@ static struct ctl_table sctp_net_table[] = {
                .extra2         = SYSCTL_INT_MAX,
        },
        {
-               .procname       = "pf_retrans",
-               .data           = &init_net.sctp.pf_retrans,
-               .maxlen         = sizeof(int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec_minmax,
-               .extra1         = SYSCTL_ZERO,
-               .extra2         = &init_net.sctp.ps_retrans,
-       },
-       {
-               .procname       = "ps_retrans",
-               .data           = &init_net.sctp.ps_retrans,
-               .maxlen         = sizeof(int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec_minmax,
-               .extra1         = &init_net.sctp.pf_retrans,
-               .extra2         = &ps_retrans_max,
-       },
-       {
                .procname       = "sndbuf_policy",
                .data           = &init_net.sctp.sndbuf_policy,
                .maxlen         = sizeof(int),
@@ -597,6 +607,11 @@ int sctp_sysctl_net_register(struct net *net)
        for (i = 0; table[i].data; i++)
                table[i].data += (char *)(&net->sctp) - (char *)&init_net.sctp;
 
+       table[SCTP_RTO_MIN_IDX].extra2 = &net->sctp.rto_max;
+       table[SCTP_RTO_MAX_IDX].extra1 = &net->sctp.rto_min;
+       table[SCTP_PF_RETRANS_IDX].extra2 = &net->sctp.ps_retrans;
+       table[SCTP_PS_RETRANS_IDX].extra1 = &net->sctp.pf_retrans;
+
        net->sctp.sysctl_header = register_net_sysctl(net, "net/sctp", table);
        if (net->sctp.sysctl_header == NULL) {
                kfree(table);
index e260c0d..b3ce248 100644 (file)
@@ -2224,7 +2224,9 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
        if (tipc_own_addr(l->net) > msg_prevnode(hdr))
                l->net_plane = msg_net_plane(hdr);
 
-       skb_linearize(skb);
+       if (skb_linearize(skb))
+               goto exit;
+
        hdr = buf_msg(skb);
        data = msg_data(hdr);
 
index b48d97c..49ddc48 100644 (file)
@@ -1689,6 +1689,7 @@ int tipc_node_xmit(struct net *net, struct sk_buff_head *list,
        struct tipc_node *n;
        struct sk_buff_head xmitq;
        bool node_up = false;
+       struct net *peer_net;
        int bearer_id;
        int rc;
 
@@ -1705,18 +1706,23 @@ int tipc_node_xmit(struct net *net, struct sk_buff_head *list,
                return -EHOSTUNREACH;
        }
 
+       rcu_read_lock();
        tipc_node_read_lock(n);
        node_up = node_is_up(n);
-       if (node_up && n->peer_net && check_net(n->peer_net)) {
+       peer_net = n->peer_net;
+       tipc_node_read_unlock(n);
+       if (node_up && peer_net && check_net(peer_net)) {
                /* xmit inner linux container */
-               tipc_lxc_xmit(n->peer_net, list);
+               tipc_lxc_xmit(peer_net, list);
                if (likely(skb_queue_empty(list))) {
-                       tipc_node_read_unlock(n);
+                       rcu_read_unlock();
                        tipc_node_put(n);
                        return 0;
                }
        }
+       rcu_read_unlock();
 
+       tipc_node_read_lock(n);
        bearer_id = n->active_links[selector & 1];
        if (unlikely(bearer_id == INVALID_BEARER_ID)) {
                tipc_node_read_unlock(n);
index 264cf36..9ed9786 100644 (file)
@@ -792,7 +792,7 @@ static int bpf_exec_tx_verdict(struct sk_msg *msg, struct sock *sk,
        struct sk_psock *psock;
        struct sock *sk_redir;
        struct tls_rec *rec;
-       bool enospc, policy;
+       bool enospc, policy, redir_ingress;
        int err = 0, send;
        u32 delta = 0;
 
@@ -837,6 +837,7 @@ more_data:
                }
                break;
        case __SK_REDIRECT:
+               redir_ingress = psock->redir_ingress;
                sk_redir = psock->sk_redir;
                memcpy(&msg_redir, msg, sizeof(*msg));
                if (msg->apply_bytes < send)
@@ -846,7 +847,8 @@ more_data:
                sk_msg_return_zero(sk, msg, send);
                msg->sg.size -= send;
                release_sock(sk);
-               err = tcp_bpf_sendmsg_redir(sk_redir, &msg_redir, send, flags);
+               err = tcp_bpf_sendmsg_redir(sk_redir, redir_ingress,
+                                           &msg_redir, send, flags);
                lock_sock(sk);
                if (err < 0) {
                        *copied -= sk_msg_free_nocharge(sk, &msg_redir);
index 105f522..616b55c 100644 (file)
@@ -114,14 +114,16 @@ static int sk_diag_show_rqlen(struct sock *sk, struct sk_buff *nlskb)
        return nla_put(nlskb, UNIX_DIAG_RQLEN, sizeof(rql), &rql);
 }
 
-static int sk_diag_dump_uid(struct sock *sk, struct sk_buff *nlskb)
+static int sk_diag_dump_uid(struct sock *sk, struct sk_buff *nlskb,
+                           struct user_namespace *user_ns)
 {
-       uid_t uid = from_kuid_munged(sk_user_ns(nlskb->sk), sock_i_uid(sk));
+       uid_t uid = from_kuid_munged(user_ns, sock_i_uid(sk));
        return nla_put(nlskb, UNIX_DIAG_UID, sizeof(uid_t), &uid);
 }
 
 static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct unix_diag_req *req,
-               u32 portid, u32 seq, u32 flags, int sk_ino)
+                       struct user_namespace *user_ns,
+                       u32 portid, u32 seq, u32 flags, int sk_ino)
 {
        struct nlmsghdr *nlh;
        struct unix_diag_msg *rep;
@@ -167,7 +169,7 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct unix_diag_r
                goto out_nlmsg_trim;
 
        if ((req->udiag_show & UDIAG_SHOW_UID) &&
-           sk_diag_dump_uid(sk, skb))
+           sk_diag_dump_uid(sk, skb, user_ns))
                goto out_nlmsg_trim;
 
        nlmsg_end(skb, nlh);
@@ -179,7 +181,8 @@ out_nlmsg_trim:
 }
 
 static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, struct unix_diag_req *req,
-               u32 portid, u32 seq, u32 flags)
+                       struct user_namespace *user_ns,
+                       u32 portid, u32 seq, u32 flags)
 {
        int sk_ino;
 
@@ -190,7 +193,7 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, struct unix_diag_r
        if (!sk_ino)
                return 0;
 
-       return sk_diag_fill(sk, skb, req, portid, seq, flags, sk_ino);
+       return sk_diag_fill(sk, skb, req, user_ns, portid, seq, flags, sk_ino);
 }
 
 static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
@@ -214,7 +217,7 @@ static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
                                goto next;
                        if (!(req->udiag_states & (1 << sk->sk_state)))
                                goto next;
-                       if (sk_diag_dump(sk, skb, req,
+                       if (sk_diag_dump(sk, skb, req, sk_user_ns(skb->sk),
                                         NETLINK_CB(cb->skb).portid,
                                         cb->nlh->nlmsg_seq,
                                         NLM_F_MULTI) < 0) {
@@ -282,7 +285,8 @@ again:
        if (!rep)
                goto out;
 
-       err = sk_diag_fill(sk, rep, req, NETLINK_CB(in_skb).portid,
+       err = sk_diag_fill(sk, rep, req, sk_user_ns(NETLINK_CB(in_skb).sk),
+                          NETLINK_CB(in_skb).portid,
                           nlh->nlmsg_seq, 0, req->udiag_ino);
        if (err < 0) {
                nlmsg_free(rep);
index 494aa74..cd47f88 100644 (file)
@@ -3,6 +3,14 @@
 # Makefile for the XFRM subsystem.
 #
 
+xfrm_interface-$(CONFIG_XFRM_INTERFACE) += xfrm_interface_core.o
+
+ifeq ($(CONFIG_XFRM_INTERFACE),m)
+xfrm_interface-$(CONFIG_DEBUG_INFO_BTF_MODULES) += xfrm_interface_bpf.o
+else ifeq ($(CONFIG_XFRM_INTERFACE),y)
+xfrm_interface-$(CONFIG_DEBUG_INFO_BTF) += xfrm_interface_bpf.o
+endif
+
 obj-$(CONFIG_XFRM) := xfrm_policy.o xfrm_state.o xfrm_hash.o \
                      xfrm_input.o xfrm_output.o \
                      xfrm_sysctl.o xfrm_replay.o xfrm_device.o
index 21269e8..4aff76c 100644 (file)
@@ -132,6 +132,16 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur
        if (xo->flags & XFRM_GRO || x->xso.dir == XFRM_DEV_OFFLOAD_IN)
                return skb;
 
+       /* The packet was sent to HW IPsec packet offload engine,
+        * but to wrong device. Drop the packet, so it won't skip
+        * XFRM stack.
+        */
+       if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET && x->xso.dev != dev) {
+               kfree_skb(skb);
+               dev_core_stats_tx_dropped_inc(dev);
+               return NULL;
+       }
+
        /* This skb was already validated on the upper/virtual dev */
        if ((x->xso.dev != dev) && (x->xso.real_dev == dev))
                return skb;
@@ -229,6 +239,7 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
        struct xfrm_dev_offload *xso = &x->xso;
        xfrm_address_t *saddr;
        xfrm_address_t *daddr;
+       bool is_packet_offload;
 
        if (!x->type_offload) {
                NL_SET_ERR_MSG(extack, "Type doesn't support offload");
@@ -241,11 +252,13 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
                return -EINVAL;
        }
 
-       if (xuo->flags & ~(XFRM_OFFLOAD_IPV6 | XFRM_OFFLOAD_INBOUND)) {
+       if (xuo->flags &
+           ~(XFRM_OFFLOAD_IPV6 | XFRM_OFFLOAD_INBOUND | XFRM_OFFLOAD_PACKET)) {
                NL_SET_ERR_MSG(extack, "Unrecognized flags in offload request");
                return -EINVAL;
        }
 
+       is_packet_offload = xuo->flags & XFRM_OFFLOAD_PACKET;
        dev = dev_get_by_index(net, xuo->ifindex);
        if (!dev) {
                if (!(xuo->flags & XFRM_OFFLOAD_INBOUND)) {
@@ -260,7 +273,7 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
                                        x->props.family,
                                        xfrm_smark_get(0, x));
                if (IS_ERR(dst))
-                       return 0;
+                       return (is_packet_offload) ? -EINVAL : 0;
 
                dev = dst->dev;
 
@@ -271,7 +284,7 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
        if (!dev->xfrmdev_ops || !dev->xfrmdev_ops->xdo_dev_state_add) {
                xso->dev = NULL;
                dev_put(dev);
-               return 0;
+               return (is_packet_offload) ? -EINVAL : 0;
        }
 
        if (x->props.flags & XFRM_STATE_ESN &&
@@ -291,14 +304,28 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
        else
                xso->dir = XFRM_DEV_OFFLOAD_OUT;
 
+       if (is_packet_offload)
+               xso->type = XFRM_DEV_OFFLOAD_PACKET;
+       else
+               xso->type = XFRM_DEV_OFFLOAD_CRYPTO;
+
        err = dev->xfrmdev_ops->xdo_dev_state_add(x);
        if (err) {
                xso->dev = NULL;
                xso->dir = 0;
                xso->real_dev = NULL;
                netdev_put(dev, &xso->dev_tracker);
-
-               if (err != -EOPNOTSUPP) {
+               xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED;
+
+               /* User explicitly requested packet offload mode and configured
+                * policy in addition to the XFRM state. So be civil to users,
+                * and return an error instead of taking fallback path.
+                *
+                * This WARN_ON() can be seen as a documentation for driver
+                * authors to do not return -EOPNOTSUPP in packet offload mode.
+                */
+               WARN_ON(err == -EOPNOTSUPP && is_packet_offload);
+               if (err != -EOPNOTSUPP || is_packet_offload) {
                        NL_SET_ERR_MSG(extack, "Device failed to offload this state");
                        return err;
                }
@@ -308,6 +335,69 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
 }
 EXPORT_SYMBOL_GPL(xfrm_dev_state_add);
 
+int xfrm_dev_policy_add(struct net *net, struct xfrm_policy *xp,
+                       struct xfrm_user_offload *xuo, u8 dir,
+                       struct netlink_ext_ack *extack)
+{
+       struct xfrm_dev_offload *xdo = &xp->xdo;
+       struct net_device *dev;
+       int err;
+
+       if (!xuo->flags || xuo->flags & ~XFRM_OFFLOAD_PACKET) {
+               /* We support only packet offload mode and it means
+                * that user must set XFRM_OFFLOAD_PACKET bit.
+                */
+               NL_SET_ERR_MSG(extack, "Unrecognized flags in offload request");
+               return -EINVAL;
+       }
+
+       dev = dev_get_by_index(net, xuo->ifindex);
+       if (!dev)
+               return -EINVAL;
+
+       if (!dev->xfrmdev_ops || !dev->xfrmdev_ops->xdo_dev_policy_add) {
+               xdo->dev = NULL;
+               dev_put(dev);
+               NL_SET_ERR_MSG(extack, "Policy offload is not supported");
+               return -EINVAL;
+       }
+
+       xdo->dev = dev;
+       netdev_tracker_alloc(dev, &xdo->dev_tracker, GFP_ATOMIC);
+       xdo->real_dev = dev;
+       xdo->type = XFRM_DEV_OFFLOAD_PACKET;
+       switch (dir) {
+       case XFRM_POLICY_IN:
+               xdo->dir = XFRM_DEV_OFFLOAD_IN;
+               break;
+       case XFRM_POLICY_OUT:
+               xdo->dir = XFRM_DEV_OFFLOAD_OUT;
+               break;
+       case XFRM_POLICY_FWD:
+               xdo->dir = XFRM_DEV_OFFLOAD_FWD;
+               break;
+       default:
+               xdo->dev = NULL;
+               dev_put(dev);
+               NL_SET_ERR_MSG(extack, "Unrecognized offload direction");
+               return -EINVAL;
+       }
+
+       err = dev->xfrmdev_ops->xdo_dev_policy_add(xp);
+       if (err) {
+               xdo->dev = NULL;
+               xdo->real_dev = NULL;
+               xdo->type = XFRM_DEV_OFFLOAD_UNSPECIFIED;
+               xdo->dir = 0;
+               netdev_put(dev, &xdo->dev_tracker);
+               NL_SET_ERR_MSG(extack, "Device failed to offload this policy");
+               return err;
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(xfrm_dev_policy_add);
+
 bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x)
 {
        int mtu;
@@ -318,8 +408,9 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x)
        if (!x->type_offload || x->encap)
                return false;
 
-       if ((!dev || (dev == xfrm_dst_path(dst)->dev)) &&
-           (!xdst->child->xfrm)) {
+       if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET ||
+           ((!dev || (dev == xfrm_dst_path(dst)->dev)) &&
+            !xdst->child->xfrm)) {
                mtu = xfrm_state_mtu(x, xdst->child_mtu_cached);
                if (skb->len <= mtu)
                        goto ok;
@@ -410,8 +501,10 @@ static int xfrm_api_check(struct net_device *dev)
 
 static int xfrm_dev_down(struct net_device *dev)
 {
-       if (dev->features & NETIF_F_HW_ESP)
+       if (dev->features & NETIF_F_HW_ESP) {
                xfrm_dev_state_flush(dev_net(dev), dev, true);
+               xfrm_dev_policy_flush(dev_net(dev), dev, true);
+       }
 
        return NOTIFY_DONE;
 }
diff --git a/net/xfrm/xfrm_interface_bpf.c b/net/xfrm/xfrm_interface_bpf.c
new file mode 100644 (file)
index 0000000..1ef2162
--- /dev/null
@@ -0,0 +1,115 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Unstable XFRM Helpers for TC-BPF hook
+ *
+ * These are called from SCHED_CLS BPF programs. Note that it is
+ * allowed to break compatibility for these functions since the interface they
+ * are exposed through to BPF programs is explicitly unstable.
+ */
+
+#include <linux/bpf.h>
+#include <linux/btf_ids.h>
+
+#include <net/dst_metadata.h>
+#include <net/xfrm.h>
+
+/* bpf_xfrm_info - XFRM metadata information
+ *
+ * Members:
+ * @if_id      - XFRM if_id:
+ *                 Transmit: if_id to be used in policy and state lookups
+ *                 Receive: if_id of the state matched for the incoming packet
+ * @link       - Underlying device ifindex:
+ *                 Transmit: used as the underlying device in VRF routing
+ *                 Receive: the device on which the packet had been received
+ */
+struct bpf_xfrm_info {
+       u32 if_id;
+       int link;
+};
+
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+                 "Global functions as their definitions will be in xfrm_interface BTF");
+
+/* bpf_skb_get_xfrm_info - Get XFRM metadata
+ *
+ * Parameters:
+ * @skb_ctx    - Pointer to ctx (__sk_buff) in TC program
+ *                 Cannot be NULL
+ * @to         - Pointer to memory to which the metadata will be copied
+ *                 Cannot be NULL
+ */
+__used noinline
+int bpf_skb_get_xfrm_info(struct __sk_buff *skb_ctx, struct bpf_xfrm_info *to)
+{
+       struct sk_buff *skb = (struct sk_buff *)skb_ctx;
+       struct xfrm_md_info *info;
+
+       info = skb_xfrm_md_info(skb);
+       if (!info)
+               return -EINVAL;
+
+       to->if_id = info->if_id;
+       to->link = info->link;
+       return 0;
+}
+
+/* bpf_skb_get_xfrm_info - Set XFRM metadata
+ *
+ * Parameters:
+ * @skb_ctx    - Pointer to ctx (__sk_buff) in TC program
+ *                 Cannot be NULL
+ * @from       - Pointer to memory from which the metadata will be copied
+ *                 Cannot be NULL
+ */
+__used noinline
+int bpf_skb_set_xfrm_info(struct __sk_buff *skb_ctx,
+                         const struct bpf_xfrm_info *from)
+{
+       struct sk_buff *skb = (struct sk_buff *)skb_ctx;
+       struct metadata_dst *md_dst;
+       struct xfrm_md_info *info;
+
+       if (unlikely(skb_metadata_dst(skb)))
+               return -EINVAL;
+
+       if (!xfrm_bpf_md_dst) {
+               struct metadata_dst __percpu *tmp;
+
+               tmp = metadata_dst_alloc_percpu(0, METADATA_XFRM, GFP_ATOMIC);
+               if (!tmp)
+                       return -ENOMEM;
+               if (cmpxchg(&xfrm_bpf_md_dst, NULL, tmp))
+                       metadata_dst_free_percpu(tmp);
+       }
+       md_dst = this_cpu_ptr(xfrm_bpf_md_dst);
+
+       info = &md_dst->u.xfrm_info;
+
+       info->if_id = from->if_id;
+       info->link = from->link;
+       skb_dst_force(skb);
+       info->dst_orig = skb_dst(skb);
+
+       dst_hold((struct dst_entry *)md_dst);
+       skb_dst_set(skb, (struct dst_entry *)md_dst);
+       return 0;
+}
+
+__diag_pop()
+
+BTF_SET8_START(xfrm_ifc_kfunc_set)
+BTF_ID_FLAGS(func, bpf_skb_get_xfrm_info)
+BTF_ID_FLAGS(func, bpf_skb_set_xfrm_info)
+BTF_SET8_END(xfrm_ifc_kfunc_set)
+
+static const struct btf_kfunc_id_set xfrm_interface_kfunc_set = {
+       .owner = THIS_MODULE,
+       .set   = &xfrm_ifc_kfunc_set,
+};
+
+int __init register_xfrm_interface_bpf(void)
+{
+       return register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS,
+                                        &xfrm_interface_kfunc_set);
+}
similarity index 98%
rename from net/xfrm/xfrm_interface.c
rename to net/xfrm/xfrm_interface_core.c
index 5a67b12..1f99dc4 100644 (file)
@@ -396,6 +396,14 @@ xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl)
 
                if_id = md_info->if_id;
                fl->flowi_oif = md_info->link;
+               if (md_info->dst_orig) {
+                       struct dst_entry *tmp_dst = dst;
+
+                       dst = md_info->dst_orig;
+                       skb_dst_set(skb, dst);
+                       md_info->dst_orig = NULL;
+                       dst_release(tmp_dst);
+               }
        } else {
                if_id = xi->p.if_id;
        }
@@ -1162,12 +1170,18 @@ static int __init xfrmi_init(void)
        if (err < 0)
                goto rtnl_link_failed;
 
+       err = register_xfrm_interface_bpf();
+       if (err < 0)
+               goto kfunc_failed;
+
        lwtunnel_encap_add_ops(&xfrmi_encap_ops, LWTUNNEL_ENCAP_XFRM);
 
        xfrm_if_register_cb(&xfrm_if_cb);
 
        return err;
 
+kfunc_failed:
+       rtnl_link_unregister(&xfrmi_link_ops);
 rtnl_link_failed:
        xfrmi6_fini();
 xfrmi6_failed:
index 78cb8d0..ff114d6 100644 (file)
@@ -492,7 +492,7 @@ static int xfrm_output_one(struct sk_buff *skb, int err)
        struct xfrm_state *x = dst->xfrm;
        struct net *net = xs_net(x);
 
-       if (err <= 0)
+       if (err <= 0 || x->xso.type == XFRM_DEV_OFFLOAD_PACKET)
                goto resume;
 
        do {
@@ -717,6 +717,16 @@ int xfrm_output(struct sock *sk, struct sk_buff *skb)
                break;
        }
 
+       if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET) {
+               if (!xfrm_dev_offload_ok(skb, x)) {
+                       XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR);
+                       kfree_skb(skb);
+                       return -EHOSTUNREACH;
+               }
+
+               return xfrm_output_resume(sk, skb, 0);
+       }
+
        secpath_reset(skb);
 
        if (xfrm_dev_offload_ok(skb, x)) {
index 9b9e276..e9eb82c 100644 (file)
@@ -425,6 +425,7 @@ void xfrm_policy_destroy(struct xfrm_policy *policy)
        if (del_timer(&policy->timer) || del_timer(&policy->polq.hold_timer))
                BUG();
 
+       xfrm_dev_policy_free(policy);
        call_rcu(&policy->rcu, xfrm_policy_destroy_rcu);
 }
 EXPORT_SYMBOL(xfrm_policy_destroy);
@@ -535,7 +536,7 @@ redo:
                __get_hash_thresh(net, pol->family, dir, &dbits, &sbits);
                h = __addr_hash(&pol->selector.daddr, &pol->selector.saddr,
                                pol->family, nhashmask, dbits, sbits);
-               if (!entry0) {
+               if (!entry0 || pol->xdo.type == XFRM_DEV_OFFLOAD_PACKET) {
                        hlist_del_rcu(&pol->bydst);
                        hlist_add_head_rcu(&pol->bydst, ndsttable + h);
                        h0 = h;
@@ -866,7 +867,7 @@ static void xfrm_policy_inexact_list_reinsert(struct net *net,
                                break;
                }
 
-               if (newpos)
+               if (newpos && policy->xdo.type != XFRM_DEV_OFFLOAD_PACKET)
                        hlist_add_behind_rcu(&policy->bydst, newpos);
                else
                        hlist_add_head_rcu(&policy->bydst, &n->hhead);
@@ -1347,7 +1348,7 @@ static void xfrm_hash_rebuild(struct work_struct *work)
                        else
                                break;
                }
-               if (newpos)
+               if (newpos && policy->xdo.type != XFRM_DEV_OFFLOAD_PACKET)
                        hlist_add_behind_rcu(&policy->bydst, newpos);
                else
                        hlist_add_head_rcu(&policy->bydst, chain);
@@ -1524,7 +1525,7 @@ static void xfrm_policy_insert_inexact_list(struct hlist_head *chain,
                        break;
        }
 
-       if (newpos)
+       if (newpos && policy->xdo.type != XFRM_DEV_OFFLOAD_PACKET)
                hlist_add_behind_rcu(&policy->bydst_inexact_list, newpos);
        else
                hlist_add_head_rcu(&policy->bydst_inexact_list, chain);
@@ -1561,9 +1562,12 @@ static struct xfrm_policy *xfrm_policy_insert_list(struct hlist_head *chain,
                        break;
        }
 
-       if (newpos)
+       if (newpos && policy->xdo.type != XFRM_DEV_OFFLOAD_PACKET)
                hlist_add_behind_rcu(&policy->bydst, &newpos->bydst);
        else
+               /* Packet offload policies enter to the head
+                * to speed-up lookups.
+                */
                hlist_add_head_rcu(&policy->bydst, chain);
 
        return delpol;
@@ -1769,12 +1773,41 @@ xfrm_policy_flush_secctx_check(struct net *net, u8 type, bool task_valid)
        }
        return err;
 }
+
+static inline int xfrm_dev_policy_flush_secctx_check(struct net *net,
+                                                    struct net_device *dev,
+                                                    bool task_valid)
+{
+       struct xfrm_policy *pol;
+       int err = 0;
+
+       list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) {
+               if (pol->walk.dead ||
+                   xfrm_policy_id2dir(pol->index) >= XFRM_POLICY_MAX ||
+                   pol->xdo.dev != dev)
+                       continue;
+
+               err = security_xfrm_policy_delete(pol->security);
+               if (err) {
+                       xfrm_audit_policy_delete(pol, 0, task_valid);
+                       return err;
+               }
+       }
+       return err;
+}
 #else
 static inline int
 xfrm_policy_flush_secctx_check(struct net *net, u8 type, bool task_valid)
 {
        return 0;
 }
+
+static inline int xfrm_dev_policy_flush_secctx_check(struct net *net,
+                                                    struct net_device *dev,
+                                                    bool task_valid)
+{
+       return 0;
+}
 #endif
 
 int xfrm_policy_flush(struct net *net, u8 type, bool task_valid)
@@ -1814,6 +1847,44 @@ out:
 }
 EXPORT_SYMBOL(xfrm_policy_flush);
 
+int xfrm_dev_policy_flush(struct net *net, struct net_device *dev,
+                         bool task_valid)
+{
+       int dir, err = 0, cnt = 0;
+       struct xfrm_policy *pol;
+
+       spin_lock_bh(&net->xfrm.xfrm_policy_lock);
+
+       err = xfrm_dev_policy_flush_secctx_check(net, dev, task_valid);
+       if (err)
+               goto out;
+
+again:
+       list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) {
+               dir = xfrm_policy_id2dir(pol->index);
+               if (pol->walk.dead ||
+                   dir >= XFRM_POLICY_MAX ||
+                   pol->xdo.dev != dev)
+                       continue;
+
+               __xfrm_policy_unlink(pol, dir);
+               spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
+               cnt++;
+               xfrm_audit_policy_delete(pol, 1, task_valid);
+               xfrm_policy_kill(pol);
+               spin_lock_bh(&net->xfrm.xfrm_policy_lock);
+               goto again;
+       }
+       if (cnt)
+               __xfrm_policy_inexact_flush(net);
+       else
+               err = -ESRCH;
+out:
+       spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
+       return err;
+}
+EXPORT_SYMBOL(xfrm_dev_policy_flush);
+
 int xfrm_policy_walk(struct net *net, struct xfrm_policy_walk *walk,
                     int (*func)(struct xfrm_policy *, int, int, void*),
                     void *data)
@@ -2113,6 +2184,9 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
                        break;
                }
        }
+       if (ret && ret->xdo.type == XFRM_DEV_OFFLOAD_PACKET)
+               goto skip_inexact;
+
        bin = xfrm_policy_inexact_lookup_rcu(net, type, family, dir, if_id);
        if (!bin || !xfrm_policy_find_inexact_candidates(&cand, bin, saddr,
                                                         daddr))
@@ -2245,6 +2319,7 @@ int xfrm_policy_delete(struct xfrm_policy *pol, int dir)
        pol = __xfrm_policy_unlink(pol, dir);
        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
        if (pol) {
+               xfrm_dev_policy_delete(pol);
                xfrm_policy_kill(pol);
                return 0;
        }
index 9ec481f..cc1d0ea 100644 (file)
@@ -84,6 +84,25 @@ static unsigned int xfrm_seq_hash(struct net *net, u32 seq)
        return __xfrm_seq_hash(seq, net->xfrm.state_hmask);
 }
 
+#define XFRM_STATE_INSERT(by, _n, _h, _type)                               \
+       {                                                                  \
+               struct xfrm_state *_x = NULL;                              \
+                                                                          \
+               if (_type != XFRM_DEV_OFFLOAD_PACKET) {                    \
+                       hlist_for_each_entry_rcu(_x, _h, by) {             \
+                               if (_x->xso.type == XFRM_DEV_OFFLOAD_PACKET) \
+                                       continue;                          \
+                               break;                                     \
+                       }                                                  \
+               }                                                          \
+                                                                          \
+               if (!_x || _x->xso.type == XFRM_DEV_OFFLOAD_PACKET)        \
+                       /* SAD is empty or consist from HW SAs only */     \
+                       hlist_add_head_rcu(_n, _h);                        \
+               else                                                       \
+                       hlist_add_before_rcu(_n, &_x->by);                 \
+       }
+
 static void xfrm_hash_transfer(struct hlist_head *list,
                               struct hlist_head *ndsttable,
                               struct hlist_head *nsrctable,
@@ -100,23 +119,25 @@ static void xfrm_hash_transfer(struct hlist_head *list,
                h = __xfrm_dst_hash(&x->id.daddr, &x->props.saddr,
                                    x->props.reqid, x->props.family,
                                    nhashmask);
-               hlist_add_head_rcu(&x->bydst, ndsttable + h);
+               XFRM_STATE_INSERT(bydst, &x->bydst, ndsttable + h, x->xso.type);
 
                h = __xfrm_src_hash(&x->id.daddr, &x->props.saddr,
                                    x->props.family,
                                    nhashmask);
-               hlist_add_head_rcu(&x->bysrc, nsrctable + h);
+               XFRM_STATE_INSERT(bysrc, &x->bysrc, nsrctable + h, x->xso.type);
 
                if (x->id.spi) {
                        h = __xfrm_spi_hash(&x->id.daddr, x->id.spi,
                                            x->id.proto, x->props.family,
                                            nhashmask);
-                       hlist_add_head_rcu(&x->byspi, nspitable + h);
+                       XFRM_STATE_INSERT(byspi, &x->byspi, nspitable + h,
+                                         x->xso.type);
                }
 
                if (x->km.seq) {
                        h = __xfrm_seq_hash(x->km.seq, nhashmask);
-                       hlist_add_head_rcu(&x->byseq, nseqtable + h);
+                       XFRM_STATE_INSERT(byseq, &x->byseq, nseqtable + h,
+                                         x->xso.type);
                }
        }
 }
@@ -549,6 +570,8 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me)
        int err = 0;
 
        spin_lock(&x->lock);
+       xfrm_dev_state_update_curlft(x);
+
        if (x->km.state == XFRM_STATE_DEAD)
                goto out;
        if (x->km.state == XFRM_STATE_EXPIRED)
@@ -951,6 +974,49 @@ xfrm_init_tempstate(struct xfrm_state *x, const struct flowi *fl,
        x->props.family = tmpl->encap_family;
 }
 
+static struct xfrm_state *__xfrm_state_lookup_all(struct net *net, u32 mark,
+                                                 const xfrm_address_t *daddr,
+                                                 __be32 spi, u8 proto,
+                                                 unsigned short family,
+                                                 struct xfrm_dev_offload *xdo)
+{
+       unsigned int h = xfrm_spi_hash(net, daddr, spi, proto, family);
+       struct xfrm_state *x;
+
+       hlist_for_each_entry_rcu(x, net->xfrm.state_byspi + h, byspi) {
+#ifdef CONFIG_XFRM_OFFLOAD
+               if (xdo->type == XFRM_DEV_OFFLOAD_PACKET) {
+                       if (x->xso.type != XFRM_DEV_OFFLOAD_PACKET)
+                               /* HW states are in the head of list, there is
+                                * no need to iterate further.
+                                */
+                               break;
+
+                       /* Packet offload: both policy and SA should
+                        * have same device.
+                        */
+                       if (xdo->dev != x->xso.dev)
+                               continue;
+               } else if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET)
+                       /* Skip HW policy for SW lookups */
+                       continue;
+#endif
+               if (x->props.family != family ||
+                   x->id.spi       != spi ||
+                   x->id.proto     != proto ||
+                   !xfrm_addr_equal(&x->id.daddr, daddr, family))
+                       continue;
+
+               if ((mark & x->mark.m) != x->mark.v)
+                       continue;
+               if (!xfrm_state_hold_rcu(x))
+                       continue;
+               return x;
+       }
+
+       return NULL;
+}
+
 static struct xfrm_state *__xfrm_state_lookup(struct net *net, u32 mark,
                                              const xfrm_address_t *daddr,
                                              __be32 spi, u8 proto,
@@ -1092,6 +1158,23 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr,
        rcu_read_lock();
        h = xfrm_dst_hash(net, daddr, saddr, tmpl->reqid, encap_family);
        hlist_for_each_entry_rcu(x, net->xfrm.state_bydst + h, bydst) {
+#ifdef CONFIG_XFRM_OFFLOAD
+               if (pol->xdo.type == XFRM_DEV_OFFLOAD_PACKET) {
+                       if (x->xso.type != XFRM_DEV_OFFLOAD_PACKET)
+                               /* HW states are in the head of list, there is
+                                * no need to iterate further.
+                                */
+                               break;
+
+                       /* Packet offload: both policy and SA should
+                        * have same device.
+                        */
+                       if (pol->xdo.dev != x->xso.dev)
+                               continue;
+               } else if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET)
+                       /* Skip HW policy for SW lookups */
+                       continue;
+#endif
                if (x->props.family == encap_family &&
                    x->props.reqid == tmpl->reqid &&
                    (mark & x->mark.m) == x->mark.v &&
@@ -1109,6 +1192,23 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr,
 
        h_wildcard = xfrm_dst_hash(net, daddr, &saddr_wildcard, tmpl->reqid, encap_family);
        hlist_for_each_entry_rcu(x, net->xfrm.state_bydst + h_wildcard, bydst) {
+#ifdef CONFIG_XFRM_OFFLOAD
+               if (pol->xdo.type == XFRM_DEV_OFFLOAD_PACKET) {
+                       if (x->xso.type != XFRM_DEV_OFFLOAD_PACKET)
+                               /* HW states are in the head of list, there is
+                                * no need to iterate further.
+                                */
+                               break;
+
+                       /* Packet offload: both policy and SA should
+                        * have same device.
+                        */
+                       if (pol->xdo.dev != x->xso.dev)
+                               continue;
+               } else if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET)
+                       /* Skip HW policy for SW lookups */
+                       continue;
+#endif
                if (x->props.family == encap_family &&
                    x->props.reqid == tmpl->reqid &&
                    (mark & x->mark.m) == x->mark.v &&
@@ -1126,8 +1226,10 @@ found:
        x = best;
        if (!x && !error && !acquire_in_progress) {
                if (tmpl->id.spi &&
-                   (x0 = __xfrm_state_lookup(net, mark, daddr, tmpl->id.spi,
-                                             tmpl->id.proto, encap_family)) != NULL) {
+                   (x0 = __xfrm_state_lookup_all(net, mark, daddr,
+                                                 tmpl->id.spi, tmpl->id.proto,
+                                                 encap_family,
+                                                 &pol->xdo)) != NULL) {
                        to_put = x0;
                        error = -EEXIST;
                        goto out;
@@ -1161,21 +1263,53 @@ found:
                        x = NULL;
                        goto out;
                }
-
+#ifdef CONFIG_XFRM_OFFLOAD
+               if (pol->xdo.type == XFRM_DEV_OFFLOAD_PACKET) {
+                       struct xfrm_dev_offload *xdo = &pol->xdo;
+                       struct xfrm_dev_offload *xso = &x->xso;
+
+                       xso->type = XFRM_DEV_OFFLOAD_PACKET;
+                       xso->dir = xdo->dir;
+                       xso->dev = xdo->dev;
+                       xso->real_dev = xdo->real_dev;
+                       netdev_tracker_alloc(xso->dev, &xso->dev_tracker,
+                                            GFP_ATOMIC);
+                       error = xso->dev->xfrmdev_ops->xdo_dev_state_add(x);
+                       if (error) {
+                               xso->dir = 0;
+                               netdev_put(xso->dev, &xso->dev_tracker);
+                               xso->dev = NULL;
+                               xso->real_dev = NULL;
+                               xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED;
+                               x->km.state = XFRM_STATE_DEAD;
+                               to_put = x;
+                               x = NULL;
+                               goto out;
+                       }
+               }
+#endif
                if (km_query(x, tmpl, pol) == 0) {
                        spin_lock_bh(&net->xfrm.xfrm_state_lock);
                        x->km.state = XFRM_STATE_ACQ;
                        list_add(&x->km.all, &net->xfrm.state_all);
-                       hlist_add_head_rcu(&x->bydst, net->xfrm.state_bydst + h);
+                       XFRM_STATE_INSERT(bydst, &x->bydst,
+                                         net->xfrm.state_bydst + h,
+                                         x->xso.type);
                        h = xfrm_src_hash(net, daddr, saddr, encap_family);
-                       hlist_add_head_rcu(&x->bysrc, net->xfrm.state_bysrc + h);
+                       XFRM_STATE_INSERT(bysrc, &x->bysrc,
+                                         net->xfrm.state_bysrc + h,
+                                         x->xso.type);
                        if (x->id.spi) {
                                h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, encap_family);
-                               hlist_add_head_rcu(&x->byspi, net->xfrm.state_byspi + h);
+                               XFRM_STATE_INSERT(byspi, &x->byspi,
+                                                 net->xfrm.state_byspi + h,
+                                                 x->xso.type);
                        }
                        if (x->km.seq) {
                                h = xfrm_seq_hash(net, x->km.seq);
-                               hlist_add_head_rcu(&x->byseq, net->xfrm.state_byseq + h);
+                               XFRM_STATE_INSERT(byseq, &x->byseq,
+                                                 net->xfrm.state_byseq + h,
+                                                 x->xso.type);
                        }
                        x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires;
                        hrtimer_start(&x->mtimer,
@@ -1185,6 +1319,18 @@ found:
                        xfrm_hash_grow_check(net, x->bydst.next != NULL);
                        spin_unlock_bh(&net->xfrm.xfrm_state_lock);
                } else {
+#ifdef CONFIG_XFRM_OFFLOAD
+                       struct xfrm_dev_offload *xso = &x->xso;
+
+                       if (xso->type == XFRM_DEV_OFFLOAD_PACKET) {
+                               xso->dev->xfrmdev_ops->xdo_dev_state_delete(x);
+                               xso->dir = 0;
+                               netdev_put(xso->dev, &xso->dev_tracker);
+                               xso->dev = NULL;
+                               xso->real_dev = NULL;
+                               xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED;
+                       }
+#endif
                        x->km.state = XFRM_STATE_DEAD;
                        to_put = x;
                        x = NULL;
@@ -1280,22 +1426,26 @@ static void __xfrm_state_insert(struct xfrm_state *x)
 
        h = xfrm_dst_hash(net, &x->id.daddr, &x->props.saddr,
                          x->props.reqid, x->props.family);
-       hlist_add_head_rcu(&x->bydst, net->xfrm.state_bydst + h);
+       XFRM_STATE_INSERT(bydst, &x->bydst, net->xfrm.state_bydst + h,
+                         x->xso.type);
 
        h = xfrm_src_hash(net, &x->id.daddr, &x->props.saddr, x->props.family);
-       hlist_add_head_rcu(&x->bysrc, net->xfrm.state_bysrc + h);
+       XFRM_STATE_INSERT(bysrc, &x->bysrc, net->xfrm.state_bysrc + h,
+                         x->xso.type);
 
        if (x->id.spi) {
                h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto,
                                  x->props.family);
 
-               hlist_add_head_rcu(&x->byspi, net->xfrm.state_byspi + h);
+               XFRM_STATE_INSERT(byspi, &x->byspi, net->xfrm.state_byspi + h,
+                                 x->xso.type);
        }
 
        if (x->km.seq) {
                h = xfrm_seq_hash(net, x->km.seq);
 
-               hlist_add_head_rcu(&x->byseq, net->xfrm.state_byseq + h);
+               XFRM_STATE_INSERT(byseq, &x->byseq, net->xfrm.state_byseq + h,
+                                 x->xso.type);
        }
 
        hrtimer_start(&x->mtimer, ktime_set(1, 0), HRTIMER_MODE_REL_SOFT);
@@ -1409,9 +1559,11 @@ static struct xfrm_state *__find_acq_core(struct net *net,
                              ktime_set(net->xfrm.sysctl_acq_expires, 0),
                              HRTIMER_MODE_REL_SOFT);
                list_add(&x->km.all, &net->xfrm.state_all);
-               hlist_add_head_rcu(&x->bydst, net->xfrm.state_bydst + h);
+               XFRM_STATE_INSERT(bydst, &x->bydst, net->xfrm.state_bydst + h,
+                                 x->xso.type);
                h = xfrm_src_hash(net, daddr, saddr, family);
-               hlist_add_head_rcu(&x->bysrc, net->xfrm.state_bysrc + h);
+               XFRM_STATE_INSERT(bysrc, &x->bysrc, net->xfrm.state_bysrc + h,
+                                 x->xso.type);
 
                net->xfrm.state_num++;
 
@@ -1786,6 +1938,8 @@ EXPORT_SYMBOL(xfrm_state_update);
 
 int xfrm_state_check_expire(struct xfrm_state *x)
 {
+       xfrm_dev_state_update_curlft(x);
+
        if (!x->curlft.use_time)
                x->curlft.use_time = ktime_get_real_seconds();
 
@@ -2094,7 +2248,8 @@ int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high,
                spin_lock_bh(&net->xfrm.xfrm_state_lock);
                x->id.spi = newspi;
                h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, x->props.family);
-               hlist_add_head_rcu(&x->byspi, net->xfrm.state_byspi + h);
+               XFRM_STATE_INSERT(byspi, &x->byspi, net->xfrm.state_byspi + h,
+                                 x->xso.type);
                spin_unlock_bh(&net->xfrm.xfrm_state_lock);
 
                err = 0;
index 0eb4696..cf5172d 100644 (file)
@@ -956,6 +956,8 @@ static int copy_user_offload(struct xfrm_dev_offload *xso, struct sk_buff *skb)
        xuo->ifindex = xso->dev->ifindex;
        if (xso->dir == XFRM_DEV_OFFLOAD_IN)
                xuo->flags = XFRM_OFFLOAD_INBOUND;
+       if (xso->type == XFRM_DEV_OFFLOAD_PACKET)
+               xuo->flags |= XFRM_OFFLOAD_PACKET;
 
        return 0;
 }
@@ -1890,6 +1892,15 @@ static struct xfrm_policy *xfrm_policy_construct(struct net *net,
        if (attrs[XFRMA_IF_ID])
                xp->if_id = nla_get_u32(attrs[XFRMA_IF_ID]);
 
+       /* configure the hardware if offload is requested */
+       if (attrs[XFRMA_OFFLOAD_DEV]) {
+               err = xfrm_dev_policy_add(net, xp,
+                                         nla_data(attrs[XFRMA_OFFLOAD_DEV]),
+                                         p->dir, extack);
+               if (err)
+                       goto error;
+       }
+
        return xp;
  error:
        *errp = err;
@@ -1929,6 +1940,7 @@ static int xfrm_add_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
        xfrm_audit_policy_add(xp, err ? 0 : 1, true);
 
        if (err) {
+               xfrm_dev_policy_delete(xp);
                security_xfrm_policy_free(xp->security);
                kfree(xp);
                return err;
@@ -2041,6 +2053,8 @@ static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr
                err = xfrm_mark_put(skb, &xp->mark);
        if (!err)
                err = xfrm_if_id_put(skb, xp->if_id);
+       if (!err && xp->xdo.dev)
+               err = copy_user_offload(&xp->xdo, skb);
        if (err) {
                nlmsg_cancel(skb, nlh);
                return err;
@@ -3379,6 +3393,8 @@ static int build_acquire(struct sk_buff *skb, struct xfrm_state *x,
                err = xfrm_mark_put(skb, &xp->mark);
        if (!err)
                err = xfrm_if_id_put(skb, xp->if_id);
+       if (!err && xp->xdo.dev)
+               err = copy_user_offload(&xp->xdo, skb);
        if (err) {
                nlmsg_cancel(skb, nlh);
                return err;
@@ -3497,6 +3513,8 @@ static int build_polexpire(struct sk_buff *skb, struct xfrm_policy *xp,
                err = xfrm_mark_put(skb, &xp->mark);
        if (!err)
                err = xfrm_if_id_put(skb, xp->if_id);
+       if (!err && xp->xdo.dev)
+               err = copy_user_offload(&xp->xdo, skb);
        if (err) {
                nlmsg_cancel(skb, nlh);
                return err;
@@ -3580,6 +3598,8 @@ static int xfrm_notify_policy(struct xfrm_policy *xp, int dir, const struct km_e
                err = xfrm_mark_put(skb, &xp->mark);
        if (!err)
                err = xfrm_if_id_put(skb, xp->if_id);
+       if (!err && xp->xdo.dev)
+               err = copy_user_offload(&xp->xdo, skb);
        if (err)
                goto out_free_skb;
 
index fdb0aff..e8d9082 100755 (executable)
@@ -752,6 +752,7 @@ class PrinterHelpers(Printer):
             'struct bpf_timer',
             'struct mptcp_sock',
             'struct bpf_dynptr',
+            'const struct bpf_dynptr',
             'struct iphdr',
             'struct ipv6hdr',
     }
index f99e000..4c677c8 100644 (file)
@@ -59,7 +59,7 @@ int snd_dice_stream_get_rate_mode(struct snd_dice *dice, unsigned int rate,
 
 static int select_clock(struct snd_dice *dice, unsigned int rate)
 {
-       __be32 reg;
+       __be32 reg, new;
        u32 data;
        int i;
        int err;
@@ -83,15 +83,17 @@ static int select_clock(struct snd_dice *dice, unsigned int rate)
        if (completion_done(&dice->clock_accepted))
                reinit_completion(&dice->clock_accepted);
 
-       reg = cpu_to_be32(data);
+       new = cpu_to_be32(data);
        err = snd_dice_transaction_write_global(dice, GLOBAL_CLOCK_SELECT,
-                                               &reg, sizeof(reg));
+                                               &new, sizeof(new));
        if (err < 0)
                return err;
 
        if (wait_for_completion_timeout(&dice->clock_accepted,
-                       msecs_to_jiffies(NOTIFICATION_TIMEOUT_MS)) == 0)
-               return -ETIMEDOUT;
+                       msecs_to_jiffies(NOTIFICATION_TIMEOUT_MS)) == 0) {
+               if (reg != new)
+                       return -ETIMEDOUT;
+       }
 
        return 0;
 }
index 51721ed..e88d9ff 100644 (file)
@@ -143,7 +143,7 @@ static const struct snd_kcontrol_new cs42l51_snd_controls[] = {
                        0, 0xA0, 96, adc_att_tlv),
        SOC_DOUBLE_R_SX_TLV("PGA Volume",
                        CS42L51_ALC_PGA_CTL, CS42L51_ALC_PGB_CTL,
-                       0, 0x19, 30, pga_tlv),
+                       0, 0x1A, 30, pga_tlv),
        SOC_SINGLE("Playback Deemphasis Switch", CS42L51_DAC_CTL, 3, 1, 0),
        SOC_SINGLE("Auto-Mute Switch", CS42L51_DAC_CTL, 2, 1, 0),
        SOC_SINGLE("Soft Ramp Switch", CS42L51_DAC_CTL, 1, 1, 0),
index a969547..52bb557 100644 (file)
@@ -14,6 +14,7 @@
 
 #include <dt-bindings/sound/tlv320adc3xxx.h>
 #include <linux/clk.h>
+#include <linux/gpio/consumer.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/io.h>
@@ -1025,7 +1026,9 @@ static const struct gpio_chip adc3xxx_gpio_chip = {
 
 static void adc3xxx_free_gpio(struct adc3xxx *adc3xxx)
 {
+#ifdef CONFIG_GPIOLIB
        gpiochip_remove(&adc3xxx->gpio_chip);
+#endif
 }
 
 static void adc3xxx_init_gpio(struct adc3xxx *adc3xxx)
index 79ef4e2..4b86ef8 100644 (file)
@@ -194,6 +194,25 @@ static int fsl_micfil_reset(struct device *dev)
        if (ret)
                return ret;
 
+       /*
+        * SRES is self-cleared bit, but REG_MICFIL_CTRL1 is defined
+        * as non-volatile register, so SRES still remain in regmap
+        * cache after set, that every update of REG_MICFIL_CTRL1,
+        * software reset happens. so clear it explicitly.
+        */
+       ret = regmap_clear_bits(micfil->regmap, REG_MICFIL_CTRL1,
+                               MICFIL_CTRL1_SRES);
+       if (ret)
+               return ret;
+
+       /*
+        * Set SRES should clear CHnF flags, But even add delay here
+        * the CHnF may not be cleared sometimes, so clear CHnF explicitly.
+        */
+       ret = regmap_write_bits(micfil->regmap, REG_MICFIL_STAT, 0xFF, 0xFF);
+       if (ret)
+               return ret;
+
        return 0;
 }
 
index bd88de0..55b009d 100644 (file)
@@ -452,7 +452,7 @@ int snd_soc_put_volsw_sx(struct snd_kcontrol *kcontrol,
        val = ucontrol->value.integer.value[0];
        if (mc->platform_max && val > mc->platform_max)
                return -EINVAL;
-       if (val > max - min)
+       if (val > max)
                return -EINVAL;
        val_mask = mask << shift;
        val = (val + min) & mask;
@@ -464,10 +464,15 @@ int snd_soc_put_volsw_sx(struct snd_kcontrol *kcontrol,
        ret = err;
 
        if (snd_soc_volsw_is_stereo(mc)) {
-               unsigned int val2;
+               unsigned int val2 = ucontrol->value.integer.value[1];
+
+               if (mc->platform_max && val2 > mc->platform_max)
+                       return -EINVAL;
+               if (val2 > max)
+                       return -EINVAL;
 
                val_mask = mask << rshift;
-               val2 = (ucontrol->value.integer.value[1] + min) & mask;
+               val2 = (val2 + min) & mask;
                val2 = val2 << rshift;
 
                err = snd_soc_component_update_bits(component, reg2, val_mask,
index c90b756..6200320 100644 (file)
@@ -501,6 +501,7 @@ static int do_build_table_cb(const char *fpath, const struct stat *sb,
        if (err) {
                p_err("failed to append entry to hashmap for ID %u, path '%s': %s",
                      pinned_info.id, path, strerror(errno));
+               free(path);
                goto out_close;
        }
 
index f89de51..464ca3f 100644 (file)
@@ -5293,7 +5293,7 @@ union bpf_attr {
  *     Return
  *             Nothing. Always succeeds.
  *
- * long bpf_dynptr_read(void *dst, u32 len, struct bpf_dynptr *src, u32 offset, u64 flags)
+ * long bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr *src, u32 offset, u64 flags)
  *     Description
  *             Read *len* bytes from *src* into *dst*, starting from *offset*
  *             into *src*.
@@ -5303,7 +5303,7 @@ union bpf_attr {
  *             of *src*'s data, -EINVAL if *src* is an invalid dynptr or if
  *             *flags* is not 0.
  *
- * long bpf_dynptr_write(struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags)
+ * long bpf_dynptr_write(const struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags)
  *     Description
  *             Write *len* bytes from *src* into *dst*, starting from *offset*
  *             into *dst*.
@@ -5313,7 +5313,7 @@ union bpf_attr {
  *             of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst*
  *             is a read-only dynptr or if *flags* is not 0.
  *
- * void *bpf_dynptr_data(struct bpf_dynptr *ptr, u32 offset, u32 len)
+ * void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u32 offset, u32 len)
  *     Description
  *             Get a pointer to the underlying dynptr data.
  *
@@ -5414,7 +5414,7 @@ union bpf_attr {
  *             Drain samples from the specified user ring buffer, and invoke
  *             the provided callback for each such sample:
  *
- *             long (\*callback_fn)(struct bpf_dynptr \*dynptr, void \*ctx);
+ *             long (\*callback_fn)(const struct bpf_dynptr \*dynptr, void \*ctx);
  *
  *             If **callback_fn** returns 0, the helper will continue to try
  *             and drain the next sample, up to a maximum of
index 0242f31..901d98b 100644 (file)
@@ -673,6 +673,7 @@ enum {
        IFLA_XFRM_UNSPEC,
        IFLA_XFRM_LINK,
        IFLA_XFRM_IF_ID,
+       IFLA_XFRM_COLLECT_METADATA,
        __IFLA_XFRM_MAX
 };
 
index 4c904ef..477666f 100644 (file)
@@ -286,3 +286,20 @@ tags:
 
 # Delete partially updated (corrupted) files on error
 .DELETE_ON_ERROR:
+
+help:
+       @echo 'libbpf common targets:'
+       @echo '  HINT: use "V=1" to enable verbose build'
+       @echo '  all     - build libraries and pkgconfig'
+       @echo '  clean   - remove all generated files'
+       @echo '  check   - check abi and version info'
+       @echo ''
+       @echo 'libbpf install targets:'
+       @echo '  HINT: use "prefix"(defaults to "/usr/local") or "DESTDIR" (defaults to "/")'
+       @echo '        to adjust target desitantion, e.g. "make prefix=/usr/local install"'
+       @echo '  install          - build and install all headers, libraries and pkgconfig'
+       @echo '  install_headers  - install only headers to include/bpf'
+       @echo ''
+       @echo 'libbpf make targets:'
+       @echo '  tags    - use ctags to make tag information for source code browsing'
+       @echo '  cscope  - use cscope to make interactive source code browsing database'
index a112e0e..7468978 100644 (file)
@@ -409,8 +409,15 @@ LIBBPF_API int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf,
                                 __u32 *buf_len, __u32 *prog_id, __u32 *fd_type,
                                 __u64 *probe_offset, __u64 *probe_addr);
 
+#ifdef __cplusplus
+/* forward-declaring enums in C++ isn't compatible with pure C enums, so
+ * instead define bpf_enable_stats() as accepting int as an input
+ */
+LIBBPF_API int bpf_enable_stats(int type);
+#else
 enum bpf_stats_type; /* defined in up-to-date linux/bpf.h */
 LIBBPF_API int bpf_enable_stats(enum bpf_stats_type type);
+#endif
 
 struct bpf_prog_bind_opts {
        size_t sz; /* size of this struct for forward/backward compatibility */
index b8daae2..75b411f 100644 (file)
@@ -1233,6 +1233,14 @@ static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec
                if (reg_off < 0)
                        return reg_off;
                arg->reg_off = reg_off;
+       } else if (sscanf(arg_str, " %d @ ( %%%15[^)] ) %n", &arg_sz, reg_name, &len) == 2) {
+               /* Memory dereference case without offset, e.g., 8@(%rsp) */
+               arg->arg_type = USDT_ARG_REG_DEREF;
+               arg->val_off = 0;
+               reg_off = calc_pt_regs_off(reg_name);
+               if (reg_off < 0)
+                       return reg_off;
+               arg->reg_off = reg_off;
        } else if (sscanf(arg_str, " %d @ %%%15s %n", &arg_sz, reg_name, &len) == 2) {
                /* Register read case, e.g., -4@%eax */
                arg->arg_type = USDT_ARG_REG;
index 8e77515..99cc33c 100644 (file)
@@ -28,6 +28,7 @@ kfree_skb                                        # attach fentry unexpected erro
 kfunc_call/subprog                               # extern (var ksym) 'bpf_prog_active': not found in kernel BTF
 kfunc_call/subprog_lskel                         # skel unexpected error: -2
 kfunc_dynptr_param/dynptr_data_null              # libbpf: prog 'dynptr_data_null': failed to attach: ERROR: strerror_r(-524)=22
+kprobe_multi_bench_attach                        # bpf_program__attach_kprobe_multi_opts unexpected error: -95
 kprobe_multi_test/attach_api_addrs               # bpf_program__attach_kprobe_multi_opts unexpected error: -95
 kprobe_multi_test/attach_api_pattern             # bpf_program__attach_kprobe_multi_opts unexpected error: -95
 kprobe_multi_test/attach_api_syms                # bpf_program__attach_kprobe_multi_opts unexpected error: -95
index 648a8a1..585fcf7 100644 (file)
@@ -29,6 +29,7 @@ htab_update                              # failed to attach: ERROR: strerror_r(-
 kfree_skb                                # attach fentry unexpected error: -524                                        (trampoline)
 kfunc_call                               # 'bpf_prog_active': not found in kernel BTF                                  (?)
 kfunc_dynptr_param                       # JIT does not support calling kernel function                                (kfunc)
+kprobe_multi_bench_attach                # bpf_program__attach_kprobe_multi_opts unexpected error: -95
 kprobe_multi_test                        # relies on fentry
 ksyms_module                             # test_ksyms_module__open_and_load unexpected error: -9                       (?)
 ksyms_module_libbpf                      # JIT does not support calling kernel function                                (kfunc)
@@ -84,3 +85,4 @@ xdp_bonding                              # failed to auto-attach program 'trace_
 xdp_bpf2bpf                              # failed to auto-attach program 'trace_on_entry': -524                        (trampoline)
 xdp_do_redirect                          # prog_run_max_size unexpected error: -22 (errno 22)
 xdp_synproxy                             # JIT does not support calling kernel function                                (kfunc)
+xfrm_info                                # JIT does not support calling kernel function                                (kfunc)
index 6a0f043..c22c43b 100644 (file)
@@ -527,13 +527,15 @@ TRUNNER_BPF_PROGS_DIR := progs
 TRUNNER_EXTRA_SOURCES := test_progs.c cgroup_helpers.c trace_helpers.c \
                         network_helpers.c testing_helpers.c            \
                         btf_helpers.c flow_dissector_load.h            \
-                        cap_helpers.c
+                        cap_helpers.c test_loader.c
 TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read $(OUTPUT)/bpf_testmod.ko \
                       $(OUTPUT)/liburandom_read.so                     \
                       $(OUTPUT)/xdp_synproxy                           \
                       $(OUTPUT)/sign-file                              \
-                      ima_setup.sh verify_sig_setup.sh                 \
-                      $(wildcard progs/btf_dump_test_case_*.c)
+                      ima_setup.sh                                     \
+                      verify_sig_setup.sh                              \
+                      $(wildcard progs/btf_dump_test_case_*.c)         \
+                      $(wildcard progs/*.bpf.o)
 TRUNNER_BPF_BUILD_RULE := CLANG_BPF_BUILD_RULE
 TRUNNER_BPF_CFLAGS := $(BPF_CFLAGS) $(CLANG_CFLAGS) -DENABLE_ATOMICS_TESTS
 $(eval $(call DEFINE_TEST_RUNNER,test_progs))
index 8452095..bc4555a 100644 (file)
@@ -2,15 +2,22 @@
 #ifndef __BPF_LEGACY__
 #define __BPF_LEGACY__
 
+#if __GNUC__ && !__clang__
+/* Functions to emit BPF_LD_ABS and BPF_LD_IND instructions.  We
+ * provide the "standard" names as synonyms of the corresponding GCC
+ * builtins. Note how the SKB argument is ignored.
+ */
+#define load_byte(skb, off) __builtin_bpf_load_byte(off)
+#define load_half(skb, off) __builtin_bpf_load_half(off)
+#define load_word(skb, off) __builtin_bpf_load_word(off)
+#else
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
  */
-unsigned long long load_byte(void *skb,
-                            unsigned long long off) asm("llvm.bpf.load.byte");
-unsigned long long load_half(void *skb,
-                            unsigned long long off) asm("llvm.bpf.load.half");
-unsigned long long load_word(void *skb,
-                            unsigned long long off) asm("llvm.bpf.load.word");
+unsigned long long load_byte(void *skb, unsigned long long off) asm("llvm.bpf.load.byte");
+unsigned long long load_half(void *skb, unsigned long long off) asm("llvm.bpf.load.half");
+unsigned long long load_word(void *skb, unsigned long long off) asm("llvm.bpf.load.word");
+#endif
 
 #endif
 
index f9034ea..612f699 100644 (file)
@@ -8,7 +8,7 @@ CONFIG_BPF_LIRC_MODE2=y
 CONFIG_BPF_LSM=y
 CONFIG_BPF_STREAM_PARSER=y
 CONFIG_BPF_SYSCALL=y
-CONFIG_BPF_UNPRIV_DEFAULT_OFF=n
+# CONFIG_BPF_UNPRIV_DEFAULT_OFF is not set
 CONFIG_CGROUP_BPF=y
 CONFIG_CRYPTO_HMAC=y
 CONFIG_CRYPTO_SHA256=y
@@ -23,6 +23,7 @@ CONFIG_IKCONFIG_PROC=y
 CONFIG_IMA=y
 CONFIG_IMA_READ_POLICY=y
 CONFIG_IMA_WRITE_POLICY=y
+CONFIG_INET_ESP=y
 CONFIG_IP_NF_FILTER=y
 CONFIG_IP_NF_RAW=y
 CONFIG_IP_NF_TARGET_SYNPROXY=y
@@ -70,7 +71,8 @@ CONFIG_NF_NAT=y
 CONFIG_RC_CORE=y
 CONFIG_SECURITY=y
 CONFIG_SECURITYFS=y
-CONFIG_TEST_BPF=y
+CONFIG_TEST_BPF=m
 CONFIG_USERFAULTFD=y
 CONFIG_VXLAN=y
 CONFIG_XDP_SOCKETS=y
+CONFIG_XFRM_INTERFACE=y
index 1f37adf..01de331 100644 (file)
@@ -390,49 +390,6 @@ struct nstoken {
        int orig_netns_fd;
 };
 
-static int setns_by_fd(int nsfd)
-{
-       int err;
-
-       err = setns(nsfd, CLONE_NEWNET);
-       close(nsfd);
-
-       if (!ASSERT_OK(err, "setns"))
-               return err;
-
-       /* Switch /sys to the new namespace so that e.g. /sys/class/net
-        * reflects the devices in the new namespace.
-        */
-       err = unshare(CLONE_NEWNS);
-       if (!ASSERT_OK(err, "unshare"))
-               return err;
-
-       /* Make our /sys mount private, so the following umount won't
-        * trigger the global umount in case it's shared.
-        */
-       err = mount("none", "/sys", NULL, MS_PRIVATE, NULL);
-       if (!ASSERT_OK(err, "remount private /sys"))
-               return err;
-
-       err = umount2("/sys", MNT_DETACH);
-       if (!ASSERT_OK(err, "umount2 /sys"))
-               return err;
-
-       err = mount("sysfs", "/sys", "sysfs", 0, NULL);
-       if (!ASSERT_OK(err, "mount /sys"))
-               return err;
-
-       err = mount("bpffs", "/sys/fs/bpf", "bpf", 0, NULL);
-       if (!ASSERT_OK(err, "mount /sys/fs/bpf"))
-               return err;
-
-       err = mount("debugfs", "/sys/kernel/debug", "debugfs", 0, NULL);
-       if (!ASSERT_OK(err, "mount /sys/kernel/debug"))
-               return err;
-
-       return 0;
-}
-
 struct nstoken *open_netns(const char *name)
 {
        int nsfd;
@@ -453,8 +410,9 @@ struct nstoken *open_netns(const char *name)
        if (!ASSERT_GE(nsfd, 0, "open netns fd"))
                goto fail;
 
-       err = setns_by_fd(nsfd);
-       if (!ASSERT_OK(err, "setns_by_fd"))
+       err = setns(nsfd, CLONE_NEWNET);
+       close(nsfd);
+       if (!ASSERT_OK(err, "setns"))
                goto fail;
 
        return token;
@@ -465,6 +423,7 @@ fail:
 
 void close_netns(struct nstoken *token)
 {
-       ASSERT_OK(setns_by_fd(token->orig_netns_fd), "setns_by_fd");
+       ASSERT_OK(setns(token->orig_netns_fd, CLONE_NEWNET), "setns");
+       close(token->orig_netns_fd);
        free(token);
 }
index 7a27703..ef4d6a3 100644 (file)
@@ -9,6 +9,7 @@
 #include <string.h>
 #include <errno.h>
 #include <sched.h>
+#include <net/if.h>
 #include <linux/compiler.h>
 #include <bpf/libbpf.h>
 
@@ -20,10 +21,12 @@ static struct test_btf_skc_cls_ingress *skel;
 static struct sockaddr_in6 srv_sa6;
 static __u32 duration;
 
-#define PROG_PIN_FILE "/sys/fs/bpf/btf_skc_cls_ingress"
-
 static int prepare_netns(void)
 {
+       LIBBPF_OPTS(bpf_tc_hook, qdisc_lo, .attach_point = BPF_TC_INGRESS);
+       LIBBPF_OPTS(bpf_tc_opts, tc_attach,
+                   .prog_fd = bpf_program__fd(skel->progs.cls_ingress));
+
        if (CHECK(unshare(CLONE_NEWNET), "create netns",
                  "unshare(CLONE_NEWNET): %s (%d)",
                  strerror(errno), errno))
@@ -33,12 +36,12 @@ static int prepare_netns(void)
                  "ip link set dev lo up", "failed\n"))
                return -1;
 
-       if (CHECK(system("tc qdisc add dev lo clsact"),
-                 "tc qdisc add dev lo clsact", "failed\n"))
+       qdisc_lo.ifindex = if_nametoindex("lo");
+       if (!ASSERT_OK(bpf_tc_hook_create(&qdisc_lo), "qdisc add dev lo clsact"))
                return -1;
 
-       if (CHECK(system("tc filter add dev lo ingress bpf direct-action object-pinned " PROG_PIN_FILE),
-                 "install tc cls-prog at ingress", "failed\n"))
+       if (!ASSERT_OK(bpf_tc_attach(&qdisc_lo, &tc_attach),
+                      "filter add dev lo ingress"))
                return -1;
 
        /* Ensure 20 bytes options (i.e. in total 40 bytes tcp header) for the
@@ -195,19 +198,12 @@ static struct test tests[] = {
 
 void test_btf_skc_cls_ingress(void)
 {
-       int i, err;
+       int i;
 
        skel = test_btf_skc_cls_ingress__open_and_load();
        if (CHECK(!skel, "test_btf_skc_cls_ingress__open_and_load", "failed\n"))
                return;
 
-       err = bpf_program__pin(skel->progs.cls_ingress, PROG_PIN_FILE);
-       if (CHECK(err, "bpf_program__pin",
-                 "cannot pin bpf prog to %s. err:%d\n", PROG_PIN_FILE, err)) {
-               test_btf_skc_cls_ingress__destroy(skel);
-               return;
-       }
-
        for (i = 0; i < ARRAY_SIZE(tests); i++) {
                if (!test__start_subtest(tests[i].desc))
                        continue;
@@ -221,6 +217,5 @@ void test_btf_skc_cls_ingress(void)
                reset_test();
        }
 
-       bpf_program__unpin(skel->progs.cls_ingress, PROG_PIN_FILE);
        test_btf_skc_cls_ingress__destroy(skel);
 }
index 1c30412..33a2776 100644 (file)
@@ -10,7 +10,9 @@
 #include "cgrp_ls_recursion.skel.h"
 #include "cgrp_ls_attach_cgroup.skel.h"
 #include "cgrp_ls_negative.skel.h"
+#include "cgrp_ls_sleepable.skel.h"
 #include "network_helpers.h"
+#include "cgroup_helpers.h"
 
 struct socket_cookie {
        __u64 cookie_key;
@@ -150,14 +152,100 @@ static void test_negative(void)
        }
 }
 
+static void test_cgroup_iter_sleepable(int cgroup_fd, __u64 cgroup_id)
+{
+       DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts);
+       union bpf_iter_link_info linfo;
+       struct cgrp_ls_sleepable *skel;
+       struct bpf_link *link;
+       int err, iter_fd;
+       char buf[16];
+
+       skel = cgrp_ls_sleepable__open();
+       if (!ASSERT_OK_PTR(skel, "skel_open"))
+               return;
+
+       bpf_program__set_autoload(skel->progs.cgroup_iter, true);
+       err = cgrp_ls_sleepable__load(skel);
+       if (!ASSERT_OK(err, "skel_load"))
+               goto out;
+
+       memset(&linfo, 0, sizeof(linfo));
+       linfo.cgroup.cgroup_fd = cgroup_fd;
+       linfo.cgroup.order = BPF_CGROUP_ITER_SELF_ONLY;
+       opts.link_info = &linfo;
+       opts.link_info_len = sizeof(linfo);
+       link = bpf_program__attach_iter(skel->progs.cgroup_iter, &opts);
+       if (!ASSERT_OK_PTR(link, "attach_iter"))
+               goto out;
+
+       iter_fd = bpf_iter_create(bpf_link__fd(link));
+       if (!ASSERT_GE(iter_fd, 0, "iter_create"))
+               goto out;
+
+       /* trigger the program run */
+       (void)read(iter_fd, buf, sizeof(buf));
+
+       ASSERT_EQ(skel->bss->cgroup_id, cgroup_id, "cgroup_id");
+
+       close(iter_fd);
+out:
+       cgrp_ls_sleepable__destroy(skel);
+}
+
+static void test_no_rcu_lock(__u64 cgroup_id)
+{
+       struct cgrp_ls_sleepable *skel;
+       int err;
+
+       skel = cgrp_ls_sleepable__open();
+       if (!ASSERT_OK_PTR(skel, "skel_open"))
+               return;
+
+       skel->bss->target_pid = syscall(SYS_gettid);
+
+       bpf_program__set_autoload(skel->progs.no_rcu_lock, true);
+       err = cgrp_ls_sleepable__load(skel);
+       if (!ASSERT_OK(err, "skel_load"))
+               goto out;
+
+       err = cgrp_ls_sleepable__attach(skel);
+       if (!ASSERT_OK(err, "skel_attach"))
+               goto out;
+
+       syscall(SYS_getpgid);
+
+       ASSERT_EQ(skel->bss->cgroup_id, cgroup_id, "cgroup_id");
+out:
+       cgrp_ls_sleepable__destroy(skel);
+}
+
+static void test_rcu_lock(void)
+{
+       struct cgrp_ls_sleepable *skel;
+       int err;
+
+       skel = cgrp_ls_sleepable__open();
+       if (!ASSERT_OK_PTR(skel, "skel_open"))
+               return;
+
+       bpf_program__set_autoload(skel->progs.yes_rcu_lock, true);
+       err = cgrp_ls_sleepable__load(skel);
+       ASSERT_ERR(err, "skel_load");
+
+       cgrp_ls_sleepable__destroy(skel);
+}
+
 void test_cgrp_local_storage(void)
 {
+       __u64 cgroup_id;
        int cgroup_fd;
 
        cgroup_fd = test__join_cgroup("/cgrp_local_storage");
        if (!ASSERT_GE(cgroup_fd, 0, "join_cgroup /cgrp_local_storage"))
                return;
 
+       cgroup_id = get_cgroup_id("/cgrp_local_storage");
        if (test__start_subtest("tp_btf"))
                test_tp_btf(cgroup_fd);
        if (test__start_subtest("attach_cgroup"))
@@ -166,6 +254,12 @@ void test_cgrp_local_storage(void)
                test_recursion(cgroup_fd);
        if (test__start_subtest("negative"))
                test_negative();
+       if (test__start_subtest("cgroup_iter_sleepable"))
+               test_cgroup_iter_sleepable(cgroup_fd, cgroup_id);
+       if (test__start_subtest("no_rcu_lock"))
+               test_no_rcu_lock(cgroup_id);
+       if (test__start_subtest("rcu_lock"))
+               test_rcu_lock();
 
        close(cgroup_fd);
 }
index b0c06f8..7faaf6d 100644 (file)
@@ -5,86 +5,16 @@
 #include "dynptr_fail.skel.h"
 #include "dynptr_success.skel.h"
 
-static size_t log_buf_sz = 1048576; /* 1 MB */
-static char obj_log_buf[1048576];
-
 static struct {
        const char *prog_name;
        const char *expected_err_msg;
 } dynptr_tests[] = {
-       /* failure cases */
-       {"ringbuf_missing_release1", "Unreleased reference id=1"},
-       {"ringbuf_missing_release2", "Unreleased reference id=2"},
-       {"ringbuf_missing_release_callback", "Unreleased reference id"},
-       {"use_after_invalid", "Expected an initialized dynptr as arg #3"},
-       {"ringbuf_invalid_api", "type=mem expected=ringbuf_mem"},
-       {"add_dynptr_to_map1", "invalid indirect read from stack"},
-       {"add_dynptr_to_map2", "invalid indirect read from stack"},
-       {"data_slice_out_of_bounds_ringbuf", "value is outside of the allowed memory range"},
-       {"data_slice_out_of_bounds_map_value", "value is outside of the allowed memory range"},
-       {"data_slice_use_after_release1", "invalid mem access 'scalar'"},
-       {"data_slice_use_after_release2", "invalid mem access 'scalar'"},
-       {"data_slice_missing_null_check1", "invalid mem access 'mem_or_null'"},
-       {"data_slice_missing_null_check2", "invalid mem access 'mem_or_null'"},
-       {"invalid_helper1", "invalid indirect read from stack"},
-       {"invalid_helper2", "Expected an initialized dynptr as arg #3"},
-       {"invalid_write1", "Expected an initialized dynptr as arg #1"},
-       {"invalid_write2", "Expected an initialized dynptr as arg #3"},
-       {"invalid_write3", "Expected an initialized dynptr as arg #1"},
-       {"invalid_write4", "arg 1 is an unacquired reference"},
-       {"invalid_read1", "invalid read from stack"},
-       {"invalid_read2", "cannot pass in dynptr at an offset"},
-       {"invalid_read3", "invalid read from stack"},
-       {"invalid_read4", "invalid read from stack"},
-       {"invalid_offset", "invalid write to stack"},
-       {"global", "type=map_value expected=fp"},
-       {"release_twice", "arg 1 is an unacquired reference"},
-       {"release_twice_callback", "arg 1 is an unacquired reference"},
-       {"dynptr_from_mem_invalid_api",
-               "Unsupported reg type fp for bpf_dynptr_from_mem data"},
-
        /* success cases */
        {"test_read_write", NULL},
        {"test_data_slice", NULL},
        {"test_ringbuf", NULL},
 };
 
-static void verify_fail(const char *prog_name, const char *expected_err_msg)
-{
-       LIBBPF_OPTS(bpf_object_open_opts, opts);
-       struct bpf_program *prog;
-       struct dynptr_fail *skel;
-       int err;
-
-       opts.kernel_log_buf = obj_log_buf;
-       opts.kernel_log_size = log_buf_sz;
-       opts.kernel_log_level = 1;
-
-       skel = dynptr_fail__open_opts(&opts);
-       if (!ASSERT_OK_PTR(skel, "dynptr_fail__open_opts"))
-               goto cleanup;
-
-       prog = bpf_object__find_program_by_name(skel->obj, prog_name);
-       if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name"))
-               goto cleanup;
-
-       bpf_program__set_autoload(prog, true);
-
-       bpf_map__set_max_entries(skel->maps.ringbuf, getpagesize());
-
-       err = dynptr_fail__load(skel);
-       if (!ASSERT_ERR(err, "unexpected load success"))
-               goto cleanup;
-
-       if (!ASSERT_OK_PTR(strstr(obj_log_buf, expected_err_msg), "expected_err_msg")) {
-               fprintf(stderr, "Expected err_msg: %s\n", expected_err_msg);
-               fprintf(stderr, "Verifier output: %s\n", obj_log_buf);
-       }
-
-cleanup:
-       dynptr_fail__destroy(skel);
-}
-
 static void verify_success(const char *prog_name)
 {
        struct dynptr_success *skel;
@@ -97,8 +27,6 @@ static void verify_success(const char *prog_name)
 
        skel->bss->pid = getpid();
 
-       bpf_map__set_max_entries(skel->maps.ringbuf, getpagesize());
-
        dynptr_success__load(skel);
        if (!ASSERT_OK_PTR(skel, "dynptr_success__load"))
                goto cleanup;
@@ -129,10 +57,8 @@ void test_dynptr(void)
                if (!test__start_subtest(dynptr_tests[i].prog_name))
                        continue;
 
-               if (dynptr_tests[i].expected_err_msg)
-                       verify_fail(dynptr_tests[i].prog_name,
-                                   dynptr_tests[i].expected_err_msg);
-               else
-                       verify_success(dynptr_tests[i].prog_name);
+               verify_success(dynptr_tests[i].prog_name);
        }
+
+       RUN_TESTS(dynptr_fail);
 }
index 0613f3b..32dd731 100644 (file)
@@ -9,7 +9,7 @@
                goto out; \
 })
 
-void serial_test_empty_skb(void)
+void test_empty_skb(void)
 {
        LIBBPF_OPTS(bpf_test_run_opts, tattr);
        struct empty_skb *bpf_obj = NULL;
index 55d641c..a922926 100644 (file)
@@ -18,11 +18,8 @@ static struct {
        const char *expected_verifier_err_msg;
        int expected_runtime_err;
 } kfunc_dynptr_tests[] = {
-       {"dynptr_type_not_supp",
-        "arg#0 pointer type STRUCT bpf_dynptr_kern points to unsupported dynamic pointer type", 0},
-       {"not_valid_dynptr",
-        "arg#0 pointer type STRUCT bpf_dynptr_kern must be valid and initialized", 0},
-       {"not_ptr_to_stack", "arg#0 expected pointer to stack", 0},
+       {"not_valid_dynptr", "Expected an initialized dynptr as arg #1", 0},
+       {"not_ptr_to_stack", "arg#0 expected pointer to stack or dynptr_ptr", 0},
        {"dynptr_data_null", NULL, -EBADMSG},
 };
 
index 0d66b15..3533a4e 100644 (file)
@@ -5,83 +5,6 @@
 #include "map_kptr.skel.h"
 #include "map_kptr_fail.skel.h"
 
-static char log_buf[1024 * 1024];
-
-struct {
-       const char *prog_name;
-       const char *err_msg;
-} map_kptr_fail_tests[] = {
-       { "size_not_bpf_dw", "kptr access size must be BPF_DW" },
-       { "non_const_var_off", "kptr access cannot have variable offset" },
-       { "non_const_var_off_kptr_xchg", "R1 doesn't have constant offset. kptr has to be" },
-       { "misaligned_access_write", "kptr access misaligned expected=8 off=7" },
-       { "misaligned_access_read", "kptr access misaligned expected=8 off=1" },
-       { "reject_var_off_store", "variable untrusted_ptr_ access var_off=(0x0; 0x1e0)" },
-       { "reject_bad_type_match", "invalid kptr access, R1 type=untrusted_ptr_prog_test_ref_kfunc" },
-       { "marked_as_untrusted_or_null", "R1 type=untrusted_ptr_or_null_ expected=percpu_ptr_" },
-       { "correct_btf_id_check_size", "access beyond struct prog_test_ref_kfunc at off 32 size 4" },
-       { "inherit_untrusted_on_walk", "R1 type=untrusted_ptr_ expected=percpu_ptr_" },
-       { "reject_kptr_xchg_on_unref", "off=8 kptr isn't referenced kptr" },
-       { "reject_kptr_get_no_map_val", "arg#0 expected pointer to map value" },
-       { "reject_kptr_get_no_null_map_val", "arg#0 expected pointer to map value" },
-       { "reject_kptr_get_no_kptr", "arg#0 no referenced kptr at map value offset=0" },
-       { "reject_kptr_get_on_unref", "arg#0 no referenced kptr at map value offset=8" },
-       { "reject_kptr_get_bad_type_match", "kernel function bpf_kfunc_call_test_kptr_get args#0" },
-       { "mark_ref_as_untrusted_or_null", "R1 type=untrusted_ptr_or_null_ expected=percpu_ptr_" },
-       { "reject_untrusted_store_to_ref", "store to referenced kptr disallowed" },
-       { "reject_bad_type_xchg", "invalid kptr access, R2 type=ptr_prog_test_ref_kfunc expected=ptr_prog_test_member" },
-       { "reject_untrusted_xchg", "R2 type=untrusted_ptr_ expected=ptr_" },
-       { "reject_member_of_ref_xchg", "invalid kptr access, R2 type=ptr_prog_test_ref_kfunc" },
-       { "reject_indirect_helper_access", "kptr cannot be accessed indirectly by helper" },
-       { "reject_indirect_global_func_access", "kptr cannot be accessed indirectly by helper" },
-       { "kptr_xchg_ref_state", "Unreleased reference id=5 alloc_insn=" },
-       { "kptr_get_ref_state", "Unreleased reference id=3 alloc_insn=" },
-};
-
-static void test_map_kptr_fail_prog(const char *prog_name, const char *err_msg)
-{
-       LIBBPF_OPTS(bpf_object_open_opts, opts, .kernel_log_buf = log_buf,
-                                               .kernel_log_size = sizeof(log_buf),
-                                               .kernel_log_level = 1);
-       struct map_kptr_fail *skel;
-       struct bpf_program *prog;
-       int ret;
-
-       skel = map_kptr_fail__open_opts(&opts);
-       if (!ASSERT_OK_PTR(skel, "map_kptr_fail__open_opts"))
-               return;
-
-       prog = bpf_object__find_program_by_name(skel->obj, prog_name);
-       if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name"))
-               goto end;
-
-       bpf_program__set_autoload(prog, true);
-
-       ret = map_kptr_fail__load(skel);
-       if (!ASSERT_ERR(ret, "map_kptr__load must fail"))
-               goto end;
-
-       if (!ASSERT_OK_PTR(strstr(log_buf, err_msg), "expected error message")) {
-               fprintf(stderr, "Expected: %s\n", err_msg);
-               fprintf(stderr, "Verifier: %s\n", log_buf);
-       }
-
-end:
-       map_kptr_fail__destroy(skel);
-}
-
-static void test_map_kptr_fail(void)
-{
-       int i;
-
-       for (i = 0; i < ARRAY_SIZE(map_kptr_fail_tests); i++) {
-               if (!test__start_subtest(map_kptr_fail_tests[i].prog_name))
-                       continue;
-               test_map_kptr_fail_prog(map_kptr_fail_tests[i].prog_name,
-                                       map_kptr_fail_tests[i].err_msg);
-       }
-}
-
 static void test_map_kptr_success(bool test_run)
 {
        LIBBPF_OPTS(bpf_test_run_opts, opts,
@@ -145,5 +68,6 @@ void test_map_kptr(void)
                 */
                test_map_kptr_success(true);
        }
-       test_map_kptr_fail();
+
+       RUN_TESTS(map_kptr_fail);
 }
index ffd8ef4..18848c3 100644 (file)
@@ -103,6 +103,7 @@ static struct {
        {"task_kfunc_release_null", "arg#0 is ptr_or_null_ expected ptr_ or socket"},
        {"task_kfunc_release_unacquired", "release kernel function bpf_task_release expects"},
        {"task_kfunc_from_pid_no_null_check", "arg#0 is ptr_or_null_ expected ptr_ or socket"},
+       {"task_kfunc_from_lsm_task_free", "reg type unsupported for arg#0 function"},
 };
 
 static void verify_fail(const char *prog_name, const char *expected_err_msg)
index cb6a53b..bca5e68 100644 (file)
  */
 
 #include <arpa/inet.h>
-#include <linux/if.h>
 #include <linux/if_tun.h>
 #include <linux/limits.h>
 #include <linux/sysctl.h>
 #include <linux/time_types.h>
 #include <linux/net_tstamp.h>
+#include <net/if.h>
 #include <stdbool.h>
 #include <stdio.h>
 #include <sys/stat.h>
 #define IFADDR_STR_LEN 18
 #define PING_ARGS "-i 0.2 -c 3 -w 10 -q"
 
-#define SRC_PROG_PIN_FILE "/sys/fs/bpf/test_tc_src"
-#define DST_PROG_PIN_FILE "/sys/fs/bpf/test_tc_dst"
-#define CHK_PROG_PIN_FILE "/sys/fs/bpf/test_tc_chk"
-
 #define TIMEOUT_MILLIS 10000
 #define NSEC_PER_SEC 1000000000ULL
 
@@ -115,7 +111,9 @@ static void netns_setup_namespaces_nofail(const char *verb)
 }
 
 struct netns_setup_result {
+       int ifindex_veth_src;
        int ifindex_veth_src_fwd;
+       int ifindex_veth_dst;
        int ifindex_veth_dst_fwd;
 };
 
@@ -139,27 +137,6 @@ static int get_ifaddr(const char *name, char *ifaddr)
        return 0;
 }
 
-static int get_ifindex(const char *name)
-{
-       char path[PATH_MAX];
-       char buf[32];
-       FILE *f;
-       int ret;
-
-       snprintf(path, PATH_MAX, "/sys/class/net/%s/ifindex", name);
-       f = fopen(path, "r");
-       if (!ASSERT_OK_PTR(f, path))
-               return -1;
-
-       ret = fread(buf, 1, sizeof(buf), f);
-       if (!ASSERT_GT(ret, 0, "fread ifindex")) {
-               fclose(f);
-               return -1;
-       }
-       fclose(f);
-       return atoi(buf);
-}
-
 #define SYS(fmt, ...)                                          \
        ({                                                      \
                char cmd[1024];                                 \
@@ -182,11 +159,20 @@ static int netns_setup_links_and_routes(struct netns_setup_result *result)
        if (get_ifaddr("veth_src_fwd", veth_src_fwd_addr))
                goto fail;
 
-       result->ifindex_veth_src_fwd = get_ifindex("veth_src_fwd");
-       if (result->ifindex_veth_src_fwd < 0)
+       result->ifindex_veth_src = if_nametoindex("veth_src");
+       if (!ASSERT_GT(result->ifindex_veth_src, 0, "ifindex_veth_src"))
+               goto fail;
+
+       result->ifindex_veth_src_fwd = if_nametoindex("veth_src_fwd");
+       if (!ASSERT_GT(result->ifindex_veth_src_fwd, 0, "ifindex_veth_src_fwd"))
                goto fail;
-       result->ifindex_veth_dst_fwd = get_ifindex("veth_dst_fwd");
-       if (result->ifindex_veth_dst_fwd < 0)
+
+       result->ifindex_veth_dst = if_nametoindex("veth_dst");
+       if (!ASSERT_GT(result->ifindex_veth_dst, 0, "ifindex_veth_dst"))
+               goto fail;
+
+       result->ifindex_veth_dst_fwd = if_nametoindex("veth_dst_fwd");
+       if (!ASSERT_GT(result->ifindex_veth_dst_fwd, 0, "ifindex_veth_dst_fwd"))
                goto fail;
 
        SYS("ip link set veth_src netns " NS_SRC);
@@ -260,19 +246,78 @@ fail:
        return -1;
 }
 
-static int netns_load_bpf(void)
+static int qdisc_clsact_create(struct bpf_tc_hook *qdisc_hook, int ifindex)
+{
+       char err_str[128], ifname[16];
+       int err;
+
+       qdisc_hook->ifindex = ifindex;
+       qdisc_hook->attach_point = BPF_TC_INGRESS | BPF_TC_EGRESS;
+       err = bpf_tc_hook_create(qdisc_hook);
+       snprintf(err_str, sizeof(err_str),
+                "qdisc add dev %s clsact",
+                if_indextoname(qdisc_hook->ifindex, ifname) ? : "<unknown_iface>");
+       err_str[sizeof(err_str) - 1] = 0;
+       ASSERT_OK(err, err_str);
+
+       return err;
+}
+
+static int xgress_filter_add(struct bpf_tc_hook *qdisc_hook,
+                            enum bpf_tc_attach_point xgress,
+                            const struct bpf_program *prog, int priority)
+{
+       LIBBPF_OPTS(bpf_tc_opts, tc_attach);
+       char err_str[128], ifname[16];
+       int err;
+
+       qdisc_hook->attach_point = xgress;
+       tc_attach.prog_fd = bpf_program__fd(prog);
+       tc_attach.priority = priority;
+       err = bpf_tc_attach(qdisc_hook, &tc_attach);
+       snprintf(err_str, sizeof(err_str),
+                "filter add dev %s %s prio %d bpf da %s",
+                if_indextoname(qdisc_hook->ifindex, ifname) ? : "<unknown_iface>",
+                xgress == BPF_TC_INGRESS ? "ingress" : "egress",
+                priority, bpf_program__name(prog));
+       err_str[sizeof(err_str) - 1] = 0;
+       ASSERT_OK(err, err_str);
+
+       return err;
+}
+
+#define QDISC_CLSACT_CREATE(qdisc_hook, ifindex) ({            \
+       if ((err = qdisc_clsact_create(qdisc_hook, ifindex)))   \
+               goto fail;                                      \
+})
+
+#define XGRESS_FILTER_ADD(qdisc_hook, xgress, prog, priority) ({               \
+       if ((err = xgress_filter_add(qdisc_hook, xgress, prog, priority)))      \
+               goto fail;                                                      \
+})
+
+static int netns_load_bpf(const struct bpf_program *src_prog,
+                         const struct bpf_program *dst_prog,
+                         const struct bpf_program *chk_prog,
+                         const struct netns_setup_result *setup_result)
 {
-       SYS("tc qdisc add dev veth_src_fwd clsact");
-       SYS("tc filter add dev veth_src_fwd ingress bpf da object-pinned "
-           SRC_PROG_PIN_FILE);
-       SYS("tc filter add dev veth_src_fwd egress bpf da object-pinned "
-           CHK_PROG_PIN_FILE);
-
-       SYS("tc qdisc add dev veth_dst_fwd clsact");
-       SYS("tc filter add dev veth_dst_fwd ingress bpf da object-pinned "
-           DST_PROG_PIN_FILE);
-       SYS("tc filter add dev veth_dst_fwd egress bpf da object-pinned "
-           CHK_PROG_PIN_FILE);
+       LIBBPF_OPTS(bpf_tc_hook, qdisc_veth_src_fwd);
+       LIBBPF_OPTS(bpf_tc_hook, qdisc_veth_dst_fwd);
+       int err;
+
+       /* tc qdisc add dev veth_src_fwd clsact */
+       QDISC_CLSACT_CREATE(&qdisc_veth_src_fwd, setup_result->ifindex_veth_src_fwd);
+       /* tc filter add dev veth_src_fwd ingress bpf da src_prog */
+       XGRESS_FILTER_ADD(&qdisc_veth_src_fwd, BPF_TC_INGRESS, src_prog, 0);
+       /* tc filter add dev veth_src_fwd egress bpf da chk_prog */
+       XGRESS_FILTER_ADD(&qdisc_veth_src_fwd, BPF_TC_EGRESS, chk_prog, 0);
+
+       /* tc qdisc add dev veth_dst_fwd clsact */
+       QDISC_CLSACT_CREATE(&qdisc_veth_dst_fwd, setup_result->ifindex_veth_dst_fwd);
+       /* tc filter add dev veth_dst_fwd ingress bpf da dst_prog */
+       XGRESS_FILTER_ADD(&qdisc_veth_dst_fwd, BPF_TC_INGRESS, dst_prog, 0);
+       /* tc filter add dev veth_dst_fwd egress bpf da chk_prog */
+       XGRESS_FILTER_ADD(&qdisc_veth_dst_fwd, BPF_TC_EGRESS, chk_prog, 0);
 
        return 0;
 fail:
@@ -499,78 +544,79 @@ done:
                close(client_fd);
 }
 
-static int netns_load_dtime_bpf(struct test_tc_dtime *skel)
+static int netns_load_dtime_bpf(struct test_tc_dtime *skel,
+                               const struct netns_setup_result *setup_result)
 {
+       LIBBPF_OPTS(bpf_tc_hook, qdisc_veth_src_fwd);
+       LIBBPF_OPTS(bpf_tc_hook, qdisc_veth_dst_fwd);
+       LIBBPF_OPTS(bpf_tc_hook, qdisc_veth_src);
+       LIBBPF_OPTS(bpf_tc_hook, qdisc_veth_dst);
        struct nstoken *nstoken;
-
-#define PIN_FNAME(__file) "/sys/fs/bpf/" #__file
-#define PIN(__prog) ({                                                 \
-               int err = bpf_program__pin(skel->progs.__prog, PIN_FNAME(__prog)); \
-               if (!ASSERT_OK(err, "pin " #__prog))            \
-                       goto fail;                                      \
-               })
+       int err;
 
        /* setup ns_src tc progs */
        nstoken = open_netns(NS_SRC);
        if (!ASSERT_OK_PTR(nstoken, "setns " NS_SRC))
                return -1;
-       PIN(egress_host);
-       PIN(ingress_host);
-       SYS("tc qdisc add dev veth_src clsact");
-       SYS("tc filter add dev veth_src ingress bpf da object-pinned "
-           PIN_FNAME(ingress_host));
-       SYS("tc filter add dev veth_src egress bpf da object-pinned "
-           PIN_FNAME(egress_host));
+       /* tc qdisc add dev veth_src clsact */
+       QDISC_CLSACT_CREATE(&qdisc_veth_src, setup_result->ifindex_veth_src);
+       /* tc filter add dev veth_src ingress bpf da ingress_host */
+       XGRESS_FILTER_ADD(&qdisc_veth_src, BPF_TC_INGRESS, skel->progs.ingress_host, 0);
+       /* tc filter add dev veth_src egress bpf da egress_host */
+       XGRESS_FILTER_ADD(&qdisc_veth_src, BPF_TC_EGRESS, skel->progs.egress_host, 0);
        close_netns(nstoken);
 
        /* setup ns_dst tc progs */
        nstoken = open_netns(NS_DST);
        if (!ASSERT_OK_PTR(nstoken, "setns " NS_DST))
                return -1;
-       PIN(egress_host);
-       PIN(ingress_host);
-       SYS("tc qdisc add dev veth_dst clsact");
-       SYS("tc filter add dev veth_dst ingress bpf da object-pinned "
-           PIN_FNAME(ingress_host));
-       SYS("tc filter add dev veth_dst egress bpf da object-pinned "
-           PIN_FNAME(egress_host));
+       /* tc qdisc add dev veth_dst clsact */
+       QDISC_CLSACT_CREATE(&qdisc_veth_dst, setup_result->ifindex_veth_dst);
+       /* tc filter add dev veth_dst ingress bpf da ingress_host */
+       XGRESS_FILTER_ADD(&qdisc_veth_dst, BPF_TC_INGRESS, skel->progs.ingress_host, 0);
+       /* tc filter add dev veth_dst egress bpf da egress_host */
+       XGRESS_FILTER_ADD(&qdisc_veth_dst, BPF_TC_EGRESS, skel->progs.egress_host, 0);
        close_netns(nstoken);
 
        /* setup ns_fwd tc progs */
        nstoken = open_netns(NS_FWD);
        if (!ASSERT_OK_PTR(nstoken, "setns " NS_FWD))
                return -1;
-       PIN(ingress_fwdns_prio100);
-       PIN(egress_fwdns_prio100);
-       PIN(ingress_fwdns_prio101);
-       PIN(egress_fwdns_prio101);
-       SYS("tc qdisc add dev veth_dst_fwd clsact");
-       SYS("tc filter add dev veth_dst_fwd ingress prio 100 bpf da object-pinned "
-           PIN_FNAME(ingress_fwdns_prio100));
-       SYS("tc filter add dev veth_dst_fwd ingress prio 101 bpf da object-pinned "
-           PIN_FNAME(ingress_fwdns_prio101));
-       SYS("tc filter add dev veth_dst_fwd egress prio 100 bpf da object-pinned "
-           PIN_FNAME(egress_fwdns_prio100));
-       SYS("tc filter add dev veth_dst_fwd egress prio 101 bpf da object-pinned "
-           PIN_FNAME(egress_fwdns_prio101));
-       SYS("tc qdisc add dev veth_src_fwd clsact");
-       SYS("tc filter add dev veth_src_fwd ingress prio 100 bpf da object-pinned "
-           PIN_FNAME(ingress_fwdns_prio100));
-       SYS("tc filter add dev veth_src_fwd ingress prio 101 bpf da object-pinned "
-           PIN_FNAME(ingress_fwdns_prio101));
-       SYS("tc filter add dev veth_src_fwd egress prio 100 bpf da object-pinned "
-           PIN_FNAME(egress_fwdns_prio100));
-       SYS("tc filter add dev veth_src_fwd egress prio 101 bpf da object-pinned "
-           PIN_FNAME(egress_fwdns_prio101));
+       /* tc qdisc add dev veth_dst_fwd clsact */
+       QDISC_CLSACT_CREATE(&qdisc_veth_dst_fwd, setup_result->ifindex_veth_dst_fwd);
+       /* tc filter add dev veth_dst_fwd ingress prio 100 bpf da ingress_fwdns_prio100 */
+       XGRESS_FILTER_ADD(&qdisc_veth_dst_fwd, BPF_TC_INGRESS,
+                         skel->progs.ingress_fwdns_prio100, 100);
+       /* tc filter add dev veth_dst_fwd ingress prio 101 bpf da ingress_fwdns_prio101 */
+       XGRESS_FILTER_ADD(&qdisc_veth_dst_fwd, BPF_TC_INGRESS,
+                         skel->progs.ingress_fwdns_prio101, 101);
+       /* tc filter add dev veth_dst_fwd egress prio 100 bpf da egress_fwdns_prio100 */
+       XGRESS_FILTER_ADD(&qdisc_veth_dst_fwd, BPF_TC_EGRESS,
+                         skel->progs.egress_fwdns_prio100, 100);
+       /* tc filter add dev veth_dst_fwd egress prio 101 bpf da egress_fwdns_prio101 */
+       XGRESS_FILTER_ADD(&qdisc_veth_dst_fwd, BPF_TC_EGRESS,
+                         skel->progs.egress_fwdns_prio101, 101);
+
+       /* tc qdisc add dev veth_src_fwd clsact */
+       QDISC_CLSACT_CREATE(&qdisc_veth_src_fwd, setup_result->ifindex_veth_src_fwd);
+       /* tc filter add dev veth_src_fwd ingress prio 100 bpf da ingress_fwdns_prio100 */
+       XGRESS_FILTER_ADD(&qdisc_veth_src_fwd, BPF_TC_INGRESS,
+                         skel->progs.ingress_fwdns_prio100, 100);
+       /* tc filter add dev veth_src_fwd ingress prio 101 bpf da ingress_fwdns_prio101 */
+       XGRESS_FILTER_ADD(&qdisc_veth_src_fwd, BPF_TC_INGRESS,
+                         skel->progs.ingress_fwdns_prio101, 101);
+       /* tc filter add dev veth_src_fwd egress prio 100 bpf da egress_fwdns_prio100 */
+       XGRESS_FILTER_ADD(&qdisc_veth_src_fwd, BPF_TC_EGRESS,
+                         skel->progs.egress_fwdns_prio100, 100);
+       /* tc filter add dev veth_src_fwd egress prio 101 bpf da egress_fwdns_prio101 */
+       XGRESS_FILTER_ADD(&qdisc_veth_src_fwd, BPF_TC_EGRESS,
+                         skel->progs.egress_fwdns_prio101, 101);
        close_netns(nstoken);
-
-#undef PIN
-
        return 0;
 
 fail:
        close_netns(nstoken);
-       return -1;
+       return err;
 }
 
 enum {
@@ -746,7 +792,7 @@ static void test_tc_redirect_dtime(struct netns_setup_result *setup_result)
        if (!ASSERT_OK(err, "test_tc_dtime__load"))
                goto done;
 
-       if (netns_load_dtime_bpf(skel))
+       if (netns_load_dtime_bpf(skel, setup_result))
                goto done;
 
        nstoken = open_netns(NS_FWD);
@@ -788,7 +834,6 @@ static void test_tc_redirect_neigh_fib(struct netns_setup_result *setup_result)
 {
        struct nstoken *nstoken = NULL;
        struct test_tc_neigh_fib *skel = NULL;
-       int err;
 
        nstoken = open_netns(NS_FWD);
        if (!ASSERT_OK_PTR(nstoken, "setns fwd"))
@@ -801,19 +846,8 @@ static void test_tc_redirect_neigh_fib(struct netns_setup_result *setup_result)
        if (!ASSERT_OK(test_tc_neigh_fib__load(skel), "test_tc_neigh_fib__load"))
                goto done;
 
-       err = bpf_program__pin(skel->progs.tc_src, SRC_PROG_PIN_FILE);
-       if (!ASSERT_OK(err, "pin " SRC_PROG_PIN_FILE))
-               goto done;
-
-       err = bpf_program__pin(skel->progs.tc_chk, CHK_PROG_PIN_FILE);
-       if (!ASSERT_OK(err, "pin " CHK_PROG_PIN_FILE))
-               goto done;
-
-       err = bpf_program__pin(skel->progs.tc_dst, DST_PROG_PIN_FILE);
-       if (!ASSERT_OK(err, "pin " DST_PROG_PIN_FILE))
-               goto done;
-
-       if (netns_load_bpf())
+       if (netns_load_bpf(skel->progs.tc_src, skel->progs.tc_dst,
+                          skel->progs.tc_chk, setup_result))
                goto done;
 
        /* bpf_fib_lookup() checks if forwarding is enabled */
@@ -849,19 +883,8 @@ static void test_tc_redirect_neigh(struct netns_setup_result *setup_result)
        if (!ASSERT_OK(err, "test_tc_neigh__load"))
                goto done;
 
-       err = bpf_program__pin(skel->progs.tc_src, SRC_PROG_PIN_FILE);
-       if (!ASSERT_OK(err, "pin " SRC_PROG_PIN_FILE))
-               goto done;
-
-       err = bpf_program__pin(skel->progs.tc_chk, CHK_PROG_PIN_FILE);
-       if (!ASSERT_OK(err, "pin " CHK_PROG_PIN_FILE))
-               goto done;
-
-       err = bpf_program__pin(skel->progs.tc_dst, DST_PROG_PIN_FILE);
-       if (!ASSERT_OK(err, "pin " DST_PROG_PIN_FILE))
-               goto done;
-
-       if (netns_load_bpf())
+       if (netns_load_bpf(skel->progs.tc_src, skel->progs.tc_dst,
+                          skel->progs.tc_chk, setup_result))
                goto done;
 
        if (!ASSERT_OK(set_forwarding(false), "disable forwarding"))
@@ -896,19 +919,8 @@ static void test_tc_redirect_peer(struct netns_setup_result *setup_result)
        if (!ASSERT_OK(err, "test_tc_peer__load"))
                goto done;
 
-       err = bpf_program__pin(skel->progs.tc_src, SRC_PROG_PIN_FILE);
-       if (!ASSERT_OK(err, "pin " SRC_PROG_PIN_FILE))
-               goto done;
-
-       err = bpf_program__pin(skel->progs.tc_chk, CHK_PROG_PIN_FILE);
-       if (!ASSERT_OK(err, "pin " CHK_PROG_PIN_FILE))
-               goto done;
-
-       err = bpf_program__pin(skel->progs.tc_dst, DST_PROG_PIN_FILE);
-       if (!ASSERT_OK(err, "pin " DST_PROG_PIN_FILE))
-               goto done;
-
-       if (netns_load_bpf())
+       if (netns_load_bpf(skel->progs.tc_src, skel->progs.tc_dst,
+                          skel->progs.tc_chk, setup_result))
                goto done;
 
        if (!ASSERT_OK(set_forwarding(false), "disable forwarding"))
@@ -991,6 +1003,8 @@ static int tun_relay_loop(int src_fd, int target_fd)
 
 static void test_tc_redirect_peer_l3(struct netns_setup_result *setup_result)
 {
+       LIBBPF_OPTS(bpf_tc_hook, qdisc_tun_fwd);
+       LIBBPF_OPTS(bpf_tc_hook, qdisc_veth_dst_fwd);
        struct test_tc_peer *skel = NULL;
        struct nstoken *nstoken = NULL;
        int err;
@@ -1034,8 +1048,8 @@ static void test_tc_redirect_peer_l3(struct netns_setup_result *setup_result)
        if (!ASSERT_OK_PTR(skel, "test_tc_peer__open"))
                goto fail;
 
-       ifindex = get_ifindex("tun_fwd");
-       if (!ASSERT_GE(ifindex, 0, "get_ifindex tun_fwd"))
+       ifindex = if_nametoindex("tun_fwd");
+       if (!ASSERT_GT(ifindex, 0, "if_indextoname tun_fwd"))
                goto fail;
 
        skel->rodata->IFINDEX_SRC = ifindex;
@@ -1045,31 +1059,21 @@ static void test_tc_redirect_peer_l3(struct netns_setup_result *setup_result)
        if (!ASSERT_OK(err, "test_tc_peer__load"))
                goto fail;
 
-       err = bpf_program__pin(skel->progs.tc_src_l3, SRC_PROG_PIN_FILE);
-       if (!ASSERT_OK(err, "pin " SRC_PROG_PIN_FILE))
-               goto fail;
-
-       err = bpf_program__pin(skel->progs.tc_dst_l3, DST_PROG_PIN_FILE);
-       if (!ASSERT_OK(err, "pin " DST_PROG_PIN_FILE))
-               goto fail;
-
-       err = bpf_program__pin(skel->progs.tc_chk, CHK_PROG_PIN_FILE);
-       if (!ASSERT_OK(err, "pin " CHK_PROG_PIN_FILE))
-               goto fail;
-
        /* Load "tc_src_l3" to the tun_fwd interface to redirect packets
         * towards dst, and "tc_dst" to redirect packets
         * and "tc_chk" on veth_dst_fwd to drop non-redirected packets.
         */
-       SYS("tc qdisc add dev tun_fwd clsact");
-       SYS("tc filter add dev tun_fwd ingress bpf da object-pinned "
-           SRC_PROG_PIN_FILE);
-
-       SYS("tc qdisc add dev veth_dst_fwd clsact");
-       SYS("tc filter add dev veth_dst_fwd ingress bpf da object-pinned "
-           DST_PROG_PIN_FILE);
-       SYS("tc filter add dev veth_dst_fwd egress bpf da object-pinned "
-           CHK_PROG_PIN_FILE);
+       /* tc qdisc add dev tun_fwd clsact */
+       QDISC_CLSACT_CREATE(&qdisc_tun_fwd, ifindex);
+       /* tc filter add dev tun_fwd ingress bpf da tc_src_l3 */
+       XGRESS_FILTER_ADD(&qdisc_tun_fwd, BPF_TC_INGRESS, skel->progs.tc_src_l3, 0);
+
+       /* tc qdisc add dev veth_dst_fwd clsact */
+       QDISC_CLSACT_CREATE(&qdisc_veth_dst_fwd, setup_result->ifindex_veth_dst_fwd);
+       /* tc filter add dev veth_dst_fwd ingress bpf da tc_dst_l3 */
+       XGRESS_FILTER_ADD(&qdisc_veth_dst_fwd, BPF_TC_INGRESS, skel->progs.tc_dst_l3, 0);
+       /* tc filter add dev veth_dst_fwd egress bpf da tc_chk */
+       XGRESS_FILTER_ADD(&qdisc_veth_dst_fwd, BPF_TC_EGRESS, skel->progs.tc_chk, 0);
 
        /* Setup route and neigh tables */
        SYS("ip -netns " NS_SRC " addr add dev tun_src " IP4_TUN_SRC "/24");
@@ -1134,7 +1138,7 @@ static void *test_tc_redirect_run_tests(void *arg)
        return NULL;
 }
 
-void serial_test_tc_redirect(void)
+void test_tc_redirect(void)
 {
        pthread_t test_thread;
        int err;
index eea2741..07ad457 100644 (file)
@@ -421,7 +421,7 @@ static void *test_tunnel_run_tests(void *arg)
        return NULL;
 }
 
-void serial_test_tunnel(void)
+void test_tunnel(void)
 {
        pthread_t test_thread;
        int err;
index 02b18d0..dae68de 100644 (file)
@@ -673,9 +673,11 @@ static struct {
        {"user_ringbuf_callback_write_forbidden", "invalid mem access 'dynptr_ptr'"},
        {"user_ringbuf_callback_null_context_write", "invalid mem access 'scalar'"},
        {"user_ringbuf_callback_null_context_read", "invalid mem access 'scalar'"},
-       {"user_ringbuf_callback_discard_dynptr", "arg 1 is an unacquired reference"},
-       {"user_ringbuf_callback_submit_dynptr", "arg 1 is an unacquired reference"},
+       {"user_ringbuf_callback_discard_dynptr", "cannot release unowned const bpf_dynptr"},
+       {"user_ringbuf_callback_submit_dynptr", "cannot release unowned const bpf_dynptr"},
        {"user_ringbuf_callback_invalid_return", "At callback return the register R0 has value"},
+       {"user_ringbuf_callback_reinit_dynptr_mem", "Dynptr has to be an uninitialized dynptr"},
+       {"user_ringbuf_callback_reinit_dynptr_ringbuf", "Dynptr has to be an uninitialized dynptr"},
 };
 
 #define SUCCESS_TEST(_func) { _func, #_func }
index 9ac6f6a..a50971c 100644 (file)
@@ -85,7 +85,7 @@ static void test_max_pkt_size(int fd)
 }
 
 #define NUM_PKTS 10000
-void serial_test_xdp_do_redirect(void)
+void test_xdp_do_redirect(void)
 {
        int err, xdp_prog_fd, tc_prog_fd, ifindex_src, ifindex_dst;
        char data[sizeof(pkt_udp) + sizeof(__u32)];
index 13daa37..c720838 100644 (file)
@@ -174,7 +174,7 @@ out:
        system("ip netns del synproxy");
 }
 
-void serial_test_xdp_synproxy(void)
+void test_xdp_synproxy(void)
 {
        if (test__start_subtest("xdp"))
                test_synproxy(true);
diff --git a/tools/testing/selftests/bpf/prog_tests/xfrm_info.c b/tools/testing/selftests/bpf/prog_tests/xfrm_info.c
new file mode 100644 (file)
index 0000000..8b03c9b
--- /dev/null
@@ -0,0 +1,362 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+
+/*
+ * Topology:
+ * ---------
+ *   NS0 namespace         |   NS1 namespace        | NS2 namespace
+ *                         |                        |
+ *   +---------------+     |   +---------------+    |
+ *   |    ipsec0     |---------|    ipsec0     |    |
+ *   | 192.168.1.100 |     |   | 192.168.1.200 |    |
+ *   | if_id: bpf    |     |   +---------------+    |
+ *   +---------------+     |                        |
+ *           |             |                        |   +---------------+
+ *           |             |                        |   |    ipsec0     |
+ *           \------------------------------------------| 192.168.1.200 |
+ *                         |                        |   +---------------+
+ *                         |                        |
+ *                         |                        | (overlay network)
+ *      ------------------------------------------------------
+ *                         |                        | (underlay network)
+ *   +--------------+      |   +--------------+     |
+ *   |    veth01    |----------|    veth10    |     |
+ *   | 172.16.1.100 |      |   | 172.16.1.200 |     |
+ *   ---------------+      |   +--------------+     |
+ *                         |                        |
+ *   +--------------+      |                        |   +--------------+
+ *   |    veth02    |-----------------------------------|    veth20    |
+ *   | 172.16.2.100 |      |                        |   | 172.16.2.200 |
+ *   +--------------+      |                        |   +--------------+
+ *
+ *
+ * Test Packet flow
+ * -----------
+ *  The tests perform 'ping 192.168.1.200' from the NS0 namespace:
+ *  1) request is routed to NS0 ipsec0
+ *  2) NS0 ipsec0 tc egress BPF program is triggered and sets the if_id based
+ *     on the requested value. This makes the ipsec0 device in external mode
+ *     select the destination tunnel
+ *  3) ping reaches the other namespace (NS1 or NS2 based on which if_id was
+ *     used) and response is sent
+ *  4) response is received on NS0 ipsec0, tc ingress program is triggered and
+ *     records the response if_id
+ *  5) requested if_id is compared with received if_id
+ */
+
+#include <net/if.h>
+#include <linux/rtnetlink.h>
+#include <linux/if_link.h>
+
+#include "test_progs.h"
+#include "network_helpers.h"
+#include "xfrm_info.skel.h"
+
+#define NS0 "xfrm_test_ns0"
+#define NS1 "xfrm_test_ns1"
+#define NS2 "xfrm_test_ns2"
+
+#define IF_ID_0_TO_1 1
+#define IF_ID_0_TO_2 2
+#define IF_ID_1 3
+#define IF_ID_2 4
+
+#define IP4_ADDR_VETH01 "172.16.1.100"
+#define IP4_ADDR_VETH10 "172.16.1.200"
+#define IP4_ADDR_VETH02 "172.16.2.100"
+#define IP4_ADDR_VETH20 "172.16.2.200"
+
+#define ESP_DUMMY_PARAMS \
+    "proto esp aead 'rfc4106(gcm(aes))' " \
+    "0xe4d8f4b4da1df18a3510b3781496daa82488b713 128 mode tunnel "
+
+#define SYS(fmt, ...)                                          \
+       ({                                                      \
+               char cmd[1024];                                 \
+               snprintf(cmd, sizeof(cmd), fmt, ##__VA_ARGS__); \
+               if (!ASSERT_OK(system(cmd), cmd))               \
+                       goto fail;                              \
+       })
+
+#define SYS_NOFAIL(fmt, ...)                                   \
+       ({                                                      \
+               char cmd[1024];                                 \
+               snprintf(cmd, sizeof(cmd), fmt, ##__VA_ARGS__); \
+               system(cmd);                                    \
+       })
+
+static int attach_tc_prog(struct bpf_tc_hook *hook, int igr_fd, int egr_fd)
+{
+       LIBBPF_OPTS(bpf_tc_opts, opts1, .handle = 1, .priority = 1,
+                   .prog_fd = igr_fd);
+       LIBBPF_OPTS(bpf_tc_opts, opts2, .handle = 1, .priority = 1,
+                   .prog_fd = egr_fd);
+       int ret;
+
+       ret = bpf_tc_hook_create(hook);
+       if (!ASSERT_OK(ret, "create tc hook"))
+               return ret;
+
+       if (igr_fd >= 0) {
+               hook->attach_point = BPF_TC_INGRESS;
+               ret = bpf_tc_attach(hook, &opts1);
+               if (!ASSERT_OK(ret, "bpf_tc_attach")) {
+                       bpf_tc_hook_destroy(hook);
+                       return ret;
+               }
+       }
+
+       if (egr_fd >= 0) {
+               hook->attach_point = BPF_TC_EGRESS;
+               ret = bpf_tc_attach(hook, &opts2);
+               if (!ASSERT_OK(ret, "bpf_tc_attach")) {
+                       bpf_tc_hook_destroy(hook);
+                       return ret;
+               }
+       }
+
+       return 0;
+}
+
+static void cleanup(void)
+{
+       SYS_NOFAIL("test -f /var/run/netns/" NS0 " && ip netns delete " NS0);
+       SYS_NOFAIL("test -f /var/run/netns/" NS1 " && ip netns delete " NS1);
+       SYS_NOFAIL("test -f /var/run/netns/" NS2 " && ip netns delete " NS2);
+}
+
+static int config_underlay(void)
+{
+       SYS("ip netns add " NS0);
+       SYS("ip netns add " NS1);
+       SYS("ip netns add " NS2);
+
+       /* NS0 <-> NS1 [veth01 <-> veth10] */
+       SYS("ip link add veth01 netns " NS0 " type veth peer name veth10 netns " NS1);
+       SYS("ip -net " NS0 " addr add " IP4_ADDR_VETH01 "/24 dev veth01");
+       SYS("ip -net " NS0 " link set dev veth01 up");
+       SYS("ip -net " NS1 " addr add " IP4_ADDR_VETH10 "/24 dev veth10");
+       SYS("ip -net " NS1 " link set dev veth10 up");
+
+       /* NS0 <-> NS2 [veth02 <-> veth20] */
+       SYS("ip link add veth02 netns " NS0 " type veth peer name veth20 netns " NS2);
+       SYS("ip -net " NS0 " addr add " IP4_ADDR_VETH02 "/24 dev veth02");
+       SYS("ip -net " NS0 " link set dev veth02 up");
+       SYS("ip -net " NS2 " addr add " IP4_ADDR_VETH20 "/24 dev veth20");
+       SYS("ip -net " NS2 " link set dev veth20 up");
+
+       return 0;
+fail:
+       return -1;
+}
+
+static int setup_xfrm_tunnel_ns(const char *ns, const char *ipv4_local,
+                               const char *ipv4_remote, int if_id)
+{
+       /* State: local -> remote */
+       SYS("ip -net %s xfrm state add src %s dst %s spi 1 "
+           ESP_DUMMY_PARAMS "if_id %d", ns, ipv4_local, ipv4_remote, if_id);
+
+       /* State: local <- remote */
+       SYS("ip -net %s xfrm state add src %s dst %s spi 1 "
+           ESP_DUMMY_PARAMS "if_id %d", ns, ipv4_remote, ipv4_local, if_id);
+
+       /* Policy: local -> remote */
+       SYS("ip -net %s xfrm policy add dir out src 0.0.0.0/0 dst 0.0.0.0/0 "
+           "if_id %d tmpl src %s dst %s proto esp mode tunnel if_id %d", ns,
+           if_id, ipv4_local, ipv4_remote, if_id);
+
+       /* Policy: local <- remote */
+       SYS("ip -net %s xfrm policy add dir in src 0.0.0.0/0 dst 0.0.0.0/0 "
+           "if_id %d tmpl src %s dst %s proto esp mode tunnel if_id %d", ns,
+           if_id, ipv4_remote, ipv4_local, if_id);
+
+       return 0;
+fail:
+       return -1;
+}
+
+static int setup_xfrm_tunnel(const char *ns_a, const char *ns_b,
+                            const char *ipv4_a, const char *ipv4_b,
+                            int if_id_a, int if_id_b)
+{
+       return setup_xfrm_tunnel_ns(ns_a, ipv4_a, ipv4_b, if_id_a) ||
+               setup_xfrm_tunnel_ns(ns_b, ipv4_b, ipv4_a, if_id_b);
+}
+
+static struct rtattr *rtattr_add(struct nlmsghdr *nh, unsigned short type,
+                                unsigned short len)
+{
+       struct rtattr *rta =
+               (struct rtattr *)((uint8_t *)nh + RTA_ALIGN(nh->nlmsg_len));
+       rta->rta_type = type;
+       rta->rta_len = RTA_LENGTH(len);
+       nh->nlmsg_len = RTA_ALIGN(nh->nlmsg_len) + RTA_ALIGN(rta->rta_len);
+       return rta;
+}
+
+static struct rtattr *rtattr_add_str(struct nlmsghdr *nh, unsigned short type,
+                                    const char *s)
+{
+       struct rtattr *rta = rtattr_add(nh, type, strlen(s));
+
+       memcpy(RTA_DATA(rta), s, strlen(s));
+       return rta;
+}
+
+static struct rtattr *rtattr_begin(struct nlmsghdr *nh, unsigned short type)
+{
+       return rtattr_add(nh, type, 0);
+}
+
+static void rtattr_end(struct nlmsghdr *nh, struct rtattr *attr)
+{
+       uint8_t *end = (uint8_t *)nh + nh->nlmsg_len;
+
+       attr->rta_len = end - (uint8_t *)attr;
+}
+
+static int setup_xfrmi_external_dev(const char *ns)
+{
+       struct {
+               struct nlmsghdr nh;
+               struct ifinfomsg info;
+               unsigned char data[128];
+       } req;
+       struct rtattr *link_info, *info_data;
+       struct nstoken *nstoken;
+       int ret = -1, sock = -1;
+       struct nlmsghdr *nh;
+
+       memset(&req, 0, sizeof(req));
+       nh = &req.nh;
+       nh->nlmsg_len = NLMSG_LENGTH(sizeof(req.info));
+       nh->nlmsg_type = RTM_NEWLINK;
+       nh->nlmsg_flags |= NLM_F_CREATE | NLM_F_REQUEST;
+
+       rtattr_add_str(nh, IFLA_IFNAME, "ipsec0");
+       link_info = rtattr_begin(nh, IFLA_LINKINFO);
+       rtattr_add_str(nh, IFLA_INFO_KIND, "xfrm");
+       info_data = rtattr_begin(nh, IFLA_INFO_DATA);
+       rtattr_add(nh, IFLA_XFRM_COLLECT_METADATA, 0);
+       rtattr_end(nh, info_data);
+       rtattr_end(nh, link_info);
+
+       nstoken = open_netns(ns);
+       if (!ASSERT_OK_PTR(nstoken, "setns"))
+               goto done;
+
+       sock = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE);
+       if (!ASSERT_GE(sock, 0, "netlink socket"))
+               goto done;
+       ret = send(sock, nh, nh->nlmsg_len, 0);
+       if (!ASSERT_EQ(ret, nh->nlmsg_len, "netlink send length"))
+               goto done;
+
+       ret = 0;
+done:
+       if (sock != -1)
+               close(sock);
+       if (nstoken)
+               close_netns(nstoken);
+       return ret;
+}
+
+static int config_overlay(void)
+{
+       if (setup_xfrm_tunnel(NS0, NS1, IP4_ADDR_VETH01, IP4_ADDR_VETH10,
+                             IF_ID_0_TO_1, IF_ID_1))
+               goto fail;
+       if (setup_xfrm_tunnel(NS0, NS2, IP4_ADDR_VETH02, IP4_ADDR_VETH20,
+                             IF_ID_0_TO_2, IF_ID_2))
+               goto fail;
+
+       /* Older iproute2 doesn't support this option */
+       if (!ASSERT_OK(setup_xfrmi_external_dev(NS0), "xfrmi"))
+               goto fail;
+
+       SYS("ip -net " NS0 " addr add 192.168.1.100/24 dev ipsec0");
+       SYS("ip -net " NS0 " link set dev ipsec0 up");
+
+       SYS("ip -net " NS1 " link add ipsec0 type xfrm if_id %d", IF_ID_1);
+       SYS("ip -net " NS1 " addr add 192.168.1.200/24 dev ipsec0");
+       SYS("ip -net " NS1 " link set dev ipsec0 up");
+
+       SYS("ip -net " NS2 " link add ipsec0 type xfrm if_id %d", IF_ID_2);
+       SYS("ip -net " NS2 " addr add 192.168.1.200/24 dev ipsec0");
+       SYS("ip -net " NS2 " link set dev ipsec0 up");
+
+       return 0;
+fail:
+       return -1;
+}
+
+static int test_xfrm_ping(struct xfrm_info *skel, u32 if_id)
+{
+       skel->bss->req_if_id = if_id;
+
+       SYS("ping -i 0.01 -c 3 -w 10 -q 192.168.1.200 > /dev/null");
+
+       if (!ASSERT_EQ(skel->bss->resp_if_id, if_id, "if_id"))
+               goto fail;
+
+       return 0;
+fail:
+       return -1;
+}
+
+static void _test_xfrm_info(void)
+{
+       LIBBPF_OPTS(bpf_tc_hook, tc_hook, .attach_point = BPF_TC_INGRESS);
+       int get_xfrm_info_prog_fd, set_xfrm_info_prog_fd;
+       struct nstoken *nstoken = NULL;
+       struct xfrm_info *skel;
+       int ifindex;
+
+       /* load and attach bpf progs to ipsec dev tc hook point */
+       skel = xfrm_info__open_and_load();
+       if (!ASSERT_OK_PTR(skel, "xfrm_info__open_and_load"))
+               goto done;
+       nstoken = open_netns(NS0);
+       if (!ASSERT_OK_PTR(nstoken, "setns " NS0))
+               goto done;
+       ifindex = if_nametoindex("ipsec0");
+       if (!ASSERT_NEQ(ifindex, 0, "ipsec0 ifindex"))
+               goto done;
+       tc_hook.ifindex = ifindex;
+       set_xfrm_info_prog_fd = bpf_program__fd(skel->progs.set_xfrm_info);
+       get_xfrm_info_prog_fd = bpf_program__fd(skel->progs.get_xfrm_info);
+       if (!ASSERT_GE(set_xfrm_info_prog_fd, 0, "bpf_program__fd"))
+               goto done;
+       if (!ASSERT_GE(get_xfrm_info_prog_fd, 0, "bpf_program__fd"))
+               goto done;
+       if (attach_tc_prog(&tc_hook, get_xfrm_info_prog_fd,
+                          set_xfrm_info_prog_fd))
+               goto done;
+
+       /* perform test */
+       if (!ASSERT_EQ(test_xfrm_ping(skel, IF_ID_0_TO_1), 0, "ping " NS1))
+               goto done;
+       if (!ASSERT_EQ(test_xfrm_ping(skel, IF_ID_0_TO_2), 0, "ping " NS2))
+               goto done;
+
+done:
+       if (nstoken)
+               close_netns(nstoken);
+       xfrm_info__destroy(skel);
+}
+
+void test_xfrm_info(void)
+{
+       cleanup();
+
+       if (!ASSERT_OK(config_underlay(), "config_underlay"))
+               goto done;
+       if (!ASSERT_OK(config_overlay(), "config_overlay"))
+               goto done;
+
+       if (test__start_subtest("xfrm_info"))
+               _test_xfrm_info();
+
+done:
+       cleanup();
+}
index 285c008..9ba14c3 100644 (file)
@@ -7,14 +7,14 @@ char _license[] SEC("license") = "GPL";
 
 unsigned long last_sym_value = 0;
 
-static inline char tolower(char c)
+static inline char to_lower(char c)
 {
        if (c >= 'A' && c <= 'Z')
                c += ('a' - 'A');
        return c;
 }
 
-static inline char toupper(char c)
+static inline char to_upper(char c)
 {
        if (c >= 'a' && c <= 'z')
                c -= ('a' - 'A');
@@ -54,7 +54,7 @@ int dump_ksym(struct bpf_iter__ksym *ctx)
        type = iter->type;
 
        if (iter->module_name[0]) {
-               type = iter->exported ? toupper(type) : tolower(type);
+               type = iter->exported ? to_upper(type) : to_lower(type);
                BPF_SEQ_PRINTF(seq, "0x%llx %c %s [ %s ] ",
                               value, type, iter->name, iter->module_name);
        } else {
index 5bb11fe..4a01ea9 100644 (file)
@@ -2,6 +2,11 @@
 #ifndef __BPF_MISC_H__
 #define __BPF_MISC_H__
 
+#define __msg(msg)             __attribute__((btf_decl_tag("comment:test_expect_msg=" msg)))
+#define __failure              __attribute__((btf_decl_tag("comment:test_expect_failure")))
+#define __success              __attribute__((btf_decl_tag("comment:test_expect_success")))
+#define __log_level(lvl)       __attribute__((btf_decl_tag("comment:test_log_level="#lvl)))
+
 #if defined(__TARGET_ARCH_x86)
 #define SYSCALL_WRAPPER 1
 #define SYS_PREFIX "__x64_"
index adb087a..b394817 100644 (file)
@@ -25,6 +25,9 @@
 #define IPV6_TCLASS            67
 #define IPV6_AUTOFLOWLABEL     70
 
+#define TC_ACT_UNSPEC          (-1)
+#define TC_ACT_SHOT            2
+
 #define SOL_TCP                        6
 #define TCP_NODELAY            1
 #define TCP_MAXSEG             2
index 8feddb8..38f78d9 100644 (file)
@@ -64,3 +64,4 @@ int BPF_PROG(test_percpu_helper, struct cgroup *cgrp, const char *path)
 
        return 0;
 }
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/cgrp_ls_sleepable.c b/tools/testing/selftests/bpf/progs/cgrp_ls_sleepable.c
new file mode 100644 (file)
index 0000000..2d11ed5
--- /dev/null
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */
+
+#include "bpf_iter.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include "bpf_misc.h"
+
+char _license[] SEC("license") = "GPL";
+
+struct {
+       __uint(type, BPF_MAP_TYPE_CGRP_STORAGE);
+       __uint(map_flags, BPF_F_NO_PREALLOC);
+       __type(key, int);
+       __type(value, long);
+} map_a SEC(".maps");
+
+__u32 target_pid;
+__u64 cgroup_id;
+
+void bpf_rcu_read_lock(void) __ksym;
+void bpf_rcu_read_unlock(void) __ksym;
+
+SEC("?iter.s/cgroup")
+int cgroup_iter(struct bpf_iter__cgroup *ctx)
+{
+       struct seq_file *seq = ctx->meta->seq;
+       struct cgroup *cgrp = ctx->cgroup;
+       long *ptr;
+
+       if (cgrp == NULL)
+               return 0;
+
+       ptr = bpf_cgrp_storage_get(&map_a, cgrp, 0,
+                                  BPF_LOCAL_STORAGE_GET_F_CREATE);
+       if (ptr)
+               cgroup_id = cgrp->kn->id;
+       return 0;
+}
+
+SEC("?fentry.s/" SYS_PREFIX "sys_getpgid")
+int no_rcu_lock(void *ctx)
+{
+       struct task_struct *task;
+       struct cgroup *cgrp;
+       long *ptr;
+
+       task = bpf_get_current_task_btf();
+       if (task->pid != target_pid)
+               return 0;
+
+       /* ptr_to_btf_id semantics. should work. */
+       cgrp = task->cgroups->dfl_cgrp;
+       ptr = bpf_cgrp_storage_get(&map_a, cgrp, 0,
+                                  BPF_LOCAL_STORAGE_GET_F_CREATE);
+       if (ptr)
+               cgroup_id = cgrp->kn->id;
+       return 0;
+}
+
+SEC("?fentry.s/" SYS_PREFIX "sys_getpgid")
+int yes_rcu_lock(void *ctx)
+{
+       struct task_struct *task;
+       struct cgroup *cgrp;
+       long *ptr;
+
+       task = bpf_get_current_task_btf();
+       if (task->pid != target_pid)
+               return 0;
+
+       bpf_rcu_read_lock();
+       cgrp = task->cgroups->dfl_cgrp;
+       /* cgrp is untrusted and cannot pass to bpf_cgrp_storage_get() helper. */
+       ptr = bpf_cgrp_storage_get(&map_a, cgrp, 0, BPF_LOCAL_STORAGE_GET_F_CREATE);
+       if (ptr)
+               cgroup_id = cgrp->kn->id;
+       bpf_rcu_read_unlock();
+       return 0;
+}
index b0f08ff..78debc1 100644 (file)
@@ -43,6 +43,7 @@ struct sample {
 
 struct {
        __uint(type, BPF_MAP_TYPE_RINGBUF);
+       __uint(max_entries, 4096);
 } ringbuf SEC(".maps");
 
 int err, val;
@@ -66,6 +67,7 @@ static int get_map_val_dynptr(struct bpf_dynptr *ptr)
  * bpf_ringbuf_submit/discard_dynptr call
  */
 SEC("?raw_tp")
+__failure __msg("Unreleased reference id=1")
 int ringbuf_missing_release1(void *ctx)
 {
        struct bpf_dynptr ptr;
@@ -78,6 +80,7 @@ int ringbuf_missing_release1(void *ctx)
 }
 
 SEC("?raw_tp")
+__failure __msg("Unreleased reference id=2")
 int ringbuf_missing_release2(void *ctx)
 {
        struct bpf_dynptr ptr1, ptr2;
@@ -113,6 +116,7 @@ static int missing_release_callback_fn(__u32 index, void *data)
 
 /* Any dynptr initialized within a callback must have bpf_dynptr_put called */
 SEC("?raw_tp")
+__failure __msg("Unreleased reference id")
 int ringbuf_missing_release_callback(void *ctx)
 {
        bpf_loop(10, missing_release_callback_fn, NULL, 0);
@@ -121,6 +125,7 @@ int ringbuf_missing_release_callback(void *ctx)
 
 /* Can't call bpf_ringbuf_submit/discard_dynptr on a non-initialized dynptr */
 SEC("?raw_tp")
+__failure __msg("arg 1 is an unacquired reference")
 int ringbuf_release_uninit_dynptr(void *ctx)
 {
        struct bpf_dynptr ptr;
@@ -133,6 +138,7 @@ int ringbuf_release_uninit_dynptr(void *ctx)
 
 /* A dynptr can't be used after it has been invalidated */
 SEC("?raw_tp")
+__failure __msg("Expected an initialized dynptr as arg #3")
 int use_after_invalid(void *ctx)
 {
        struct bpf_dynptr ptr;
@@ -152,6 +158,7 @@ int use_after_invalid(void *ctx)
 
 /* Can't call non-dynptr ringbuf APIs on a dynptr ringbuf sample */
 SEC("?raw_tp")
+__failure __msg("type=mem expected=ringbuf_mem")
 int ringbuf_invalid_api(void *ctx)
 {
        struct bpf_dynptr ptr;
@@ -174,6 +181,7 @@ done:
 
 /* Can't add a dynptr to a map */
 SEC("?raw_tp")
+__failure __msg("invalid indirect read from stack")
 int add_dynptr_to_map1(void *ctx)
 {
        struct bpf_dynptr ptr;
@@ -191,6 +199,7 @@ int add_dynptr_to_map1(void *ctx)
 
 /* Can't add a struct with an embedded dynptr to a map */
 SEC("?raw_tp")
+__failure __msg("invalid indirect read from stack")
 int add_dynptr_to_map2(void *ctx)
 {
        struct test_info x;
@@ -208,6 +217,7 @@ int add_dynptr_to_map2(void *ctx)
 
 /* A data slice can't be accessed out of bounds */
 SEC("?raw_tp")
+__failure __msg("value is outside of the allowed memory range")
 int data_slice_out_of_bounds_ringbuf(void *ctx)
 {
        struct bpf_dynptr ptr;
@@ -228,6 +238,7 @@ done:
 }
 
 SEC("?raw_tp")
+__failure __msg("value is outside of the allowed memory range")
 int data_slice_out_of_bounds_map_value(void *ctx)
 {
        __u32 key = 0, map_val;
@@ -248,6 +259,7 @@ int data_slice_out_of_bounds_map_value(void *ctx)
 
 /* A data slice can't be used after it has been released */
 SEC("?raw_tp")
+__failure __msg("invalid mem access 'scalar'")
 int data_slice_use_after_release1(void *ctx)
 {
        struct bpf_dynptr ptr;
@@ -279,6 +291,7 @@ done:
  * ptr2 is at fp - 16).
  */
 SEC("?raw_tp")
+__failure __msg("invalid mem access 'scalar'")
 int data_slice_use_after_release2(void *ctx)
 {
        struct bpf_dynptr ptr1, ptr2;
@@ -310,6 +323,7 @@ done:
 
 /* A data slice must be first checked for NULL */
 SEC("?raw_tp")
+__failure __msg("invalid mem access 'mem_or_null'")
 int data_slice_missing_null_check1(void *ctx)
 {
        struct bpf_dynptr ptr;
@@ -330,6 +344,7 @@ int data_slice_missing_null_check1(void *ctx)
 
 /* A data slice can't be dereferenced if it wasn't checked for null */
 SEC("?raw_tp")
+__failure __msg("invalid mem access 'mem_or_null'")
 int data_slice_missing_null_check2(void *ctx)
 {
        struct bpf_dynptr ptr;
@@ -352,6 +367,7 @@ done:
  * dynptr argument
  */
 SEC("?raw_tp")
+__failure __msg("invalid indirect read from stack")
 int invalid_helper1(void *ctx)
 {
        struct bpf_dynptr ptr;
@@ -366,6 +382,7 @@ int invalid_helper1(void *ctx)
 
 /* A dynptr can't be passed into a helper function at a non-zero offset */
 SEC("?raw_tp")
+__failure __msg("Expected an initialized dynptr as arg #3")
 int invalid_helper2(void *ctx)
 {
        struct bpf_dynptr ptr;
@@ -381,6 +398,7 @@ int invalid_helper2(void *ctx)
 
 /* A bpf_dynptr is invalidated if it's been written into */
 SEC("?raw_tp")
+__failure __msg("Expected an initialized dynptr as arg #1")
 int invalid_write1(void *ctx)
 {
        struct bpf_dynptr ptr;
@@ -402,6 +420,7 @@ int invalid_write1(void *ctx)
  * offset
  */
 SEC("?raw_tp")
+__failure __msg("Expected an initialized dynptr as arg #3")
 int invalid_write2(void *ctx)
 {
        struct bpf_dynptr ptr;
@@ -425,6 +444,7 @@ int invalid_write2(void *ctx)
  * non-const offset
  */
 SEC("?raw_tp")
+__failure __msg("Expected an initialized dynptr as arg #1")
 int invalid_write3(void *ctx)
 {
        struct bpf_dynptr ptr;
@@ -456,6 +476,7 @@ static int invalid_write4_callback(__u32 index, void *data)
  * be invalidated as a dynptr
  */
 SEC("?raw_tp")
+__failure __msg("arg 1 is an unacquired reference")
 int invalid_write4(void *ctx)
 {
        struct bpf_dynptr ptr;
@@ -472,7 +493,9 @@ int invalid_write4(void *ctx)
 
 /* A globally-defined bpf_dynptr can't be used (it must reside as a stack frame) */
 struct bpf_dynptr global_dynptr;
+
 SEC("?raw_tp")
+__failure __msg("type=map_value expected=fp")
 int global(void *ctx)
 {
        /* this should fail */
@@ -485,6 +508,7 @@ int global(void *ctx)
 
 /* A direct read should fail */
 SEC("?raw_tp")
+__failure __msg("invalid read from stack")
 int invalid_read1(void *ctx)
 {
        struct bpf_dynptr ptr;
@@ -501,6 +525,7 @@ int invalid_read1(void *ctx)
 
 /* A direct read at an offset should fail */
 SEC("?raw_tp")
+__failure __msg("cannot pass in dynptr at an offset")
 int invalid_read2(void *ctx)
 {
        struct bpf_dynptr ptr;
@@ -516,6 +541,7 @@ int invalid_read2(void *ctx)
 
 /* A direct read at an offset into the lower stack slot should fail */
 SEC("?raw_tp")
+__failure __msg("invalid read from stack")
 int invalid_read3(void *ctx)
 {
        struct bpf_dynptr ptr1, ptr2;
@@ -542,6 +568,7 @@ static int invalid_read4_callback(__u32 index, void *data)
 
 /* A direct read within a callback function should fail */
 SEC("?raw_tp")
+__failure __msg("invalid read from stack")
 int invalid_read4(void *ctx)
 {
        struct bpf_dynptr ptr;
@@ -557,6 +584,7 @@ int invalid_read4(void *ctx)
 
 /* Initializing a dynptr on an offset should fail */
 SEC("?raw_tp")
+__failure __msg("invalid write to stack")
 int invalid_offset(void *ctx)
 {
        struct bpf_dynptr ptr;
@@ -571,6 +599,7 @@ int invalid_offset(void *ctx)
 
 /* Can't release a dynptr twice */
 SEC("?raw_tp")
+__failure __msg("arg 1 is an unacquired reference")
 int release_twice(void *ctx)
 {
        struct bpf_dynptr ptr;
@@ -597,6 +626,7 @@ static int release_twice_callback_fn(__u32 index, void *data)
  * within a calback function, fails
  */
 SEC("?raw_tp")
+__failure __msg("arg 1 is an unacquired reference")
 int release_twice_callback(void *ctx)
 {
        struct bpf_dynptr ptr;
@@ -612,6 +642,7 @@ int release_twice_callback(void *ctx)
 
 /* Reject unsupported local mem types for dynptr_from_mem API */
 SEC("?raw_tp")
+__failure __msg("Unsupported reg type fp for bpf_dynptr_from_mem data")
 int dynptr_from_mem_invalid_api(void *ctx)
 {
        struct bpf_dynptr ptr;
index a3a6103..35db7c6 100644 (file)
@@ -20,6 +20,7 @@ struct sample {
 
 struct {
        __uint(type, BPF_MAP_TYPE_RINGBUF);
+       __uint(max_entries, 4096);
 } ringbuf SEC(".maps");
 
 struct {
index 2c7b615..4ad88da 100644 (file)
@@ -99,13 +99,28 @@ int list_push_pop_multiple(struct bpf_spin_lock *lock, struct bpf_list_head *hea
        struct foo *f[8], *pf;
        int i;
 
-       for (i = 0; i < ARRAY_SIZE(f); i++) {
+       /* Loop following this check adds nodes 2-at-a-time in order to
+        * validate multiple release_on_unlock release logic
+        */
+       if (ARRAY_SIZE(f) % 2)
+               return 10;
+
+       for (i = 0; i < ARRAY_SIZE(f); i += 2) {
                f[i] = bpf_obj_new(typeof(**f));
                if (!f[i])
                        return 2;
                f[i]->data = i;
+
+               f[i + 1] = bpf_obj_new(typeof(**f));
+               if (!f[i + 1]) {
+                       bpf_obj_drop(f[i]);
+                       return 9;
+               }
+               f[i + 1]->data = i + 1;
+
                bpf_spin_lock(lock);
                bpf_list_push_front(head, &f[i]->node);
+               bpf_list_push_front(head, &f[i + 1]->node);
                bpf_spin_unlock(lock);
        }
 
index 05e209b..760e41e 100644 (file)
@@ -3,6 +3,7 @@
 #include <bpf/bpf_tracing.h>
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_core_read.h>
+#include "bpf_misc.h"
 
 struct map_value {
        char buf[8];
@@ -23,6 +24,7 @@ extern struct prog_test_ref_kfunc *
 bpf_kfunc_call_test_kptr_get(struct prog_test_ref_kfunc **p, int a, int b) __ksym;
 
 SEC("?tc")
+__failure __msg("kptr access size must be BPF_DW")
 int size_not_bpf_dw(struct __sk_buff *ctx)
 {
        struct map_value *v;
@@ -37,6 +39,7 @@ int size_not_bpf_dw(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
+__failure __msg("kptr access cannot have variable offset")
 int non_const_var_off(struct __sk_buff *ctx)
 {
        struct map_value *v;
@@ -55,6 +58,7 @@ int non_const_var_off(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
+__failure __msg("R1 doesn't have constant offset. kptr has to be")
 int non_const_var_off_kptr_xchg(struct __sk_buff *ctx)
 {
        struct map_value *v;
@@ -73,6 +77,7 @@ int non_const_var_off_kptr_xchg(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
+__failure __msg("kptr access misaligned expected=8 off=7")
 int misaligned_access_write(struct __sk_buff *ctx)
 {
        struct map_value *v;
@@ -88,6 +93,7 @@ int misaligned_access_write(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
+__failure __msg("kptr access misaligned expected=8 off=1")
 int misaligned_access_read(struct __sk_buff *ctx)
 {
        struct map_value *v;
@@ -101,6 +107,7 @@ int misaligned_access_read(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
+__failure __msg("variable untrusted_ptr_ access var_off=(0x0; 0x1e0)")
 int reject_var_off_store(struct __sk_buff *ctx)
 {
        struct prog_test_ref_kfunc *unref_ptr;
@@ -124,6 +131,7 @@ int reject_var_off_store(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
+__failure __msg("invalid kptr access, R1 type=untrusted_ptr_prog_test_ref_kfunc")
 int reject_bad_type_match(struct __sk_buff *ctx)
 {
        struct prog_test_ref_kfunc *unref_ptr;
@@ -144,6 +152,7 @@ int reject_bad_type_match(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
+__failure __msg("R1 type=untrusted_ptr_or_null_ expected=percpu_ptr_")
 int marked_as_untrusted_or_null(struct __sk_buff *ctx)
 {
        struct map_value *v;
@@ -158,6 +167,7 @@ int marked_as_untrusted_or_null(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
+__failure __msg("access beyond struct prog_test_ref_kfunc at off 32 size 4")
 int correct_btf_id_check_size(struct __sk_buff *ctx)
 {
        struct prog_test_ref_kfunc *p;
@@ -175,6 +185,7 @@ int correct_btf_id_check_size(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
+__failure __msg("R1 type=untrusted_ptr_ expected=percpu_ptr_")
 int inherit_untrusted_on_walk(struct __sk_buff *ctx)
 {
        struct prog_test_ref_kfunc *unref_ptr;
@@ -194,6 +205,7 @@ int inherit_untrusted_on_walk(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
+__failure __msg("off=8 kptr isn't referenced kptr")
 int reject_kptr_xchg_on_unref(struct __sk_buff *ctx)
 {
        struct map_value *v;
@@ -208,6 +220,7 @@ int reject_kptr_xchg_on_unref(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
+__failure __msg("arg#0 expected pointer to map value")
 int reject_kptr_get_no_map_val(struct __sk_buff *ctx)
 {
        bpf_kfunc_call_test_kptr_get((void *)&ctx, 0, 0);
@@ -215,6 +228,7 @@ int reject_kptr_get_no_map_val(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
+__failure __msg("arg#0 expected pointer to map value")
 int reject_kptr_get_no_null_map_val(struct __sk_buff *ctx)
 {
        bpf_kfunc_call_test_kptr_get(bpf_map_lookup_elem(&array_map, &(int){0}), 0, 0);
@@ -222,6 +236,7 @@ int reject_kptr_get_no_null_map_val(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
+__failure __msg("arg#0 no referenced kptr at map value offset=0")
 int reject_kptr_get_no_kptr(struct __sk_buff *ctx)
 {
        struct map_value *v;
@@ -236,6 +251,7 @@ int reject_kptr_get_no_kptr(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
+__failure __msg("arg#0 no referenced kptr at map value offset=8")
 int reject_kptr_get_on_unref(struct __sk_buff *ctx)
 {
        struct map_value *v;
@@ -250,6 +266,7 @@ int reject_kptr_get_on_unref(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
+__failure __msg("kernel function bpf_kfunc_call_test_kptr_get args#0")
 int reject_kptr_get_bad_type_match(struct __sk_buff *ctx)
 {
        struct map_value *v;
@@ -264,6 +281,7 @@ int reject_kptr_get_bad_type_match(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
+__failure __msg("R1 type=untrusted_ptr_or_null_ expected=percpu_ptr_")
 int mark_ref_as_untrusted_or_null(struct __sk_buff *ctx)
 {
        struct map_value *v;
@@ -278,6 +296,7 @@ int mark_ref_as_untrusted_or_null(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
+__failure __msg("store to referenced kptr disallowed")
 int reject_untrusted_store_to_ref(struct __sk_buff *ctx)
 {
        struct prog_test_ref_kfunc *p;
@@ -297,6 +316,7 @@ int reject_untrusted_store_to_ref(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
+__failure __msg("R2 type=untrusted_ptr_ expected=ptr_")
 int reject_untrusted_xchg(struct __sk_buff *ctx)
 {
        struct prog_test_ref_kfunc *p;
@@ -315,6 +335,8 @@ int reject_untrusted_xchg(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
+__failure
+__msg("invalid kptr access, R2 type=ptr_prog_test_ref_kfunc expected=ptr_prog_test_member")
 int reject_bad_type_xchg(struct __sk_buff *ctx)
 {
        struct prog_test_ref_kfunc *ref_ptr;
@@ -333,6 +355,7 @@ int reject_bad_type_xchg(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
+__failure __msg("invalid kptr access, R2 type=ptr_prog_test_ref_kfunc")
 int reject_member_of_ref_xchg(struct __sk_buff *ctx)
 {
        struct prog_test_ref_kfunc *ref_ptr;
@@ -351,6 +374,7 @@ int reject_member_of_ref_xchg(struct __sk_buff *ctx)
 }
 
 SEC("?syscall")
+__failure __msg("kptr cannot be accessed indirectly by helper")
 int reject_indirect_helper_access(struct __sk_buff *ctx)
 {
        struct map_value *v;
@@ -371,6 +395,7 @@ int write_func(int *p)
 }
 
 SEC("?tc")
+__failure __msg("kptr cannot be accessed indirectly by helper")
 int reject_indirect_global_func_access(struct __sk_buff *ctx)
 {
        struct map_value *v;
@@ -384,6 +409,7 @@ int reject_indirect_global_func_access(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
+__failure __msg("Unreleased reference id=5 alloc_insn=")
 int kptr_xchg_ref_state(struct __sk_buff *ctx)
 {
        struct prog_test_ref_kfunc *p;
@@ -402,6 +428,7 @@ int kptr_xchg_ref_state(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
+__failure __msg("Unreleased reference id=3 alloc_insn=")
 int kptr_get_ref_state(struct __sk_buff *ctx)
 {
        struct map_value *v;
index 94a9700..125f908 100644 (file)
@@ -23,13 +23,14 @@ struct bpf_key *bpf_lookup_user_key(__u32 serial, __u64 flags) __ksym;
 void bpf_key_put(struct bpf_key *key) __ksym;
 void bpf_rcu_read_lock(void) __ksym;
 void bpf_rcu_read_unlock(void) __ksym;
-struct task_struct *bpf_task_acquire(struct task_struct *p) __ksym;
+struct task_struct *bpf_task_acquire_not_zero(struct task_struct *p) __ksym;
 void bpf_task_release(struct task_struct *p) __ksym;
 
 SEC("?fentry.s/" SYS_PREFIX "sys_getpgid")
 int get_cgroup_id(void *ctx)
 {
        struct task_struct *task;
+       struct css_set *cgroups;
 
        task = bpf_get_current_task_btf();
        if (task->pid != target_pid)
@@ -37,7 +38,11 @@ int get_cgroup_id(void *ctx)
 
        /* simulate bpf_get_current_cgroup_id() helper */
        bpf_rcu_read_lock();
-       cgroup_id = task->cgroups->dfl_cgrp->kn->id;
+       cgroups = task->cgroups;
+       if (!cgroups)
+               goto unlock;
+       cgroup_id = cgroups->dfl_cgrp->kn->id;
+unlock:
        bpf_rcu_read_unlock();
        return 0;
 }
@@ -56,6 +61,8 @@ int task_succ(void *ctx)
        bpf_rcu_read_lock();
        /* region including helper using rcu ptr real_parent */
        real_parent = task->real_parent;
+       if (!real_parent)
+               goto out;
        ptr = bpf_task_storage_get(&map_a, real_parent, &init_val,
                                   BPF_LOCAL_STORAGE_GET_F_CREATE);
        if (!ptr)
@@ -92,7 +99,10 @@ int two_regions(void *ctx)
        bpf_rcu_read_unlock();
        bpf_rcu_read_lock();
        real_parent = task->real_parent;
+       if (!real_parent)
+               goto out;
        (void)bpf_task_storage_get(&map_a, real_parent, 0, 0);
+out:
        bpf_rcu_read_unlock();
        return 0;
 }
@@ -105,7 +115,10 @@ int non_sleepable_1(void *ctx)
        task = bpf_get_current_task_btf();
        bpf_rcu_read_lock();
        real_parent = task->real_parent;
+       if (!real_parent)
+               goto out;
        (void)bpf_task_storage_get(&map_a, real_parent, 0, 0);
+out:
        bpf_rcu_read_unlock();
        return 0;
 }
@@ -121,7 +134,10 @@ int non_sleepable_2(void *ctx)
 
        bpf_rcu_read_lock();
        real_parent = task->real_parent;
+       if (!real_parent)
+               goto out;
        (void)bpf_task_storage_get(&map_a, real_parent, 0, 0);
+out:
        bpf_rcu_read_unlock();
        return 0;
 }
@@ -129,16 +145,33 @@ int non_sleepable_2(void *ctx)
 SEC("?fentry.s/" SYS_PREFIX "sys_nanosleep")
 int task_acquire(void *ctx)
 {
-       struct task_struct *task, *real_parent;
+       struct task_struct *task, *real_parent, *gparent;
 
        task = bpf_get_current_task_btf();
        bpf_rcu_read_lock();
        real_parent = task->real_parent;
+       if (!real_parent)
+               goto out;
+
+       /* rcu_ptr->rcu_field */
+       gparent = real_parent->real_parent;
+       if (!gparent)
+               goto out;
+
        /* acquire a reference which can be used outside rcu read lock region */
-       real_parent = bpf_task_acquire(real_parent);
+       gparent = bpf_task_acquire_not_zero(gparent);
+       if (!gparent)
+               /* Until we resolve the issues with using task->rcu_users, we
+                * expect bpf_task_acquire_not_zero() to return a NULL task.
+                * See the comment at the definition of
+                * bpf_task_acquire_not_zero() for more details.
+                */
+               goto out;
+
+       (void)bpf_task_storage_get(&map_a, gparent, 0, 0);
+       bpf_task_release(gparent);
+out:
        bpf_rcu_read_unlock();
-       (void)bpf_task_storage_get(&map_a, real_parent, 0, 0);
-       bpf_task_release(real_parent);
        return 0;
 }
 
@@ -181,9 +214,12 @@ int non_sleepable_rcu_mismatch(void *ctx)
        /* non-sleepable: missing bpf_rcu_read_unlock() in one path */
        bpf_rcu_read_lock();
        real_parent = task->real_parent;
+       if (!real_parent)
+               goto out;
        (void)bpf_task_storage_get(&map_a, real_parent, 0, 0);
        if (real_parent)
                bpf_rcu_read_unlock();
+out:
        return 0;
 }
 
@@ -199,16 +235,17 @@ int inproper_sleepable_helper(void *ctx)
        /* sleepable helper in rcu read lock region */
        bpf_rcu_read_lock();
        real_parent = task->real_parent;
+       if (!real_parent)
+               goto out;
        regs = (struct pt_regs *)bpf_task_pt_regs(real_parent);
-       if (!regs) {
-               bpf_rcu_read_unlock();
-               return 0;
-       }
+       if (!regs)
+               goto out;
 
        ptr = (void *)PT_REGS_IP(regs);
        (void)bpf_copy_from_user_task(&value, sizeof(uint32_t), ptr, task, 0);
        user_data = value;
        (void)bpf_task_storage_get(&map_a, real_parent, 0, 0);
+out:
        bpf_rcu_read_unlock();
        return 0;
 }
@@ -239,7 +276,10 @@ int nested_rcu_region(void *ctx)
        bpf_rcu_read_lock();
        bpf_rcu_read_lock();
        real_parent = task->real_parent;
+       if (!real_parent)
+               goto out;
        (void)bpf_task_storage_get(&map_a, real_parent, 0, 0);
+out:
        bpf_rcu_read_unlock();
        bpf_rcu_read_unlock();
        return 0;
index e310473..87fa1db 100644 (file)
@@ -271,3 +271,14 @@ int BPF_PROG(task_kfunc_from_pid_no_null_check, struct task_struct *task, u64 cl
 
        return 0;
 }
+
+SEC("lsm/task_free")
+int BPF_PROG(task_kfunc_from_lsm_task_free, struct task_struct *task)
+{
+       struct task_struct *acquired;
+
+       /* the argument of lsm task_free hook is untrusted. */
+       acquired = bpf_task_acquire(task);
+       bpf_task_release(acquired);
+       return 0;
+}
index 60c7ead..9f359cf 100644 (file)
@@ -123,12 +123,17 @@ int BPF_PROG(test_task_get_release, struct task_struct *task, u64 clone_flags)
        }
 
        kptr = bpf_task_kptr_get(&v->task);
-       if (!kptr) {
+       if (kptr) {
+               /* Until we resolve the issues with using task->rcu_users, we
+                * expect bpf_task_kptr_get() to return a NULL task. See the
+                * comment at the definition of bpf_task_acquire_not_zero() for
+                * more details.
+                */
+               bpf_task_release(kptr);
                err = 3;
                return 0;
        }
 
-       bpf_task_release(kptr);
 
        return 0;
 }
index ce39d09..f4a8250 100644 (file)
@@ -33,18 +33,6 @@ int err, pid;
 char _license[] SEC("license") = "GPL";
 
 SEC("?lsm.s/bpf")
-int BPF_PROG(dynptr_type_not_supp, int cmd, union bpf_attr *attr,
-            unsigned int size)
-{
-       char write_data[64] = "hello there, world!!";
-       struct bpf_dynptr ptr;
-
-       bpf_ringbuf_reserve_dynptr(&ringbuf, sizeof(write_data), 0, &ptr);
-
-       return bpf_verify_pkcs7_signature(&ptr, &ptr, NULL);
-}
-
-SEC("?lsm.s/bpf")
 int BPF_PROG(not_valid_dynptr, int cmd, union bpf_attr *attr, unsigned int size)
 {
        unsigned long val;
index 82aba45..f3201dc 100644 (file)
@@ -18,6 +18,13 @@ struct {
        __uint(type, BPF_MAP_TYPE_USER_RINGBUF);
 } user_ringbuf SEC(".maps");
 
+struct {
+       __uint(type, BPF_MAP_TYPE_RINGBUF);
+       __uint(max_entries, 2);
+} ringbuf SEC(".maps");
+
+static int map_value;
+
 static long
 bad_access1(struct bpf_dynptr *dynptr, void *context)
 {
@@ -32,7 +39,7 @@ bad_access1(struct bpf_dynptr *dynptr, void *context)
 /* A callback that accesses a dynptr in a bpf_user_ringbuf_drain callback should
  * not be able to read before the pointer.
  */
-SEC("?raw_tp/sys_nanosleep")
+SEC("?raw_tp/")
 int user_ringbuf_callback_bad_access1(void *ctx)
 {
        bpf_user_ringbuf_drain(&user_ringbuf, bad_access1, NULL, 0);
@@ -54,7 +61,7 @@ bad_access2(struct bpf_dynptr *dynptr, void *context)
 /* A callback that accesses a dynptr in a bpf_user_ringbuf_drain callback should
  * not be able to read past the end of the pointer.
  */
-SEC("?raw_tp/sys_nanosleep")
+SEC("?raw_tp/")
 int user_ringbuf_callback_bad_access2(void *ctx)
 {
        bpf_user_ringbuf_drain(&user_ringbuf, bad_access2, NULL, 0);
@@ -73,7 +80,7 @@ write_forbidden(struct bpf_dynptr *dynptr, void *context)
 /* A callback that accesses a dynptr in a bpf_user_ringbuf_drain callback should
  * not be able to write to that pointer.
  */
-SEC("?raw_tp/sys_nanosleep")
+SEC("?raw_tp/")
 int user_ringbuf_callback_write_forbidden(void *ctx)
 {
        bpf_user_ringbuf_drain(&user_ringbuf, write_forbidden, NULL, 0);
@@ -92,7 +99,7 @@ null_context_write(struct bpf_dynptr *dynptr, void *context)
 /* A callback that accesses a dynptr in a bpf_user_ringbuf_drain callback should
  * not be able to write to that pointer.
  */
-SEC("?raw_tp/sys_nanosleep")
+SEC("?raw_tp/")
 int user_ringbuf_callback_null_context_write(void *ctx)
 {
        bpf_user_ringbuf_drain(&user_ringbuf, null_context_write, NULL, 0);
@@ -113,7 +120,7 @@ null_context_read(struct bpf_dynptr *dynptr, void *context)
 /* A callback that accesses a dynptr in a bpf_user_ringbuf_drain callback should
  * not be able to write to that pointer.
  */
-SEC("?raw_tp/sys_nanosleep")
+SEC("?raw_tp/")
 int user_ringbuf_callback_null_context_read(void *ctx)
 {
        bpf_user_ringbuf_drain(&user_ringbuf, null_context_read, NULL, 0);
@@ -132,7 +139,7 @@ try_discard_dynptr(struct bpf_dynptr *dynptr, void *context)
 /* A callback that accesses a dynptr in a bpf_user_ringbuf_drain callback should
  * not be able to read past the end of the pointer.
  */
-SEC("?raw_tp/sys_nanosleep")
+SEC("?raw_tp/")
 int user_ringbuf_callback_discard_dynptr(void *ctx)
 {
        bpf_user_ringbuf_drain(&user_ringbuf, try_discard_dynptr, NULL, 0);
@@ -151,7 +158,7 @@ try_submit_dynptr(struct bpf_dynptr *dynptr, void *context)
 /* A callback that accesses a dynptr in a bpf_user_ringbuf_drain callback should
  * not be able to read past the end of the pointer.
  */
-SEC("?raw_tp/sys_nanosleep")
+SEC("?raw_tp/")
 int user_ringbuf_callback_submit_dynptr(void *ctx)
 {
        bpf_user_ringbuf_drain(&user_ringbuf, try_submit_dynptr, NULL, 0);
@@ -168,10 +175,38 @@ invalid_drain_callback_return(struct bpf_dynptr *dynptr, void *context)
 /* A callback that accesses a dynptr in a bpf_user_ringbuf_drain callback should
  * not be able to write to that pointer.
  */
-SEC("?raw_tp/sys_nanosleep")
+SEC("?raw_tp/")
 int user_ringbuf_callback_invalid_return(void *ctx)
 {
        bpf_user_ringbuf_drain(&user_ringbuf, invalid_drain_callback_return, NULL, 0);
 
        return 0;
 }
+
+static long
+try_reinit_dynptr_mem(struct bpf_dynptr *dynptr, void *context)
+{
+       bpf_dynptr_from_mem(&map_value, 4, 0, dynptr);
+       return 0;
+}
+
+static long
+try_reinit_dynptr_ringbuf(struct bpf_dynptr *dynptr, void *context)
+{
+       bpf_ringbuf_reserve_dynptr(&ringbuf, 8, 0, dynptr);
+       return 0;
+}
+
+SEC("?raw_tp/")
+int user_ringbuf_callback_reinit_dynptr_mem(void *ctx)
+{
+       bpf_user_ringbuf_drain(&user_ringbuf, try_reinit_dynptr_mem, NULL, 0);
+       return 0;
+}
+
+SEC("?raw_tp/")
+int user_ringbuf_callback_reinit_dynptr_ringbuf(void *ctx)
+{
+       bpf_user_ringbuf_drain(&user_ringbuf, try_reinit_dynptr_ringbuf, NULL, 0);
+       return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/xfrm_info.c b/tools/testing/selftests/bpf/progs/xfrm_info.c
new file mode 100644 (file)
index 0000000..f6a501f
--- /dev/null
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "vmlinux.h"
+#include "bpf_tracing_net.h"
+#include <bpf/bpf_helpers.h>
+
+struct bpf_xfrm_info___local {
+       u32 if_id;
+       int link;
+} __attribute__((preserve_access_index));
+
+__u32 req_if_id;
+__u32 resp_if_id;
+
+int bpf_skb_set_xfrm_info(struct __sk_buff *skb_ctx,
+                         const struct bpf_xfrm_info___local *from) __ksym;
+int bpf_skb_get_xfrm_info(struct __sk_buff *skb_ctx,
+                         struct bpf_xfrm_info___local *to) __ksym;
+
+SEC("tc")
+int set_xfrm_info(struct __sk_buff *skb)
+{
+       struct bpf_xfrm_info___local info = { .if_id = req_if_id };
+
+       return bpf_skb_set_xfrm_info(skb, &info) ? TC_ACT_SHOT : TC_ACT_UNSPEC;
+}
+
+SEC("tc")
+int get_xfrm_info(struct __sk_buff *skb)
+{
+       struct bpf_xfrm_info___local info = {};
+
+       if (bpf_skb_get_xfrm_info(skb, &info) < 0)
+               return TC_ACT_SHOT;
+
+       resp_if_id = info.if_id;
+
+       return TC_ACT_UNSPEC;
+}
+
+char _license[] SEC("license") = "GPL";
index 19ad172..0bd9990 100644 (file)
@@ -1,9 +1,9 @@
 /* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
 #include <iostream>
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#include <unistd.h>
+#include <linux/bpf.h>
+#include <linux/btf.h>
 #include <bpf/libbpf.h>
-#pragma GCC diagnostic pop
 #include <bpf/bpf.h>
 #include <bpf/btf.h>
 #include "test_core_extern.skel.h"
@@ -99,6 +99,7 @@ int main(int argc, char *argv[])
        struct btf_dump_opts opts = { };
        struct test_core_extern *skel;
        struct btf *btf;
+       int fd;
 
        try_skeleton_template();
 
@@ -117,6 +118,12 @@ int main(int argc, char *argv[])
        skel = test_core_extern__open_and_load();
        test_core_extern__destroy(skel);
 
+       fd = bpf_enable_stats(BPF_STATS_RUN_TIME);
+       if (fd < 0)
+               std::cout << "FAILED to enable stats: " << fd << std::endl;
+       else
+               ::close(fd);
+
        std::cout << "DONE!" << std::endl;
 
        return 0;
diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c
new file mode 100644 (file)
index 0000000..679efb3
--- /dev/null
@@ -0,0 +1,233 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */
+#include <stdlib.h>
+#include <test_progs.h>
+#include <bpf/btf.h>
+
+#define str_has_pfx(str, pfx) \
+       (strncmp(str, pfx, __builtin_constant_p(pfx) ? sizeof(pfx) - 1 : strlen(pfx)) == 0)
+
+#define TEST_LOADER_LOG_BUF_SZ 1048576
+
+#define TEST_TAG_EXPECT_FAILURE "comment:test_expect_failure"
+#define TEST_TAG_EXPECT_SUCCESS "comment:test_expect_success"
+#define TEST_TAG_EXPECT_MSG_PFX "comment:test_expect_msg="
+#define TEST_TAG_LOG_LEVEL_PFX "comment:test_log_level="
+
+struct test_spec {
+       const char *name;
+       bool expect_failure;
+       const char *expect_msg;
+       int log_level;
+};
+
+static int tester_init(struct test_loader *tester)
+{
+       if (!tester->log_buf) {
+               tester->log_buf_sz = TEST_LOADER_LOG_BUF_SZ;
+               tester->log_buf = malloc(tester->log_buf_sz);
+               if (!ASSERT_OK_PTR(tester->log_buf, "tester_log_buf"))
+                       return -ENOMEM;
+       }
+
+       return 0;
+}
+
+void test_loader_fini(struct test_loader *tester)
+{
+       if (!tester)
+               return;
+
+       free(tester->log_buf);
+}
+
+static int parse_test_spec(struct test_loader *tester,
+                          struct bpf_object *obj,
+                          struct bpf_program *prog,
+                          struct test_spec *spec)
+{
+       struct btf *btf;
+       int func_id, i;
+
+       memset(spec, 0, sizeof(*spec));
+
+       spec->name = bpf_program__name(prog);
+
+       btf = bpf_object__btf(obj);
+       if (!btf) {
+               ASSERT_FAIL("BPF object has no BTF");
+               return -EINVAL;
+       }
+
+       func_id = btf__find_by_name_kind(btf, spec->name, BTF_KIND_FUNC);
+       if (func_id < 0) {
+               ASSERT_FAIL("failed to find FUNC BTF type for '%s'", spec->name);
+               return -EINVAL;
+       }
+
+       for (i = 1; i < btf__type_cnt(btf); i++) {
+               const struct btf_type *t;
+               const char *s;
+
+               t = btf__type_by_id(btf, i);
+               if (!btf_is_decl_tag(t))
+                       continue;
+
+               if (t->type != func_id || btf_decl_tag(t)->component_idx != -1)
+                       continue;
+
+               s = btf__str_by_offset(btf, t->name_off);
+               if (strcmp(s, TEST_TAG_EXPECT_FAILURE) == 0) {
+                       spec->expect_failure = true;
+               } else if (strcmp(s, TEST_TAG_EXPECT_SUCCESS) == 0) {
+                       spec->expect_failure = false;
+               } else if (str_has_pfx(s, TEST_TAG_EXPECT_MSG_PFX)) {
+                       spec->expect_msg = s + sizeof(TEST_TAG_EXPECT_MSG_PFX) - 1;
+               } else if (str_has_pfx(s, TEST_TAG_LOG_LEVEL_PFX)) {
+                       errno = 0;
+                       spec->log_level = strtol(s + sizeof(TEST_TAG_LOG_LEVEL_PFX) - 1, NULL, 0);
+                       if (errno) {
+                               ASSERT_FAIL("failed to parse test log level from '%s'", s);
+                               return -EINVAL;
+                       }
+               }
+       }
+
+       return 0;
+}
+
+static void prepare_case(struct test_loader *tester,
+                        struct test_spec *spec,
+                        struct bpf_object *obj,
+                        struct bpf_program *prog)
+{
+       int min_log_level = 0;
+
+       if (env.verbosity > VERBOSE_NONE)
+               min_log_level = 1;
+       if (env.verbosity > VERBOSE_VERY)
+               min_log_level = 2;
+
+       bpf_program__set_log_buf(prog, tester->log_buf, tester->log_buf_sz);
+
+       /* Make sure we set at least minimal log level, unless test requirest
+        * even higher level already. Make sure to preserve independent log
+        * level 4 (verifier stats), though.
+        */
+       if ((spec->log_level & 3) < min_log_level)
+               bpf_program__set_log_level(prog, (spec->log_level & 4) | min_log_level);
+       else
+               bpf_program__set_log_level(prog, spec->log_level);
+
+       tester->log_buf[0] = '\0';
+}
+
+static void emit_verifier_log(const char *log_buf, bool force)
+{
+       if (!force && env.verbosity == VERBOSE_NONE)
+               return;
+       fprintf(stdout, "VERIFIER LOG:\n=============\n%s=============\n", log_buf);
+}
+
+static void validate_case(struct test_loader *tester,
+                         struct test_spec *spec,
+                         struct bpf_object *obj,
+                         struct bpf_program *prog,
+                         int load_err)
+{
+       if (spec->expect_msg) {
+               char *match;
+
+               match = strstr(tester->log_buf, spec->expect_msg);
+               if (!ASSERT_OK_PTR(match, "expect_msg")) {
+                       /* if we are in verbose mode, we've already emitted log */
+                       if (env.verbosity == VERBOSE_NONE)
+                               emit_verifier_log(tester->log_buf, true /*force*/);
+                       fprintf(stderr, "EXPECTED MSG: '%s'\n", spec->expect_msg);
+                       return;
+               }
+       }
+}
+
+/* this function is forced noinline and has short generic name to look better
+ * in test_progs output (in case of a failure)
+ */
+static noinline
+void run_subtest(struct test_loader *tester,
+                const char *skel_name,
+                skel_elf_bytes_fn elf_bytes_factory)
+{
+       LIBBPF_OPTS(bpf_object_open_opts, open_opts, .object_name = skel_name);
+       struct bpf_object *obj = NULL, *tobj;
+       struct bpf_program *prog, *tprog;
+       const void *obj_bytes;
+       size_t obj_byte_cnt;
+       int err;
+
+       if (tester_init(tester) < 0)
+               return; /* failed to initialize tester */
+
+       obj_bytes = elf_bytes_factory(&obj_byte_cnt);
+       obj = bpf_object__open_mem(obj_bytes, obj_byte_cnt, &open_opts);
+       if (!ASSERT_OK_PTR(obj, "obj_open_mem"))
+               return;
+
+       bpf_object__for_each_program(prog, obj) {
+               const char *prog_name = bpf_program__name(prog);
+               struct test_spec spec;
+
+               if (!test__start_subtest(prog_name))
+                       continue;
+
+               /* if we can't derive test specification, go to the next test */
+               err = parse_test_spec(tester, obj, prog, &spec);
+               if (!ASSERT_OK(err, "parse_test_spec"))
+                       continue;
+
+               tobj = bpf_object__open_mem(obj_bytes, obj_byte_cnt, &open_opts);
+               if (!ASSERT_OK_PTR(tobj, "obj_open_mem")) /* shouldn't happen */
+                       continue;
+
+               bpf_object__for_each_program(tprog, tobj)
+                       bpf_program__set_autoload(tprog, false);
+
+               bpf_object__for_each_program(tprog, tobj) {
+                       /* only load specified program */
+                       if (strcmp(bpf_program__name(tprog), prog_name) == 0) {
+                               bpf_program__set_autoload(tprog, true);
+                               break;
+                       }
+               }
+
+               prepare_case(tester, &spec, tobj, tprog);
+
+               err = bpf_object__load(tobj);
+               if (spec.expect_failure) {
+                       if (!ASSERT_ERR(err, "unexpected_load_success")) {
+                               emit_verifier_log(tester->log_buf, false /*force*/);
+                               goto tobj_cleanup;
+                       }
+               } else {
+                       if (!ASSERT_OK(err, "unexpected_load_failure")) {
+                               emit_verifier_log(tester->log_buf, true /*force*/);
+                               goto tobj_cleanup;
+                       }
+               }
+
+               emit_verifier_log(tester->log_buf, false /*force*/);
+               validate_case(tester, &spec, tobj, tprog, err);
+
+tobj_cleanup:
+               bpf_object__close(tobj);
+       }
+
+       bpf_object__close(obj);
+}
+
+void test_loader__run_subtests(struct test_loader *tester,
+                              const char *skel_name,
+                              skel_elf_bytes_fn elf_bytes_factory)
+{
+       /* see comment in run_subtest() for why we do this function nesting */
+       run_subtest(tester, skel_name, elf_bytes_factory);
+}
index 7fc15e0..7cb1bc0 100755 (executable)
@@ -769,12 +769,14 @@ skip(ret != 0, "bpftool not installed")
 base_progs = progs
 _, base_maps = bpftool("map")
 base_map_names = [
-    'pid_iter.rodata' # created on each bpftool invocation
+    'pid_iter.rodata', # created on each bpftool invocation
+    'libbpf_det_bind', # created on each bpftool invocation
 ]
 
 # Check netdevsim
-ret, out = cmd("modprobe netdevsim", fail=False)
-skip(ret != 0, "netdevsim module could not be loaded")
+if not os.path.isdir("/sys/bus/netdevsim/"):
+    ret, out = cmd("modprobe netdevsim", fail=False)
+    skip(ret != 0, "netdevsim module could not be loaded")
 
 # Check debugfs
 _, out = cmd("mount")
index b090996..3f058df 100644 (file)
@@ -1,4 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __TEST_PROGS_H
+#define __TEST_PROGS_H
+
 #include <stdio.h>
 #include <unistd.h>
 #include <errno.h>
@@ -210,6 +213,12 @@ int test__join_cgroup(const char *path);
 #define CHECK_ATTR(condition, tag, format...) \
        _CHECK(condition, tag, tattr.duration, format)
 
+#define ASSERT_FAIL(fmt, args...) ({                                   \
+       static int duration = 0;                                        \
+       CHECK(false, "", fmt"\n", ##args);                              \
+       false;                                                          \
+})
+
 #define ASSERT_TRUE(actual, name) ({                                   \
        static int duration = 0;                                        \
        bool ___ok = (actual);                                          \
@@ -397,3 +406,27 @@ int write_sysctl(const char *sysctl, const char *value);
 #endif
 
 #define BPF_TESTMOD_TEST_FILE "/sys/kernel/bpf_testmod"
+
+struct test_loader {
+       char *log_buf;
+       size_t log_buf_sz;
+
+       struct bpf_object *obj;
+};
+
+typedef const void *(*skel_elf_bytes_fn)(size_t *sz);
+
+extern void test_loader__run_subtests(struct test_loader *tester,
+                                     const char *skel_name,
+                                     skel_elf_bytes_fn elf_bytes_factory);
+
+extern void test_loader_fini(struct test_loader *tester);
+
+#define RUN_TESTS(skel) ({                                                    \
+       struct test_loader tester = {};                                        \
+                                                                              \
+       test_loader__run_subtests(&tester, #skel, skel##__elf_bytes);          \
+       test_loader_fini(&tester);                                             \
+})
+
+#endif /* __TEST_PROGS_H */
index e768181..024a0fa 100644 (file)
@@ -1690,24 +1690,42 @@ static void test_txmsg_apply(int cgrp, struct sockmap_options *opt)
 {
        txmsg_pass = 1;
        txmsg_redir = 0;
+       txmsg_ingress = 0;
        txmsg_apply = 1;
        txmsg_cork = 0;
        test_send_one(opt, cgrp);
 
        txmsg_pass = 0;
        txmsg_redir = 1;
+       txmsg_ingress = 0;
+       txmsg_apply = 1;
+       txmsg_cork = 0;
+       test_send_one(opt, cgrp);
+
+       txmsg_pass = 0;
+       txmsg_redir = 1;
+       txmsg_ingress = 1;
        txmsg_apply = 1;
        txmsg_cork = 0;
        test_send_one(opt, cgrp);
 
        txmsg_pass = 1;
        txmsg_redir = 0;
+       txmsg_ingress = 0;
+       txmsg_apply = 1024;
+       txmsg_cork = 0;
+       test_send_large(opt, cgrp);
+
+       txmsg_pass = 0;
+       txmsg_redir = 1;
+       txmsg_ingress = 0;
        txmsg_apply = 1024;
        txmsg_cork = 0;
        test_send_large(opt, cgrp);
 
        txmsg_pass = 0;
        txmsg_redir = 1;
+       txmsg_ingress = 1;
        txmsg_apply = 1024;
        txmsg_cork = 0;
        test_send_large(opt, cgrp);
index 3193915..9d99392 100644 (file)
@@ -76,7 +76,7 @@
        },
        .prog_type = BPF_PROG_TYPE_SCHED_CLS,
        .result = REJECT,
-       .errstr = "arg#0 expected pointer to ctx, but got PTR",
+       .errstr = "R1 must have zero offset when passed to release func or trusted arg to kfunc",
        .fixup_kfunc_btf_id = {
                { "bpf_kfunc_call_test_pass_ctx", 2 },
        },
        .errstr = "!read_ok",
        .result = REJECT,
 },
+/* Make sure that verifier.c:states_equal() considers IDs from all
+ * frames when building 'idmap' for check_ids().
+ */
+{
+       "calls: check_ids() across call boundary",
+       .insns = {
+       /* Function main() */
+       BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+       /* fp[-24] = map_lookup_elem(...) ; get a MAP_VALUE_PTR_OR_NULL with some ID */
+       BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+       BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+       BPF_LD_MAP_FD(BPF_REG_1,
+                     0),
+       BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+       BPF_STX_MEM(BPF_DW, BPF_REG_FP, BPF_REG_0, -24),
+       /* fp[-32] = map_lookup_elem(...) ; get a MAP_VALUE_PTR_OR_NULL with some ID */
+       BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+       BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+       BPF_LD_MAP_FD(BPF_REG_1,
+                     0),
+       BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+       BPF_STX_MEM(BPF_DW, BPF_REG_FP, BPF_REG_0, -32),
+       /* call foo(&fp[-24], &fp[-32])   ; both arguments have IDs in the current
+        *                                ; stack frame
+        */
+       BPF_MOV64_REG(BPF_REG_1, BPF_REG_FP),
+       BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -24),
+       BPF_MOV64_REG(BPF_REG_2, BPF_REG_FP),
+       BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -32),
+       BPF_CALL_REL(2),
+       /* exit 0 */
+       BPF_MOV64_IMM(BPF_REG_0, 0),
+       BPF_EXIT_INSN(),
+       /* Function foo()
+        *
+        * r9 = &frame[0].fp[-24]  ; save arguments in the callee saved registers,
+        * r8 = &frame[0].fp[-32]  ; arguments are pointers to pointers to map value
+        */
+       BPF_MOV64_REG(BPF_REG_9, BPF_REG_1),
+       BPF_MOV64_REG(BPF_REG_8, BPF_REG_2),
+       /* r7 = ktime_get_ns() */
+       BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns),
+       BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
+       /* r6 = ktime_get_ns() */
+       BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns),
+       BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+       /* if r6 > r7 goto +1      ; no new information about the state is derived from
+        *                         ; this check, thus produced verifier states differ
+        *                         ; only in 'insn_idx'
+        * r9 = r8
+        */
+       BPF_JMP_REG(BPF_JGT, BPF_REG_6, BPF_REG_7, 1),
+       BPF_MOV64_REG(BPF_REG_9, BPF_REG_8),
+       /* r9 = *r9                ; verifier get's to this point via two paths:
+        *                         ; (I) one including r9 = r8, verified first;
+        *                         ; (II) one excluding r9 = r8, verified next.
+        *                         ; After load of *r9 to r9 the frame[0].fp[-24].id == r9.id.
+        *                         ; Suppose that checkpoint is created here via path (I).
+        *                         ; When verifying via (II) the r9.id must be compared against
+        *                         ; frame[0].fp[-24].id, otherwise (I) and (II) would be
+        *                         ; incorrectly deemed equivalent.
+        * if r9 == 0 goto <exit>
+        */
+       BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_9, 0),
+       BPF_JMP_IMM(BPF_JEQ, BPF_REG_9, 0, 1),
+       /* r8 = *r8                ; read map value via r8, this is not safe
+        * r0 = *r8                ; because r8 might be not equal to r9.
+        */
+       BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_8, 0),
+       BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_8, 0),
+       /* exit 0 */
+       BPF_MOV64_IMM(BPF_REG_0, 0),
+       BPF_EXIT_INSN(),
+       },
+       .flags = BPF_F_TEST_STATE_FREQ,
+       .fixup_map_hash_8b = { 3, 9 },
+       .result = REJECT,
+       .errstr = "R8 invalid mem access 'map_value_or_null'",
+       .result_unpriv = REJECT,
+       .errstr_unpriv = "",
+       .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+},
index 11acd18..dce2e28 100644 (file)
        .result = ACCEPT,
        .prog_type = BPF_PROG_TYPE_SCHED_CLS,
 },
+{
+       "direct packet access: test30 (check_id() in regsafe(), bad access)",
+       .insns = {
+       /* r9 = ctx */
+       BPF_MOV64_REG(BPF_REG_9, BPF_REG_1),
+       /* r7 = ktime_get_ns() */
+       BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns),
+       BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
+       /* r6 = ktime_get_ns() */
+       BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns),
+       BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+       /* r2 = ctx->data
+        * r3 = ctx->data
+        * r4 = ctx->data_end
+        */
+       BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_9, offsetof(struct __sk_buff, data)),
+       BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_9, offsetof(struct __sk_buff, data)),
+       BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_9, offsetof(struct __sk_buff, data_end)),
+       /* if r6 > 100 goto exit
+        * if r7 > 100 goto exit
+        */
+       BPF_JMP_IMM(BPF_JGT, BPF_REG_6, 100, 9),
+       BPF_JMP_IMM(BPF_JGT, BPF_REG_7, 100, 8),
+       /* r2 += r6              ; this forces assignment of ID to r2
+        * r2 += 1               ; get some fixed off for r2
+        * r3 += r7              ; this forces assignment of ID to r3
+        * r3 += 1               ; get some fixed off for r3
+        */
+       BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_6),
+       BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 1),
+       BPF_ALU64_REG(BPF_ADD, BPF_REG_3, BPF_REG_7),
+       BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, 1),
+       /* if r6 > r7 goto +1    ; no new information about the state is derived from
+        *                       ; this check, thus produced verifier states differ
+        *                       ; only in 'insn_idx'
+        * r2 = r3               ; optionally share ID between r2 and r3
+        */
+       BPF_JMP_REG(BPF_JNE, BPF_REG_6, BPF_REG_7, 1),
+       BPF_MOV64_REG(BPF_REG_2, BPF_REG_3),
+       /* if r3 > ctx->data_end goto exit */
+       BPF_JMP_REG(BPF_JGT, BPF_REG_3, BPF_REG_4, 1),
+       /* r5 = *(u8 *) (r2 - 1) ; access packet memory using r2,
+        *                       ; this is not always safe
+        */
+       BPF_LDX_MEM(BPF_B, BPF_REG_5, BPF_REG_2, -1),
+       /* exit(0) */
+       BPF_MOV64_IMM(BPF_REG_0, 0),
+       BPF_EXIT_INSN(),
+       },
+       .flags = BPF_F_TEST_STATE_FREQ,
+       .result = REJECT,
+       .errstr = "invalid access to packet, off=0 size=1, R2",
+       .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
index 1f82021..17ee84d 100644 (file)
@@ -9,7 +9,7 @@
        },
        .fixup_map_array_48b = { 1 },
        .result_unpriv = REJECT,
-       .errstr_unpriv = "bpf_array access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN",
+       .errstr_unpriv = "access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN",
        .result = REJECT,
        .errstr = "R1 is bpf_array invalid negative access: off=-8",
 },
@@ -26,7 +26,7 @@
        },
        .fixup_map_array_48b = { 3 },
        .result_unpriv = REJECT,
-       .errstr_unpriv = "bpf_array access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN",
+       .errstr_unpriv = "access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN",
        .result = REJECT,
        .errstr = "only read from bpf_array is supported",
 },
@@ -41,7 +41,7 @@
        },
        .fixup_map_array_48b = { 1 },
        .result_unpriv = REJECT,
-       .errstr_unpriv = "bpf_array access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN",
+       .errstr_unpriv = "access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN",
        .result = REJECT,
        .errstr = "cannot access ptr member ops with moff 0 in struct bpf_map with off 1 size 4",
        .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
@@ -57,7 +57,7 @@
        },
        .fixup_map_array_48b = { 1 },
        .result_unpriv = REJECT,
-       .errstr_unpriv = "bpf_array access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN",
+       .errstr_unpriv = "access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN",
        .result = ACCEPT,
        .retval = 1,
 },
index 84838fe..92e3f6a 100644 (file)
@@ -28,7 +28,7 @@
        },
        .fixup_map_ringbuf = { 1 },
        .result = REJECT,
-       .errstr = "dereference of modified ringbuf_mem ptr R1",
+       .errstr = "R1 must have zero offset when passed to release func",
 },
 {
        "ringbuf: invalid reservation offset 2",
index 781621f..eaf114f 100644 (file)
        .errstr = "inside bpf_spin_lock",
        .prog_type = BPF_PROG_TYPE_SCHED_CLS,
 },
+{
+       "spin_lock: regsafe compare reg->id for map value",
+       .insns = {
+       BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+       BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_6, offsetof(struct __sk_buff, mark)),
+       BPF_LD_MAP_FD(BPF_REG_1, 0),
+       BPF_MOV64_REG(BPF_REG_9, BPF_REG_1),
+       BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0),
+       BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+       BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
+       BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+       BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+       BPF_EXIT_INSN(),
+       BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
+       BPF_MOV64_REG(BPF_REG_1, BPF_REG_9),
+       BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+       BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
+       BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+       BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+       BPF_EXIT_INSN(),
+       BPF_MOV64_REG(BPF_REG_8, BPF_REG_0),
+       BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+       BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 4),
+       BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_spin_lock),
+       BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 1),
+       BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+       BPF_MOV64_REG(BPF_REG_7, BPF_REG_8),
+       BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+       BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 4),
+       BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_spin_unlock),
+       BPF_MOV64_IMM(BPF_REG_0, 0),
+       BPF_EXIT_INSN(),
+       },
+       .fixup_map_spin_lock = { 2 },
+       .result = REJECT,
+       .errstr = "bpf_spin_unlock of different lock",
+       .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+       .flags = BPF_F_TEST_STATE_FREQ,
+},
+/* Make sure that regsafe() compares ids for spin lock records using
+ * check_ids():
+ *  1: r9 = map_lookup_elem(...)  ; r9.id == 1
+ *  2: r8 = map_lookup_elem(...)  ; r8.id == 2
+ *  3: r7 = ktime_get_ns()
+ *  4: r6 = ktime_get_ns()
+ *  5: if r6 > r7 goto <9>
+ *  6: spin_lock(r8)
+ *  7: r9 = r8
+ *  8: goto <10>
+ *  9: spin_lock(r9)
+ * 10: spin_unlock(r9)             ; r9.id == 1 || r9.id == 2 and lock is active,
+ *                                 ; second visit to (10) should be considered safe
+ *                                 ; if check_ids() is used.
+ * 11: exit(0)
+ */
+{
+       "spin_lock: regsafe() check_ids() similar id mappings",
+       .insns = {
+       BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0),
+       /* r9 = map_lookup_elem(...) */
+       BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+       BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
+       BPF_LD_MAP_FD(BPF_REG_1,
+                     0),
+       BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+       BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 24),
+       BPF_MOV64_REG(BPF_REG_9, BPF_REG_0),
+       /* r8 = map_lookup_elem(...) */
+       BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+       BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
+       BPF_LD_MAP_FD(BPF_REG_1,
+                     0),
+       BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+       BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 18),
+       BPF_MOV64_REG(BPF_REG_8, BPF_REG_0),
+       /* r7 = ktime_get_ns() */
+       BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns),
+       BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
+       /* r6 = ktime_get_ns() */
+       BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns),
+       BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+       /* if r6 > r7 goto +5      ; no new information about the state is derived from
+        *                         ; this check, thus produced verifier states differ
+        *                         ; only in 'insn_idx'
+        * spin_lock(r8)
+        * r9 = r8
+        * goto unlock
+        */
+       BPF_JMP_REG(BPF_JGT, BPF_REG_6, BPF_REG_7, 5),
+       BPF_MOV64_REG(BPF_REG_1, BPF_REG_8),
+       BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 4),
+       BPF_EMIT_CALL(BPF_FUNC_spin_lock),
+       BPF_MOV64_REG(BPF_REG_9, BPF_REG_8),
+       BPF_JMP_A(3),
+       /* spin_lock(r9) */
+       BPF_MOV64_REG(BPF_REG_1, BPF_REG_9),
+       BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 4),
+       BPF_EMIT_CALL(BPF_FUNC_spin_lock),
+       /* spin_unlock(r9) */
+       BPF_MOV64_REG(BPF_REG_1, BPF_REG_9),
+       BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 4),
+       BPF_EMIT_CALL(BPF_FUNC_spin_unlock),
+       /* exit(0) */
+       BPF_MOV64_IMM(BPF_REG_0, 0),
+       BPF_EXIT_INSN(),
+       },
+       .fixup_map_spin_lock = { 3, 10 },
+       .result = VERBOSE_ACCEPT,
+       .errstr = "28: safe",
+       .result_unpriv = REJECT,
+       .errstr_unpriv = "",
+       .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+       .flags = BPF_F_TEST_STATE_FREQ,
+},
index 3ecb70a..52a8bca 100644 (file)
        .prog_type = BPF_PROG_TYPE_SCHED_CLS,
        .result = ACCEPT,
 },
+{
+       "MAP_VALUE_OR_NULL check_ids() in regsafe()",
+       .insns = {
+       BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+       /* r9 = map_lookup_elem(...) */
+       BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+       BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+       BPF_LD_MAP_FD(BPF_REG_1,
+                     0),
+       BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+       BPF_MOV64_REG(BPF_REG_9, BPF_REG_0),
+       /* r8 = map_lookup_elem(...) */
+       BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+       BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+       BPF_LD_MAP_FD(BPF_REG_1,
+                     0),
+       BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+       BPF_MOV64_REG(BPF_REG_8, BPF_REG_0),
+       /* r7 = ktime_get_ns() */
+       BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns),
+       BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
+       /* r6 = ktime_get_ns() */
+       BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns),
+       BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+       /* if r6 > r7 goto +1    ; no new information about the state is derived from
+        *                       ; this check, thus produced verifier states differ
+        *                       ; only in 'insn_idx'
+        * r9 = r8               ; optionally share ID between r9 and r8
+        */
+       BPF_JMP_REG(BPF_JGT, BPF_REG_6, BPF_REG_7, 1),
+       BPF_MOV64_REG(BPF_REG_9, BPF_REG_8),
+       /* if r9 == 0 goto <exit> */
+       BPF_JMP_IMM(BPF_JEQ, BPF_REG_9, 0, 1),
+       /* read map value via r8, this is not always
+        * safe because r8 might be not equal to r9.
+        */
+       BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_8, 0),
+       /* exit 0 */
+       BPF_MOV64_IMM(BPF_REG_0, 0),
+       BPF_EXIT_INSN(),
+       },
+       .flags = BPF_F_TEST_STATE_FREQ,
+       .fixup_map_hash_8b = { 3, 9 },
+       .result = REJECT,
+       .errstr = "R8 invalid mem access 'map_value_or_null'",
+       .result_unpriv = REJECT,
+       .errstr_unpriv = "",
+       .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+},
@@ -31,7 +31,7 @@
 # |    2001:db8:10::2/64    |
 # +-------------------------+
 
-lib_dir=$(dirname $0)/../../../../net/forwarding
+lib_dir=$(dirname $0)/../../../net/forwarding
 
 ALL_TESTS="
        decap_error_test
index ee38ca8..9cc8411 100644 (file)
@@ -2,6 +2,7 @@
 bind_bhash
 csum
 cmsg_sender
+diag_uid
 fin_ack_lat
 gro
 hwtstamp_config
index 969620a..1e4b397 100644 (file)
@@ -1,3 +1,3 @@
-TEST_GEN_PROGS := test_unix_oob unix_connect
+TEST_GEN_PROGS := diag_uid test_unix_oob unix_connect
 
 include ../../lib.mk
diff --git a/tools/testing/selftests/net/af_unix/diag_uid.c b/tools/testing/selftests/net/af_unix/diag_uid.c
new file mode 100644 (file)
index 0000000..5b88f71
--- /dev/null
@@ -0,0 +1,178 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright Amazon.com Inc. or its affiliates. */
+
+#define _GNU_SOURCE
+#include <sched.h>
+
+#include <unistd.h>
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <linux/sock_diag.h>
+#include <linux/unix_diag.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/un.h>
+
+#include "../../kselftest_harness.h"
+
+FIXTURE(diag_uid)
+{
+       int netlink_fd;
+       int unix_fd;
+       __u32 inode;
+       __u64 cookie;
+};
+
+FIXTURE_VARIANT(diag_uid)
+{
+       int unshare;
+       int udiag_show;
+};
+
+FIXTURE_VARIANT_ADD(diag_uid, uid)
+{
+       .unshare = 0,
+       .udiag_show = UDIAG_SHOW_UID
+};
+
+FIXTURE_VARIANT_ADD(diag_uid, uid_unshare)
+{
+       .unshare = CLONE_NEWUSER,
+       .udiag_show = UDIAG_SHOW_UID
+};
+
+FIXTURE_SETUP(diag_uid)
+{
+       struct stat file_stat;
+       socklen_t optlen;
+       int ret;
+
+       if (variant->unshare)
+               ASSERT_EQ(unshare(variant->unshare), 0);
+
+       self->netlink_fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_SOCK_DIAG);
+       ASSERT_NE(self->netlink_fd, -1);
+
+       self->unix_fd = socket(AF_UNIX, SOCK_STREAM, 0);
+       ASSERT_NE(self->unix_fd, -1);
+
+       ret = fstat(self->unix_fd, &file_stat);
+       ASSERT_EQ(ret, 0);
+
+       self->inode = file_stat.st_ino;
+
+       optlen = sizeof(self->cookie);
+       ret = getsockopt(self->unix_fd, SOL_SOCKET, SO_COOKIE, &self->cookie, &optlen);
+       ASSERT_EQ(ret, 0);
+}
+
+FIXTURE_TEARDOWN(diag_uid)
+{
+       close(self->netlink_fd);
+       close(self->unix_fd);
+}
+
+int send_request(struct __test_metadata *_metadata,
+                FIXTURE_DATA(diag_uid) *self,
+                const FIXTURE_VARIANT(diag_uid) *variant)
+{
+       struct {
+               struct nlmsghdr nlh;
+               struct unix_diag_req udr;
+       } req = {
+               .nlh = {
+                       .nlmsg_len = sizeof(req),
+                       .nlmsg_type = SOCK_DIAG_BY_FAMILY,
+                       .nlmsg_flags = NLM_F_REQUEST
+               },
+               .udr = {
+                       .sdiag_family = AF_UNIX,
+                       .udiag_ino = self->inode,
+                       .udiag_cookie = {
+                               (__u32)self->cookie,
+                               (__u32)(self->cookie >> 32)
+                       },
+                       .udiag_show = variant->udiag_show
+               }
+       };
+       struct sockaddr_nl nladdr = {
+               .nl_family = AF_NETLINK
+       };
+       struct iovec iov = {
+               .iov_base = &req,
+               .iov_len = sizeof(req)
+       };
+       struct msghdr msg = {
+               .msg_name = &nladdr,
+               .msg_namelen = sizeof(nladdr),
+               .msg_iov = &iov,
+               .msg_iovlen = 1
+       };
+
+       return sendmsg(self->netlink_fd, &msg, 0);
+}
+
+void render_response(struct __test_metadata *_metadata,
+                    struct unix_diag_req *udr, __u32 len)
+{
+       unsigned int rta_len = len - NLMSG_LENGTH(sizeof(*udr));
+       struct rtattr *attr;
+       uid_t uid;
+
+       ASSERT_GT(len, sizeof(*udr));
+       ASSERT_EQ(udr->sdiag_family, AF_UNIX);
+
+       attr = (struct rtattr *)(udr + 1);
+       ASSERT_NE(RTA_OK(attr, rta_len), 0);
+       ASSERT_EQ(attr->rta_type, UNIX_DIAG_UID);
+
+       uid = *(uid_t *)RTA_DATA(attr);
+       ASSERT_EQ(uid, getuid());
+}
+
+void receive_response(struct __test_metadata *_metadata,
+                     FIXTURE_DATA(diag_uid) *self)
+{
+       long buf[8192 / sizeof(long)];
+       struct sockaddr_nl nladdr = {
+               .nl_family = AF_NETLINK
+       };
+       struct iovec iov = {
+               .iov_base = buf,
+               .iov_len = sizeof(buf)
+       };
+       struct msghdr msg = {
+               .msg_name = &nladdr,
+               .msg_namelen = sizeof(nladdr),
+               .msg_iov = &iov,
+               .msg_iovlen = 1
+       };
+       struct unix_diag_req *udr;
+       struct nlmsghdr *nlh;
+       int ret;
+
+       ret = recvmsg(self->netlink_fd, &msg, 0);
+       ASSERT_GT(ret, 0);
+
+       nlh = (struct nlmsghdr *)buf;
+       ASSERT_NE(NLMSG_OK(nlh, ret), 0);
+       ASSERT_EQ(nlh->nlmsg_type, SOCK_DIAG_BY_FAMILY);
+
+       render_response(_metadata, NLMSG_DATA(nlh), nlh->nlmsg_len);
+
+       nlh = NLMSG_NEXT(nlh, ret);
+       ASSERT_EQ(NLMSG_OK(nlh, ret), 0);
+}
+
+TEST_F(diag_uid, 1)
+{
+       int ret;
+
+       ret = send_request(_metadata, self, variant);
+       ASSERT_GT(ret, 0);
+
+       receive_response(_metadata, self);
+}
+
+TEST_HARNESS_MAIN
index a26cb94..4abaf16 100644 (file)
@@ -12,7 +12,7 @@ CCINCLUDE += -I$(SCRATCH_DIR)/include
 
 BPFOBJ := $(BUILD_DIR)/libbpf/libbpf.a
 
-MAKE_DIRS := $(BUILD_DIR)/libbpf
+MAKE_DIRS := $(BUILD_DIR)/libbpf $(OUTPUT)/bpf
 $(MAKE_DIRS):
        mkdir -p $@
 
@@ -37,8 +37,8 @@ endif
 
 CLANG_SYS_INCLUDES = $(call get_sys_includes,$(CLANG),$(CLANG_TARGET_ARCH))
 
-$(TEST_CUSTOM_PROGS): $(BPFOBJ)
-       $(CLANG) -O2 -target bpf -c $(@:.o=.c) $(CCINCLUDE) $(CLANG_SYS_INCLUDES) -o $@
+$(TEST_CUSTOM_PROGS): $(OUTPUT)/%.o: %.c $(BPFOBJ) | $(MAKE_DIRS)
+       $(CLANG) -O2 -target bpf -c $< $(CCINCLUDE) $(CLANG_SYS_INCLUDES) -o $@
 
 $(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile)                    \
           $(APIDIR)/linux/bpf.h                                               \
index ead7963..bd89198 100644 (file)
@@ -43,5 +43,5 @@ CONFIG_NET_ACT_TUNNEL_KEY=m
 CONFIG_NET_ACT_MIRRED=m
 CONFIG_BAREUDP=m
 CONFIG_IPV6_IOAM6_LWTUNNEL=y
-CONFIG_CRYPTO_SM4=y
+CONFIG_CRYPTO_SM4_GENERIC=y
 CONFIG_AMT=m
index 2271a87..5637b5d 100755 (executable)
@@ -1711,13 +1711,21 @@ ipv4_del_addr_test()
 
        $IP addr add dev dummy1 172.16.104.1/24
        $IP addr add dev dummy1 172.16.104.11/24
+       $IP addr add dev dummy1 172.16.104.12/24
+       $IP addr add dev dummy1 172.16.104.13/24
        $IP addr add dev dummy2 172.16.104.1/24
        $IP addr add dev dummy2 172.16.104.11/24
+       $IP addr add dev dummy2 172.16.104.12/24
        $IP route add 172.16.105.0/24 via 172.16.104.2 src 172.16.104.11
+       $IP route add 172.16.106.0/24 dev lo src 172.16.104.12
+       $IP route add table 0 172.16.107.0/24 via 172.16.104.2 src 172.16.104.13
        $IP route add vrf red 172.16.105.0/24 via 172.16.104.2 src 172.16.104.11
+       $IP route add vrf red 172.16.106.0/24 dev lo src 172.16.104.12
        set +e
 
        # removing address from device in vrf should only remove route from vrf table
+       echo "    Regular FIB info"
+
        $IP addr del dev dummy2 172.16.104.11/24
        $IP ro ls vrf red | grep -q 172.16.105.0/24
        log_test $? 1 "Route removed from VRF when source address deleted"
@@ -1735,6 +1743,35 @@ ipv4_del_addr_test()
        $IP ro ls vrf red | grep -q 172.16.105.0/24
        log_test $? 0 "Route in VRF is not removed by address delete"
 
+       # removing address from device in vrf should only remove route from vrf
+       # table even when the associated fib info only differs in table ID
+       echo "    Identical FIB info with different table ID"
+
+       $IP addr del dev dummy2 172.16.104.12/24
+       $IP ro ls vrf red | grep -q 172.16.106.0/24
+       log_test $? 1 "Route removed from VRF when source address deleted"
+
+       $IP ro ls | grep -q 172.16.106.0/24
+       log_test $? 0 "Route in default VRF not removed"
+
+       $IP addr add dev dummy2 172.16.104.12/24
+       $IP route add vrf red 172.16.106.0/24 dev lo src 172.16.104.12
+
+       $IP addr del dev dummy1 172.16.104.12/24
+       $IP ro ls | grep -q 172.16.106.0/24
+       log_test $? 1 "Route removed in default VRF when source address deleted"
+
+       $IP ro ls vrf red | grep -q 172.16.106.0/24
+       log_test $? 0 "Route in VRF is not removed by address delete"
+
+       # removing address from device in default vrf should remove route from
+       # the default vrf even when route was inserted with a table ID of 0.
+       echo "    Table ID 0"
+
+       $IP addr del dev dummy1 172.16.104.13/24
+       $IP ro ls | grep -q 172.16.107.0/24
+       log_test $? 1 "Route removed in default VRF when source address deleted"
+
        $IP li del dummy1
        $IP li del dummy2
        cleanup
index a9c5c1b..453ae00 100644 (file)
@@ -3,6 +3,7 @@
 TEST_PROGS = bridge_igmp.sh \
        bridge_locked_port.sh \
        bridge_mdb.sh \
+       bridge_mdb_host.sh \
        bridge_mdb_port_down.sh \
        bridge_mld.sh \
        bridge_port_isolation.sh \
index b1ba687..2fa5973 100755 (executable)
 #!/bin/bash
 # SPDX-License-Identifier: GPL-2.0
-#
-# Verify that adding host mdb entries work as intended for all types of
-# multicast filters: ipv4, ipv6, and mac
 
-ALL_TESTS="mdb_add_del_test"
-NUM_NETIFS=2
+# +-----------------------+                          +------------------------+
+# | H1 (vrf)              |                          | H2 (vrf)               |
+# | + $h1.10              |                          | + $h2.10               |
+# | | 192.0.2.1/28        |                          | | 192.0.2.2/28         |
+# | | 2001:db8:1::1/64    |                          | | 2001:db8:1::2/64     |
+# | |                     |                          | |                      |
+# | |  + $h1.20           |                          | |  + $h2.20            |
+# | \  | 198.51.100.1/24  |                          | \  | 198.51.100.2/24   |
+# |  \ | 2001:db8:2::1/64 |                          |  \ | 2001:db8:2::2/64  |
+# |   \|                  |                          |   \|                   |
+# |    + $h1              |                          |    + $h2               |
+# +----|------------------+                          +----|-------------------+
+#      |                                                  |
+# +----|--------------------------------------------------|-------------------+
+# | SW |                                                  |                   |
+# | +--|--------------------------------------------------|-----------------+ |
+# | |  + $swp1                   BR0 (802.1q)             + $swp2           | |
+# | |     vid 10                                             vid 10         | |
+# | |     vid 20                                             vid 20         | |
+# | |                                                                       | |
+# | +-----------------------------------------------------------------------+ |
+# +---------------------------------------------------------------------------+
 
-TEST_GROUP_IP4="225.1.2.3"
-TEST_GROUP_IP6="ff02::42"
-TEST_GROUP_MAC="01:00:01:c0:ff:ee"
+ALL_TESTS="
+       cfg_test
+       fwd_test
+       ctrl_test
+"
 
+NUM_NETIFS=4
 source lib.sh
+source tc_common.sh
 
 h1_create()
 {
-       simple_if_init $h1 192.0.2.1/24 2001:db8:1::1/64
+       simple_if_init $h1
+       vlan_create $h1 10 v$h1 192.0.2.1/28 2001:db8:1::1/64
+       vlan_create $h1 20 v$h1 198.51.100.1/24 2001:db8:2::1/64
 }
 
 h1_destroy()
 {
-       simple_if_fini $h1 192.0.2.1/24 2001:db8:1::1/64
+       vlan_destroy $h1 20
+       vlan_destroy $h1 10
+       simple_if_fini $h1
 }
 
-switch_create()
+h2_create()
 {
-       # Enable multicast filtering
-       ip link add dev br0 type bridge mcast_snooping 1
+       simple_if_init $h2
+       vlan_create $h2 10 v$h2 192.0.2.2/28
+       vlan_create $h2 20 v$h2 198.51.100.2/24
+}
 
-       ip link set dev $swp1 master br0
+h2_destroy()
+{
+       vlan_destroy $h2 20
+       vlan_destroy $h2 10
+       simple_if_fini $h2
+}
 
+switch_create()
+{
+       ip link add name br0 type bridge vlan_filtering 1 vlan_default_pvid 0 \
+               mcast_snooping 1 mcast_igmp_version 3 mcast_mld_version 2
+       bridge vlan add vid 10 dev br0 self
+       bridge vlan add vid 20 dev br0 self
        ip link set dev br0 up
+
+       ip link set dev $swp1 master br0
        ip link set dev $swp1 up
+       bridge vlan add vid 10 dev $swp1
+       bridge vlan add vid 20 dev $swp1
+
+       ip link set dev $swp2 master br0
+       ip link set dev $swp2 up
+       bridge vlan add vid 10 dev $swp2
+       bridge vlan add vid 20 dev $swp2
+
+       tc qdisc add dev br0 clsact
+       tc qdisc add dev $h2 clsact
 }
 
 switch_destroy()
 {
+       tc qdisc del dev $h2 clsact
+       tc qdisc del dev br0 clsact
+
+       bridge vlan del vid 20 dev $swp2
+       bridge vlan del vid 10 dev $swp2
+       ip link set dev $swp2 down
+       ip link set dev $swp2 nomaster
+
+       bridge vlan del vid 20 dev $swp1
+       bridge vlan del vid 10 dev $swp1
        ip link set dev $swp1 down
+       ip link set dev $swp1 nomaster
+
+       ip link set dev br0 down
+       bridge vlan del vid 20 dev br0 self
+       bridge vlan del vid 10 dev br0 self
        ip link del dev br0
 }
 
@@ -45,9 +110,14 @@ setup_prepare()
        h1=${NETIFS[p1]}
        swp1=${NETIFS[p2]}
 
+       swp2=${NETIFS[p3]}
+       h2=${NETIFS[p4]}
+
        vrf_prepare
+       forwarding_enable
 
        h1_create
+       h2_create
        switch_create
 }
 
@@ -56,48 +126,1039 @@ cleanup()
        pre_cleanup
 
        switch_destroy
+       h2_destroy
        h1_destroy
 
+       forwarding_restore
        vrf_cleanup
 }
 
-do_mdb_add_del()
+cfg_test_host_common()
+{
+       local name=$1; shift
+       local grp=$1; shift
+       local src=$1; shift
+       local state=$1; shift
+       local invalid_state=$1; shift
+
+       RET=0
+
+       # Check basic add, replace and delete behavior.
+       bridge mdb add dev br0 port br0 grp $grp $state vid 10
+       bridge mdb show dev br0 vid 10 | grep -q "$grp"
+       check_err $? "Failed to add $name host entry"
+
+       bridge mdb replace dev br0 port br0 grp $grp $state vid 10 &> /dev/null
+       check_fail $? "Managed to replace $name host entry"
+
+       bridge mdb del dev br0 port br0 grp $grp $state vid 10
+       bridge mdb show dev br0 vid 10 | grep -q "$grp"
+       check_fail $? "Failed to delete $name host entry"
+
+       # Check error cases.
+       bridge mdb add dev br0 port br0 grp $grp $invalid_state vid 10 \
+               &> /dev/null
+       check_fail $? "Managed to add $name host entry with a $invalid_state state"
+
+       bridge mdb add dev br0 port br0 grp $grp src $src $state vid 10 \
+               &> /dev/null
+       check_fail $? "Managed to add $name host entry with a source"
+
+       bridge mdb add dev br0 port br0 grp $grp $state vid 10 \
+               filter_mode exclude &> /dev/null
+       check_fail $? "Managed to add $name host entry with a filter mode"
+
+       bridge mdb add dev br0 port br0 grp $grp $state vid 10 \
+               source_list $src &> /dev/null
+       check_fail $? "Managed to add $name host entry with a source list"
+
+       bridge mdb add dev br0 port br0 grp $grp $state vid 10 \
+               proto 123 &> /dev/null
+       check_fail $? "Managed to add $name host entry with a protocol"
+
+       log_test "Common host entries configuration tests ($name)"
+}
+
+# Check configuration of host entries from all types.
+cfg_test_host()
+{
+       echo
+       log_info "# Host entries configuration tests"
+
+       cfg_test_host_common "IPv4" "239.1.1.1" "192.0.2.1" "temp" "permanent"
+       cfg_test_host_common "IPv6" "ff0e::1" "2001:db8:1::1" "temp" "permanent"
+       cfg_test_host_common "L2" "01:02:03:04:05:06" "00:00:00:00:00:01" \
+               "permanent" "temp"
+}
+
+cfg_test_port_common()
+{
+       local name=$1;shift
+       local grp_key=$1; shift
+
+       RET=0
+
+       # Check basic add, replace and delete behavior.
+       bridge mdb add dev br0 port $swp1 $grp_key permanent vid 10
+       bridge mdb show dev br0 vid 10 | grep -q "$grp_key"
+       check_err $? "Failed to add $name entry"
+
+       bridge mdb replace dev br0 port $swp1 $grp_key permanent vid 10 \
+               &> /dev/null
+       check_err $? "Failed to replace $name entry"
+
+       bridge mdb del dev br0 port $swp1 $grp_key permanent vid 10
+       bridge mdb show dev br0 vid 10 | grep -q "$grp_key"
+       check_fail $? "Failed to delete $name entry"
+
+       # Check default protocol and replacement.
+       bridge mdb add dev br0 port $swp1 $grp_key permanent vid 10
+       bridge -d mdb show dev br0 vid 10 | grep "$grp_key" | grep -q "static"
+       check_err $? "$name entry not added with default \"static\" protocol"
+
+       bridge mdb replace dev br0 port $swp1 $grp_key permanent vid 10 \
+               proto 123
+       bridge -d mdb show dev br0 vid 10 | grep "$grp_key" | grep -q "123"
+       check_err $? "Failed to replace protocol of $name entry"
+       bridge mdb del dev br0 port $swp1 $grp_key permanent vid 10
+
+       # Check behavior when VLAN is not specified.
+       bridge mdb add dev br0 port $swp1 $grp_key permanent
+       bridge mdb show dev br0 vid 10 | grep -q "$grp_key"
+       check_err $? "$name entry with VLAN 10 not added when VLAN was not specified"
+       bridge mdb show dev br0 vid 20 | grep -q "$grp_key"
+       check_err $? "$name entry with VLAN 20 not added when VLAN was not specified"
+
+       bridge mdb del dev br0 port $swp1 $grp_key permanent
+       bridge mdb show dev br0 vid 10 | grep -q "$grp_key"
+       check_fail $? "$name entry with VLAN 10 not deleted when VLAN was not specified"
+       bridge mdb show dev br0 vid 20 | grep -q "$grp_key"
+       check_fail $? "$name entry with VLAN 20 not deleted when VLAN was not specified"
+
+       # Check behavior when bridge port is down.
+       ip link set dev $swp1 down
+
+       bridge mdb add dev br0 port $swp1 $grp_key permanent vid 10
+       check_err $? "Failed to add $name permanent entry when bridge port is down"
+
+       bridge mdb del dev br0 port $swp1 $grp_key permanent vid 10
+
+       bridge mdb add dev br0 port $swp1 $grp_key temp vid 10 &> /dev/null
+       check_fail $? "Managed to add $name temporary entry when bridge port is down"
+
+       ip link set dev $swp1 up
+       setup_wait_dev $swp1
+
+       # Check error cases.
+       ip link set dev br0 down
+       bridge mdb add dev br0 port $swp1 $grp_key permanent vid 10 \
+               &> /dev/null
+       check_fail $? "Managed to add $name entry when bridge is down"
+       ip link set dev br0 up
+
+       ip link set dev br0 type bridge mcast_snooping 0
+       bridge mdb add dev br0 port $swp1 $grp_key permanent vid \
+               10 &> /dev/null
+       check_fail $? "Managed to add $name entry when multicast snooping is disabled"
+       ip link set dev br0 type bridge mcast_snooping 1
+
+       bridge mdb add dev br0 port $swp1 $grp_key permanent vid 5000 \
+               &> /dev/null
+       check_fail $? "Managed to add $name entry with an invalid VLAN"
+
+       log_test "Common port group entries configuration tests ($name)"
+}
+
+src_list_create()
+{
+       local src_prefix=$1; shift
+       local num_srcs=$1; shift
+       local src_list
+       local i
+
+       for i in $(seq 1 $num_srcs); do
+               src_list=${src_list},${src_prefix}${i}
+       done
+
+       echo $src_list | cut -c 2-
+}
+
+__cfg_test_port_ip_star_g()
+{
+       local name=$1; shift
+       local grp=$1; shift
+       local invalid_grp=$1; shift
+       local src_prefix=$1; shift
+       local src1=${src_prefix}1
+       local src2=${src_prefix}2
+       local src3=${src_prefix}3
+       local max_srcs=31
+       local num_srcs
+
+       RET=0
+
+       bridge mdb add dev br0 port $swp1 grp $grp vid 10
+       bridge -d mdb show dev br0 vid 10 | grep "$grp" | grep -q "exclude"
+       check_err $? "Default filter mode is not \"exclude\""
+       bridge mdb del dev br0 port $swp1 grp $grp vid 10
+
+       # Check basic add and delete behavior.
+       bridge mdb add dev br0 port $swp1 grp $grp vid 10 filter_mode exclude \
+               source_list $src1
+       bridge -d mdb show dev br0 vid 10 | grep "$grp" | grep -q -v "src"
+       check_err $? "(*, G) entry not created"
+       bridge -d mdb show dev br0 vid 10 | grep "$grp" | grep -q "src $src1"
+       check_err $? "(S, G) entry not created"
+       bridge mdb del dev br0 port $swp1 grp $grp vid 10
+       bridge -d mdb show dev br0 vid 10 | grep "$grp" | grep -q -v "src"
+       check_fail $? "(*, G) entry not deleted"
+       bridge -d mdb show dev br0 vid 10 | grep "$grp" | grep -q "src $src1"
+       check_fail $? "(S, G) entry not deleted"
+
+       ## State (permanent / temp) tests.
+
+       # Check that group and source timer are not set for permanent entries.
+       bridge mdb add dev br0 port $swp1 grp $grp permanent vid 10 \
+               filter_mode exclude source_list $src1
+
+       bridge -d mdb show dev br0 vid 10 | grep "$grp" | grep -v "src" | \
+               grep -q "permanent"
+       check_err $? "(*, G) entry not added as \"permanent\" when should"
+       bridge -d mdb show dev br0 vid 10 | grep "$grp" | grep "src" | \
+               grep -q "permanent"
+       check_err $? "(S, G) entry not added as \"permanent\" when should"
+
+       bridge -d -s mdb show dev br0 vid 10 | grep "$grp" | grep -v "src" | \
+               grep -q " 0.00"
+       check_err $? "(*, G) \"permanent\" entry has a pending group timer"
+       bridge -d -s mdb show dev br0 vid 10 | grep "$grp" | grep -v "src" | \
+               grep -q "\/0.00"
+       check_err $? "\"permanent\" source entry has a pending source timer"
+
+       bridge mdb del dev br0 port $swp1 grp $grp vid 10
+
+       # Check that group timer is set for temporary (*, G) EXCLUDE, but not
+       # the source timer.
+       bridge mdb add dev br0 port $swp1 grp $grp temp vid 10 \
+               filter_mode exclude source_list $src1
+
+       bridge -d mdb show dev br0 vid 10 | grep "$grp" | grep -v "src" | \
+               grep -q "temp"
+       check_err $? "(*, G) EXCLUDE entry not added as \"temp\" when should"
+       bridge -d mdb show dev br0 vid 10 | grep "$grp" | grep "src" | \
+               grep -q "temp"
+       check_err $? "(S, G) \"blocked\" entry not added as \"temp\" when should"
+
+       bridge -d -s mdb show dev br0 vid 10 | grep "$grp" | grep -v "src" | \
+               grep -q " 0.00"
+       check_fail $? "(*, G) EXCLUDE entry does not have a pending group timer"
+       bridge -d -s mdb show dev br0 vid 10 | grep "$grp" | grep -v "src" | \
+               grep -q "\/0.00"
+       check_err $? "\"blocked\" source entry has a pending source timer"
+
+       bridge mdb del dev br0 port $swp1 grp $grp vid 10
+
+       # Check that group timer is not set for temporary (*, G) INCLUDE, but
+       # that the source timer is set.
+       bridge mdb add dev br0 port $swp1 grp $grp temp vid 10 \
+               filter_mode include source_list $src1
+
+       bridge -d mdb show dev br0 vid 10 | grep "$grp" | grep -v "src" | \
+               grep -q "temp"
+       check_err $? "(*, G) INCLUDE entry not added as \"temp\" when should"
+       bridge -d mdb show dev br0 vid 10 | grep "$grp" | grep "src" | \
+               grep -q "temp"
+       check_err $? "(S, G) entry not added as \"temp\" when should"
+
+       bridge -d -s mdb show dev br0 vid 10 | grep "$grp" | grep -v "src" | \
+               grep -q " 0.00"
+       check_err $? "(*, G) INCLUDE entry has a pending group timer"
+       bridge -d -s mdb show dev br0 vid 10 | grep "$grp" | grep -v "src" | \
+               grep -q "\/0.00"
+       check_fail $? "Source entry does not have a pending source timer"
+
+       bridge mdb del dev br0 port $swp1 grp $grp vid 10
+
+       # Check that group timer is never set for (S, G) entries.
+       bridge mdb add dev br0 port $swp1 grp $grp temp vid 10 \
+               filter_mode include source_list $src1
+
+       bridge -d -s mdb show dev br0 vid 10 | grep "$grp" | grep "src" | \
+               grep -q " 0.00"
+       check_err $? "(S, G) entry has a pending group timer"
+
+       bridge mdb del dev br0 port $swp1 grp $grp vid 10
+
+       ## Filter mode (include / exclude) tests.
+
+       # Check that (*, G) INCLUDE entries are added with correct filter mode
+       # and that (S, G) entries are not marked as "blocked".
+       bridge mdb add dev br0 port $swp1 grp $grp vid 10 \
+               filter_mode include source_list $src1
+
+       bridge -d mdb show dev br0 vid 10 | grep "$grp" | grep -v "src" | \
+               grep -q "include"
+       check_err $? "(*, G) INCLUDE not added with \"include\" filter mode"
+       bridge -d mdb show dev br0 vid 10 | grep "$grp" | grep "src" | \
+               grep -q "blocked"
+       check_fail $? "(S, G) entry marked as \"blocked\" when should not"
+
+       bridge mdb del dev br0 port $swp1 grp $grp vid 10
+
+       # Check that (*, G) EXCLUDE entries are added with correct filter mode
+       # and that (S, G) entries are marked as "blocked".
+       bridge mdb add dev br0 port $swp1 grp $grp vid 10 \
+               filter_mode exclude source_list $src1
+
+       bridge -d mdb show dev br0 vid 10 | grep "$grp" | grep -v "src" | \
+               grep -q "exclude"
+       check_err $? "(*, G) EXCLUDE not added with \"exclude\" filter mode"
+       bridge -d mdb show dev br0 vid 10 | grep "$grp" | grep "src" | \
+               grep -q "blocked"
+       check_err $? "(S, G) entry not marked as \"blocked\" when should"
+
+       bridge mdb del dev br0 port $swp1 grp $grp vid 10
+
+       ## Protocol tests.
+
+       # Check that (*, G) and (S, G) entries are added with the specified
+       # protocol.
+       bridge mdb add dev br0 port $swp1 grp $grp vid 10 \
+               filter_mode exclude source_list $src1 proto zebra
+
+       bridge -d mdb show dev br0 vid 10 | grep "$grp" | grep -v "src" | \
+               grep -q "zebra"
+       check_err $? "(*, G) entry not added with \"zebra\" protocol"
+       bridge -d mdb show dev br0 vid 10 | grep "$grp" | grep "src" | \
+               grep -q "zebra"
+       check_err $? "(S, G) entry not marked added with \"zebra\" protocol"
+
+       bridge mdb del dev br0 port $swp1 grp $grp vid 10
+
+       ## Replace tests.
+
+       # Check that state can be modified.
+       bridge mdb add dev br0 port $swp1 grp $grp temp vid 10 \
+               filter_mode exclude source_list $src1
+
+       bridge mdb replace dev br0 port $swp1 grp $grp permanent vid 10 \
+               filter_mode exclude source_list $src1
+       bridge -d mdb show dev br0 vid 10 | grep "$grp" | grep -v "src" | \
+               grep -q "permanent"
+       check_err $? "(*, G) entry not marked as \"permanent\" after replace"
+       bridge -d mdb show dev br0 vid 10 | grep "$grp" | grep "src" | \
+               grep -q "permanent"
+       check_err $? "(S, G) entry not marked as \"permanent\" after replace"
+
+       bridge mdb replace dev br0 port $swp1 grp $grp temp vid 10 \
+               filter_mode exclude source_list $src1
+       bridge -d mdb show dev br0 vid 10 | grep "$grp" | grep -v "src" | \
+               grep -q "temp"
+       check_err $? "(*, G) entry not marked as \"temp\" after replace"
+       bridge -d mdb show dev br0 vid 10 | grep "$grp" | grep "src" | \
+               grep -q "temp"
+       check_err $? "(S, G) entry not marked as \"temp\" after replace"
+
+       bridge mdb del dev br0 port $swp1 grp $grp vid 10
+
+       # Check that filter mode can be modified.
+       bridge mdb add dev br0 port $swp1 grp $grp temp vid 10 \
+               filter_mode exclude source_list $src1
+
+       bridge mdb replace dev br0 port $swp1 grp $grp temp vid 10 \
+               filter_mode include source_list $src1
+       bridge -d mdb show dev br0 vid 10 | grep "$grp" | grep -v "src" | \
+               grep -q "include"
+       check_err $? "(*, G) not marked with \"include\" filter mode after replace"
+       bridge -d mdb show dev br0 vid 10 | grep "$grp" | grep "src" | \
+               grep -q "blocked"
+       check_fail $? "(S, G) marked as \"blocked\" after replace"
+
+       bridge mdb replace dev br0 port $swp1 grp $grp temp vid 10 \
+               filter_mode exclude source_list $src1
+       bridge -d mdb show dev br0 vid 10 | grep "$grp" | grep -v "src" | \
+               grep -q "exclude"
+       check_err $? "(*, G) not marked with \"exclude\" filter mode after replace"
+       bridge -d mdb show dev br0 vid 10 | grep "$grp" | grep "src" | \
+               grep -q "blocked"
+       check_err $? "(S, G) not marked as \"blocked\" after replace"
+
+       bridge mdb del dev br0 port $swp1 grp $grp vid 10
+
+       # Check that sources can be added to and removed from the source list.
+       bridge mdb add dev br0 port $swp1 grp $grp temp vid 10 \
+               filter_mode exclude source_list $src1
+
+       bridge mdb replace dev br0 port $swp1 grp $grp temp vid 10 \
+               filter_mode exclude source_list $src1,$src2,$src3
+       bridge -d mdb show dev br0 vid 10 | grep "$grp" | grep -q "src $src1"
+       check_err $? "(S, G) entry for source $src1 not created after replace"
+       bridge -d mdb show dev br0 vid 10 | grep "$grp" | grep -q "src $src2"
+       check_err $? "(S, G) entry for source $src2 not created after replace"
+       bridge -d mdb show dev br0 vid 10 | grep "$grp" | grep -q "src $src3"
+       check_err $? "(S, G) entry for source $src3 not created after replace"
+
+       bridge mdb replace dev br0 port $swp1 grp $grp temp vid 10 \
+               filter_mode exclude source_list $src1,$src3
+       bridge -d mdb show dev br0 vid 10 | grep "$grp" | grep -q "src $src1"
+       check_err $? "(S, G) entry for source $src1 not created after second replace"
+       bridge -d mdb show dev br0 vid 10 | grep "$grp" | grep -q "src $src2"
+       check_fail $? "(S, G) entry for source $src2 created after second replace"
+       bridge -d mdb show dev br0 vid 10 | grep "$grp" | grep -q "src $src3"
+       check_err $? "(S, G) entry for source $src3 not created after second replace"
+
+       bridge mdb del dev br0 port $swp1 grp $grp vid 10
+
+       # Check that protocol can be modified.
+       bridge mdb add dev br0 port $swp1 grp $grp temp vid 10 \
+               filter_mode exclude source_list $src1 proto zebra
+
+       bridge mdb replace dev br0 port $swp1 grp $grp temp vid 10 \
+               filter_mode exclude source_list $src1 proto bgp
+       bridge -d mdb show dev br0 vid 10 | grep "$grp" | grep -v "src" | \
+               grep -q "bgp"
+       check_err $? "(*, G) protocol not changed to \"bgp\" after replace"
+       bridge -d mdb show dev br0 vid 10 | grep "$grp" | grep "src" | \
+               grep -q "bgp"
+       check_err $? "(S, G) protocol not changed to \"bgp\" after replace"
+
+       bridge mdb del dev br0 port $swp1 grp $grp vid 10
+
+       ## Star exclude tests.
+
+       # Check star exclude functionality. When adding a new EXCLUDE (*, G),
+       # it needs to be also added to all (S, G) entries for proper
+       # replication.
+       bridge mdb add dev br0 port $swp2 grp $grp vid 10 \
+               filter_mode include source_list $src1
+       bridge mdb add dev br0 port $swp1 grp $grp vid 10
+       bridge -d mdb show dev br0 vid 10 | grep "$swp1" | grep "$grp" | \
+               grep "$src1" | grep -q "added_by_star_ex"
+       check_err $? "\"added_by_star_ex\" entry not created after adding (*, G) entry"
+       bridge mdb del dev br0 port $swp1 grp $grp vid 10
+       bridge mdb del dev br0 port $swp2 grp $grp src $src1 vid 10
+
+       ## Error cases tests.
+
+       bridge mdb add dev br0 port $swp1 grp $invalid_grp vid 10 &> /dev/null
+       check_fail $? "Managed to add an entry with an invalid group"
+
+       bridge mdb add dev br0 port $swp1 grp $grp vid 10 filter_mode include \
+               &> /dev/null
+       check_fail $? "Managed to add an INCLUDE entry with an empty source list"
+
+       bridge mdb add dev br0 port $swp1 grp $grp vid 10 filter_mode include \
+               source_list $grp &> /dev/null
+       check_fail $? "Managed to add an entry with an invalid source in source list"
+
+       bridge mdb add dev br0 port $swp1 grp $grp vid 10 \
+               source_list $src &> /dev/null
+       check_fail $? "Managed to add an entry with a source list and no filter mode"
+
+       bridge mdb add dev br0 port $swp1 grp $grp vid 10 filter_mode include \
+               source_list $src1
+       bridge mdb add dev br0 port $swp1 grp $grp vid 10 filter_mode exclude \
+               source_list $src1 &> /dev/null
+       check_fail $? "Managed to replace an entry without using replace"
+       bridge mdb del dev br0 port $swp1 grp $grp vid 10
+
+       bridge mdb add dev br0 port $swp1 grp $grp src $src2 vid 10
+       bridge mdb add dev br0 port $swp1 grp $grp vid 10 filter_mode include \
+               source_list $src1,$src2,$src3 &> /dev/null
+       check_fail $? "Managed to add a source that already has a forwarding entry"
+       bridge mdb del dev br0 port $swp1 grp $grp src $src2 vid 10
+
+       # Check maximum number of sources.
+       bridge mdb add dev br0 port $swp1 grp $grp vid 10 filter_mode exclude \
+               source_list $(src_list_create $src_prefix $max_srcs)
+       num_srcs=$(bridge -d mdb show dev br0 vid 10 | grep "$grp" | \
+               grep "src" | wc -l)
+       [[ $num_srcs -eq $max_srcs ]]
+       check_err $? "Failed to configure maximum number of sources ($max_srcs)"
+       bridge mdb del dev br0 port $swp1 grp $grp vid 10
+
+       bridge mdb add dev br0 port $swp1 grp $grp vid 10 filter_mode exclude \
+               source_list $(src_list_create $src_prefix $((max_srcs + 1))) \
+               &> /dev/null
+       check_fail $? "Managed to exceed maximum number of sources ($max_srcs)"
+
+       log_test "$name (*, G) port group entries configuration tests"
+}
+
+cfg_test_port_ip_star_g()
+{
+       echo
+       log_info "# Port group entries configuration tests - (*, G)"
+
+       cfg_test_port_common "IPv4 (*, G)" "grp 239.1.1.1"
+       cfg_test_port_common "IPv6 (*, G)" "grp ff0e::1"
+       __cfg_test_port_ip_star_g "IPv4" "239.1.1.1" "224.0.0.1" "192.0.2."
+       __cfg_test_port_ip_star_g "IPv6" "ff0e::1" "ff02::1" "2001:db8:1::"
+}
+
+__cfg_test_port_ip_sg()
+{
+       local name=$1; shift
+       local grp=$1; shift
+       local src=$1; shift
+       local grp_key="grp $grp src $src"
+
+       RET=0
+
+       bridge mdb add dev br0 port $swp1 $grp_key vid 10
+       bridge -d mdb show dev br0 vid 10 | grep "$grp_key" | grep -q "include"
+       check_err $? "Default filter mode is not \"include\""
+       bridge mdb del dev br0 port $swp1 $grp_key vid 10
+
+       # Check that entries can be added as both permanent and temp and that
+       # group timer is set correctly.
+       bridge mdb add dev br0 port $swp1 $grp_key permanent vid 10
+       bridge -d mdb show dev br0 vid 10 | grep "$grp_key" | \
+               grep -q "permanent"
+       check_err $? "Entry not added as \"permanent\" when should"
+       bridge -d -s mdb show dev br0 vid 10 | grep "$grp_key" | \
+               grep -q "0.00"
+       check_err $? "\"permanent\" entry has a pending group timer"
+       bridge mdb del dev br0 port $swp1 $grp_key vid 10
+
+       bridge mdb add dev br0 port $swp1 $grp_key temp vid 10
+       bridge -d mdb show dev br0 vid 10 | grep "$grp_key" | \
+               grep -q "temp"
+       check_err $? "Entry not added as \"temp\" when should"
+       bridge -d -s mdb show dev br0 vid 10 | grep "$grp_key" | \
+               grep -q "0.00"
+       check_fail $? "\"temp\" entry has an unpending group timer"
+       bridge mdb del dev br0 port $swp1 $grp_key vid 10
+
+       # Check error cases.
+       bridge mdb add dev br0 port $swp1 $grp_key vid 10 \
+               filter_mode include &> /dev/null
+       check_fail $? "Managed to add an entry with a filter mode"
+
+       bridge mdb add dev br0 port $swp1 $grp_key vid 10 \
+               filter_mode include source_list $src &> /dev/null
+       check_fail $? "Managed to add an entry with a source list"
+
+       bridge mdb add dev br0 port $swp1 grp $grp src $grp vid 10 &> /dev/null
+       check_fail $? "Managed to add an entry with an invalid source"
+
+       bridge mdb add dev br0 port $swp1 $grp_key vid 10 temp
+       bridge mdb add dev br0 port $swp1 $grp_key vid 10 permanent &> /dev/null
+       check_fail $? "Managed to replace an entry without using replace"
+       bridge mdb del dev br0 port $swp1 $grp_key vid 10
+
+       # Check that we can replace available attributes.
+       bridge mdb add dev br0 port $swp1 $grp_key vid 10 proto 123
+       bridge mdb replace dev br0 port $swp1 $grp_key vid 10 proto 111
+       bridge -d mdb show dev br0 vid 10 | grep "$grp_key" | \
+               grep -q "111"
+       check_err $? "Failed to replace protocol"
+
+       bridge mdb replace dev br0 port $swp1 $grp_key vid 10 permanent
+       bridge -d mdb show dev br0 vid 10 | grep "$grp_key" | \
+               grep -q "permanent"
+       check_err $? "Entry not marked as \"permanent\" after replace"
+       bridge -d -s mdb show dev br0 vid 10 | grep "$grp_key" | \
+               grep -q "0.00"
+       check_err $? "Entry has a pending group timer after replace"
+
+       bridge mdb replace dev br0 port $swp1 $grp_key vid 10 temp
+       bridge -d mdb show dev br0 vid 10 | grep "$grp_key" | \
+               grep -q "temp"
+       check_err $? "Entry not marked as \"temp\" after replace"
+       bridge -d -s mdb show dev br0 vid 10 | grep "$grp_key" | \
+               grep -q "0.00"
+       check_fail $? "Entry has an unpending group timer after replace"
+       bridge mdb del dev br0 port $swp1 $grp_key vid 10
+
+       # Check star exclude functionality. When adding a (S, G), all matching
+       # (*, G) ports need to be added to it.
+       bridge mdb add dev br0 port $swp2 grp $grp vid 10
+       bridge mdb add dev br0 port $swp1 $grp_key vid 10
+       bridge mdb show dev br0 vid 10 | grep "$grp_key" | grep $swp2 | \
+               grep -q "added_by_star_ex"
+       check_err $? "\"added_by_star_ex\" entry not created after adding (S, G) entry"
+       bridge mdb del dev br0 port $swp1 $grp_key vid 10
+       bridge mdb del dev br0 port $swp2 grp $grp vid 10
+
+       log_test "$name (S, G) port group entries configuration tests"
+}
+
+cfg_test_port_ip_sg()
+{
+       echo
+       log_info "# Port group entries configuration tests - (S, G)"
+
+       cfg_test_port_common "IPv4 (S, G)" "grp 239.1.1.1 src 192.0.2.1"
+       cfg_test_port_common "IPv6 (S, G)" "grp ff0e::1 src 2001:db8:1::1"
+       __cfg_test_port_ip_sg "IPv4" "239.1.1.1" "192.0.2.1"
+       __cfg_test_port_ip_sg "IPv6" "ff0e::1" "2001:db8:1::1"
+}
+
+cfg_test_port_ip()
+{
+       cfg_test_port_ip_star_g
+       cfg_test_port_ip_sg
+}
+
+__cfg_test_port_l2()
+{
+       local grp="01:02:03:04:05:06"
+
+       RET=0
+
+       bridge meb add dev br0 port $swp grp 00:01:02:03:04:05 \
+               permanent vid 10 &> /dev/null
+       check_fail $? "Managed to add an entry with unicast MAC"
+
+       bridge mdb add dev br0 port $swp grp $grp src 00:01:02:03:04:05 \
+               permanent vid 10 &> /dev/null
+       check_fail $? "Managed to add an entry with a source"
+
+       bridge mdb add dev br0 port $swp1 grp $grp permanent vid 10 \
+               filter_mode include &> /dev/null
+       check_fail $? "Managed to add an entry with a filter mode"
+
+       bridge mdb add dev br0 port $swp1 grp $grp permanent vid 10 \
+               source_list 00:01:02:03:04:05 &> /dev/null
+       check_fail $? "Managed to add an entry with a source list"
+
+       log_test "L2 (*, G) port group entries configuration tests"
+}
+
+cfg_test_port_l2()
+{
+       echo
+       log_info "# Port group entries configuration tests - L2"
+
+       cfg_test_port_common "L2 (*, G)" "grp 01:02:03:04:05:06"
+       __cfg_test_port_l2
+}
+
+# Check configuration of regular (port) entries of all types.
+cfg_test_port()
+{
+       cfg_test_port_ip
+       cfg_test_port_l2
+}
+
+cfg_test()
 {
-       local group=$1
-       local flag=$2
+       cfg_test_host
+       cfg_test_port
+}
+
+__fwd_test_host_ip()
+{
+       local grp=$1; shift
+       local src=$1; shift
+       local mode=$1; shift
+       local name
+       local eth_type
 
        RET=0
-       bridge mdb add dev br0 port br0 grp $group $flag 2>/dev/null
-       check_err $? "Failed adding $group to br0, port br0"
 
-       if [ -z "$flag" ]; then
-           flag="temp"
+       if [[ $mode == "-4" ]]; then
+               name="IPv4"
+               eth_type="ipv4"
+       else
+               name="IPv6"
+               eth_type="ipv6"
        fi
 
-       bridge mdb show dev br0 | grep $group | grep -q $flag 2>/dev/null
-       check_err $? "$group not added with $flag flag"
+       tc filter add dev br0 ingress protocol 802.1q pref 1 handle 1 flower \
+               vlan_ethtype $eth_type vlan_id 10 dst_ip $grp src_ip $src \
+               action drop
+
+       # Packet should only be flooded to multicast router ports when there is
+       # no matching MDB entry. The bridge is not configured as a multicast
+       # router port.
+       $MZ $mode $h1.10 -c 1 -p 128 -A $src -B $grp -t udp -q
+       tc_check_packets "dev br0 ingress" 1 0
+       check_err $? "Packet locally received after flood"
+
+       # Install a regular port group entry and expect the packet to not be
+       # locally received.
+       bridge mdb add dev br0 port $swp2 grp $grp temp vid 10
+       $MZ $mode $h1.10 -c 1 -p 128 -A $src -B $grp -t udp -q
+       tc_check_packets "dev br0 ingress" 1 0
+       check_err $? "Packet locally received after installing a regular entry"
+
+       # Add a host entry and expect the packet to be locally received.
+       bridge mdb add dev br0 port br0 grp $grp temp vid 10
+       $MZ $mode $h1.10 -c 1 -p 128 -A $src -B $grp -t udp -q
+       tc_check_packets "dev br0 ingress" 1 1
+       check_err $? "Packet not locally received after adding a host entry"
+
+       # Remove the host entry and expect the packet to not be locally
+       # received.
+       bridge mdb del dev br0 port br0 grp $grp vid 10
+       $MZ $mode $h1.10 -c 1 -p 128 -A $src -B $grp -t udp -q
+       tc_check_packets "dev br0 ingress" 1 1
+       check_err $? "Packet locally received after removing a host entry"
+
+       bridge mdb del dev br0 port $swp2 grp $grp vid 10
+
+       tc filter del dev br0 ingress protocol 802.1q pref 1 handle 1 flower
+
+       log_test "$name host entries forwarding tests"
+}
+
+fwd_test_host_ip()
+{
+       __fwd_test_host_ip "239.1.1.1" "192.0.2.1" "-4"
+       __fwd_test_host_ip "ff0e::1" "2001:db8:1::1" "-6"
+}
+
+fwd_test_host_l2()
+{
+       local dmac=01:02:03:04:05:06
+
+       RET=0
+
+       tc filter add dev br0 ingress protocol all pref 1 handle 1 flower \
+               dst_mac $dmac action drop
+
+       # Packet should be flooded and locally received when there is no
+       # matching MDB entry.
+       $MZ $h1.10 -c 1 -p 128 -a own -b $dmac -q
+       tc_check_packets "dev br0 ingress" 1 1
+       check_err $? "Packet not locally received after flood"
+
+       # Install a regular port group entry and expect the packet to not be
+       # locally received.
+       bridge mdb add dev br0 port $swp2 grp $dmac permanent vid 10
+       $MZ $h1.10 -c 1 -p 128 -a own -b $dmac -q
+       tc_check_packets "dev br0 ingress" 1 1
+       check_err $? "Packet locally received after installing a regular entry"
+
+       # Add a host entry and expect the packet to be locally received.
+       bridge mdb add dev br0 port br0 grp $dmac permanent vid 10
+       $MZ $h1.10 -c 1 -p 128 -a own -b $dmac -q
+       tc_check_packets "dev br0 ingress" 1 2
+       check_err $? "Packet not locally received after adding a host entry"
+
+       # Remove the host entry and expect the packet to not be locally
+       # received.
+       bridge mdb del dev br0 port br0 grp $dmac permanent vid 10
+       $MZ $h1.10 -c 1 -p 128 -a own -b $dmac -q
+       tc_check_packets "dev br0 ingress" 1 2
+       check_err $? "Packet locally received after removing a host entry"
+
+       bridge mdb del dev br0 port $swp2 grp $dmac permanent vid 10
+
+       tc filter del dev br0 ingress protocol all pref 1 handle 1 flower
+
+       log_test "L2 host entries forwarding tests"
+}
+
+fwd_test_host()
+{
+       # Disable multicast router on the bridge to ensure that packets are
+       # only locally received when a matching host entry is present.
+       ip link set dev br0 type bridge mcast_router 0
+
+       fwd_test_host_ip
+       fwd_test_host_l2
+
+       ip link set dev br0 type bridge mcast_router 1
+}
+
+__fwd_test_port_ip()
+{
+       local grp=$1; shift
+       local valid_src=$1; shift
+       local invalid_src=$1; shift
+       local mode=$1; shift
+       local filter_mode=$1; shift
+       local name
+       local eth_type
+       local src_list
+
+       RET=0
+
+       if [[ $mode == "-4" ]]; then
+               name="IPv4"
+               eth_type="ipv4"
+       else
+               name="IPv6"
+               eth_type="ipv6"
+       fi
+
+       # The valid source is the one we expect to get packets from after
+       # adding the entry.
+       if [[ $filter_mode == "include" ]]; then
+               src_list=$valid_src
+       else
+               src_list=$invalid_src
+       fi
+
+       tc filter add dev $h2 ingress protocol 802.1q pref 1 handle 1 flower \
+               vlan_ethtype $eth_type vlan_id 10 dst_ip $grp \
+               src_ip $valid_src action drop
+       tc filter add dev $h2 ingress protocol 802.1q pref 1 handle 2 flower \
+               vlan_ethtype $eth_type vlan_id 10 dst_ip $grp \
+               src_ip $invalid_src action drop
+
+       $MZ $mode $h1.10 -c 1 -p 128 -A $valid_src -B $grp -t udp -q
+       tc_check_packets "dev $h2 ingress" 1 0
+       check_err $? "Packet from valid source received on H2 before adding entry"
+
+       $MZ $mode $h1.10 -c 1 -p 128 -A $invalid_src -B $grp -t udp -q
+       tc_check_packets "dev $h2 ingress" 2 0
+       check_err $? "Packet from invalid source received on H2 before adding entry"
+
+       bridge mdb add dev br0 port $swp2 grp $grp vid 10 \
+               filter_mode $filter_mode source_list $src_list
 
-       bridge mdb del dev br0 port br0 grp $group 2>/dev/null
-       check_err $? "Failed deleting $group from br0, port br0"
+       $MZ $mode $h1.10 -c 1 -p 128 -A $valid_src -B $grp -t udp -q
+       tc_check_packets "dev $h2 ingress" 1 1
+       check_err $? "Packet from valid source not received on H2 after adding entry"
 
-       bridge mdb show dev br0 | grep -q $group >/dev/null
-       check_err_fail 1 $? "$group still in mdb after delete"
+       $MZ $mode $h1.10 -c 1 -p 128 -A $invalid_src -B $grp -t udp -q
+       tc_check_packets "dev $h2 ingress" 2 0
+       check_err $? "Packet from invalid source received on H2 after adding entry"
 
-       log_test "MDB add/del group $group to bridge port br0"
+       bridge mdb replace dev br0 port $swp2 grp $grp vid 10 \
+               filter_mode exclude
+
+       $MZ $mode $h1.10 -c 1 -p 128 -A $valid_src -B $grp -t udp -q
+       tc_check_packets "dev $h2 ingress" 1 2
+       check_err $? "Packet from valid source not received on H2 after allowing all sources"
+
+       $MZ $mode $h1.10 -c 1 -p 128 -A $invalid_src -B $grp -t udp -q
+       tc_check_packets "dev $h2 ingress" 2 1
+       check_err $? "Packet from invalid source not received on H2 after allowing all sources"
+
+       bridge mdb del dev br0 port $swp2 grp $grp vid 10
+
+       $MZ $mode $h1.10 -c 1 -p 128 -A $valid_src -B $grp -t udp -q
+       tc_check_packets "dev $h2 ingress" 1 2
+       check_err $? "Packet from valid source received on H2 after deleting entry"
+
+       $MZ $mode $h1.10 -c 1 -p 128 -A $invalid_src -B $grp -t udp -q
+       tc_check_packets "dev $h2 ingress" 2 1
+       check_err $? "Packet from invalid source received on H2 after deleting entry"
+
+       tc filter del dev $h2 ingress protocol 802.1q pref 1 handle 2 flower
+       tc filter del dev $h2 ingress protocol 802.1q pref 1 handle 1 flower
+
+       log_test "$name port group \"$filter_mode\" entries forwarding tests"
+}
+
+fwd_test_port_ip()
+{
+       __fwd_test_port_ip "239.1.1.1" "192.0.2.1" "192.0.2.2" "-4" "exclude"
+       __fwd_test_port_ip "ff0e::1" "2001:db8:1::1" "2001:db8:1::2" "-6" \
+               "exclude"
+       __fwd_test_port_ip "239.1.1.1" "192.0.2.1" "192.0.2.2" "-4" "include"
+       __fwd_test_port_ip "ff0e::1" "2001:db8:1::1" "2001:db8:1::2" "-6" \
+               "include"
+}
+
+fwd_test_port_l2()
+{
+       local dmac=01:02:03:04:05:06
+
+       RET=0
+
+       tc filter add dev $h2 ingress protocol all pref 1 handle 1 flower \
+               dst_mac $dmac action drop
+
+       $MZ $h1.10 -c 1 -p 128 -a own -b $dmac -q
+       tc_check_packets "dev $h2 ingress" 1 0
+       check_err $? "Packet received on H2 before adding entry"
+
+       bridge mdb add dev br0 port $swp2 grp $dmac permanent vid 10
+       $MZ $h1.10 -c 1 -p 128 -a own -b $dmac -q
+       tc_check_packets "dev $h2 ingress" 1 1
+       check_err $? "Packet not received on H2 after adding entry"
+
+       bridge mdb del dev br0 port $swp2 grp $dmac permanent vid 10
+       $MZ $h1.10 -c 1 -p 128 -a own -b $dmac -q
+       tc_check_packets "dev $h2 ingress" 1 1
+       check_err $? "Packet received on H2 after deleting entry"
+
+       tc filter del dev $h2 ingress protocol all pref 1 handle 1 flower
+
+       log_test "L2 port entries forwarding tests"
+}
+
+fwd_test_port()
+{
+       # Disable multicast flooding to ensure that packets are only forwarded
+       # out of a port when a matching port group entry is present.
+       bridge link set dev $swp2 mcast_flood off
+
+       fwd_test_port_ip
+       fwd_test_port_l2
+
+       bridge link set dev $swp2 mcast_flood on
+}
+
+fwd_test()
+{
+       echo
+       log_info "# Forwarding tests"
+
+       # Forwarding according to MDB entries only takes place when the bridge
+       # detects that there is a valid querier in the network. Set the bridge
+       # as the querier and assign it a valid IPv6 link-local address to be
+       # used as the source address for MLD queries.
+       ip -6 address add fe80::1/64 nodad dev br0
+       ip link set dev br0 type bridge mcast_querier 1
+       # Wait the default Query Response Interval (10 seconds) for the bridge
+       # to determine that there are no other queriers in the network.
+       sleep 10
+
+       fwd_test_host
+       fwd_test_port
+
+       ip link set dev br0 type bridge mcast_querier 0
+       ip -6 address del fe80::1/64 dev br0
 }
 
-mdb_add_del_test()
+igmpv3_is_in_get()
 {
-       do_mdb_add_del $TEST_GROUP_MAC permanent
-       do_mdb_add_del $TEST_GROUP_IP4
-       do_mdb_add_del $TEST_GROUP_IP6
+       local igmpv3
+
+       igmpv3=$(:
+               )"22:"$(                        : Type - Membership Report
+               )"00:"$(                        : Reserved
+               )"2a:f8:"$(                     : Checksum
+               )"00:00:"$(                     : Reserved
+               )"00:01:"$(                     : Number of Group Records
+               )"01:"$(                        : Record Type - IS_IN
+               )"00:"$(                        : Aux Data Len
+               )"00:01:"$(                     : Number of Sources
+               )"ef:01:01:01:"$(               : Multicast Address - 239.1.1.1
+               )"c0:00:02:02"$(                : Source Address - 192.0.2.2
+               )
+
+       echo $igmpv3
+}
+
+ctrl_igmpv3_is_in_test()
+{
+       RET=0
+
+       # Add a permanent entry and check that it is not affected by the
+       # received IGMP packet.
+       bridge mdb add dev br0 port $swp1 grp 239.1.1.1 permanent vid 10 \
+               filter_mode include source_list 192.0.2.1
+
+       # IS_IN ( 192.0.2.2 )
+       $MZ $h1.10 -c 1 -A 192.0.2.1 -B 239.1.1.1 \
+               -t ip proto=2,p=$(igmpv3_is_in_get) -q
+
+       bridge -d mdb show dev br0 vid 10 | grep 239.1.1.1 | grep -q 192.0.2.2
+       check_fail $? "Permanent entry affected by IGMP packet"
+
+       # Replace the permanent entry with a temporary one and check that after
+       # processing the IGMP packet, a new source is added to the list along
+       # with a new forwarding entry.
+       bridge mdb replace dev br0 port $swp1 grp 239.1.1.1 temp vid 10 \
+               filter_mode include source_list 192.0.2.1
+
+       # IS_IN ( 192.0.2.2 )
+       $MZ $h1.10 -c 1 -A 192.0.2.1 -B 239.1.1.1 \
+               -t ip proto=2,p=$(igmpv3_is_in_get) -q
+
+       bridge -d mdb show dev br0 vid 10 | grep 239.1.1.1 | grep -v "src" | \
+               grep -q 192.0.2.2
+       check_err $? "Source not add to source list"
+
+       bridge -d mdb show dev br0 vid 10 | grep 239.1.1.1 | \
+               grep -q "src 192.0.2.2"
+       check_err $? "(S, G) entry not created for new source"
+
+       bridge mdb del dev br0 port $swp1 grp 239.1.1.1 vid 10
+
+       log_test "IGMPv3 MODE_IS_INCLUE tests"
+}
+
+mldv2_is_in_get()
+{
+       local hbh
+       local icmpv6
+
+       hbh=$(:
+               )"3a:"$(                        : Next Header - ICMPv6
+               )"00:"$(                        : Hdr Ext Len
+               )"00:00:00:00:00:00:"$(         : Options and Padding
+               )
+
+       icmpv6=$(:
+               )"8f:"$(                        : Type - MLDv2 Report
+               )"00:"$(                        : Code
+               )"45:39:"$(                     : Checksum
+               )"00:00:"$(                     : Reserved
+               )"00:01:"$(                     : Number of Group Records
+               )"01:"$(                        : Record Type - IS_IN
+               )"00:"$(                        : Aux Data Len
+               )"00:01:"$(                     : Number of Sources
+               )"ff:0e:00:00:00:00:00:00:"$(   : Multicast address - ff0e::1
+               )"00:00:00:00:00:00:00:01:"$(   :
+               )"20:01:0d:b8:00:01:00:00:"$(   : Source Address - 2001:db8:1::2
+               )"00:00:00:00:00:00:00:02:"$(   :
+               )
+
+       echo ${hbh}${icmpv6}
+}
+
+ctrl_mldv2_is_in_test()
+{
+       RET=0
+
+       # Add a permanent entry and check that it is not affected by the
+       # received MLD packet.
+       bridge mdb add dev br0 port $swp1 grp ff0e::1 permanent vid 10 \
+               filter_mode include source_list 2001:db8:1::1
+
+       # IS_IN ( 2001:db8:1::2 )
+       $MZ -6 $h1.10 -c 1 -A fe80::1 -B ff0e::1 \
+               -t ip hop=1,next=0,p=$(mldv2_is_in_get) -q
+
+       bridge -d mdb show dev br0 vid 10 | grep ff0e::1 | \
+               grep -q 2001:db8:1::2
+       check_fail $? "Permanent entry affected by MLD packet"
+
+       # Replace the permanent entry with a temporary one and check that after
+       # processing the MLD packet, a new source is added to the list along
+       # with a new forwarding entry.
+       bridge mdb replace dev br0 port $swp1 grp ff0e::1 temp vid 10 \
+               filter_mode include source_list 2001:db8:1::1
+
+       # IS_IN ( 2001:db8:1::2 )
+       $MZ -6 $h1.10 -c 1 -A fe80::1 -B ff0e::1 \
+               -t ip hop=1,next=0,p=$(mldv2_is_in_get) -q
+
+       bridge -d mdb show dev br0 vid 10 | grep ff0e::1 | grep -v "src" | \
+               grep -q 2001:db8:1::2
+       check_err $? "Source not add to source list"
+
+       bridge -d mdb show dev br0 vid 10 | grep ff0e::1 | \
+               grep -q "src 2001:db8:1::2"
+       check_err $? "(S, G) entry not created for new source"
+
+       bridge mdb del dev br0 port $swp1 grp ff0e::1 vid 10
+
+       log_test "MLDv2 MODE_IS_INCLUDE tests"
+}
+
+ctrl_test()
+{
+       echo
+       log_info "# Control packets tests"
+
+       ctrl_igmpv3_is_in_test
+       ctrl_mldv2_is_in_test
 }
 
 trap cleanup EXIT
 
 setup_prepare
 setup_wait
-
 tests_run
 
 exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/bridge_mdb_host.sh b/tools/testing/selftests/net/forwarding/bridge_mdb_host.sh
new file mode 100755 (executable)
index 0000000..b1ba687
--- /dev/null
@@ -0,0 +1,103 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Verify that adding host mdb entries work as intended for all types of
+# multicast filters: ipv4, ipv6, and mac
+
+ALL_TESTS="mdb_add_del_test"
+NUM_NETIFS=2
+
+TEST_GROUP_IP4="225.1.2.3"
+TEST_GROUP_IP6="ff02::42"
+TEST_GROUP_MAC="01:00:01:c0:ff:ee"
+
+source lib.sh
+
+h1_create()
+{
+       simple_if_init $h1 192.0.2.1/24 2001:db8:1::1/64
+}
+
+h1_destroy()
+{
+       simple_if_fini $h1 192.0.2.1/24 2001:db8:1::1/64
+}
+
+switch_create()
+{
+       # Enable multicast filtering
+       ip link add dev br0 type bridge mcast_snooping 1
+
+       ip link set dev $swp1 master br0
+
+       ip link set dev br0 up
+       ip link set dev $swp1 up
+}
+
+switch_destroy()
+{
+       ip link set dev $swp1 down
+       ip link del dev br0
+}
+
+setup_prepare()
+{
+       h1=${NETIFS[p1]}
+       swp1=${NETIFS[p2]}
+
+       vrf_prepare
+
+       h1_create
+       switch_create
+}
+
+cleanup()
+{
+       pre_cleanup
+
+       switch_destroy
+       h1_destroy
+
+       vrf_cleanup
+}
+
+do_mdb_add_del()
+{
+       local group=$1
+       local flag=$2
+
+       RET=0
+       bridge mdb add dev br0 port br0 grp $group $flag 2>/dev/null
+       check_err $? "Failed adding $group to br0, port br0"
+
+       if [ -z "$flag" ]; then
+           flag="temp"
+       fi
+
+       bridge mdb show dev br0 | grep $group | grep -q $flag 2>/dev/null
+       check_err $? "$group not added with $flag flag"
+
+       bridge mdb del dev br0 port br0 grp $group 2>/dev/null
+       check_err $? "Failed deleting $group from br0, port br0"
+
+       bridge mdb show dev br0 | grep -q $group >/dev/null
+       check_err_fail 1 $? "$group still in mdb after delete"
+
+       log_test "MDB add/del group $group to bridge port br0"
+}
+
+mdb_add_del_test()
+{
+       do_mdb_add_del $TEST_GROUP_MAC permanent
+       do_mdb_add_del $TEST_GROUP_IP4
+       do_mdb_add_del $TEST_GROUP_IP6
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
index 0900c54..275491b 100755 (executable)
@@ -782,7 +782,7 @@ kci_test_ipsec_offload()
            tmpl proto esp src $srcip dst $dstip spi 9 \
            mode transport reqid 42
        check_err $?
-       ip x p add dir out src $dstip/24 dst $srcip/24 \
+       ip x p add dir in src $dstip/24 dst $srcip/24 \
            tmpl proto esp src $dstip dst $srcip spi 9 \
            mode transport reqid 42
        check_err $?
index 0a49907..da5bfd8 100755 (executable)
@@ -32,7 +32,7 @@ DEV="eth0"
 # This is determined by reading the RSS indirection table using ethtool.
 get_rss_cfg_num_rxqs() {
        echo $(ethtool -x "${DEV}" |
-               egrep [[:space:]]+[0-9]+:[[:space:]]+ |
+               grep -E [[:space:]]+[0-9]+:[[:space:]]+ |
                cut -d: -f2- |
                awk '{$1=$1};1' |
                tr ' ' '\n' |
index b48e183..76645aa 100755 (executable)
@@ -35,6 +35,8 @@ cleanup() {
        for i in 1 2;do ip netns del nsrouter$i;done
 }
 
+trap cleanup EXIT
+
 ipv4() {
     echo -n 192.168.$1.2
 }
@@ -146,11 +148,17 @@ ip netns exec nsclient1 nft -f - <<EOF
 table inet filter {
        counter unknown { }
        counter related { }
+       counter redir4 { }
+       counter redir6 { }
        chain input {
                type filter hook input priority 0; policy accept;
-               meta l4proto { icmp, icmpv6 } ct state established,untracked accept
 
+               icmp type "redirect" ct state "related" counter name "redir4" accept
+               icmpv6 type "nd-redirect" ct state "related" counter name "redir6" accept
+
+               meta l4proto { icmp, icmpv6 } ct state established,untracked accept
                meta l4proto { icmp, icmpv6 } ct state "related" counter name "related" accept
+
                counter name "unknown" drop
        }
 }
@@ -279,5 +287,29 @@ else
        echo "ERROR: icmp error RELATED state test has failed"
 fi
 
-cleanup
+# add 'bad' route,  expect icmp REDIRECT to be generated
+ip netns exec nsclient1 ip route add 192.168.1.42 via 192.168.1.1
+ip netns exec nsclient1 ip route add dead:1::42 via dead:1::1
+
+ip netns exec "nsclient1" ping -q -c 2 192.168.1.42 > /dev/null
+
+expect="packets 1 bytes 112"
+check_counter nsclient1 "redir4" "$expect"
+if [ $? -ne 0 ];then
+       ret=1
+fi
+
+ip netns exec "nsclient1" ping -c 1 dead:1::42 > /dev/null
+expect="packets 1 bytes 192"
+check_counter nsclient1 "redir6" "$expect"
+if [ $? -ne 0 ];then
+       ret=1
+fi
+
+if [ $ret -eq 0 ];then
+       echo "PASS: icmp redirects had RELATED state"
+else
+       echo "ERROR: icmp redirect RELATED state test has failed"
+fi
+
 exit $ret
index 26e193f..873a892 100644 (file)
@@ -150,7 +150,7 @@ do_preprocess()
        let lines=3
        out=`basename "$in"`"-slabs-by-loss"
        `cat "$in" | grep -A "$lines" 'Slabs sorted by loss' |\
-               egrep -iv '\-\-|Name|Slabs'\
+               grep -E -iv '\-\-|Name|Slabs'\
                | awk '{print $1" "$4+$2*$3" "$4}' > "$out"`
        if [ $? -eq 0 ]; then
                do_slabs_plotting "$out"
@@ -159,7 +159,7 @@ do_preprocess()
        let lines=3
        out=`basename "$in"`"-slabs-by-size"
        `cat "$in" | grep -A "$lines" 'Slabs sorted by size' |\
-               egrep -iv '\-\-|Name|Slabs'\
+               grep -E -iv '\-\-|Name|Slabs'\
                | awk '{print $1" "$4" "$4-$2*$3}' > "$out"`
        if [ $? -eq 0 ]; then
                do_slabs_plotting "$out"