Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net
authorJakub Kicinski <kuba@kernel.org>
Fri, 11 Feb 2022 01:29:56 +0000 (17:29 -0800)
committerJakub Kicinski <kuba@kernel.org>
Fri, 11 Feb 2022 01:29:56 +0000 (17:29 -0800)
No conflicts.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
791 files changed:
Documentation/admin-guide/sysctl/net.rst
Documentation/bpf/btf.rst
Documentation/bpf/instruction-set.rst
Documentation/devicetree/bindings/net/cdns,macb.yaml
Documentation/devicetree/bindings/net/dsa/microchip,ksz.yaml
Documentation/devicetree/bindings/net/fsl-fman.txt
Documentation/devicetree/bindings/net/microchip,lan966x-switch.yaml
Documentation/devicetree/bindings/net/renesas,etheravb.yaml
Documentation/networking/ethtool-netlink.rst
Documentation/networking/mctp.rst
MAINTAINERS
arch/alpha/include/uapi/asm/socket.h
arch/arm64/boot/dts/xilinx/zynqmp.dtsi
arch/arm64/net/bpf_jit_comp.c
arch/mips/include/uapi/asm/socket.h
arch/parisc/include/uapi/asm/socket.h
arch/powerpc/net/bpf_jit_comp.c
arch/sparc/include/uapi/asm/socket.h
arch/sparc/net/bpf_jit_comp_64.c
arch/x86/Kconfig
arch/x86/include/asm/text-patching.h
arch/x86/kernel/alternative.c
arch/x86/net/bpf_jit_comp.c
drivers/bluetooth/btintel.c
drivers/bluetooth/btintel.h
drivers/bluetooth/btmrvl_debugfs.c
drivers/bluetooth/btmrvl_sdio.c
drivers/bluetooth/btmtk.h
drivers/bluetooth/btmtksdio.c
drivers/bluetooth/btrtl.c
drivers/bluetooth/btusb.c
drivers/bluetooth/hci_h5.c
drivers/bluetooth/hci_ll.c
drivers/bluetooth/hci_serdev.c
drivers/net/bonding/bond_alb.c
drivers/net/bonding/bond_main.c
drivers/net/bonding/bond_procfs.c
drivers/net/dsa/Kconfig
drivers/net/dsa/Makefile
drivers/net/dsa/b53/b53_common.c
drivers/net/dsa/bcm_sf2.c
drivers/net/dsa/microchip/ksz8795.c
drivers/net/dsa/microchip/ksz9477.c
drivers/net/dsa/microchip/ksz_common.c
drivers/net/dsa/microchip/ksz_common.h
drivers/net/dsa/mt7530.c
drivers/net/dsa/mv88e6xxx/chip.c
drivers/net/dsa/mv88e6xxx/chip.h
drivers/net/dsa/mv88e6xxx/global1.h
drivers/net/dsa/mv88e6xxx/global1_vtu.c
drivers/net/dsa/mv88e6xxx/global2.h
drivers/net/dsa/mv88e6xxx/global2_scratch.c
drivers/net/dsa/mv88e6xxx/port.c
drivers/net/dsa/mv88e6xxx/port.h
drivers/net/dsa/mv88e6xxx/serdes.c
drivers/net/dsa/mv88e6xxx/smi.c
drivers/net/dsa/qca/ar9331.c
drivers/net/dsa/qca8k.c
drivers/net/dsa/qca8k.h
drivers/net/dsa/realtek-smi-core.c [deleted file]
drivers/net/dsa/realtek/Kconfig [new file with mode: 0644]
drivers/net/dsa/realtek/Makefile [new file with mode: 0644]
drivers/net/dsa/realtek/realtek-mdio.c [new file with mode: 0644]
drivers/net/dsa/realtek/realtek-smi.c [new file with mode: 0644]
drivers/net/dsa/realtek/realtek.h [moved from drivers/net/dsa/realtek-smi-core.h with 55% similarity]
drivers/net/dsa/realtek/rtl8365mb.c [moved from drivers/net/dsa/rtl8365mb.c with 75% similarity]
drivers/net/dsa/realtek/rtl8366-core.c [moved from drivers/net/dsa/rtl8366.c with 61% similarity]
drivers/net/dsa/realtek/rtl8366rb.c [moved from drivers/net/dsa/rtl8366rb.c with 78% similarity]
drivers/net/dsa/xrs700x/xrs700x.c
drivers/net/ethernet/3com/typhoon.c
drivers/net/ethernet/agere/et131x.c
drivers/net/ethernet/amazon/ena/ena_netdev.c
drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
drivers/net/ethernet/broadcom/bnxt/bnxt.c
drivers/net/ethernet/broadcom/bnxt/bnxt.h
drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h
drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c
drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.h
drivers/net/ethernet/broadcom/genet/bcmgenet.c
drivers/net/ethernet/cadence/macb.h
drivers/net/ethernet/cadence/macb_main.c
drivers/net/ethernet/cavium/liquidio/lio_main.c
drivers/net/ethernet/cavium/thunder/thunder_bgx.c
drivers/net/ethernet/cortina/gemini.c
drivers/net/ethernet/dec/tulip/pnic.c
drivers/net/ethernet/dlink/sundance.c
drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h
drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c
drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c
drivers/net/ethernet/freescale/enetc/enetc.h
drivers/net/ethernet/freescale/enetc/enetc_cbdr.c
drivers/net/ethernet/freescale/enetc/enetc_pf.c
drivers/net/ethernet/freescale/enetc/enetc_qos.c
drivers/net/ethernet/freescale/fec_main.c
drivers/net/ethernet/freescale/fec_ptp.c
drivers/net/ethernet/freescale/xgmac_mdio.c
drivers/net/ethernet/hisilicon/hns3/hnae3.h
drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h
drivers/net/ethernet/intel/e1000e/netdev.c
drivers/net/ethernet/intel/i40e/i40e.h
drivers/net/ethernet/intel/i40e/i40e_adminq.c
drivers/net/ethernet/intel/i40e/i40e_common.c
drivers/net/ethernet/intel/i40e/i40e_debugfs.c
drivers/net/ethernet/intel/i40e/i40e_ethtool.c
drivers/net/ethernet/intel/i40e/i40e_main.c
drivers/net/ethernet/intel/i40e/i40e_prototype.h
drivers/net/ethernet/intel/i40e/i40e_txrx.c
drivers/net/ethernet/intel/i40e/i40e_txrx.h
drivers/net/ethernet/intel/i40e/i40e_xsk.c
drivers/net/ethernet/intel/iavf/iavf_main.c
drivers/net/ethernet/intel/ice/Makefile
drivers/net/ethernet/intel/ice/ice.h
drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
drivers/net/ethernet/intel/ice/ice_base.c
drivers/net/ethernet/intel/ice/ice_common.c
drivers/net/ethernet/intel/ice/ice_common.h
drivers/net/ethernet/intel/ice/ice_dcb_lib.c
drivers/net/ethernet/intel/ice/ice_eswitch.c
drivers/net/ethernet/intel/ice/ice_ethtool.c
drivers/net/ethernet/intel/ice/ice_flex_pipe.c
drivers/net/ethernet/intel/ice/ice_flex_pipe.h
drivers/net/ethernet/intel/ice/ice_flex_type.h
drivers/net/ethernet/intel/ice/ice_fltr.c
drivers/net/ethernet/intel/ice/ice_fltr.h
drivers/net/ethernet/intel/ice/ice_idc.c
drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h
drivers/net/ethernet/intel/ice/ice_lib.c
drivers/net/ethernet/intel/ice/ice_lib.h
drivers/net/ethernet/intel/ice/ice_main.c
drivers/net/ethernet/intel/ice/ice_osdep.h
drivers/net/ethernet/intel/ice/ice_pf_vsi_vlan_ops.c [new file with mode: 0644]
drivers/net/ethernet/intel/ice/ice_pf_vsi_vlan_ops.h [new file with mode: 0644]
drivers/net/ethernet/intel/ice/ice_switch.c
drivers/net/ethernet/intel/ice/ice_switch.h
drivers/net/ethernet/intel/ice/ice_txrx.c
drivers/net/ethernet/intel/ice/ice_txrx.h
drivers/net/ethernet/intel/ice/ice_txrx_lib.c
drivers/net/ethernet/intel/ice/ice_txrx_lib.h
drivers/net/ethernet/intel/ice/ice_type.h
drivers/net/ethernet/intel/ice/ice_vf_vsi_vlan_ops.c [new file with mode: 0644]
drivers/net/ethernet/intel/ice/ice_vf_vsi_vlan_ops.h [new file with mode: 0644]
drivers/net/ethernet/intel/ice/ice_virtchnl_allowlist.c
drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h
drivers/net/ethernet/intel/ice/ice_vlan.h [new file with mode: 0644]
drivers/net/ethernet/intel/ice/ice_vlan_mode.c [new file with mode: 0644]
drivers/net/ethernet/intel/ice/ice_vlan_mode.h [new file with mode: 0644]
drivers/net/ethernet/intel/ice/ice_vsi_vlan_lib.c [new file with mode: 0644]
drivers/net/ethernet/intel/ice/ice_vsi_vlan_lib.h [new file with mode: 0644]
drivers/net/ethernet/intel/ice/ice_vsi_vlan_ops.c [new file with mode: 0644]
drivers/net/ethernet/intel/ice/ice_vsi_vlan_ops.h [new file with mode: 0644]
drivers/net/ethernet/intel/ice/ice_xsk.c
drivers/net/ethernet/intel/ice/ice_xsk.h
drivers/net/ethernet/intel/igb/igb_ethtool.c
drivers/net/ethernet/intel/igb/igb_main.c
drivers/net/ethernet/intel/igbvf/netdev.c
drivers/net/ethernet/intel/igc/igc_main.c
drivers/net/ethernet/intel/ixgb/ixgb_main.c
drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
drivers/net/ethernet/marvell/mvneta.c
drivers/net/ethernet/marvell/octeontx2/af/cgx.c
drivers/net/ethernet/marvell/octeontx2/af/cgx.h
drivers/net/ethernet/marvell/octeontx2/af/lmac_common.h
drivers/net/ethernet/marvell/octeontx2/af/mbox.h
drivers/net/ethernet/marvell/octeontx2/af/rpm.c
drivers/net/ethernet/marvell/octeontx2/af/rpm.h
drivers/net/ethernet/marvell/octeontx2/af/rvu.h
drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c
drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
drivers/net/ethernet/marvell/octeontx2/nic/Makefile
drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c
drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h
drivers/net/ethernet/marvell/octeontx2/nic/otx2_dcbnl.c [new file with mode: 0644]
drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c
drivers/net/ethernet/marvell/octeontx2/nic/otx2_flows.c
drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c
drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c
drivers/net/ethernet/mediatek/mtk_star_emac.c
drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/accept.c
drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.h
drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/csum.c
drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/ct.c
drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/drop.c
drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/goto.c
drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/mark.c
drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/mirred.c
drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/mirred_nic.c
drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/mpls.c
drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/pedit.c
drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/pedit.h
drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/ptype.c
drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/redirect_ingress.c
drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/sample.c
drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/trap.c
drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/tun.c
drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/vlan.c
drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/vlan.h
drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/vlan_mangle.c
drivers/net/ethernet/mellanox/mlx5/core/en/tc/post_act.c
drivers/net/ethernet/mellanox/mlx5/core/en/tc/sample.c
drivers/net/ethernet/mellanox/mlx5/core/en/tc/sample.h
drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c
drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.h
drivers/net/ethernet/mellanox/mlx5/core/en/tc_priv.h
drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c
drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.h
drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
drivers/net/ethernet/mellanox/mlx5/core/en_tc.h
drivers/net/ethernet/mellanox/mlx5/core/esw/indir_table.c
drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads_termtbl.c
drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h
drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c
drivers/net/ethernet/mellanox/mlxsw/core.c
drivers/net/ethernet/mellanox/mlxsw/core.h
drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.c
drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.h
drivers/net/ethernet/mellanox/mlxsw/core_env.c
drivers/net/ethernet/mellanox/mlxsw/core_env.h
drivers/net/ethernet/mellanox/mlxsw/minimal.c
drivers/net/ethernet/mellanox/mlxsw/reg.h
drivers/net/ethernet/mellanox/mlxsw/resources.h
drivers/net/ethernet/mellanox/mlxsw/spectrum.c
drivers/net/ethernet/mellanox/mlxsw/spectrum.h
drivers/net/ethernet/mellanox/mlxsw/spectrum1_kvdl.c
drivers/net/ethernet/mellanox/mlxsw/spectrum2_acl_tcam.c
drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c
drivers/net/ethernet/mellanox/mlxsw/spectrum_ethtool.c
drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c
drivers/net/ethernet/mellanox/mlxsw/spectrum_ptp.c
drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
drivers/net/ethernet/microchip/lan743x_ethtool.c
drivers/net/ethernet/microchip/lan966x/Makefile
drivers/net/ethernet/microchip/lan966x/lan966x_ethtool.c
drivers/net/ethernet/microchip/lan966x/lan966x_main.c
drivers/net/ethernet/microchip/lan966x/lan966x_main.h
drivers/net/ethernet/microchip/lan966x/lan966x_mdb.c
drivers/net/ethernet/microchip/lan966x/lan966x_phylink.c
drivers/net/ethernet/microchip/lan966x/lan966x_ptp.c [new file with mode: 0644]
drivers/net/ethernet/microchip/lan966x/lan966x_regs.h
drivers/net/ethernet/microchip/lan966x/lan966x_switchdev.c
drivers/net/ethernet/microchip/sparx5/sparx5_main.c
drivers/net/ethernet/microchip/sparx5/sparx5_phylink.c
drivers/net/ethernet/microsoft/mana/gdma_main.c
drivers/net/ethernet/microsoft/mana/mana.h
drivers/net/ethernet/microsoft/mana/mana_en.c
drivers/net/ethernet/microsoft/mana/mana_ethtool.c
drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c
drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h
drivers/net/ethernet/netronome/nfp/nfp_net_sriov.h
drivers/net/ethernet/netronome/nfp/nfp_port.h
drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp.c
drivers/net/ethernet/pensando/ionic/ionic.h
drivers/net/ethernet/pensando/ionic/ionic_bus_pci.c
drivers/net/ethernet/pensando/ionic/ionic_dev.c
drivers/net/ethernet/pensando/ionic/ionic_dev.h
drivers/net/ethernet/pensando/ionic/ionic_lif.c
drivers/net/ethernet/pensando/ionic/ionic_lif.h
drivers/net/ethernet/pensando/ionic/ionic_main.c
drivers/net/ethernet/pensando/ionic/ionic_rx_filter.c
drivers/net/ethernet/pensando/ionic/ionic_txrx.c
drivers/net/ethernet/qlogic/qed/qed_dev.c
drivers/net/ethernet/qlogic/qed/qed_mcp.c
drivers/net/ethernet/qlogic/qed/qed_mcp.h
drivers/net/ethernet/realtek/r8169_main.c
drivers/net/ethernet/realtek/r8169_phy_config.c
drivers/net/ethernet/renesas/ravb_main.c
drivers/net/ethernet/renesas/sh_eth.c
drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c
drivers/net/ethernet/sfc/ef10.c
drivers/net/ethernet/sfc/ef100_nic.c
drivers/net/ethernet/sfc/net_driver.h
drivers/net/ethernet/sfc/nic_common.h
drivers/net/ethernet/sfc/rx_common.c
drivers/net/ethernet/sfc/rx_common.h
drivers/net/ethernet/sfc/siena.c
drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c
drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c
drivers/net/ethernet/stmicro/stmmac/stmmac.h
drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c
drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c
drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c
drivers/net/ethernet/xilinx/xilinx_axienet.h
drivers/net/ethernet/xilinx/xilinx_axienet_main.c
drivers/net/fjes/fjes_main.c
drivers/net/hyperv/netvsc.c
drivers/net/ieee802154/atusb.c
drivers/net/ieee802154/mac802154_hwsim.c
drivers/net/ipa/gsi_trans.c
drivers/net/ipa/gsi_trans.h
drivers/net/ipa/ipa_data-v3.1.c
drivers/net/ipa/ipa_data-v3.5.1.c
drivers/net/ipa/ipa_data-v4.11.c
drivers/net/ipa/ipa_data-v4.2.c
drivers/net/ipa/ipa_data-v4.5.c
drivers/net/ipa/ipa_data-v4.9.c
drivers/net/ipa/ipa_data.h
drivers/net/ipa/ipa_endpoint.c
drivers/net/ipa/ipa_endpoint.h
drivers/net/mdio/mdio-xgene.c
drivers/net/pcs/pcs-xpcs.c
drivers/net/phy/aquantia_main.c
drivers/net/phy/at803x.c
drivers/net/phy/phy-core.c
drivers/net/phy/phylink.c
drivers/net/usb/Kconfig
drivers/net/usb/asix.h
drivers/net/usb/asix_common.c
drivers/net/usb/asix_devices.c
drivers/net/usb/cdc_mbim.c
drivers/net/usb/smsc95xx.c
drivers/nfc/st-nci/vendor_cmds.c
drivers/nfc/st21nfca/vendor_cmds.c
drivers/ptp/ptp_clock.c
drivers/ptp/ptp_pch.c
drivers/ptp/ptp_sysfs.c
drivers/ptp/ptp_vclock.c
drivers/soc/fsl/dpio/qbman-portal.c
include/linux/bpf-cgroup.h
include/linux/bpf.h
include/linux/bpf_verifier.h
include/linux/btf.h
include/linux/btf_ids.h
include/linux/compiler_types.h
include/linux/dsa/tag_qca.h [new file with mode: 0644]
include/linux/ethtool.h
include/linux/filter.h
include/linux/ipv6.h
include/linux/linkmode.h
include/linux/mii.h
include/linux/mlx5/mlx5_ifc.h
include/linux/net/intel/i40e_client.h
include/linux/net/intel/iidc.h
include/linux/netdevice.h
include/linux/netfilter.h
include/linux/netfilter/nf_conntrack_pptp.h
include/linux/netlink.h
include/linux/pcs/pcs-xpcs.h
include/linux/phy.h
include/linux/phylink.h
include/linux/ref_tracker.h
include/linux/skbuff.h
include/linux/skmsg.h
include/linux/sunrpc/svc_xprt.h
include/linux/sunrpc/xprt.h
include/linux/udp.h
include/linux/uio.h
include/net/ax25.h
include/net/bluetooth/hci_core.h
include/net/bluetooth/mgmt.h
include/net/bonding.h
include/net/cfg802154.h
include/net/dsa.h
include/net/gro.h
include/net/inet_connection_sock.h
include/net/inet_dscp.h [new file with mode: 0644]
include/net/inet_timewait_sock.h
include/net/ip.h
include/net/ip_fib.h
include/net/ipv6.h
include/net/ipv6_frag.h
include/net/mac802154.h
include/net/mctp.h
include/net/net_namespace.h
include/net/netfilter/nf_conntrack_acct.h
include/net/netfilter/nf_conntrack_bpf.h [new file with mode: 0644]
include/net/netfilter/nf_conntrack_ecache.h
include/net/netfilter/nf_conntrack_extend.h
include/net/netfilter/nf_conntrack_labels.h
include/net/netfilter/nf_conntrack_seqadj.h
include/net/netfilter/nf_conntrack_timeout.h
include/net/netfilter/nf_conntrack_timestamp.h
include/net/netfilter/nf_tables_core.h
include/net/netns/core.h
include/net/netns/ipv4.h
include/net/netns/ipv6.h
include/net/page_pool.h
include/net/pkt_cls.h
include/net/pkt_sched.h
include/net/request_sock.h
include/net/sch_generic.h
include/net/sock.h
include/net/tcp.h
include/net/udplite.h
include/net/xdp.h
include/net/xdp_sock_drv.h
include/net/xsk_buff_pool.h
include/trace/events/mctp.h
include/trace/events/skb.h
include/uapi/asm-generic/socket.h
include/uapi/linux/bpf.h
include/uapi/linux/ethtool_netlink.h
include/uapi/linux/ioam6_iptunnel.h
include/uapi/linux/mctp.h
include/uapi/linux/net_dropmon.h
include/uapi/linux/netfilter/nfnetlink_queue.h
include/uapi/linux/socket.h
init/Kconfig
init/main.c
kernel/bpf/arraymap.c
kernel/bpf/bpf_iter.c
kernel/bpf/btf.c
kernel/bpf/cgroup.c
kernel/bpf/core.c
kernel/bpf/cpumap.c
kernel/bpf/devmap.c
kernel/bpf/helpers.c
kernel/bpf/preload/Makefile
kernel/bpf/preload/iterators/Makefile
kernel/bpf/preload/iterators/iterators.c
kernel/bpf/preload/iterators/iterators.lskel.h [new file with mode: 0644]
kernel/bpf/preload/iterators/iterators.skel.h [deleted file]
kernel/bpf/syscall.c
kernel/bpf/trampoline.c
kernel/bpf/verifier.c
kernel/trace/bpf_trace.c
lib/Kconfig.debug
lib/ref_tracker.c
net/6lowpan/core.c
net/ax25/ax25_route.c
net/batman-adv/multicast.c
net/bluetooth/hci_conn.c
net/bluetooth/hci_core.c
net/bluetooth/hci_event.c
net/bluetooth/hci_sync.c
net/bluetooth/mgmt.c
net/bluetooth/msft.c
net/bpf/bpf_dummy_struct_ops.c
net/bpf/test_run.c
net/caif/caif_dev.c
net/can/gw.c
net/core/dev.c
net/core/drop_monitor.c
net/core/filter.c
net/core/gro.c
net/core/link_watch.c
net/core/net_namespace.c
net/core/page_pool.c
net/core/rtnetlink.c
net/core/sock.c
net/core/sock_map.c
net/core/sysctl_net_core.c
net/core/xdp.c
net/dccp/dccp.h
net/dccp/ipv4.c
net/dccp/ipv6.c
net/dccp/minisocks.c
net/dsa/dsa2.c
net/dsa/dsa_priv.h
net/dsa/slave.c
net/dsa/switch.c
net/dsa/tag_qca.c
net/ethtool/rings.c
net/hsr/hsr_debugfs.c
net/hsr/hsr_device.c
net/hsr/hsr_forward.c
net/hsr/hsr_framereg.c
net/hsr/hsr_framereg.h
net/hsr/hsr_main.h
net/hsr/hsr_netlink.c
net/ieee802154/6lowpan/core.c
net/ieee802154/nl-phy.c
net/ipv4/bpf_tcp_ca.c
net/ipv4/fib_frontend.c
net/ipv4/fib_lookup.h
net/ipv4/fib_rules.c
net/ipv4/fib_semantics.c
net/ipv4/fib_trie.c
net/ipv4/icmp.c
net/ipv4/inet_connection_sock.c
net/ipv4/inet_hashtables.c
net/ipv4/inet_timewait_sock.c
net/ipv4/ip_input.c
net/ipv4/ip_options.c
net/ipv4/ip_output.c
net/ipv4/ipmr.c
net/ipv4/netfilter/nf_nat_pptp.c
net/ipv4/nexthop.c
net/ipv4/proc.c
net/ipv4/route.c
net/ipv4/sysctl_net_ipv4.c
net/ipv4/tcp.c
net/ipv4/tcp_bbr.c
net/ipv4/tcp_cubic.c
net/ipv4/tcp_dctcp.c
net/ipv4/tcp_input.c
net/ipv4/tcp_ipv4.c
net/ipv4/tcp_minisocks.c
net/ipv4/tcp_output.c
net/ipv4/udp.c
net/ipv6/addrconf.c
net/ipv6/exthdrs.c
net/ipv6/fib6_rules.c
net/ipv6/icmp.c
net/ipv6/inet6_hashtables.c
net/ipv6/ioam6_iptunnel.c
net/ipv6/ip6_input.c
net/ipv6/ip6_offload.c
net/ipv6/ip6_output.c
net/ipv6/ip6_tunnel.c
net/ipv6/ip6mr.c
net/ipv6/ping.c
net/ipv6/tcp_ipv6.c
net/ipv6/udp.c
net/mctp/af_mctp.c
net/mctp/device.c
net/mctp/route.c
net/mctp/test/route-test.c
net/mptcp/options.c
net/mptcp/pm_netlink.c
net/netfilter/Makefile
net/netfilter/core.c
net/netfilter/nf_conntrack_acct.c
net/netfilter/nf_conntrack_bpf.c [new file with mode: 0644]
net/netfilter/nf_conntrack_core.c
net/netfilter/nf_conntrack_ecache.c
net/netfilter/nf_conntrack_extend.c
net/netfilter/nf_conntrack_helper.c
net/netfilter/nf_conntrack_labels.c
net/netfilter/nf_conntrack_netlink.c
net/netfilter/nf_conntrack_pptp.c
net/netfilter/nf_conntrack_proto_udp.c
net/netfilter/nf_conntrack_seqadj.c
net/netfilter/nf_conntrack_timeout.c
net/netfilter/nf_conntrack_timestamp.c
net/netfilter/nf_nat_core.c
net/netfilter/nf_synproxy_core.c
net/netfilter/nf_tables_core.c
net/netfilter/nfnetlink_cttimeout.c
net/netfilter/nfnetlink_queue.c
net/netfilter/nft_cmp.c
net/netfilter/nft_compat.c
net/netfilter/nft_exthdr.c
net/openvswitch/datapath.c
net/openvswitch/datapath.h
net/openvswitch/flow.c
net/sched/act_ct.c
net/sched/cls_api.c
net/smc/af_smc.c
net/smc/smc_tx.c
net/smc/smc_tx.h
net/sunrpc/auth_gss/auth_gss.c
net/sunrpc/svc_xprt.c
net/sunrpc/xprt.c
net/switchdev/switchdev.c
net/tipc/msg.h
net/tls/tls_sw.c
net/unix/af_unix.c
net/xdp/xsk.c
net/xdp/xsk_buff_pool.c
net/xdp/xsk_queue.h
samples/bpf/map_perf_test_user.c
samples/bpf/xdp1_user.c
samples/bpf/xdp_adjust_tail_user.c
samples/bpf/xdp_fwd_user.c
samples/bpf/xdp_redirect_cpu.bpf.c
samples/bpf/xdp_redirect_cpu_user.c
samples/bpf/xdp_redirect_map.bpf.c
samples/bpf/xdp_redirect_map_multi.bpf.c
samples/bpf/xdp_router_ipv4_user.c
samples/bpf/xdp_rxq_info_user.c
samples/bpf/xdp_sample_pkts_user.c
samples/bpf/xdp_sample_user.c
samples/bpf/xdp_sample_user.h
samples/bpf/xdp_tx_iptunnel_user.c
samples/bpf/xdpsock_ctrl_proc.c
samples/bpf/xdpsock_user.c
samples/bpf/xsk_fwd.c
scripts/bpf_doc.py
scripts/pahole-flags.sh
scripts/pahole-version.sh [new file with mode: 0755]
security/device_cgroup.c
tools/bpf/bpftool/btf.c
tools/bpf/bpftool/cgroup.c
tools/bpf/bpftool/common.c
tools/bpf/bpftool/feature.c
tools/bpf/bpftool/gen.c
tools/bpf/bpftool/link.c
tools/bpf/bpftool/main.c
tools/bpf/bpftool/main.h
tools/bpf/bpftool/map.c
tools/bpf/bpftool/net.c
tools/bpf/bpftool/pids.c
tools/bpf/bpftool/prog.c
tools/bpf/bpftool/struct_ops.c
tools/bpf/resolve_btfids/Makefile
tools/include/uapi/linux/bpf.h
tools/lib/bpf/Makefile
tools/lib/bpf/bpf.c
tools/lib/bpf/bpf.h
tools/lib/bpf/bpf_helpers.h
tools/lib/bpf/bpf_tracing.h
tools/lib/bpf/btf.c
tools/lib/bpf/btf.h
tools/lib/bpf/btf_dump.c
tools/lib/bpf/hashmap.c
tools/lib/bpf/libbpf.c
tools/lib/bpf/libbpf.h
tools/lib/bpf/libbpf.map
tools/lib/bpf/libbpf_internal.h
tools/lib/bpf/libbpf_legacy.h
tools/lib/bpf/netlink.c
tools/lib/bpf/skel_internal.h
tools/perf/tests/llvm.c
tools/perf/util/bpf-loader.c
tools/perf/util/bpf_map.c
tools/testing/selftests/bpf/Makefile
tools/testing/selftests/bpf/README.rst
tools/testing/selftests/bpf/benchs/bench_ringbufs.c
tools/testing/selftests/bpf/benchs/bench_trigger.c
tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
tools/testing/selftests/bpf/config
tools/testing/selftests/bpf/prog_tests/atomics.c
tools/testing/selftests/bpf/prog_tests/attach_probe.c
tools/testing/selftests/bpf/prog_tests/bind_perm.c
tools/testing/selftests/bpf/prog_tests/bpf_cookie.c
tools/testing/selftests/bpf/prog_tests/bpf_iter.c
tools/testing/selftests/bpf/prog_tests/bpf_iter_setsockopt_unix.c [new file with mode: 0644]
tools/testing/selftests/bpf/prog_tests/bpf_mod_race.c [new file with mode: 0644]
tools/testing/selftests/bpf/prog_tests/bpf_nf.c [new file with mode: 0644]
tools/testing/selftests/bpf/prog_tests/btf.c
tools/testing/selftests/bpf/prog_tests/btf_tag.c
tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c
tools/testing/selftests/bpf/prog_tests/cgroup_getset_retval.c [new file with mode: 0644]
tools/testing/selftests/bpf/prog_tests/check_mtu.c
tools/testing/selftests/bpf/prog_tests/cls_redirect.c
tools/testing/selftests/bpf/prog_tests/core_kern.c
tools/testing/selftests/bpf/prog_tests/core_kern_overflow.c [new file with mode: 0644]
tools/testing/selftests/bpf/prog_tests/dummy_st_ops.c
tools/testing/selftests/bpf/prog_tests/fentry_fexit.c
tools/testing/selftests/bpf/prog_tests/fentry_test.c
tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c
tools/testing/selftests/bpf/prog_tests/fexit_stress.c
tools/testing/selftests/bpf/prog_tests/fexit_test.c
tools/testing/selftests/bpf/prog_tests/flow_dissector.c
tools/testing/selftests/bpf/prog_tests/flow_dissector_load_bytes.c
tools/testing/selftests/bpf/prog_tests/for_each.c
tools/testing/selftests/bpf/prog_tests/get_func_args_test.c
tools/testing/selftests/bpf/prog_tests/get_func_ip_test.c
tools/testing/selftests/bpf/prog_tests/get_stackid_cannot_attach.c
tools/testing/selftests/bpf/prog_tests/global_data.c
tools/testing/selftests/bpf/prog_tests/global_data_init.c
tools/testing/selftests/bpf/prog_tests/global_func_args.c
tools/testing/selftests/bpf/prog_tests/kfree_skb.c
tools/testing/selftests/bpf/prog_tests/kfunc_call.c
tools/testing/selftests/bpf/prog_tests/ksyms_module.c
tools/testing/selftests/bpf/prog_tests/l4lb_all.c
tools/testing/selftests/bpf/prog_tests/log_buf.c
tools/testing/selftests/bpf/prog_tests/map_lock.c
tools/testing/selftests/bpf/prog_tests/map_ptr.c
tools/testing/selftests/bpf/prog_tests/modify_return.c
tools/testing/selftests/bpf/prog_tests/pkt_access.c
tools/testing/selftests/bpf/prog_tests/pkt_md_access.c
tools/testing/selftests/bpf/prog_tests/prog_run_opts.c [new file with mode: 0644]
tools/testing/selftests/bpf/prog_tests/prog_run_xattr.c [deleted file]
tools/testing/selftests/bpf/prog_tests/queue_stack_map.c
tools/testing/selftests/bpf/prog_tests/raw_tp_test_run.c
tools/testing/selftests/bpf/prog_tests/raw_tp_writable_test_run.c
tools/testing/selftests/bpf/prog_tests/signal_pending.c
tools/testing/selftests/bpf/prog_tests/skb_ctx.c
tools/testing/selftests/bpf/prog_tests/skb_helpers.c
tools/testing/selftests/bpf/prog_tests/sock_fields.c
tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
tools/testing/selftests/bpf/prog_tests/sockopt_sk.c
tools/testing/selftests/bpf/prog_tests/spinlock.c
tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c
tools/testing/selftests/bpf/prog_tests/syscall.c
tools/testing/selftests/bpf/prog_tests/tailcalls.c
tools/testing/selftests/bpf/prog_tests/task_pt_regs.c
tools/testing/selftests/bpf/prog_tests/test_bpf_syscall_macro.c [new file with mode: 0644]
tools/testing/selftests/bpf/prog_tests/test_profiler.c
tools/testing/selftests/bpf/prog_tests/test_skb_pkt_end.c
tools/testing/selftests/bpf/prog_tests/timer.c
tools/testing/selftests/bpf/prog_tests/timer_mim.c
tools/testing/selftests/bpf/prog_tests/trace_ext.c
tools/testing/selftests/bpf/prog_tests/xdp.c
tools/testing/selftests/bpf/prog_tests/xdp_adjust_frags.c [new file with mode: 0644]
tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c
tools/testing/selftests/bpf/prog_tests/xdp_attach.c
tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c
tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c
tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c
tools/testing/selftests/bpf/prog_tests/xdp_info.c
tools/testing/selftests/bpf/prog_tests/xdp_link.c
tools/testing/selftests/bpf/prog_tests/xdp_noinline.c
tools/testing/selftests/bpf/prog_tests/xdp_perf.c
tools/testing/selftests/bpf/progs/bloom_filter_bench.c
tools/testing/selftests/bpf/progs/bloom_filter_map.c
tools/testing/selftests/bpf/progs/bpf_iter_setsockopt_unix.c [new file with mode: 0644]
tools/testing/selftests/bpf/progs/bpf_iter_task.c
tools/testing/selftests/bpf/progs/bpf_iter_unix.c
tools/testing/selftests/bpf/progs/bpf_loop.c
tools/testing/selftests/bpf/progs/bpf_loop_bench.c
tools/testing/selftests/bpf/progs/bpf_misc.h [new file with mode: 0644]
tools/testing/selftests/bpf/progs/bpf_mod_race.c [new file with mode: 0644]
tools/testing/selftests/bpf/progs/bpf_syscall_macro.c [new file with mode: 0644]
tools/testing/selftests/bpf/progs/bpf_tracing_net.h
tools/testing/selftests/bpf/progs/btf_type_tag_user.c [new file with mode: 0644]
tools/testing/selftests/bpf/progs/cgroup_getset_retval_getsockopt.c [new file with mode: 0644]
tools/testing/selftests/bpf/progs/cgroup_getset_retval_setsockopt.c [new file with mode: 0644]
tools/testing/selftests/bpf/progs/core_kern.c
tools/testing/selftests/bpf/progs/core_kern_overflow.c [new file with mode: 0644]
tools/testing/selftests/bpf/progs/fexit_sleep.c
tools/testing/selftests/bpf/progs/freplace_cls_redirect.c
tools/testing/selftests/bpf/progs/kfunc_call_race.c [new file with mode: 0644]
tools/testing/selftests/bpf/progs/kfunc_call_test.c
tools/testing/selftests/bpf/progs/ksym_race.c [new file with mode: 0644]
tools/testing/selftests/bpf/progs/perfbuf_bench.c
tools/testing/selftests/bpf/progs/ringbuf_bench.c
tools/testing/selftests/bpf/progs/sample_map_ret0.c
tools/testing/selftests/bpf/progs/sockmap_parse_prog.c
tools/testing/selftests/bpf/progs/sockopt_sk.c
tools/testing/selftests/bpf/progs/test_bpf_nf.c [new file with mode: 0644]
tools/testing/selftests/bpf/progs/test_btf_decl_tag.c [moved from tools/testing/selftests/bpf/progs/btf_decl_tag.c with 100% similarity]
tools/testing/selftests/bpf/progs/test_btf_haskv.c
tools/testing/selftests/bpf/progs/test_btf_newkv.c
tools/testing/selftests/bpf/progs/test_btf_nokv.c
tools/testing/selftests/bpf/progs/test_probe_user.c
tools/testing/selftests/bpf/progs/test_ringbuf.c
tools/testing/selftests/bpf/progs/test_sk_lookup.c
tools/testing/selftests/bpf/progs/test_skb_cgroup_id_kern.c
tools/testing/selftests/bpf/progs/test_sock_fields.c
tools/testing/selftests/bpf/progs/test_sockmap_progs_query.c [new file with mode: 0644]
tools/testing/selftests/bpf/progs/test_tc_edt.c
tools/testing/selftests/bpf/progs/test_tcp_check_syncookie_kern.c
tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_grow.c
tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_shrink.c
tools/testing/selftests/bpf/progs/test_xdp_bpf2bpf.c
tools/testing/selftests/bpf/progs/test_xdp_update_frags.c [new file with mode: 0644]
tools/testing/selftests/bpf/progs/test_xdp_with_cpumap_frags_helpers.c [new file with mode: 0644]
tools/testing/selftests/bpf/progs/test_xdp_with_cpumap_helpers.c
tools/testing/selftests/bpf/progs/test_xdp_with_devmap_frags_helpers.c [new file with mode: 0644]
tools/testing/selftests/bpf/progs/test_xdp_with_devmap_helpers.c
tools/testing/selftests/bpf/progs/trace_printk.c
tools/testing/selftests/bpf/progs/trace_vprintk.c
tools/testing/selftests/bpf/progs/trigger_bench.c
tools/testing/selftests/bpf/progs/xdp_redirect_multi_kern.c
tools/testing/selftests/bpf/test_lru_map.c
tools/testing/selftests/bpf/test_lwt_seg6local.sh
tools/testing/selftests/bpf/test_maps.c
tools/testing/selftests/bpf/test_tcp_check_syncookie.sh
tools/testing/selftests/bpf/test_verifier.c
tools/testing/selftests/bpf/test_xdp_meta.sh
tools/testing/selftests/bpf/test_xdp_redirect.sh
tools/testing/selftests/bpf/test_xdp_redirect_multi.sh
tools/testing/selftests/bpf/test_xdp_veth.sh
tools/testing/selftests/bpf/test_xdp_vlan.sh
tools/testing/selftests/bpf/trace_helpers.c
tools/testing/selftests/bpf/trace_helpers.h
tools/testing/selftests/bpf/verifier/calls.c
tools/testing/selftests/bpf/verifier/sock.c
tools/testing/selftests/bpf/xdp_redirect_multi.c
tools/testing/selftests/bpf/xdping.c
tools/testing/selftests/bpf/xdpxceiver.c
tools/testing/selftests/bpf/xdpxceiver.h
tools/testing/selftests/net/.gitignore
tools/testing/selftests/net/Makefile
tools/testing/selftests/net/cmsg_sender.c [new file with mode: 0644]
tools/testing/selftests/net/cmsg_so_mark.c [deleted file]
tools/testing/selftests/net/cmsg_so_mark.sh
tools/testing/selftests/net/cmsg_time.sh [new file with mode: 0755]
tools/testing/selftests/net/fib_rule_tests.sh
tools/testing/selftests/net/fib_tests.sh
tools/testing/selftests/net/forwarding/bridge_vlan_aware.sh
tools/testing/selftests/net/forwarding/bridge_vlan_unaware.sh
tools/testing/selftests/net/forwarding/fib_offload_lib.sh
tools/testing/selftests/net/forwarding/forwarding.config.sample
tools/testing/selftests/net/forwarding/lib.sh
tools/testing/selftests/net/forwarding/pedit_ip.sh [new file with mode: 0755]
tools/testing/selftests/net/mptcp/mptcp_join.sh
tools/testing/selftests/net/mptcp/pm_netlink.sh
tools/testing/selftests/net/mptcp/pm_nl_ctl.c
tools/testing/selftests/net/rtnetlink.sh
tools/testing/selftests/net/timestamping.c

index 4150f74..f86b5e1 100644 (file)
@@ -365,6 +365,15 @@ new netns has been created.
 
 Default : 0  (for compatibility reasons)
 
+txrehash
+--------
+
+Controls default hash rethink behaviour on listening socket when SO_TXREHASH
+option is set to SOCK_TXREHASH_DEFAULT (i. e. not overridden by setsockopt).
+
+If set to 1 (default), hash rethink is performed on listening socket.
+If set to 0, hash rethink is not performed.
+
 2. /proc/sys/net/unix - Parameters for Unix domain sockets
 ----------------------------------------------------------
 
index 1ebf4c5..7940da9 100644 (file)
@@ -503,6 +503,19 @@ valid index (starting from 0) pointing to a member or an argument.
  * ``info.vlen``: 0
  * ``type``: the type with ``btf_type_tag`` attribute
 
+Currently, ``BTF_KIND_TYPE_TAG`` is only emitted for pointer types.
+It has the following btf type chain:
+::
+
+  ptr -> [type_tag]*
+      -> [const | volatile | restrict | typedef]*
+      -> base_type
+
+Basically, a pointer type points to zero or more
+type_tag, then zero or more const/volatile/restrict/typedef
+and finally the base type. The base type is one of
+int, ptr, array, struct, union, enum, func_proto and float types.
+
 3. BTF Kernel API
 =================
 
@@ -565,18 +578,15 @@ A map can be created with ``btf_fd`` and specified key/value type id.::
 In libbpf, the map can be defined with extra annotation like below:
 ::
 
-    struct bpf_map_def SEC("maps") btf_map = {
-        .type = BPF_MAP_TYPE_ARRAY,
-        .key_size = sizeof(int),
-        .value_size = sizeof(struct ipv_counts),
-        .max_entries = 4,
-    };
-    BPF_ANNOTATE_KV_PAIR(btf_map, int, struct ipv_counts);
+    struct {
+        __uint(type, BPF_MAP_TYPE_ARRAY);
+        __type(key, int);
+        __type(value, struct ipv_counts);
+        __uint(max_entries, 4);
+    } btf_map SEC(".maps");
 
-Here, the parameters for macro BPF_ANNOTATE_KV_PAIR are map name, key and
-value types for the map. During ELF parsing, libbpf is able to extract
-key/value type_id's and assign them to BPF_MAP_CREATE attributes
-automatically.
+During ELF parsing, libbpf is able to extract key/value type_id's and assign
+them to BPF_MAP_CREATE attributes automatically.
 
 .. _BPF_Prog_Load:
 
@@ -824,13 +834,12 @@ structure has bitfields. For example, for the following map,::
            ___A b1:4;
            enum A b2:4;
       };
-      struct bpf_map_def SEC("maps") tmpmap = {
-           .type = BPF_MAP_TYPE_ARRAY,
-           .key_size = sizeof(__u32),
-           .value_size = sizeof(struct tmp_t),
-           .max_entries = 1,
-      };
-      BPF_ANNOTATE_KV_PAIR(tmpmap, int, struct tmp_t);
+      struct {
+           __uint(type, BPF_MAP_TYPE_ARRAY);
+           __type(key, int);
+           __type(value, struct tmp_t);
+           __uint(max_entries, 1);
+      } tmpmap SEC(".maps");
 
 bpftool is able to pretty print like below:
 ::
index 3704836..5300837 100644 (file)
@@ -22,7 +22,13 @@ necessary across calls.
 Instruction encoding
 ====================
 
-eBPF uses 64-bit instructions with the following encoding:
+eBPF has two instruction encodings:
+
+ * the basic instruction encoding, which uses 64 bits to encode an instruction
+ * the wide instruction encoding, which appends a second 64-bit immediate value
+   (imm64) after the basic instruction for a total of 128 bits.
+
+The basic instruction encoding looks as follows:
 
  =============  =======  ===============  ====================  ============
  32 bits (MSB)  16 bits  4 bits           4 bits                8 bits (LSB)
@@ -82,9 +88,9 @@ BPF_ALU uses 32-bit wide operands while BPF_ALU64 uses 64-bit wide operands for
 otherwise identical operations.
 The code field encodes the operation as below:
 
-  ========  =====  ==========================
+  ========  =====  =================================================
   code      value  description
-  ========  =====  ==========================
+  ========  =====  =================================================
   BPF_ADD   0x00   dst += src
   BPF_SUB   0x10   dst -= src
   BPF_MUL   0x20   dst \*= src
@@ -98,8 +104,8 @@ The code field encodes the operation as below:
   BPF_XOR   0xa0   dst ^= src
   BPF_MOV   0xb0   dst = src
   BPF_ARSH  0xc0   sign extending shift right
-  BPF_END   0xd0   endianness conversion
-  ========  =====  ==========================
+  BPF_END   0xd0   byte swap operations (see separate section below)
+  ========  =====  =================================================
 
 BPF_ADD | BPF_X | BPF_ALU means::
 
@@ -118,6 +124,42 @@ BPF_XOR | BPF_K | BPF_ALU64 means::
   src_reg = src_reg ^ imm32
 
 
+Byte swap instructions
+----------------------
+
+The byte swap instructions use an instruction class of ``BFP_ALU`` and a 4-bit
+code field of ``BPF_END``.
+
+The byte swap instructions instructions operate on the destination register
+only and do not use a separate source register or immediate value.
+
+The 1-bit source operand field in the opcode is used to to select what byte
+order the operation convert from or to:
+
+  =========  =====  =================================================
+  source     value  description
+  =========  =====  =================================================
+  BPF_TO_LE  0x00   convert between host byte order and little endian
+  BPF_TO_BE  0x08   convert between host byte order and big endian
+  =========  =====  =================================================
+
+The imm field encodes the width of the swap operations.  The following widths
+are supported: 16, 32 and 64.
+
+Examples:
+
+``BPF_ALU | BPF_TO_LE | BPF_END`` with imm = 16 means::
+
+  dst_reg = htole16(dst_reg)
+
+``BPF_ALU | BPF_TO_BE | BPF_END`` with imm = 64 means::
+
+  dst_reg = htobe64(dst_reg)
+
+``BPF_FROM_LE`` and ``BPF_FROM_BE`` exist as aliases for ``BPF_TO_LE`` and
+``BPF_TO_LE`` respetively.
+
+
 Jump instructions
 -----------------
 
@@ -176,63 +218,96 @@ The mode modifier is one of:
   =============  =====  ====================================
   mode modifier  value  description
   =============  =====  ====================================
-  BPF_IMM        0x00   used for 64-bit mov
-  BPF_ABS        0x20   legacy BPF packet access
-  BPF_IND        0x40   legacy BPF packet access
-  BPF_MEM        0x60   all normal load and store operations
+  BPF_IMM        0x00   64-bit immediate instructions
+  BPF_ABS        0x20   legacy BPF packet access (absolute)
+  BPF_IND        0x40   legacy BPF packet access (indirect)
+  BPF_MEM        0x60   regular load and store operations
   BPF_ATOMIC     0xc0   atomic operations
   =============  =====  ====================================
 
-BPF_MEM | <size> | BPF_STX means::
+
+Regular load and store operations
+---------------------------------
+
+The ``BPF_MEM`` mode modifier is used to encode regular load and store
+instructions that transfer data between a register and memory.
+
+``BPF_MEM | <size> | BPF_STX`` means::
 
   *(size *) (dst_reg + off) = src_reg
 
-BPF_MEM | <size> | BPF_ST means::
+``BPF_MEM | <size> | BPF_ST`` means::
 
   *(size *) (dst_reg + off) = imm32
 
-BPF_MEM | <size> | BPF_LDX means::
+``BPF_MEM | <size> | BPF_LDX`` means::
 
   dst_reg = *(size *) (src_reg + off)
 
-Where size is one of: BPF_B or BPF_H or BPF_W or BPF_DW.
+Where size is one of: ``BPF_B``, ``BPF_H``, ``BPF_W``, or ``BPF_DW``.
 
 Atomic operations
 -----------------
 
-eBPF includes atomic operations, which use the immediate field for extra
-encoding::
+Atomic operations are operations that operate on memory and can not be
+interrupted or corrupted by other access to the same memory region
+by other eBPF programs or means outside of this specification.
+
+All atomic operations supported by eBPF are encoded as store operations
+that use the ``BPF_ATOMIC`` mode modifier as follows:
+
+  * ``BPF_ATOMIC | BPF_W | BPF_STX`` for 32-bit operations
+  * ``BPF_ATOMIC | BPF_DW | BPF_STX`` for 64-bit operations
+  * 8-bit and 16-bit wide atomic operations are not supported.
 
-   .imm = BPF_ADD, .code = BPF_ATOMIC | BPF_W  | BPF_STX: lock xadd *(u32 *)(dst_reg + off16) += src_reg
-   .imm = BPF_ADD, .code = BPF_ATOMIC | BPF_DW | BPF_STX: lock xadd *(u64 *)(dst_reg + off16) += src_reg
+The imm field is used to encode the actual atomic operation.
+Simple atomic operation use a subset of the values defined to encode
+arithmetic operations in the imm field to encode the atomic operation:
 
-The basic atomic operations supported are::
+  ========  =====  ===========
+  imm       value  description
+  ========  =====  ===========
+  BPF_ADD   0x00   atomic add
+  BPF_OR    0x40   atomic or
+  BPF_AND   0x50   atomic and
+  BPF_XOR   0xa0   atomic xor
+  ========  =====  ===========
 
-    BPF_ADD
-    BPF_AND
-    BPF_OR
-    BPF_XOR
 
-Each having equivalent semantics with the ``BPF_ADD`` example, that is: the
-memory location addresed by ``dst_reg + off`` is atomically modified, with
-``src_reg`` as the other operand. If the ``BPF_FETCH`` flag is set in the
-immediate, then these operations also overwrite ``src_reg`` with the
-value that was in memory before it was modified.
+``BPF_ATOMIC | BPF_W  | BPF_STX`` with imm = BPF_ADD means::
 
-The more special operations are::
+  *(u32 *)(dst_reg + off16) += src_reg
 
-    BPF_XCHG
+``BPF_ATOMIC | BPF_DW | BPF_STX`` with imm = BPF ADD means::
 
-This atomically exchanges ``src_reg`` with the value addressed by ``dst_reg +
-off``. ::
+  *(u64 *)(dst_reg + off16) += src_reg
 
-    BPF_CMPXCHG
+``BPF_XADD`` is a deprecated name for ``BPF_ATOMIC | BPF_ADD``.
 
-This atomically compares the value addressed by ``dst_reg + off`` with
-``R0``. If they match it is replaced with ``src_reg``. In either case, the
-value that was there before is zero-extended and loaded back to ``R0``.
+In addition to the simple atomic operations, there also is a modifier and
+two complex atomic operations:
 
-Note that 1 and 2 byte atomic operations are not supported.
+  ===========  ================  ===========================
+  imm          value             description
+  ===========  ================  ===========================
+  BPF_FETCH    0x01              modifier: return old value
+  BPF_XCHG     0xe0 | BPF_FETCH  atomic exchange
+  BPF_CMPXCHG  0xf0 | BPF_FETCH  atomic compare and exchange
+  ===========  ================  ===========================
+
+The ``BPF_FETCH`` modifier is optional for simple atomic operations, and
+always set for the complex atomic operations.  If the ``BPF_FETCH`` flag
+is set, then the operation also overwrites ``src_reg`` with the value that
+was in memory before it was modified.
+
+The ``BPF_XCHG`` operation atomically exchanges ``src_reg`` with the value
+addressed by ``dst_reg + off``.
+
+The ``BPF_CMPXCHG`` operation atomically compares the value addressed by
+``dst_reg + off`` with ``R0``. If they match, the value addressed by
+``dst_reg + off`` is replaced with ``src_reg``. In either case, the
+value that was at ``dst_reg + off`` before the operation is zero-extended
+and loaded back to ``R0``.
 
 Clang can generate atomic instructions by default when ``-mcpu=v3`` is
 enabled. If a lower version for ``-mcpu`` is set, the only atomic instruction
@@ -240,40 +315,52 @@ Clang can generate is ``BPF_ADD`` *without* ``BPF_FETCH``. If you need to enable
 the atomics features, while keeping a lower ``-mcpu`` version, you can use
 ``-Xclang -target-feature -Xclang +alu32``.
 
-You may encounter ``BPF_XADD`` - this is a legacy name for ``BPF_ATOMIC``,
-referring to the exclusive-add operation encoded when the immediate field is
-zero.
+64-bit immediate instructions
+-----------------------------
 
-16-byte instructions
---------------------
+Instructions with the ``BPF_IMM`` mode modifier use the wide instruction
+encoding for an extra imm64 value.
 
-eBPF has one 16-byte instruction: ``BPF_LD | BPF_DW | BPF_IMM`` which consists
-of two consecutive ``struct bpf_insn`` 8-byte blocks and interpreted as single
-instruction that loads 64-bit immediate value into a dst_reg.
+There is currently only one such instruction.
 
-Packet access instructions
---------------------------
+``BPF_LD | BPF_DW | BPF_IMM`` means::
 
-eBPF has two non-generic instructions: (BPF_ABS | <size> | BPF_LD) and
-(BPF_IND | <size> | BPF_LD) which are used to access packet data.
+  dst_reg = imm64
 
-They had to be carried over from classic BPF to have strong performance of
-socket filters running in eBPF interpreter. These instructions can only
-be used when interpreter context is a pointer to ``struct sk_buff`` and
-have seven implicit operands. Register R6 is an implicit input that must
-contain pointer to sk_buff. Register R0 is an implicit output which contains
-the data fetched from the packet. Registers R1-R5 are scratch registers
-and must not be used to store the data across BPF_ABS | BPF_LD or
-BPF_IND | BPF_LD instructions.
 
-These instructions have implicit program exit condition as well. When
-eBPF program is trying to access the data beyond the packet boundary,
-the interpreter will abort the execution of the program. JIT compilers
-therefore must preserve this property. src_reg and imm32 fields are
-explicit inputs to these instructions.
+Legacy BPF Packet access instructions
+-------------------------------------
 
-For example, BPF_IND | BPF_W | BPF_LD means::
+eBPF has special instructions for access to packet data that have been
+carried over from classic BPF to retain the performance of legacy socket
+filters running in the eBPF interpreter.
 
-  R0 = ntohl(*(u32 *) (((struct sk_buff *) R6)->data + src_reg + imm32))
+The instructions come in two forms: ``BPF_ABS | <size> | BPF_LD`` and
+``BPF_IND | <size> | BPF_LD``.
 
-and R1 - R5 are clobbered.
+These instructions are used to access packet data and can only be used when
+the program context is a pointer to networking packet.  ``BPF_ABS``
+accesses packet data at an absolute offset specified by the immediate data
+and ``BPF_IND`` access packet data at an offset that includes the value of
+a register in addition to the immediate data.
+
+These instructions have seven implicit operands:
+
+ * Register R6 is an implicit input that must contain pointer to a
+   struct sk_buff.
+ * Register R0 is an implicit output which contains the data fetched from
+   the packet.
+ * Registers R1-R5 are scratch registers that are clobbered after a call to
+   ``BPF_ABS | BPF_LD`` or ``BPF_IND`` | BPF_LD instructions.
+
+These instructions have an implicit program exit condition as well. When an
+eBPF program is trying to access the data beyond the packet boundary, the
+program execution will be aborted.
+
+``BPF_ABS | BPF_W | BPF_LD`` means::
+
+  R0 = ntohl(*(u32 *) (((struct sk_buff *) R6)->data + imm32))
+
+``BPF_IND | BPF_W | BPF_LD`` means::
+
+  R0 = ntohl(*(u32 *) (((struct sk_buff *) R6)->data + src_reg + imm32))
index 8dd06db..6cd3d85 100644 (file)
@@ -81,6 +81,25 @@ properties:
 
   phy-handle: true
 
+  phys:
+    maxItems: 1
+
+  phy-names:
+    const: sgmii-phy
+    description:
+      Required with ZynqMP SoC when in SGMII mode.
+      Should reference PS-GTR generic PHY device for this controller
+      instance. See ZynqMP example.
+
+  resets:
+    maxItems: 1
+    description:
+      Recommended with ZynqMP, specify reset control for this
+      controller instance with zynqmp-reset driver.
+
+  reset-names:
+    maxItems: 1
+
   fixed-link: true
 
   iommus:
@@ -157,3 +176,40 @@ examples:
                     reset-gpios = <&pioE 6 1>;
             };
     };
+
+  - |
+    #include <dt-bindings/clock/xlnx-zynqmp-clk.h>
+    #include <dt-bindings/power/xlnx-zynqmp-power.h>
+    #include <dt-bindings/reset/xlnx-zynqmp-resets.h>
+    #include <dt-bindings/phy/phy.h>
+
+    bus {
+            #address-cells = <2>;
+            #size-cells = <2>;
+            gem1: ethernet@ff0c0000 {
+                    compatible = "cdns,zynqmp-gem", "cdns,gem";
+                    interrupt-parent = <&gic>;
+                    interrupts = <0 59 4>, <0 59 4>;
+                    reg = <0x0 0xff0c0000 0x0 0x1000>;
+                    clocks = <&zynqmp_clk LPD_LSBUS>, <&zynqmp_clk GEM1_REF>,
+                             <&zynqmp_clk GEM1_TX>, <&zynqmp_clk GEM1_RX>,
+                             <&zynqmp_clk GEM_TSU>;
+                    clock-names = "pclk", "hclk", "tx_clk", "rx_clk", "tsu_clk";
+                    #address-cells = <1>;
+                    #size-cells = <0>;
+                    #stream-id-cells = <1>;
+                    iommus = <&smmu 0x875>;
+                    power-domains = <&zynqmp_firmware PD_ETH_1>;
+                    resets = <&zynqmp_reset ZYNQMP_RESET_GEM1>;
+                    reset-names = "gem1_rst";
+                    status = "okay";
+                    phy-mode = "sgmii";
+                    phy-names = "sgmii-phy";
+                    phys = <&psgtr 1 PHY_TYPE_SGMII 1 1>;
+                    fixed-link {
+                            speed = <1000>;
+                            full-duplex;
+                            pause;
+                    };
+            };
+    };
index 84985f5..1841520 100644 (file)
@@ -42,6 +42,12 @@ properties:
     description:
       Set if the output SYNCLKO frequency should be set to 125MHz instead of 25MHz.
 
+  microchip,synclko-disable:
+    $ref: /schemas/types.yaml#/definitions/flag
+    description:
+      Set if the output SYNCLKO clock should be disabled. Do not mix with
+      microchip,synclko-125.
+
 required:
   - compatible
   - reg
index 020337f..801efc7 100644 (file)
@@ -388,14 +388,24 @@ PROPERTIES
                Value type: <prop-encoded-array>
                Definition: A standard property.
 
-- bus-frequency
+- clocks
+               Usage: optional
+               Value type: <phandle>
+               Definition: A reference to the input clock of the controller
+               from which the MDC frequency is derived.
+
+- clock-frequency
                Usage: optional
                Value type: <u32>
-               Definition: Specifies the external MDIO bus clock speed to
-               be used, if different from the standard 2.5 MHz.
-               This may be due to the standard speed being unsupported (e.g.
-               due to a hardware problem), or to advertise that all relevant
-               components in the system support a faster speed.
+               Definition: Specifies the external MDC frequency, in Hertz, to
+               be used. Requires that the input clock is specified in the
+               "clocks" property. See also: mdio.yaml.
+
+- suppress-preamble
+               Usage: optional
+               Value type: <boolean>
+               Definition: Disable generation of preamble bits. See also:
+               mdio.yaml.
 
 - interrupts
                Usage: required for external MDIO
index e79e4e1..1381276 100644 (file)
@@ -38,6 +38,7 @@ properties:
       - description: register based extraction
       - description: frame dma based extraction
       - description: analyzer interrupt
+      - description: ptp interrupt
 
   interrupt-names:
     minItems: 1
@@ -45,6 +46,7 @@ properties:
       - const: xtr
       - const: fdma
       - const: ana
+      - const: ptp
 
   resets:
     items:
index bda8210..ee2ccac 100644 (file)
@@ -45,8 +45,10 @@ properties:
 
       - items:
           - enum:
+              - renesas,r9a07g043-gbeth # RZ/G2UL
               - renesas,r9a07g044-gbeth # RZ/G2{L,LC}
-          - const: renesas,rzg2l-gbeth  # RZ/G2L
+              - renesas,r9a07g054-gbeth # RZ/V2L
+          - const: renesas,rzg2l-gbeth  # RZ/{G2L,G2UL,V2L} family
 
   reg: true
 
index 9d98e05..cae28af 100644 (file)
@@ -860,8 +860,16 @@ Kernel response contents:
   ``ETHTOOL_A_RINGS_RX_JUMBO``          u32     size of RX jumbo ring
   ``ETHTOOL_A_RINGS_TX``                u32     size of TX ring
   ``ETHTOOL_A_RINGS_RX_BUF_LEN``        u32     size of buffers on the ring
+  ``ETHTOOL_A_RINGS_TCP_DATA_SPLIT``    u8      TCP header / data split
   ====================================  ======  ===========================
 
+``ETHTOOL_A_RINGS_TCP_DATA_SPLIT`` indicates whether the device is usable with
+page-flipping TCP zero-copy receive (``getsockopt(TCP_ZEROCOPY_RECEIVE)``).
+If enabled the device is configured to place frame headers and data into
+separate buffers. The device configuration must make it possible to receive
+full memory pages of data, for example because MTU is high enough or through
+HW-GRO.
+
 
 RINGS_SET
 =========
index 46f74bf..c628cb5 100644 (file)
@@ -212,6 +212,54 @@ remote address is already known, or the message does not require a reply.
 Like the send calls, sockets will only receive responses to requests they have
 sent (TO=1) and may only respond (TO=0) to requests they have received.
 
+``ioctl(SIOCMCTPALLOCTAG)`` and ``ioctl(SIOCMCTPDROPTAG)``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+These tags give applications more control over MCTP message tags, by allocating
+(and dropping) tag values explicitly, rather than the kernel automatically
+allocating a per-message tag at ``sendmsg()`` time.
+
+In general, you will only need to use these ioctls if your MCTP protocol does
+not fit the usual request/response model. For example, if you need to persist
+tags across multiple requests, or a request may generate more than one response.
+In these cases, the ioctls allow you to decouple the tag allocation (and
+release) from individual message send and receive operations.
+
+Both ioctls are passed a pointer to a ``struct mctp_ioc_tag_ctl``:
+
+.. code-block:: C
+
+    struct mctp_ioc_tag_ctl {
+        mctp_eid_t      peer_addr;
+        __u8           tag;
+        __u16          flags;
+    };
+
+``SIOCMCTPALLOCTAG`` allocates a tag for a specific peer, which an application
+can use in future ``sendmsg()`` calls. The application populates the
+``peer_addr`` member with the remote EID. Other fields must be zero.
+
+On return, the ``tag`` member will be populated with the allocated tag value.
+The allocated tag will have the following tag bits set:
+
+ - ``MCTP_TAG_OWNER``: it only makes sense to allocate tags if you're the tag
+   owner
+
+ - ``MCTP_TAG_PREALLOC``: to indicate to ``sendmsg()`` that this is a
+   preallocated tag.
+
+ - ... and the actual tag value, within the least-significant three bits
+   (``MCTP_TAG_MASK``). Note that zero is a valid tag value.
+
+The tag value should be used as-is for the ``smctp_tag`` member of ``struct
+sockaddr_mctp``.
+
+``SIOCMCTPDROPTAG`` releases a tag that has been previously allocated by a
+``SIOCMCTPALLOCTAG`` ioctl. The ``peer_addr`` must be the same as used for the
+allocation, and the ``tag`` value must match exactly the tag returned from the
+allocation (including the ``MCTP_TAG_OWNER`` and ``MCTP_TAG_PREALLOC`` bits).
+The ``flags`` field must be zero.
+
 Kernel internals
 ================
 
index 40ff05f..25eb309 100644 (file)
@@ -3523,6 +3523,8 @@ F:        net/sched/act_bpf.c
 F:     net/sched/cls_bpf.c
 F:     samples/bpf/
 F:     scripts/bpf_doc.py
+F:     scripts/pahole-flags.sh
+F:     scripts/pahole-version.sh
 F:     tools/bpf/
 F:     tools/lib/bpf/
 F:     tools/testing/selftests/bpf/
@@ -16357,8 +16359,7 @@ REALTEK RTL83xx SMI DSA ROUTER CHIPS
 M:     Linus Walleij <linus.walleij@linaro.org>
 S:     Maintained
 F:     Documentation/devicetree/bindings/net/dsa/realtek-smi.txt
-F:     drivers/net/dsa/realtek-smi*
-F:     drivers/net/dsa/rtl83*
+F:     drivers/net/dsa/realtek/*
 
 REALTEK WIRELESS DRIVER (rtlwifi family)
 M:     Ping-Ke Shih <pkshih@realtek.com>
index 284d287..7d81535 100644 (file)
 
 #define SO_RESERVE_MEM         73
 
+#define SO_TXREHASH            74
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64
index 74e6644..9bec3ba 100644 (file)
                        #stream-id-cells = <1>;
                        iommus = <&smmu 0x874>;
                        power-domains = <&zynqmp_firmware PD_ETH_0>;
+                       resets = <&zynqmp_reset ZYNQMP_RESET_GEM0>;
+                       reset-names = "gem0_rst";
                };
 
                gem1: ethernet@ff0c0000 {
                        #stream-id-cells = <1>;
                        iommus = <&smmu 0x875>;
                        power-domains = <&zynqmp_firmware PD_ETH_1>;
+                       resets = <&zynqmp_reset ZYNQMP_RESET_GEM1>;
+                       reset-names = "gem1_rst";
                };
 
                gem2: ethernet@ff0d0000 {
                        #stream-id-cells = <1>;
                        iommus = <&smmu 0x876>;
                        power-domains = <&zynqmp_firmware PD_ETH_2>;
+                       resets = <&zynqmp_reset ZYNQMP_RESET_GEM2>;
+                       reset-names = "gem2_rst";
                };
 
                gem3: ethernet@ff0e0000 {
                        #stream-id-cells = <1>;
                        iommus = <&smmu 0x877>;
                        power-domains = <&zynqmp_firmware PD_ETH_3>;
+                       resets = <&zynqmp_reset ZYNQMP_RESET_GEM3>;
+                       reset-names = "gem3_rst";
                };
 
                gpio: gpio@ff0a0000 {
index e96d4d8..74f9a9b 100644 (file)
@@ -1143,6 +1143,11 @@ out:
        return prog;
 }
 
+bool bpf_jit_supports_kfunc_call(void)
+{
+       return true;
+}
+
 u64 bpf_jit_alloc_exec_limit(void)
 {
        return VMALLOC_END - VMALLOC_START;
index 24e0efb..1d55e57 100644 (file)
 
 #define SO_RESERVE_MEM         73
 
+#define SO_TXREHASH            74
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64
index 845ddc6..654061e 100644 (file)
 
 #define SO_RESERVE_MEM         0x4047
 
+#define SO_TXREHASH            0x4048
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64
index 56dd1f4..a4f4d34 100644 (file)
@@ -264,7 +264,7 @@ skip_codegen_passes:
        fp->jited = 1;
        fp->jited_len = proglen + FUNCTION_DESCR_SIZE;
 
-       bpf_flush_icache(bpf_hdr, (u8 *)bpf_hdr + (bpf_hdr->pages * PAGE_SIZE));
+       bpf_flush_icache(bpf_hdr, (u8 *)bpf_hdr + bpf_hdr->size);
        if (!fp->is_func || extra_pass) {
                bpf_jit_binary_lock_ro(bpf_hdr);
                bpf_prog_fill_jited_linfo(fp, addrs);
index 2672dd0..666f81e 100644 (file)
 
 #define SO_RESERVE_MEM           0x0052
 
+#define SO_TXREHASH              0x0053
+
 
 #if !defined(__KERNEL__)
 
index b1e3878..fa0759b 100644 (file)
@@ -1599,7 +1599,7 @@ skip_init_ctx:
        if (bpf_jit_enable > 1)
                bpf_jit_dump(prog->len, image_size, pass, ctx.image);
 
-       bpf_flush_icache(header, (u8 *)header + (header->pages * PAGE_SIZE));
+       bpf_flush_icache(header, (u8 *)header + header->size);
 
        if (!prog->is_func || extra_pass) {
                bpf_jit_binary_lock_ro(header);
index 9f5bd41..995f2dc 100644 (file)
@@ -158,6 +158,7 @@ config X86
        select HAVE_ALIGNED_STRUCT_PAGE         if SLUB
        select HAVE_ARCH_AUDITSYSCALL
        select HAVE_ARCH_HUGE_VMAP              if X86_64 || X86_PAE
+       select HAVE_ARCH_HUGE_VMALLOC           if HAVE_ARCH_HUGE_VMAP
        select HAVE_ARCH_JUMP_LABEL
        select HAVE_ARCH_JUMP_LABEL_RELATIVE
        select HAVE_ARCH_KASAN                  if X86_64
index b742178..4cc18ba 100644 (file)
@@ -44,6 +44,7 @@ extern void text_poke_early(void *addr, const void *opcode, size_t len);
 extern void *text_poke(void *addr, const void *opcode, size_t len);
 extern void text_poke_sync(void);
 extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len);
+extern void *text_poke_copy(void *addr, const void *opcode, size_t len);
 extern int poke_int3_handler(struct pt_regs *regs);
 extern void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate);
 
index 5007c3f..018b61f 100644 (file)
@@ -1102,6 +1102,40 @@ void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
        return __text_poke(addr, opcode, len);
 }
 
+/**
+ * text_poke_copy - Copy instructions into (an unused part of) RX memory
+ * @addr: address to modify
+ * @opcode: source of the copy
+ * @len: length to copy, could be more than 2x PAGE_SIZE
+ *
+ * Not safe against concurrent execution; useful for JITs to dump
+ * new code blocks into unused regions of RX memory. Can be used in
+ * conjunction with synchronize_rcu_tasks() to wait for existing
+ * execution to quiesce after having made sure no existing functions
+ * pointers are live.
+ */
+void *text_poke_copy(void *addr, const void *opcode, size_t len)
+{
+       unsigned long start = (unsigned long)addr;
+       size_t patched = 0;
+
+       if (WARN_ON_ONCE(core_kernel_text(start)))
+               return NULL;
+
+       mutex_lock(&text_mutex);
+       while (patched < len) {
+               unsigned long ptr = start + patched;
+               size_t s;
+
+               s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched);
+
+               __text_poke((void *)ptr, opcode + patched, s);
+               patched += s;
+       }
+       mutex_unlock(&text_mutex);
+       return addr;
+}
+
 static void do_sync_core(void *info)
 {
        sync_core();
index 2b1e266..c7db0fe 100644 (file)
@@ -330,8 +330,7 @@ static int emit_jump(u8 **pprog, void *func, void *ip)
 }
 
 static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
-                               void *old_addr, void *new_addr,
-                               const bool text_live)
+                               void *old_addr, void *new_addr)
 {
        const u8 *nop_insn = x86_nops[5];
        u8 old_insn[X86_PATCH_SIZE];
@@ -365,10 +364,7 @@ static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
                goto out;
        ret = 1;
        if (memcmp(ip, new_insn, X86_PATCH_SIZE)) {
-               if (text_live)
-                       text_poke_bp(ip, new_insn, X86_PATCH_SIZE, NULL);
-               else
-                       memcpy(ip, new_insn, X86_PATCH_SIZE);
+               text_poke_bp(ip, new_insn, X86_PATCH_SIZE, NULL);
                ret = 0;
        }
 out:
@@ -384,7 +380,7 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
                /* BPF poking in modules is not supported */
                return -EINVAL;
 
-       return __bpf_arch_text_poke(ip, t, old_addr, new_addr, true);
+       return __bpf_arch_text_poke(ip, t, old_addr, new_addr);
 }
 
 #define EMIT_LFENCE()  EMIT3(0x0F, 0xAE, 0xE8)
@@ -558,24 +554,15 @@ static void bpf_tail_call_direct_fixup(struct bpf_prog *prog)
                mutex_lock(&array->aux->poke_mutex);
                target = array->ptrs[poke->tail_call.key];
                if (target) {
-                       /* Plain memcpy is used when image is not live yet
-                        * and still not locked as read-only. Once poke
-                        * location is active (poke->tailcall_target_stable),
-                        * any parallel bpf_arch_text_poke() might occur
-                        * still on the read-write image until we finally
-                        * locked it as read-only. Both modifications on
-                        * the given image are under text_mutex to avoid
-                        * interference.
-                        */
                        ret = __bpf_arch_text_poke(poke->tailcall_target,
                                                   BPF_MOD_JUMP, NULL,
                                                   (u8 *)target->bpf_func +
-                                                  poke->adj_off, false);
+                                                  poke->adj_off);
                        BUG_ON(ret < 0);
                        ret = __bpf_arch_text_poke(poke->tailcall_bypass,
                                                   BPF_MOD_JUMP,
                                                   (u8 *)poke->tailcall_target +
-                                                  X86_PATCH_SIZE, NULL, false);
+                                                  X86_PATCH_SIZE, NULL);
                        BUG_ON(ret < 0);
                }
                WRITE_ONCE(poke->tailcall_target_stable, true);
@@ -787,7 +774,6 @@ static int emit_atomic(u8 **pprog, u8 atomic_op,
        /* emit opcode */
        switch (atomic_op) {
        case BPF_ADD:
-       case BPF_SUB:
        case BPF_AND:
        case BPF_OR:
        case BPF_XOR:
@@ -867,7 +853,7 @@ static void emit_nops(u8 **pprog, int len)
 
 #define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp)))
 
-static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
+static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image,
                  int oldproglen, struct jit_context *ctx, bool jmp_padding)
 {
        bool tail_call_reachable = bpf_prog->aux->tail_call_reachable;
@@ -894,8 +880,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
        push_callee_regs(&prog, callee_regs_used);
 
        ilen = prog - temp;
-       if (image)
-               memcpy(image + proglen, temp, ilen);
+       if (rw_image)
+               memcpy(rw_image + proglen, temp, ilen);
        proglen += ilen;
        addrs[0] = proglen;
        prog = temp;
@@ -1324,6 +1310,9 @@ st:                       if (is_imm8(insn->off))
                                        pr_err("extable->insn doesn't fit into 32-bit\n");
                                        return -EFAULT;
                                }
+                               /* switch ex to rw buffer for writes */
+                               ex = (void *)rw_image + ((void *)ex - (void *)image);
+
                                ex->insn = delta;
 
                                ex->data = EX_TYPE_BPF;
@@ -1706,7 +1695,7 @@ emit_jmp:
                                pr_err("bpf_jit: fatal error\n");
                                return -EFAULT;
                        }
-                       memcpy(image + proglen, temp, ilen);
+                       memcpy(rw_image + proglen, temp, ilen);
                }
                proglen += ilen;
                addrs[i] = proglen;
@@ -2247,6 +2236,7 @@ int arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs)
 }
 
 struct x64_jit_data {
+       struct bpf_binary_header *rw_header;
        struct bpf_binary_header *header;
        int *addrs;
        u8 *image;
@@ -2259,6 +2249,7 @@ struct x64_jit_data {
 
 struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 {
+       struct bpf_binary_header *rw_header = NULL;
        struct bpf_binary_header *header = NULL;
        struct bpf_prog *tmp, *orig_prog = prog;
        struct x64_jit_data *jit_data;
@@ -2267,6 +2258,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
        bool tmp_blinded = false;
        bool extra_pass = false;
        bool padding = false;
+       u8 *rw_image = NULL;
        u8 *image = NULL;
        int *addrs;
        int pass;
@@ -2302,6 +2294,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
                oldproglen = jit_data->proglen;
                image = jit_data->image;
                header = jit_data->header;
+               rw_header = jit_data->rw_header;
+               rw_image = (void *)rw_header + ((void *)image - (void *)header);
                extra_pass = true;
                padding = true;
                goto skip_init_addrs;
@@ -2332,12 +2326,12 @@ skip_init_addrs:
        for (pass = 0; pass < MAX_PASSES || image; pass++) {
                if (!padding && pass >= PADDING_PASSES)
                        padding = true;
-               proglen = do_jit(prog, addrs, image, oldproglen, &ctx, padding);
+               proglen = do_jit(prog, addrs, image, rw_image, oldproglen, &ctx, padding);
                if (proglen <= 0) {
 out_image:
                        image = NULL;
                        if (header)
-                               bpf_jit_binary_free(header);
+                               bpf_jit_binary_pack_free(header, rw_header);
                        prog = orig_prog;
                        goto out_addrs;
                }
@@ -2361,8 +2355,9 @@ out_image:
                                sizeof(struct exception_table_entry);
 
                        /* allocate module memory for x86 insns and extable */
-                       header = bpf_jit_binary_alloc(roundup(proglen, align) + extable_size,
-                                                     &image, align, jit_fill_hole);
+                       header = bpf_jit_binary_pack_alloc(roundup(proglen, align) + extable_size,
+                                                          &image, align, &rw_header, &rw_image,
+                                                          jit_fill_hole);
                        if (!header) {
                                prog = orig_prog;
                                goto out_addrs;
@@ -2378,14 +2373,26 @@ out_image:
 
        if (image) {
                if (!prog->is_func || extra_pass) {
+                       /*
+                        * bpf_jit_binary_pack_finalize fails in two scenarios:
+                        *   1) header is not pointing to proper module memory;
+                        *   2) the arch doesn't support bpf_arch_text_copy().
+                        *
+                        * Both cases are serious bugs and justify WARN_ON.
+                        */
+                       if (WARN_ON(bpf_jit_binary_pack_finalize(prog, header, rw_header))) {
+                               prog = orig_prog;
+                               goto out_addrs;
+                       }
+
                        bpf_tail_call_direct_fixup(prog);
-                       bpf_jit_binary_lock_ro(header);
                } else {
                        jit_data->addrs = addrs;
                        jit_data->ctx = ctx;
                        jit_data->proglen = proglen;
                        jit_data->image = image;
                        jit_data->header = header;
+                       jit_data->rw_header = rw_header;
                }
                prog->bpf_func = (void *)image;
                prog->jited = 1;
@@ -2413,3 +2420,10 @@ bool bpf_jit_supports_kfunc_call(void)
 {
        return true;
 }
+
+void *bpf_arch_text_copy(void *dst, void *src, size_t len)
+{
+       if (text_poke_copy(dst, src, len) == NULL)
+               return ERR_PTR(-EINVAL);
+       return dst;
+}
index 1a4f8b2..06514ed 100644 (file)
@@ -2428,10 +2428,15 @@ static int btintel_setup_combined(struct hci_dev *hdev)
 
                        /* Apply the device specific HCI quirks
                         *
-                        * WBS for SdP - SdP and Stp have a same hw_varaint but
-                        * different fw_variant
+                        * WBS for SdP - For the Legacy ROM products, only SdP
+                        * supports the WBS. But the version information is not
+                        * enough to use here because the StP2 and SdP have same
+                        * hw_variant and fw_variant. So, this flag is set by
+                        * the transport driver (btusb) based on the HW info
+                        * (idProduct)
                         */
-                       if (ver.hw_variant == 0x08 && ver.fw_variant == 0x22)
+                       if (!btintel_test_flag(hdev,
+                                              INTEL_ROM_LEGACY_NO_WBS_SUPPORT))
                                set_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED,
                                        &hdev->quirks);
 
index c9b24e9..e0060e5 100644 (file)
@@ -152,6 +152,7 @@ enum {
        INTEL_BROKEN_INITIAL_NCMD,
        INTEL_BROKEN_SHUTDOWN_LED,
        INTEL_ROM_LEGACY,
+       INTEL_ROM_LEGACY_NO_WBS_SUPPORT,
 
        __INTEL_NUM_FLAGS,
 };
index c486757..db35b91 100644 (file)
@@ -1,4 +1,4 @@
-/**
+/*
  * Marvell Bluetooth driver: debugfs related functions
  *
  * Copyright (C) 2009, Marvell International Ltd.
index 68378b4..b8ef66f 100644 (file)
@@ -1,4 +1,4 @@
-/**
+/*
  * Marvell BT-over-SDIO driver: SDIO interface related functions.
  *
  * Copyright (C) 2009, Marvell International Ltd.
index 6e7b0c7..fb76d97 100644 (file)
@@ -7,8 +7,12 @@
 
 #define HCI_WMT_MAX_EVENT_SIZE         64
 
+#define BTMTK_WMT_REG_WRITE 0x1
 #define BTMTK_WMT_REG_READ 0x2
 
+#define MT7921_PINMUX_0 0x70005050
+#define MT7921_PINMUX_1 0x70005054
+
 enum {
        BTMTK_WMT_PATCH_DWNLD = 0x1,
        BTMTK_WMT_TEST = 0x2,
@@ -68,6 +72,37 @@ struct btmtk_tci_sleep {
        u8 time_compensation;
 } __packed;
 
+struct btmtk_wakeon {
+       u8 mode;
+       u8 gpo;
+       u8 active_high;
+       __le16 enable_delay;
+       __le16 wakeup_delay;
+} __packed;
+
+struct btmtk_sco {
+       u8 clock_config;
+       u8 transmit_format_config;
+       u8 channel_format_config;
+       u8 channel_select_config;
+} __packed;
+
+struct reg_read_cmd {
+       u8 type;
+       u8 rsv;
+       u8 num;
+       __le32 addr;
+} __packed;
+
+struct reg_write_cmd {
+       u8 type;
+       u8 rsv;
+       u8 num;
+       __le32 addr;
+       __le32 data;
+       __le32 mask;
+} __packed;
+
 struct btmtk_hci_wmt_params {
        u8 op;
        u8 flag;
index b5ea8d3..8be763a 100644 (file)
 
 #define VERSION "0.1"
 
-#define MTKBTSDIO_AUTOSUSPEND_DELAY    8000
+#define MTKBTSDIO_AUTOSUSPEND_DELAY    1000
 
-static bool enable_autosuspend;
+static bool enable_autosuspend = true;
 
 struct btmtksdio_data {
        const char *fwname;
        u16 chipid;
+       bool lp_mbox_supported;
 };
 
 static const struct btmtksdio_data mt7663_data = {
        .fwname = FIRMWARE_MT7663,
        .chipid = 0x7663,
+       .lp_mbox_supported = false,
 };
 
 static const struct btmtksdio_data mt7668_data = {
        .fwname = FIRMWARE_MT7668,
        .chipid = 0x7668,
+       .lp_mbox_supported = false,
 };
 
 static const struct btmtksdio_data mt7921_data = {
        .fwname = FIRMWARE_MT7961,
        .chipid = 0x7921,
+       .lp_mbox_supported = true,
 };
 
 static const struct sdio_device_id btmtksdio_table[] = {
@@ -87,8 +91,17 @@ MODULE_DEVICE_TABLE(sdio, btmtksdio_table);
 #define RX_DONE_INT            BIT(1)
 #define TX_EMPTY               BIT(2)
 #define TX_FIFO_OVERFLOW       BIT(8)
+#define FW_MAILBOX_INT         BIT(15)
+#define INT_MASK               GENMASK(15, 0)
 #define RX_PKT_LEN             GENMASK(31, 16)
 
+#define MTK_REG_CSICR          0xc0
+#define CSICR_CLR_MBOX_ACK BIT(0)
+#define MTK_REG_PH2DSM0R       0xc4
+#define PH2DSM0R_DRIVER_OWN    BIT(0)
+#define MTK_REG_PD2HRM0R       0xdc
+#define PD2HRM0R_DRV_OWN       BIT(0)
+
 #define MTK_REG_CTDR           0x18
 
 #define MTK_REG_CRDR           0x1c
@@ -100,6 +113,7 @@ MODULE_DEVICE_TABLE(sdio, btmtksdio_table);
 #define BTMTKSDIO_TX_WAIT_VND_EVT      1
 #define BTMTKSDIO_HW_TX_READY          2
 #define BTMTKSDIO_FUNC_ENABLED         3
+#define BTMTKSDIO_PATCH_ENABLED                4
 
 struct mtkbtsdio_hdr {
        __le16  len;
@@ -278,6 +292,78 @@ static u32 btmtksdio_drv_own_query(struct btmtksdio_dev *bdev)
        return sdio_readl(bdev->func, MTK_REG_CHLPCR, NULL);
 }
 
+static u32 btmtksdio_drv_own_query_79xx(struct btmtksdio_dev *bdev)
+{
+       return sdio_readl(bdev->func, MTK_REG_PD2HRM0R, NULL);
+}
+
+static int btmtksdio_fw_pmctrl(struct btmtksdio_dev *bdev)
+{
+       u32 status;
+       int err;
+
+       sdio_claim_host(bdev->func);
+
+       if (bdev->data->lp_mbox_supported &&
+           test_bit(BTMTKSDIO_PATCH_ENABLED, &bdev->tx_state)) {
+               sdio_writel(bdev->func, CSICR_CLR_MBOX_ACK, MTK_REG_CSICR,
+                           &err);
+               err = readx_poll_timeout(btmtksdio_drv_own_query_79xx, bdev,
+                                        status, !(status & PD2HRM0R_DRV_OWN),
+                                        2000, 1000000);
+               if (err < 0) {
+                       bt_dev_err(bdev->hdev, "mailbox ACK not cleared");
+                       goto out;
+               }
+       }
+
+       /* Return ownership to the device */
+       sdio_writel(bdev->func, C_FW_OWN_REQ_SET, MTK_REG_CHLPCR, &err);
+       if (err < 0)
+               goto out;
+
+       err = readx_poll_timeout(btmtksdio_drv_own_query, bdev, status,
+                                !(status & C_COM_DRV_OWN), 2000, 1000000);
+
+out:
+       sdio_release_host(bdev->func);
+
+       if (err < 0)
+               bt_dev_err(bdev->hdev, "Cannot return ownership to device");
+
+       return err;
+}
+
+static int btmtksdio_drv_pmctrl(struct btmtksdio_dev *bdev)
+{
+       u32 status;
+       int err;
+
+       sdio_claim_host(bdev->func);
+
+       /* Get ownership from the device */
+       sdio_writel(bdev->func, C_FW_OWN_REQ_CLR, MTK_REG_CHLPCR, &err);
+       if (err < 0)
+               goto out;
+
+       err = readx_poll_timeout(btmtksdio_drv_own_query, bdev, status,
+                                status & C_COM_DRV_OWN, 2000, 1000000);
+
+       if (!err && bdev->data->lp_mbox_supported &&
+           test_bit(BTMTKSDIO_PATCH_ENABLED, &bdev->tx_state))
+               err = readx_poll_timeout(btmtksdio_drv_own_query_79xx, bdev,
+                                        status, status & PD2HRM0R_DRV_OWN,
+                                        2000, 1000000);
+
+out:
+       sdio_release_host(bdev->func);
+
+       if (err < 0)
+               bt_dev_err(bdev->hdev, "Cannot get ownership from device");
+
+       return err;
+}
+
 static int btmtksdio_recv_event(struct hci_dev *hdev, struct sk_buff *skb)
 {
        struct btmtksdio_dev *bdev = hci_get_drvdata(hdev);
@@ -480,6 +566,13 @@ static void btmtksdio_txrx_work(struct work_struct *work)
                 * FIFO.
                 */
                sdio_writel(bdev->func, int_status, MTK_REG_CHISR, NULL);
+               int_status &= INT_MASK;
+
+               if ((int_status & FW_MAILBOX_INT) &&
+                   bdev->data->chipid == 0x7921) {
+                       sdio_writel(bdev->func, PH2DSM0R_DRIVER_OWN,
+                                   MTK_REG_PH2DSM0R, 0);
+               }
 
                if (int_status & FW_OWN_BACK_INT)
                        bt_dev_dbg(bdev->hdev, "Get fw own back");
@@ -531,7 +624,7 @@ static void btmtksdio_interrupt(struct sdio_func *func)
 static int btmtksdio_open(struct hci_dev *hdev)
 {
        struct btmtksdio_dev *bdev = hci_get_drvdata(hdev);
-       u32 status, val;
+       u32 val;
        int err;
 
        sdio_claim_host(bdev->func);
@@ -542,18 +635,10 @@ static int btmtksdio_open(struct hci_dev *hdev)
 
        set_bit(BTMTKSDIO_FUNC_ENABLED, &bdev->tx_state);
 
-       /* Get ownership from the device */
-       sdio_writel(bdev->func, C_FW_OWN_REQ_CLR, MTK_REG_CHLPCR, &err);
+       err = btmtksdio_drv_pmctrl(bdev);
        if (err < 0)
                goto err_disable_func;
 
-       err = readx_poll_timeout(btmtksdio_drv_own_query, bdev, status,
-                                status & C_COM_DRV_OWN, 2000, 1000000);
-       if (err < 0) {
-               bt_dev_err(bdev->hdev, "Cannot get ownership from device");
-               goto err_disable_func;
-       }
-
        /* Disable interrupt & mask out all interrupt sources */
        sdio_writel(bdev->func, C_INT_EN_CLR, MTK_REG_CHLPCR, &err);
        if (err < 0)
@@ -623,8 +708,6 @@ err_release_host:
 static int btmtksdio_close(struct hci_dev *hdev)
 {
        struct btmtksdio_dev *bdev = hci_get_drvdata(hdev);
-       u32 status;
-       int err;
 
        sdio_claim_host(bdev->func);
 
@@ -635,13 +718,7 @@ static int btmtksdio_close(struct hci_dev *hdev)
 
        cancel_work_sync(&bdev->txrx_work);
 
-       /* Return ownership to the device */
-       sdio_writel(bdev->func, C_FW_OWN_REQ_SET, MTK_REG_CHLPCR, NULL);
-
-       err = readx_poll_timeout(btmtksdio_drv_own_query, bdev, status,
-                                !(status & C_COM_DRV_OWN), 2000, 1000000);
-       if (err < 0)
-               bt_dev_err(bdev->hdev, "Cannot return ownership to device");
+       btmtksdio_fw_pmctrl(bdev);
 
        clear_bit(BTMTKSDIO_FUNC_ENABLED, &bdev->tx_state);
        sdio_disable_func(bdev->func);
@@ -686,6 +763,7 @@ static int btmtksdio_func_query(struct hci_dev *hdev)
 
 static int mt76xx_setup(struct hci_dev *hdev, const char *fwname)
 {
+       struct btmtksdio_dev *bdev = hci_get_drvdata(hdev);
        struct btmtk_hci_wmt_params wmt_params;
        struct btmtk_tci_sleep tci_sleep;
        struct sk_buff *skb;
@@ -746,6 +824,8 @@ ignore_setup_fw:
                return err;
        }
 
+       set_bit(BTMTKSDIO_PATCH_ENABLED, &bdev->tx_state);
+
 ignore_func_on:
        /* Apply the low power environment setup */
        tci_sleep.mode = 0x5;
@@ -768,6 +848,7 @@ ignore_func_on:
 
 static int mt79xx_setup(struct hci_dev *hdev, const char *fwname)
 {
+       struct btmtksdio_dev *bdev = hci_get_drvdata(hdev);
        struct btmtk_hci_wmt_params wmt_params;
        u8 param = 0x1;
        int err;
@@ -793,19 +874,15 @@ static int mt79xx_setup(struct hci_dev *hdev, const char *fwname)
 
        hci_set_msft_opcode(hdev, 0xFD30);
        hci_set_aosp_capable(hdev);
+       set_bit(BTMTKSDIO_PATCH_ENABLED, &bdev->tx_state);
 
        return err;
 }
 
-static int btsdio_mtk_reg_read(struct hci_dev *hdev, u32 reg, u32 *val)
+static int btmtksdio_mtk_reg_read(struct hci_dev *hdev, u32 reg, u32 *val)
 {
        struct btmtk_hci_wmt_params wmt_params;
-       struct reg_read_cmd {
-               u8 type;
-               u8 rsv;
-               u8 num;
-               __le32 addr;
-       } __packed reg_read = {
+       struct reg_read_cmd reg_read = {
                .type = 1,
                .num = 1,
        };
@@ -821,7 +898,7 @@ static int btsdio_mtk_reg_read(struct hci_dev *hdev, u32 reg, u32 *val)
 
        err = mtk_hci_wmt_sync(hdev, &wmt_params);
        if (err < 0) {
-               bt_dev_err(hdev, "Failed to read reg(%d)", err);
+               bt_dev_err(hdev, "Failed to read reg (%d)", err);
                return err;
        }
 
@@ -830,6 +907,66 @@ static int btsdio_mtk_reg_read(struct hci_dev *hdev, u32 reg, u32 *val)
        return err;
 }
 
+static int btmtksdio_mtk_reg_write(struct hci_dev *hdev, u32 reg, u32 val, u32 mask)
+{
+       struct btmtk_hci_wmt_params wmt_params;
+       const struct reg_write_cmd reg_write = {
+               .type = 1,
+               .num = 1,
+               .addr = cpu_to_le32(reg),
+               .data = cpu_to_le32(val),
+               .mask = cpu_to_le32(mask),
+       };
+       int err, status;
+
+       wmt_params.op = BTMTK_WMT_REGISTER;
+       wmt_params.flag = BTMTK_WMT_REG_WRITE;
+       wmt_params.dlen = sizeof(reg_write);
+       wmt_params.data = &reg_write;
+       wmt_params.status = &status;
+
+       err = mtk_hci_wmt_sync(hdev, &wmt_params);
+       if (err < 0)
+               bt_dev_err(hdev, "Failed to write reg (%d)", err);
+
+       return err;
+}
+
+static int btmtksdio_sco_setting(struct hci_dev *hdev)
+{
+       const struct btmtk_sco sco_setting = {
+               .clock_config = 0x49,
+               .channel_format_config = 0x80,
+       };
+       struct sk_buff *skb;
+       u32 val;
+       int err;
+
+       /* Enable SCO over I2S/PCM for MediaTek chipset */
+       skb =  __hci_cmd_sync(hdev, 0xfc72, sizeof(sco_setting),
+                             &sco_setting, HCI_CMD_TIMEOUT);
+       if (IS_ERR(skb))
+               return PTR_ERR(skb);
+
+       kfree_skb(skb);
+
+       err = btmtksdio_mtk_reg_read(hdev, MT7921_PINMUX_0, &val);
+       if (err < 0)
+               return err;
+
+       val |= 0x11000000;
+       err = btmtksdio_mtk_reg_write(hdev, MT7921_PINMUX_0, val, ~0);
+       if (err < 0)
+               return err;
+
+       err = btmtksdio_mtk_reg_read(hdev, MT7921_PINMUX_1, &val);
+       if (err < 0)
+               return err;
+
+       val |= 0x00000101;
+       return btmtksdio_mtk_reg_write(hdev, MT7921_PINMUX_1, val, ~0);
+}
+
 static int btmtksdio_setup(struct hci_dev *hdev)
 {
        struct btmtksdio_dev *bdev = hci_get_drvdata(hdev);
@@ -844,13 +981,13 @@ static int btmtksdio_setup(struct hci_dev *hdev)
 
        switch (bdev->data->chipid) {
        case 0x7921:
-               err = btsdio_mtk_reg_read(hdev, 0x70010200, &dev_id);
+               err = btmtksdio_mtk_reg_read(hdev, 0x70010200, &dev_id);
                if (err < 0) {
                        bt_dev_err(hdev, "Failed to get device id (%d)", err);
                        return err;
                }
 
-               err = btsdio_mtk_reg_read(hdev, 0x80021004, &fw_version);
+               err = btmtksdio_mtk_reg_read(hdev, 0x80021004, &fw_version);
                if (err < 0) {
                        bt_dev_err(hdev, "Failed to get fw version (%d)", err);
                        return err;
@@ -862,6 +999,22 @@ static int btmtksdio_setup(struct hci_dev *hdev)
                err = mt79xx_setup(hdev, fwname);
                if (err < 0)
                        return err;
+
+               err = btmtksdio_fw_pmctrl(bdev);
+               if (err < 0)
+                       return err;
+
+               err = btmtksdio_drv_pmctrl(bdev);
+               if (err < 0)
+                       return err;
+
+               /* Enable SCO over I2S/PCM */
+               err = btmtksdio_sco_setting(hdev);
+               if (err < 0) {
+                       bt_dev_err(hdev, "Failed to enable SCO setting (%d)", err);
+                       return err;
+               }
+
                break;
        case 0x7663:
        case 0x7668:
@@ -958,6 +1111,32 @@ static int btmtksdio_send_frame(struct hci_dev *hdev, struct sk_buff *skb)
        return 0;
 }
 
+static bool btmtksdio_sdio_wakeup(struct hci_dev *hdev)
+{
+       struct btmtksdio_dev *bdev = hci_get_drvdata(hdev);
+       bool may_wakeup = device_may_wakeup(bdev->dev);
+       const struct btmtk_wakeon bt_awake = {
+               .mode = 0x1,
+               .gpo = 0,
+               .active_high = 0x1,
+               .enable_delay = cpu_to_le16(0xc80),
+               .wakeup_delay = cpu_to_le16(0x20),
+       };
+
+       if (may_wakeup && bdev->data->chipid == 0x7921) {
+               struct sk_buff *skb;
+
+               skb =  __hci_cmd_sync(hdev, 0xfc27, sizeof(bt_awake),
+                                     &bt_awake, HCI_CMD_TIMEOUT);
+               if (IS_ERR(skb))
+                       may_wakeup = false;
+
+               kfree_skb(skb);
+       }
+
+       return may_wakeup;
+}
+
 static int btmtksdio_probe(struct sdio_func *func,
                           const struct sdio_device_id *id)
 {
@@ -997,6 +1176,7 @@ static int btmtksdio_probe(struct sdio_func *func,
        hdev->setup    = btmtksdio_setup;
        hdev->shutdown = btmtksdio_shutdown;
        hdev->send     = btmtksdio_send_frame;
+       hdev->wakeup   = btmtksdio_sdio_wakeup;
        hdev->set_bdaddr = btmtk_set_bdaddr;
 
        SET_HCIDEV_DEV(hdev, &func->dev);
@@ -1032,7 +1212,11 @@ static int btmtksdio_probe(struct sdio_func *func,
         */
        pm_runtime_put_noidle(bdev->dev);
 
-       return 0;
+       err = device_init_wakeup(bdev->dev, true);
+       if (err)
+               bt_dev_err(hdev, "failed to initialize device wakeup");
+
+       return err;
 }
 
 static void btmtksdio_remove(struct sdio_func *func)
@@ -1058,7 +1242,6 @@ static int btmtksdio_runtime_suspend(struct device *dev)
 {
        struct sdio_func *func = dev_to_sdio_func(dev);
        struct btmtksdio_dev *bdev;
-       u32 status;
        int err;
 
        bdev = sdio_get_drvdata(func);
@@ -1070,18 +1253,9 @@ static int btmtksdio_runtime_suspend(struct device *dev)
 
        sdio_set_host_pm_flags(func, MMC_PM_KEEP_POWER);
 
-       sdio_claim_host(bdev->func);
+       err = btmtksdio_fw_pmctrl(bdev);
 
-       sdio_writel(bdev->func, C_FW_OWN_REQ_SET, MTK_REG_CHLPCR, &err);
-       if (err < 0)
-               goto out;
-
-       err = readx_poll_timeout(btmtksdio_drv_own_query, bdev, status,
-                                !(status & C_COM_DRV_OWN), 2000, 1000000);
-out:
-       bt_dev_info(bdev->hdev, "status (%d) return ownership to device", err);
-
-       sdio_release_host(bdev->func);
+       bt_dev_dbg(bdev->hdev, "status (%d) return ownership to device", err);
 
        return err;
 }
@@ -1090,7 +1264,6 @@ static int btmtksdio_runtime_resume(struct device *dev)
 {
        struct sdio_func *func = dev_to_sdio_func(dev);
        struct btmtksdio_dev *bdev;
-       u32 status;
        int err;
 
        bdev = sdio_get_drvdata(func);
@@ -1100,18 +1273,9 @@ static int btmtksdio_runtime_resume(struct device *dev)
        if (!test_bit(BTMTKSDIO_FUNC_ENABLED, &bdev->tx_state))
                return 0;
 
-       sdio_claim_host(bdev->func);
+       err = btmtksdio_drv_pmctrl(bdev);
 
-       sdio_writel(bdev->func, C_FW_OWN_REQ_CLR, MTK_REG_CHLPCR, &err);
-       if (err < 0)
-               goto out;
-
-       err = readx_poll_timeout(btmtksdio_drv_own_query, bdev, status,
-                                status & C_COM_DRV_OWN, 2000, 1000000);
-out:
-       bt_dev_info(bdev->hdev, "status (%d) get ownership from device", err);
-
-       sdio_release_host(bdev->func);
+       bt_dev_dbg(bdev->hdev, "status (%d) get ownership from device", err);
 
        return err;
 }
index c2bdd1e..c2030f7 100644 (file)
@@ -149,6 +149,14 @@ static const struct id_table ic_id_table[] = {
          .cfg_name = "rtl_bt/rtl8761bu_config" },
 
        /* 8822C with UART interface */
+       { IC_INFO(RTL_ROM_LMP_8822B, 0xc, 0x8, HCI_UART),
+         .config_needed = true,
+         .has_rom_version = true,
+         .has_msft_ext = true,
+         .fw_name  = "rtl_bt/rtl8822cs_fw.bin",
+         .cfg_name = "rtl_bt/rtl8822cs_config" },
+
+       /* 8822C with UART interface */
        { IC_INFO(RTL_ROM_LMP_8822B, 0xc, 0xa, HCI_UART),
          .config_needed = true,
          .has_rom_version = true,
index c30d131..aefa0ee 100644 (file)
@@ -62,6 +62,7 @@ static struct usb_driver btusb_driver;
 #define BTUSB_QCA_WCN6855      0x1000000
 #define BTUSB_INTEL_BROKEN_SHUTDOWN_LED        0x2000000
 #define BTUSB_INTEL_BROKEN_INITIAL_NCMD 0x4000000
+#define BTUSB_INTEL_NO_WBS_SUPPORT     0x8000000
 
 static const struct usb_device_id btusb_table[] = {
        /* Generic Bluetooth USB device */
@@ -385,9 +386,11 @@ static const struct usb_device_id blacklist_table[] = {
        { USB_DEVICE(0x8087, 0x0033), .driver_info = BTUSB_INTEL_COMBINED },
        { USB_DEVICE(0x8087, 0x07da), .driver_info = BTUSB_CSR },
        { USB_DEVICE(0x8087, 0x07dc), .driver_info = BTUSB_INTEL_COMBINED |
+                                                    BTUSB_INTEL_NO_WBS_SUPPORT |
                                                     BTUSB_INTEL_BROKEN_INITIAL_NCMD |
                                                     BTUSB_INTEL_BROKEN_SHUTDOWN_LED },
        { USB_DEVICE(0x8087, 0x0a2a), .driver_info = BTUSB_INTEL_COMBINED |
+                                                    BTUSB_INTEL_NO_WBS_SUPPORT |
                                                     BTUSB_INTEL_BROKEN_SHUTDOWN_LED },
        { USB_DEVICE(0x8087, 0x0a2b), .driver_info = BTUSB_INTEL_COMBINED },
        { USB_DEVICE(0x8087, 0x0aa7), .driver_info = BTUSB_INTEL_COMBINED |
@@ -405,6 +408,8 @@ static const struct usb_device_id blacklist_table[] = {
                                                     BTUSB_WIDEBAND_SPEECH },
 
        /* Realtek 8852AE Bluetooth devices */
+       { USB_DEVICE(0x0bda, 0x2852), .driver_info = BTUSB_REALTEK |
+                                                    BTUSB_WIDEBAND_SPEECH },
        { USB_DEVICE(0x0bda, 0xc852), .driver_info = BTUSB_REALTEK |
                                                     BTUSB_WIDEBAND_SPEECH },
        { USB_DEVICE(0x0bda, 0x385a), .driver_info = BTUSB_REALTEK |
@@ -2057,10 +2062,10 @@ static int btusb_setup_csr(struct hci_dev *hdev)
                 * These controllers are really messed-up.
                 *
                 * 1. Their bulk RX endpoint will never report any data unless
-                * the device was suspended at least once (yes, really).
+                *    the device was suspended at least once (yes, really).
                 * 2. They will not wakeup when autosuspended and receiving data
-                * on their bulk RX endpoint from e.g. a keyboard or mouse
-                * (IOW remote-wakeup support is broken for the bulk endpoint).
+                *    on their bulk RX endpoint from e.g. a keyboard or mouse
+                *    (IOW remote-wakeup support is broken for the bulk endpoint).
                 *
                 * To fix 1. enable runtime-suspend, force-suspend the
                 * HCI and then wake-it up by disabling runtime-suspend.
@@ -3737,6 +3742,9 @@ static int btusb_probe(struct usb_interface *intf,
                hdev->send = btusb_send_frame_intel;
                hdev->cmd_timeout = btusb_intel_cmd_timeout;
 
+               if (id->driver_info & BTUSB_INTEL_NO_WBS_SUPPORT)
+                       btintel_set_flag(hdev, INTEL_ROM_LEGACY_NO_WBS_SUPPORT);
+
                if (id->driver_info & BTUSB_INTEL_BROKEN_INITIAL_NCMD)
                        btintel_set_flag(hdev, INTEL_BROKEN_INITIAL_NCMD);
 
index 34286ff..fdf504b 100644 (file)
@@ -966,6 +966,11 @@ static void h5_btrtl_open(struct h5 *h5)
                pm_runtime_enable(&h5->hu->serdev->dev);
        }
 
+       /* The controller needs reset to startup */
+       gpiod_set_value_cansleep(h5->enable_gpio, 0);
+       gpiod_set_value_cansleep(h5->device_wake_gpio, 0);
+       msleep(100);
+
        /* The controller needs up to 500ms to wakeup */
        gpiod_set_value_cansleep(h5->enable_gpio, 1);
        gpiod_set_value_cansleep(h5->device_wake_gpio, 1);
index eb1e736..4eb420a 100644 (file)
@@ -509,7 +509,7 @@ static int send_command_from_firmware(struct ll_device *lldev,
        return 0;
 }
 
-/**
+/*
  * download_firmware -
  *     internal function which parses through the .bts firmware
  *     script file intreprets SEND, DELAY actions only as of now
index 3b00d82..4cda890 100644 (file)
@@ -305,6 +305,8 @@ int hci_uart_register_device(struct hci_uart *hu,
        if (err)
                return err;
 
+       percpu_init_rwsem(&hu->proto_lock);
+
        err = p->open(hu);
        if (err)
                goto err_open;
@@ -327,7 +329,6 @@ int hci_uart_register_device(struct hci_uart *hu,
 
        INIT_WORK(&hu->init_ready, hci_uart_init_work);
        INIT_WORK(&hu->write_work, hci_uart_write_work);
-       percpu_init_rwsem(&hu->proto_lock);
 
        /* Only when vendor specific setup callback is provided, consider
         * the manufacturer information valid. This avoids filling in the
index 533e476..303c8d3 100644 (file)
@@ -19,6 +19,7 @@
 #include <linux/in.h>
 #include <net/arp.h>
 #include <net/ipv6.h>
+#include <net/ndisc.h>
 #include <asm/byteorder.h>
 #include <net/bonding.h>
 #include <net/bond_alb.h>
@@ -1269,6 +1270,27 @@ unwind:
        return res;
 }
 
+/* determine if the packet is NA or NS */
+static bool alb_determine_nd(struct sk_buff *skb, struct bonding *bond)
+{
+       struct ipv6hdr *ip6hdr;
+       struct icmp6hdr *hdr;
+
+       if (!pskb_network_may_pull(skb, sizeof(*ip6hdr)))
+               return true;
+
+       ip6hdr = ipv6_hdr(skb);
+       if (ip6hdr->nexthdr != IPPROTO_ICMPV6)
+               return false;
+
+       if (!pskb_network_may_pull(skb, sizeof(*ip6hdr) + sizeof(*hdr)))
+               return true;
+
+       hdr = icmp6_hdr(skb);
+       return hdr->icmp6_type == NDISC_NEIGHBOUR_ADVERTISEMENT ||
+               hdr->icmp6_type == NDISC_NEIGHBOUR_SOLICITATION;
+}
+
 /************************ exported alb functions ************************/
 
 int bond_alb_initialize(struct bonding *bond, int rlb_enabled)
@@ -1348,8 +1370,11 @@ struct slave *bond_xmit_tlb_slave_get(struct bonding *bond,
        /* Do not TX balance any multicast or broadcast */
        if (!is_multicast_ether_addr(eth_data->h_dest)) {
                switch (skb->protocol) {
-               case htons(ETH_P_IP):
                case htons(ETH_P_IPV6):
+                       if (alb_determine_nd(skb, bond))
+                               break;
+                       fallthrough;
+               case htons(ETH_P_IP):
                        hash_index = bond_xmit_hash(bond, skb);
                        if (bond->params.tlb_dynamic_lb) {
                                tx_slave = tlb_choose_channel(bond,
@@ -1432,10 +1457,12 @@ struct slave *bond_xmit_alb_slave_get(struct bonding *bond,
                        break;
                }
 
-               if (!pskb_network_may_pull(skb, sizeof(*ip6hdr))) {
+               if (alb_determine_nd(skb, bond)) {
                        do_tx_balance = false;
                        break;
                }
+
+               /* The IPv6 header is pulled by alb_determine_nd */
                /* Additionally, DAD probes should not be tx-balanced as that
                 * will lead to false positives for duplicate addresses and
                 * prevent address configuration from working.
index 238b56d..617c2bf 100644 (file)
@@ -6048,27 +6048,38 @@ static int __net_init bond_net_init(struct net *net)
        return 0;
 }
 
-static void __net_exit bond_net_exit(struct net *net)
+static void __net_exit bond_net_exit_batch(struct list_head *net_list)
 {
-       struct bond_net *bn = net_generic(net, bond_net_id);
-       struct bonding *bond, *tmp_bond;
+       struct bond_net *bn;
+       struct net *net;
        LIST_HEAD(list);
 
-       bond_destroy_sysfs(bn);
+       list_for_each_entry(net, net_list, exit_list) {
+               bn = net_generic(net, bond_net_id);
+               bond_destroy_sysfs(bn);
+       }
 
        /* Kill off any bonds created after unregistering bond rtnl ops */
        rtnl_lock();
-       list_for_each_entry_safe(bond, tmp_bond, &bn->dev_list, bond_list)
-               unregister_netdevice_queue(bond->dev, &list);
+       list_for_each_entry(net, net_list, exit_list) {
+               struct bonding *bond, *tmp_bond;
+
+               bn = net_generic(net, bond_net_id);
+               list_for_each_entry_safe(bond, tmp_bond, &bn->dev_list, bond_list)
+                       unregister_netdevice_queue(bond->dev, &list);
+       }
        unregister_netdevice_many(&list);
        rtnl_unlock();
 
-       bond_destroy_proc_dir(bn);
+       list_for_each_entry(net, net_list, exit_list) {
+               bn = net_generic(net, bond_net_id);
+               bond_destroy_proc_dir(bn);
+       }
 }
 
 static struct pernet_operations bond_net_ops = {
        .init = bond_net_init,
-       .exit = bond_net_exit,
+       .exit_batch = bond_net_exit_batch,
        .id   = &bond_net_id,
        .size = sizeof(struct bond_net),
 };
index 46b150e..cfe37be 100644 (file)
@@ -307,7 +307,6 @@ void __net_init bond_create_proc_dir(struct bond_net *bn)
 }
 
 /* Destroy the bonding directory under /proc/net, if empty.
- * Caller must hold rtnl_lock.
  */
 void __net_exit bond_destroy_proc_dir(struct bond_net *bn)
 {
index c0c9144..8d51c10 100644 (file)
@@ -68,17 +68,7 @@ config NET_DSA_QCA8K
          This enables support for the Qualcomm Atheros QCA8K Ethernet
          switch chips.
 
-config NET_DSA_REALTEK_SMI
-       tristate "Realtek SMI Ethernet switch family support"
-       select NET_DSA_TAG_RTL4_A
-       select NET_DSA_TAG_RTL8_4
-       select FIXED_PHY
-       select IRQ_DOMAIN
-       select REALTEK_PHY
-       select REGMAP
-       help
-         This enables support for the Realtek SMI-based switch
-         chips, currently only RTL8366RB.
+source "drivers/net/dsa/realtek/Kconfig"
 
 config NET_DSA_SMSC_LAN9303
        tristate
index 8da1569..e73838c 100644 (file)
@@ -9,8 +9,6 @@ obj-$(CONFIG_NET_DSA_LANTIQ_GSWIP) += lantiq_gswip.o
 obj-$(CONFIG_NET_DSA_MT7530)   += mt7530.o
 obj-$(CONFIG_NET_DSA_MV88E6060) += mv88e6060.o
 obj-$(CONFIG_NET_DSA_QCA8K)    += qca8k.o
-obj-$(CONFIG_NET_DSA_REALTEK_SMI) += realtek-smi.o
-realtek-smi-objs               := realtek-smi-core.o rtl8366.o rtl8366rb.o rtl8365mb.o
 obj-$(CONFIG_NET_DSA_SMSC_LAN9303) += lan9303-core.o
 obj-$(CONFIG_NET_DSA_SMSC_LAN9303_I2C) += lan9303_i2c.o
 obj-$(CONFIG_NET_DSA_SMSC_LAN9303_MDIO) += lan9303_mdio.o
@@ -23,5 +21,6 @@ obj-y                         += microchip/
 obj-y                          += mv88e6xxx/
 obj-y                          += ocelot/
 obj-y                          += qca/
+obj-y                          += realtek/
 obj-y                          += sja1105/
 obj-y                          += xrs700x/
index 3867f3d..a3b9899 100644 (file)
@@ -2186,7 +2186,7 @@ int b53_eee_init(struct dsa_switch *ds, int port, struct phy_device *phy)
 {
        int ret;
 
-       ret = phy_init_eee(phy, 0);
+       ret = phy_init_eee(phy, false);
        if (ret)
                return 0;
 
index 6afb5db..cf82b1f 100644 (file)
@@ -712,49 +712,25 @@ static u32 bcm_sf2_sw_get_phy_flags(struct dsa_switch *ds, int port)
                       PHY_BRCM_IDDQ_SUSPEND;
 }
 
-static void bcm_sf2_sw_validate(struct dsa_switch *ds, int port,
-                               unsigned long *supported,
-                               struct phylink_link_state *state)
+static void bcm_sf2_sw_get_caps(struct dsa_switch *ds, int port,
+                               struct phylink_config *config)
 {
+       unsigned long *interfaces = config->supported_interfaces;
        struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
-       __ETHTOOL_DECLARE_LINK_MODE_MASK(mask) = { 0, };
-
-       if (!phy_interface_mode_is_rgmii(state->interface) &&
-           state->interface != PHY_INTERFACE_MODE_MII &&
-           state->interface != PHY_INTERFACE_MODE_REVMII &&
-           state->interface != PHY_INTERFACE_MODE_GMII &&
-           state->interface != PHY_INTERFACE_MODE_INTERNAL &&
-           state->interface != PHY_INTERFACE_MODE_MOCA) {
-               linkmode_zero(supported);
-               if (port != core_readl(priv, CORE_IMP0_PRT_ID))
-                       dev_err(ds->dev,
-                               "Unsupported interface: %d for port %d\n",
-                               state->interface, port);
-               return;
-       }
-
-       /* Allow all the expected bits */
-       phylink_set(mask, Autoneg);
-       phylink_set_port_modes(mask);
-       phylink_set(mask, Pause);
-       phylink_set(mask, Asym_Pause);
 
-       /* With the exclusion of MII and Reverse MII, we support Gigabit,
-        * including Half duplex
-        */
-       if (state->interface != PHY_INTERFACE_MODE_MII &&
-           state->interface != PHY_INTERFACE_MODE_REVMII) {
-               phylink_set(mask, 1000baseT_Full);
-               phylink_set(mask, 1000baseT_Half);
+       if (priv->int_phy_mask & BIT(port)) {
+               __set_bit(PHY_INTERFACE_MODE_INTERNAL, interfaces);
+       } else if (priv->moca_port == port) {
+               __set_bit(PHY_INTERFACE_MODE_MOCA, interfaces);
+       } else {
+               __set_bit(PHY_INTERFACE_MODE_MII, interfaces);
+               __set_bit(PHY_INTERFACE_MODE_REVMII, interfaces);
+               __set_bit(PHY_INTERFACE_MODE_GMII, interfaces);
+               phy_interface_set_rgmii(interfaces);
        }
 
-       phylink_set(mask, 10baseT_Half);
-       phylink_set(mask, 10baseT_Full);
-       phylink_set(mask, 100baseT_Half);
-       phylink_set(mask, 100baseT_Full);
-
-       linkmode_and(supported, supported, mask);
-       linkmode_and(state->advertising, state->advertising, mask);
+       config->mac_capabilities = MAC_ASYM_PAUSE | MAC_SYM_PAUSE |
+               MAC_10 | MAC_100 | MAC_1000;
 }
 
 static void bcm_sf2_sw_mac_config(struct dsa_switch *ds, int port,
@@ -1221,7 +1197,7 @@ static const struct dsa_switch_ops bcm_sf2_ops = {
        .get_sset_count         = bcm_sf2_sw_get_sset_count,
        .get_ethtool_phy_stats  = b53_get_ethtool_phy_stats,
        .get_phy_flags          = bcm_sf2_sw_get_phy_flags,
-       .phylink_validate       = bcm_sf2_sw_validate,
+       .phylink_get_caps       = bcm_sf2_sw_get_caps,
        .phylink_mac_config     = bcm_sf2_sw_mac_config,
        .phylink_mac_link_down  = bcm_sf2_sw_mac_link_down,
        .phylink_mac_link_up    = bcm_sf2_sw_mac_link_up,
index 991b9c6..5dc9899 100644 (file)
@@ -1461,27 +1461,22 @@ static int ksz8_setup(struct dsa_switch *ds)
        return 0;
 }
 
-static void ksz8_validate(struct dsa_switch *ds, int port,
-                         unsigned long *supported,
-                         struct phylink_link_state *state)
+static void ksz8_get_caps(struct dsa_switch *ds, int port,
+                         struct phylink_config *config)
 {
-       __ETHTOOL_DECLARE_LINK_MODE_MASK(mask) = { 0, };
        struct ksz_device *dev = ds->priv;
 
        if (port == dev->cpu_port) {
-               if (state->interface != PHY_INTERFACE_MODE_RMII &&
-                   state->interface != PHY_INTERFACE_MODE_MII &&
-                   state->interface != PHY_INTERFACE_MODE_NA)
-                       goto unsupported;
+               __set_bit(PHY_INTERFACE_MODE_RMII,
+                         config->supported_interfaces);
+               __set_bit(PHY_INTERFACE_MODE_MII,
+                         config->supported_interfaces);
        } else {
-               if (state->interface != PHY_INTERFACE_MODE_INTERNAL &&
-                   state->interface != PHY_INTERFACE_MODE_NA)
-                       goto unsupported;
+               __set_bit(PHY_INTERFACE_MODE_INTERNAL,
+                         config->supported_interfaces);
        }
 
-       /* Allow all the expected bits */
-       phylink_set_port_modes(mask);
-       phylink_set(mask, Autoneg);
+       config->mac_capabilities = MAC_10 | MAC_100;
 
        /* Silicon Errata Sheet (DS80000830A):
         * "Port 1 does not respond to received flow control PAUSE frames"
@@ -1489,27 +1484,11 @@ static void ksz8_validate(struct dsa_switch *ds, int port,
         * switches.
         */
        if (!ksz_is_ksz88x3(dev) || port)
-               phylink_set(mask, Pause);
+               config->mac_capabilities |= MAC_SYM_PAUSE;
 
        /* Asym pause is not supported on KSZ8863 and KSZ8873 */
        if (!ksz_is_ksz88x3(dev))
-               phylink_set(mask, Asym_Pause);
-
-       /* 10M and 100M are only supported */
-       phylink_set(mask, 10baseT_Half);
-       phylink_set(mask, 10baseT_Full);
-       phylink_set(mask, 100baseT_Half);
-       phylink_set(mask, 100baseT_Full);
-
-       linkmode_and(supported, supported, mask);
-       linkmode_and(state->advertising, state->advertising, mask);
-
-       return;
-
-unsupported:
-       linkmode_zero(supported);
-       dev_err(ds->dev, "Unsupported interface: %s, port: %d\n",
-               phy_modes(state->interface), port);
+               config->mac_capabilities |= MAC_ASYM_PAUSE;
 }
 
 static const struct dsa_switch_ops ksz8_switch_ops = {
@@ -1518,7 +1497,7 @@ static const struct dsa_switch_ops ksz8_switch_ops = {
        .setup                  = ksz8_setup,
        .phy_read               = ksz_phy_read16,
        .phy_write              = ksz_phy_write16,
-       .phylink_validate       = ksz8_validate,
+       .phylink_get_caps       = ksz8_get_caps,
        .phylink_mac_link_down  = ksz_mac_link_down,
        .port_enable            = ksz_enable_port,
        .get_strings            = ksz8_get_strings,
index 353b5f9..a85d990 100644 (file)
@@ -222,9 +222,12 @@ static int ksz9477_reset_switch(struct ksz_device *dev)
                           (BROADCAST_STORM_VALUE *
                           BROADCAST_STORM_PROT_RATE) / 100);
 
-       if (dev->synclko_125)
-               ksz_write8(dev, REG_SW_GLOBAL_OUTPUT_CTRL__1,
-                          SW_ENABLE_REFCLKO | SW_REFCLKO_IS_125MHZ);
+       data8 = SW_ENABLE_REFCLKO;
+       if (dev->synclko_disable)
+               data8 = 0;
+       else if (dev->synclko_125)
+               data8 = SW_ENABLE_REFCLKO | SW_REFCLKO_IS_125MHZ;
+       ksz_write8(dev, REG_SW_GLOBAL_OUTPUT_CTRL__1, data8);
 
        return 0;
 }
index 55dbda0..7e33ec7 100644 (file)
@@ -434,6 +434,12 @@ int ksz_switch_register(struct ksz_device *dev,
                        }
                dev->synclko_125 = of_property_read_bool(dev->dev->of_node,
                                                         "microchip,synclko-125");
+               dev->synclko_disable = of_property_read_bool(dev->dev->of_node,
+                                                            "microchip,synclko-disable");
+               if (dev->synclko_125 && dev->synclko_disable) {
+                       dev_err(dev->dev, "inconsistent synclko settings\n");
+                       return -EINVAL;
+               }
        }
 
        ret = dsa_register_switch(dev->ds);
index df8ae59..3db63f6 100644 (file)
@@ -75,6 +75,7 @@ struct ksz_device {
        u32 regs_size;
        bool phy_errata_9477;
        bool synclko_125;
+       bool synclko_disable;
 
        struct vlan_table *vlan_cache;
 
index ff3c267..f74f25f 100644 (file)
@@ -2846,7 +2846,7 @@ static void mt753x_phylink_mac_link_up(struct dsa_switch *ds, int port,
                        mcr |= PMCR_RX_FC_EN;
        }
 
-       if (mode == MLO_AN_PHY && phydev && phy_init_eee(phydev, 0) >= 0) {
+       if (mode == MLO_AN_PHY && phydev && phy_init_eee(phydev, false) >= 0) {
                switch (speed) {
                case SPEED_1000:
                        mcr |= PMCR_FORCE_EEE1G;
index 8530dbe..5344d0c 100644 (file)
@@ -86,12 +86,16 @@ int mv88e6xxx_write(struct mv88e6xxx_chip *chip, int addr, int reg, u16 val)
 int mv88e6xxx_wait_mask(struct mv88e6xxx_chip *chip, int addr, int reg,
                        u16 mask, u16 val)
 {
+       const unsigned long timeout = jiffies + msecs_to_jiffies(50);
        u16 data;
        int err;
        int i;
 
-       /* There's no bus specific operation to wait for a mask */
-       for (i = 0; i < 16; i++) {
+       /* There's no bus specific operation to wait for a mask. Even
+        * if the initial poll takes longer than 50ms, always do at
+        * least one more attempt.
+        */
+       for (i = 0; time_before(jiffies, timeout) || (i < 2); i++) {
                err = mv88e6xxx_read(chip, addr, reg, &data);
                if (err)
                        return err;
@@ -99,7 +103,10 @@ int mv88e6xxx_wait_mask(struct mv88e6xxx_chip *chip, int addr, int reg,
                if ((data & mask) == val)
                        return 0;
 
-               usleep_range(1000, 2000);
+               if (i < 2)
+                       cpu_relax();
+               else
+                       usleep_range(1000, 2000);
        }
 
        dev_err(chip->dev, "Timeout while waiting for switch\n");
@@ -563,133 +570,249 @@ static int mv88e6xxx_serdes_pcs_link_up(struct mv88e6xxx_chip *chip, int port,
        return 0;
 }
 
-static void mv88e6065_phylink_validate(struct mv88e6xxx_chip *chip, int port,
-                                      unsigned long *mask,
-                                      struct phylink_link_state *state)
+static const u8 mv88e6185_phy_interface_modes[] = {
+       [MV88E6185_PORT_STS_CMODE_GMII_FD]       = PHY_INTERFACE_MODE_GMII,
+       [MV88E6185_PORT_STS_CMODE_MII_100_FD_PS] = PHY_INTERFACE_MODE_MII,
+       [MV88E6185_PORT_STS_CMODE_MII_100]       = PHY_INTERFACE_MODE_MII,
+       [MV88E6185_PORT_STS_CMODE_MII_10]        = PHY_INTERFACE_MODE_MII,
+       [MV88E6185_PORT_STS_CMODE_SERDES]        = PHY_INTERFACE_MODE_1000BASEX,
+       [MV88E6185_PORT_STS_CMODE_1000BASE_X]    = PHY_INTERFACE_MODE_1000BASEX,
+       [MV88E6185_PORT_STS_CMODE_PHY]           = PHY_INTERFACE_MODE_SGMII,
+};
+
+static void mv88e6185_phylink_get_caps(struct mv88e6xxx_chip *chip, int port,
+                                      struct phylink_config *config)
 {
-       if (!phy_interface_mode_is_8023z(state->interface)) {
-               /* 10M and 100M are only supported in non-802.3z mode */
-               phylink_set(mask, 10baseT_Half);
-               phylink_set(mask, 10baseT_Full);
-               phylink_set(mask, 100baseT_Half);
-               phylink_set(mask, 100baseT_Full);
-       }
+       u8 cmode = chip->ports[port].cmode;
+
+       if (cmode < ARRAY_SIZE(mv88e6185_phy_interface_modes) &&
+           mv88e6185_phy_interface_modes[cmode])
+               __set_bit(mv88e6185_phy_interface_modes[cmode],
+                         config->supported_interfaces);
+
+       config->mac_capabilities = MAC_SYM_PAUSE | MAC_10 | MAC_100 |
+                                  MAC_1000FD;
 }
 
-static void mv88e6185_phylink_validate(struct mv88e6xxx_chip *chip, int port,
-                                      unsigned long *mask,
-                                      struct phylink_link_state *state)
-{
-       /* FIXME: if the port is in 1000Base-X mode, then it only supports
-        * 1000M FD speeds.  In this case, CMODE will indicate 5.
+static const u8 mv88e6xxx_phy_interface_modes[] = {
+       [MV88E6XXX_PORT_STS_CMODE_MII_PHY]      = PHY_INTERFACE_MODE_MII,
+       [MV88E6XXX_PORT_STS_CMODE_MII]          = PHY_INTERFACE_MODE_MII,
+       [MV88E6XXX_PORT_STS_CMODE_GMII]         = PHY_INTERFACE_MODE_GMII,
+       [MV88E6XXX_PORT_STS_CMODE_RMII_PHY]     = PHY_INTERFACE_MODE_RMII,
+       [MV88E6XXX_PORT_STS_CMODE_RMII]         = PHY_INTERFACE_MODE_RMII,
+       [MV88E6XXX_PORT_STS_CMODE_100BASEX]     = PHY_INTERFACE_MODE_100BASEX,
+       [MV88E6XXX_PORT_STS_CMODE_1000BASEX]    = PHY_INTERFACE_MODE_1000BASEX,
+       [MV88E6XXX_PORT_STS_CMODE_SGMII]        = PHY_INTERFACE_MODE_SGMII,
+       /* higher interface modes are not needed here, since ports supporting
+        * them are writable, and so the supported interfaces are filled in the
+        * corresponding .phylink_set_interfaces() implementation below
         */
-       phylink_set(mask, 1000baseT_Full);
-       phylink_set(mask, 1000baseX_Full);
+};
 
-       mv88e6065_phylink_validate(chip, port, mask, state);
+static void mv88e6xxx_translate_cmode(u8 cmode, unsigned long *supported)
+{
+       if (cmode < ARRAY_SIZE(mv88e6xxx_phy_interface_modes) &&
+           mv88e6xxx_phy_interface_modes[cmode])
+               __set_bit(mv88e6xxx_phy_interface_modes[cmode], supported);
+       else if (cmode == MV88E6XXX_PORT_STS_CMODE_RGMII)
+               phy_interface_set_rgmii(supported);
 }
 
-static void mv88e6341_phylink_validate(struct mv88e6xxx_chip *chip, int port,
-                                      unsigned long *mask,
-                                      struct phylink_link_state *state)
+static void mv88e6250_phylink_get_caps(struct mv88e6xxx_chip *chip, int port,
+                                      struct phylink_config *config)
 {
-       if (port >= 5)
-               phylink_set(mask, 2500baseX_Full);
+       unsigned long *supported = config->supported_interfaces;
 
-       /* No ethtool bits for 200Mbps */
-       phylink_set(mask, 1000baseT_Full);
-       phylink_set(mask, 1000baseX_Full);
+       /* Translate the default cmode */
+       mv88e6xxx_translate_cmode(chip->ports[port].cmode, supported);
 
-       mv88e6065_phylink_validate(chip, port, mask, state);
+       config->mac_capabilities = MAC_SYM_PAUSE | MAC_10 | MAC_100;
 }
 
-static void mv88e6352_phylink_validate(struct mv88e6xxx_chip *chip, int port,
-                                      unsigned long *mask,
-                                      struct phylink_link_state *state)
+static int mv88e6352_get_port4_serdes_cmode(struct mv88e6xxx_chip *chip)
 {
-       /* No ethtool bits for 200Mbps */
-       phylink_set(mask, 1000baseT_Full);
-       phylink_set(mask, 1000baseX_Full);
+       u16 reg, val;
+       int err;
+
+       err = mv88e6xxx_port_read(chip, 4, MV88E6XXX_PORT_STS, &reg);
+       if (err)
+               return err;
+
+       /* If PHY_DETECT is zero, then we are not in auto-media mode */
+       if (!(reg & MV88E6XXX_PORT_STS_PHY_DETECT))
+               return 0xf;
+
+       val = reg & ~MV88E6XXX_PORT_STS_PHY_DETECT;
+       err = mv88e6xxx_port_write(chip, 4, MV88E6XXX_PORT_STS, val);
+       if (err)
+               return err;
+
+       err = mv88e6xxx_port_read(chip, 4, MV88E6XXX_PORT_STS, &val);
+       if (err)
+               return err;
+
+       /* Restore PHY_DETECT value */
+       err = mv88e6xxx_port_write(chip, 4, MV88E6XXX_PORT_STS, reg);
+       if (err)
+               return err;
 
-       mv88e6065_phylink_validate(chip, port, mask, state);
+       return val & MV88E6XXX_PORT_STS_CMODE_MASK;
 }
 
-static void mv88e6390_phylink_validate(struct mv88e6xxx_chip *chip, int port,
-                                      unsigned long *mask,
-                                      struct phylink_link_state *state)
+static void mv88e6352_phylink_get_caps(struct mv88e6xxx_chip *chip, int port,
+                                      struct phylink_config *config)
 {
-       if (port >= 9) {
-               phylink_set(mask, 2500baseX_Full);
-               phylink_set(mask, 2500baseT_Full);
+       unsigned long *supported = config->supported_interfaces;
+       int err, cmode;
+
+       /* Translate the default cmode */
+       mv88e6xxx_translate_cmode(chip->ports[port].cmode, supported);
+
+       config->mac_capabilities = MAC_SYM_PAUSE | MAC_10 | MAC_100 |
+                                  MAC_1000FD;
+
+       /* Port 4 supports automedia if the serdes is associated with it. */
+       if (port == 4) {
+               mv88e6xxx_reg_lock(chip);
+               err = mv88e6352_g2_scratch_port_has_serdes(chip, port);
+               if (err < 0)
+                       dev_err(chip->dev, "p%d: failed to read scratch\n",
+                               port);
+               if (err <= 0)
+                       goto unlock;
+
+               cmode = mv88e6352_get_port4_serdes_cmode(chip);
+               if (cmode < 0)
+                       dev_err(chip->dev, "p%d: failed to read serdes cmode\n",
+                               port);
+               else
+                       mv88e6xxx_translate_cmode(cmode, supported);
+unlock:
+               mv88e6xxx_reg_unlock(chip);
        }
+}
+
+static void mv88e6341_phylink_get_caps(struct mv88e6xxx_chip *chip, int port,
+                                      struct phylink_config *config)
+{
+       unsigned long *supported = config->supported_interfaces;
+
+       /* Translate the default cmode */
+       mv88e6xxx_translate_cmode(chip->ports[port].cmode, supported);
 
        /* No ethtool bits for 200Mbps */
-       phylink_set(mask, 1000baseT_Full);
-       phylink_set(mask, 1000baseX_Full);
+       config->mac_capabilities = MAC_SYM_PAUSE | MAC_10 | MAC_100 |
+                                  MAC_1000FD;
+
+       /* The C_Mode field is programmable on port 5 */
+       if (port == 5) {
+               __set_bit(PHY_INTERFACE_MODE_SGMII, supported);
+               __set_bit(PHY_INTERFACE_MODE_1000BASEX, supported);
+               __set_bit(PHY_INTERFACE_MODE_2500BASEX, supported);
 
-       mv88e6065_phylink_validate(chip, port, mask, state);
+               config->mac_capabilities |= MAC_2500FD;
+       }
 }
 
-static void mv88e6390x_phylink_validate(struct mv88e6xxx_chip *chip, int port,
-                                       unsigned long *mask,
-                                       struct phylink_link_state *state)
+static void mv88e6390_phylink_get_caps(struct mv88e6xxx_chip *chip, int port,
+                                      struct phylink_config *config)
 {
-       if (port >= 9) {
-               phylink_set(mask, 10000baseT_Full);
-               phylink_set(mask, 10000baseKR_Full);
+       unsigned long *supported = config->supported_interfaces;
+
+       /* Translate the default cmode */
+       mv88e6xxx_translate_cmode(chip->ports[port].cmode, supported);
+
+       /* No ethtool bits for 200Mbps */
+       config->mac_capabilities = MAC_SYM_PAUSE | MAC_10 | MAC_100 |
+                                  MAC_1000FD;
+
+       /* The C_Mode field is programmable on ports 9 and 10 */
+       if (port == 9 || port == 10) {
+               __set_bit(PHY_INTERFACE_MODE_SGMII, supported);
+               __set_bit(PHY_INTERFACE_MODE_1000BASEX, supported);
+               __set_bit(PHY_INTERFACE_MODE_2500BASEX, supported);
+
+               config->mac_capabilities |= MAC_2500FD;
        }
+}
+
+static void mv88e6390x_phylink_get_caps(struct mv88e6xxx_chip *chip, int port,
+                                       struct phylink_config *config)
+{
+       unsigned long *supported = config->supported_interfaces;
 
-       mv88e6390_phylink_validate(chip, port, mask, state);
+       mv88e6390_phylink_get_caps(chip, port, config);
+
+       /* For the 6x90X, ports 2-7 can be in automedia mode.
+        * (Note that 6x90 doesn't support RXAUI nor XAUI).
+        *
+        * Port 2 can also support 1000BASE-X in automedia mode if port 9 is
+        * configured for 1000BASE-X, SGMII or 2500BASE-X.
+        * Port 3-4 can also support 1000BASE-X in automedia mode if port 9 is
+        * configured for RXAUI, 1000BASE-X, SGMII or 2500BASE-X.
+        *
+        * Port 5 can also support 1000BASE-X in automedia mode if port 10 is
+        * configured for 1000BASE-X, SGMII or 2500BASE-X.
+        * Port 6-7 can also support 1000BASE-X in automedia mode if port 10 is
+        * configured for RXAUI, 1000BASE-X, SGMII or 2500BASE-X.
+        *
+        * For now, be permissive (as the old code was) and allow 1000BASE-X
+        * on ports 2..7.
+        */
+       if (port >= 2 && port <= 7)
+               __set_bit(PHY_INTERFACE_MODE_1000BASEX, supported);
+
+       /* The C_Mode field can also be programmed for 10G speeds */
+       if (port == 9 || port == 10) {
+               __set_bit(PHY_INTERFACE_MODE_XAUI, supported);
+               __set_bit(PHY_INTERFACE_MODE_RXAUI, supported);
+
+               config->mac_capabilities |= MAC_10000FD;
+       }
 }
 
-static void mv88e6393x_phylink_validate(struct mv88e6xxx_chip *chip, int port,
-                                       unsigned long *mask,
-                                       struct phylink_link_state *state)
+static void mv88e6393x_phylink_get_caps(struct mv88e6xxx_chip *chip, int port,
+                                       struct phylink_config *config)
 {
+       unsigned long *supported = config->supported_interfaces;
        bool is_6191x =
                chip->info->prod_num == MV88E6XXX_PORT_SWITCH_ID_PROD_6191X;
 
-       if (((port == 0 || port == 9) && !is_6191x) || port == 10) {
-               phylink_set(mask, 10000baseT_Full);
-               phylink_set(mask, 10000baseKR_Full);
-               phylink_set(mask, 10000baseCR_Full);
-               phylink_set(mask, 10000baseSR_Full);
-               phylink_set(mask, 10000baseLR_Full);
-               phylink_set(mask, 10000baseLRM_Full);
-               phylink_set(mask, 10000baseER_Full);
-               phylink_set(mask, 5000baseT_Full);
-               phylink_set(mask, 2500baseX_Full);
-               phylink_set(mask, 2500baseT_Full);
-       }
+       mv88e6xxx_translate_cmode(chip->ports[port].cmode, supported);
+
+       config->mac_capabilities = MAC_SYM_PAUSE | MAC_10 | MAC_100 |
+                                  MAC_1000FD;
 
-       phylink_set(mask, 1000baseT_Full);
-       phylink_set(mask, 1000baseX_Full);
+       /* The C_Mode field can be programmed for ports 0, 9 and 10 */
+       if (port == 0 || port == 9 || port == 10) {
+               __set_bit(PHY_INTERFACE_MODE_SGMII, supported);
+               __set_bit(PHY_INTERFACE_MODE_1000BASEX, supported);
 
-       mv88e6065_phylink_validate(chip, port, mask, state);
+               /* 6191X supports >1G modes only on port 10 */
+               if (!is_6191x || port == 10) {
+                       __set_bit(PHY_INTERFACE_MODE_2500BASEX, supported);
+                       __set_bit(PHY_INTERFACE_MODE_5GBASER, supported);
+                       __set_bit(PHY_INTERFACE_MODE_10GBASER, supported);
+                       /* FIXME: USXGMII is not supported yet */
+                       /* __set_bit(PHY_INTERFACE_MODE_USXGMII, supported); */
+
+                       config->mac_capabilities |= MAC_2500FD | MAC_5000FD |
+                               MAC_10000FD;
+               }
+       }
 }
 
-static void mv88e6xxx_validate(struct dsa_switch *ds, int port,
-                              unsigned long *supported,
-                              struct phylink_link_state *state)
+static void mv88e6xxx_get_caps(struct dsa_switch *ds, int port,
+                              struct phylink_config *config)
 {
-       __ETHTOOL_DECLARE_LINK_MODE_MASK(mask) = { 0, };
        struct mv88e6xxx_chip *chip = ds->priv;
 
-       /* Allow all the expected bits */
-       phylink_set(mask, Autoneg);
-       phylink_set(mask, Pause);
-       phylink_set_port_modes(mask);
+       chip->info->ops->phylink_get_caps(chip, port, config);
 
-       if (chip->info->ops->phylink_validate)
-               chip->info->ops->phylink_validate(chip, port, mask, state);
-
-       linkmode_and(supported, supported, mask);
-       linkmode_and(state->advertising, state->advertising, mask);
-
-       /* We can only operate at 2500BaseX or 1000BaseX.  If requested
-        * to advertise both, only report advertising at 2500BaseX.
-        */
-       phylink_helper_basex_speed(state);
+       /* Internal ports need GMII for PHYLIB */
+       if (mv88e6xxx_phy_is_internal(ds, port))
+               __set_bit(PHY_INTERFACE_MODE_GMII,
+                         config->supported_interfaces);
 }
 
 static void mv88e6xxx_mac_config(struct dsa_switch *ds, int port,
@@ -1283,8 +1406,15 @@ static u16 mv88e6xxx_port_vlan(struct mv88e6xxx_chip *chip, int dev, int port)
 
        pvlan = 0;
 
-       /* Frames from user ports can egress any local DSA links and CPU ports,
-        * as well as any local member of their bridge group.
+       /* Frames from standalone user ports can only egress on the
+        * upstream port.
+        */
+       if (!dsa_port_bridge_dev_get(dp))
+               return BIT(dsa_switch_upstream_port(ds));
+
+       /* Frames from bridged user ports can egress any local DSA
+        * links and CPU ports, as well as any local member of their
+        * bridge group.
         */
        dsa_switch_for_each_port(other_dp, ds)
                if (other_dp->type == DSA_PORT_TYPE_CPU ||
@@ -1616,21 +1746,11 @@ static int mv88e6xxx_fid_map_vlan(struct mv88e6xxx_chip *chip,
 
 int mv88e6xxx_fid_map(struct mv88e6xxx_chip *chip, unsigned long *fid_bitmap)
 {
-       int i, err;
-       u16 fid;
-
        bitmap_zero(fid_bitmap, MV88E6XXX_N_FID);
 
-       /* Set every FID bit used by the (un)bridged ports */
-       for (i = 0; i < mv88e6xxx_num_ports(chip); ++i) {
-               err = mv88e6xxx_port_get_fid(chip, i, &fid);
-               if (err)
-                       return err;
-
-               set_bit(fid, fid_bitmap);
-       }
-
-       /* Set every FID bit used by the VLAN entries */
+       /* Every FID has an associated VID, so walking the VTU
+        * will discover the full set of FIDs in use.
+        */
        return mv88e6xxx_vtu_walk(chip, mv88e6xxx_fid_map_vlan, fid_bitmap);
 }
 
@@ -1643,10 +1763,7 @@ static int mv88e6xxx_atu_new(struct mv88e6xxx_chip *chip, u16 *fid)
        if (err)
                return err;
 
-       /* The reset value 0x000 is used to indicate that multiple address
-        * databases are not needed. Return the next positive available.
-        */
-       *fid = find_next_zero_bit(fid_bitmap, MV88E6XXX_N_FID, 1);
+       *fid = find_first_zero_bit(fid_bitmap, MV88E6XXX_N_FID);
        if (unlikely(*fid >= mv88e6xxx_num_databases(chip)))
                return -ENOSPC;
 
@@ -2138,6 +2255,9 @@ static int mv88e6xxx_port_vlan_join(struct mv88e6xxx_chip *chip, int port,
        if (!vlan.valid) {
                memset(&vlan, 0, sizeof(vlan));
 
+               if (vid == MV88E6XXX_VID_STANDALONE)
+                       vlan.policy = true;
+
                err = mv88e6xxx_atu_new(chip, &vlan.fid);
                if (err)
                        return err;
@@ -2480,6 +2600,10 @@ static int mv88e6xxx_port_bridge_join(struct dsa_switch *ds, int port,
        if (err)
                goto unlock;
 
+       err = mv88e6xxx_port_set_map_da(chip, port, true);
+       if (err)
+               goto unlock;
+
        err = mv88e6xxx_port_commit_pvid(chip, port);
        if (err)
                goto unlock;
@@ -2514,6 +2638,12 @@ static void mv88e6xxx_port_bridge_leave(struct dsa_switch *ds, int port,
            mv88e6xxx_port_vlan_map(chip, port))
                dev_err(ds->dev, "failed to remap in-chip Port VLAN\n");
 
+       err = mv88e6xxx_port_set_map_da(chip, port, false);
+       if (err)
+               dev_err(ds->dev,
+                       "port %d failed to restore map-DA: %pe\n",
+                       port, ERR_PTR(err));
+
        err = mv88e6xxx_port_commit_pvid(chip, port);
        if (err)
                dev_err(ds->dev,
@@ -2911,12 +3041,13 @@ static int mv88e6xxx_setup_port(struct mv88e6xxx_chip *chip, int port)
                return err;
 
        /* Port Control 2: don't force a good FCS, set the MTU size to
-        * 10222 bytes, disable 802.1q tags checking, don't discard tagged or
-        * untagged frames on this port, do a destination address lookup on all
-        * received packets as usual, disable ARP mirroring and don't send a
-        * copy of all transmitted/received frames on this port to the CPU.
+        * 10222 bytes, disable 802.1q tags checking, don't discard
+        * tagged or untagged frames on this port, skip destination
+        * address lookup on user ports, disable ARP mirroring and don't
+        * send a copy of all transmitted/received frames on this port
+        * to the CPU.
         */
-       err = mv88e6xxx_port_set_map_da(chip, port);
+       err = mv88e6xxx_port_set_map_da(chip, port, !dsa_is_user_port(ds, port));
        if (err)
                return err;
 
@@ -2924,8 +3055,44 @@ static int mv88e6xxx_setup_port(struct mv88e6xxx_chip *chip, int port)
        if (err)
                return err;
 
+       /* On chips that support it, set all downstream DSA ports'
+        * VLAN policy to TRAP. In combination with loading
+        * MV88E6XXX_VID_STANDALONE as a policy entry in the VTU, this
+        * provides a better isolation barrier between standalone
+        * ports, as the ATU is bypassed on any intermediate switches
+        * between the incoming port and the CPU.
+        */
+       if (dsa_is_downstream_port(ds, port) &&
+           chip->info->ops->port_set_policy) {
+               err = chip->info->ops->port_set_policy(chip, port,
+                                               MV88E6XXX_POLICY_MAPPING_VTU,
+                                               MV88E6XXX_POLICY_ACTION_TRAP);
+               if (err)
+                       return err;
+       }
+
+       /* User ports start out in standalone mode and 802.1Q is
+        * therefore disabled. On DSA ports, all valid VIDs are always
+        * loaded in the VTU - therefore, enable 802.1Q in order to take
+        * advantage of VLAN policy on chips that supports it.
+        */
        err = mv88e6xxx_port_set_8021q_mode(chip, port,
-                               MV88E6XXX_PORT_CTL2_8021Q_MODE_DISABLED);
+                               dsa_is_user_port(ds, port) ?
+                               MV88E6XXX_PORT_CTL2_8021Q_MODE_DISABLED :
+                               MV88E6XXX_PORT_CTL2_8021Q_MODE_SECURE);
+       if (err)
+               return err;
+
+       /* Bind MV88E6XXX_VID_STANDALONE to MV88E6XXX_FID_STANDALONE by
+        * virtue of the fact that mv88e6xxx_atu_new() will pick it as
+        * the first free FID. This will be used as the private PVID for
+        * unbridged ports. Shared (DSA and CPU) ports must also be
+        * members of this VID, in order to trap all frames assigned to
+        * it to the CPU.
+        */
+       err = mv88e6xxx_port_vlan_join(chip, port, MV88E6XXX_VID_STANDALONE,
+                                      MV88E6XXX_G1_VTU_DATA_MEMBER_TAG_UNMODIFIED,
+                                      false);
        if (err)
                return err;
 
@@ -2938,7 +3105,7 @@ static int mv88e6xxx_setup_port(struct mv88e6xxx_chip *chip, int port)
         * relying on their port default FID.
         */
        err = mv88e6xxx_port_vlan_join(chip, port, MV88E6XXX_VID_BRIDGED,
-                                      MV88E6XXX_G1_VTU_DATA_MEMBER_TAG_UNTAGGED,
+                                      MV88E6XXX_G1_VTU_DATA_MEMBER_TAG_UNMODIFIED,
                                       false);
        if (err)
                return err;
@@ -3582,7 +3749,7 @@ static const struct mv88e6xxx_ops mv88e6085_ops = {
        .rmu_disable = mv88e6085_g1_rmu_disable,
        .vtu_getnext = mv88e6352_g1_vtu_getnext,
        .vtu_loadpurge = mv88e6352_g1_vtu_loadpurge,
-       .phylink_validate = mv88e6185_phylink_validate,
+       .phylink_get_caps = mv88e6185_phylink_get_caps,
        .set_max_frame_size = mv88e6185_g1_set_max_frame_size,
 };
 
@@ -3616,7 +3783,7 @@ static const struct mv88e6xxx_ops mv88e6095_ops = {
        .reset = mv88e6185_g1_reset,
        .vtu_getnext = mv88e6185_g1_vtu_getnext,
        .vtu_loadpurge = mv88e6185_g1_vtu_loadpurge,
-       .phylink_validate = mv88e6185_phylink_validate,
+       .phylink_get_caps = mv88e6185_phylink_get_caps,
        .set_max_frame_size = mv88e6185_g1_set_max_frame_size,
 };
 
@@ -3632,6 +3799,7 @@ static const struct mv88e6xxx_ops mv88e6097_ops = {
        .port_sync_link = mv88e6185_port_sync_link,
        .port_set_speed_duplex = mv88e6185_port_set_speed_duplex,
        .port_tag_remap = mv88e6095_port_tag_remap,
+       .port_set_policy = mv88e6352_port_set_policy,
        .port_set_frame_mode = mv88e6351_port_set_frame_mode,
        .port_set_ucast_flood = mv88e6352_port_set_ucast_flood,
        .port_set_mcast_flood = mv88e6352_port_set_mcast_flood,
@@ -3662,7 +3830,7 @@ static const struct mv88e6xxx_ops mv88e6097_ops = {
        .rmu_disable = mv88e6085_g1_rmu_disable,
        .vtu_getnext = mv88e6352_g1_vtu_getnext,
        .vtu_loadpurge = mv88e6352_g1_vtu_loadpurge,
-       .phylink_validate = mv88e6185_phylink_validate,
+       .phylink_get_caps = mv88e6185_phylink_get_caps,
        .set_max_frame_size = mv88e6185_g1_set_max_frame_size,
 };
 
@@ -3699,7 +3867,7 @@ static const struct mv88e6xxx_ops mv88e6123_ops = {
        .atu_set_hash = mv88e6165_g1_atu_set_hash,
        .vtu_getnext = mv88e6352_g1_vtu_getnext,
        .vtu_loadpurge = mv88e6352_g1_vtu_loadpurge,
-       .phylink_validate = mv88e6185_phylink_validate,
+       .phylink_get_caps = mv88e6185_phylink_get_caps,
        .set_max_frame_size = mv88e6185_g1_set_max_frame_size,
 };
 
@@ -3740,7 +3908,7 @@ static const struct mv88e6xxx_ops mv88e6131_ops = {
        .reset = mv88e6185_g1_reset,
        .vtu_getnext = mv88e6185_g1_vtu_getnext,
        .vtu_loadpurge = mv88e6185_g1_vtu_loadpurge,
-       .phylink_validate = mv88e6185_phylink_validate,
+       .phylink_get_caps = mv88e6185_phylink_get_caps,
 };
 
 static const struct mv88e6xxx_ops mv88e6141_ops = {
@@ -3804,7 +3972,7 @@ static const struct mv88e6xxx_ops mv88e6141_ops = {
        .serdes_get_stats = mv88e6390_serdes_get_stats,
        .serdes_get_regs_len = mv88e6390_serdes_get_regs_len,
        .serdes_get_regs = mv88e6390_serdes_get_regs,
-       .phylink_validate = mv88e6341_phylink_validate,
+       .phylink_get_caps = mv88e6341_phylink_get_caps,
 };
 
 static const struct mv88e6xxx_ops mv88e6161_ops = {
@@ -3846,7 +4014,7 @@ static const struct mv88e6xxx_ops mv88e6161_ops = {
        .vtu_loadpurge = mv88e6352_g1_vtu_loadpurge,
        .avb_ops = &mv88e6165_avb_ops,
        .ptp_ops = &mv88e6165_ptp_ops,
-       .phylink_validate = mv88e6185_phylink_validate,
+       .phylink_get_caps = mv88e6185_phylink_get_caps,
        .set_max_frame_size = mv88e6185_g1_set_max_frame_size,
 };
 
@@ -3882,7 +4050,7 @@ static const struct mv88e6xxx_ops mv88e6165_ops = {
        .vtu_loadpurge = mv88e6352_g1_vtu_loadpurge,
        .avb_ops = &mv88e6165_avb_ops,
        .ptp_ops = &mv88e6165_ptp_ops,
-       .phylink_validate = mv88e6185_phylink_validate,
+       .phylink_get_caps = mv88e6185_phylink_get_caps,
 };
 
 static const struct mv88e6xxx_ops mv88e6171_ops = {
@@ -3924,7 +4092,7 @@ static const struct mv88e6xxx_ops mv88e6171_ops = {
        .atu_set_hash = mv88e6165_g1_atu_set_hash,
        .vtu_getnext = mv88e6352_g1_vtu_getnext,
        .vtu_loadpurge = mv88e6352_g1_vtu_loadpurge,
-       .phylink_validate = mv88e6185_phylink_validate,
+       .phylink_get_caps = mv88e6185_phylink_get_caps,
 };
 
 static const struct mv88e6xxx_ops mv88e6172_ops = {
@@ -3979,7 +4147,7 @@ static const struct mv88e6xxx_ops mv88e6172_ops = {
        .serdes_get_regs_len = mv88e6352_serdes_get_regs_len,
        .serdes_get_regs = mv88e6352_serdes_get_regs,
        .gpio_ops = &mv88e6352_gpio_ops,
-       .phylink_validate = mv88e6352_phylink_validate,
+       .phylink_get_caps = mv88e6352_phylink_get_caps,
 };
 
 static const struct mv88e6xxx_ops mv88e6175_ops = {
@@ -4021,7 +4189,7 @@ static const struct mv88e6xxx_ops mv88e6175_ops = {
        .atu_set_hash = mv88e6165_g1_atu_set_hash,
        .vtu_getnext = mv88e6352_g1_vtu_getnext,
        .vtu_loadpurge = mv88e6352_g1_vtu_loadpurge,
-       .phylink_validate = mv88e6185_phylink_validate,
+       .phylink_get_caps = mv88e6185_phylink_get_caps,
 };
 
 static const struct mv88e6xxx_ops mv88e6176_ops = {
@@ -4079,7 +4247,7 @@ static const struct mv88e6xxx_ops mv88e6176_ops = {
        .serdes_get_regs_len = mv88e6352_serdes_get_regs_len,
        .serdes_get_regs = mv88e6352_serdes_get_regs,
        .gpio_ops = &mv88e6352_gpio_ops,
-       .phylink_validate = mv88e6352_phylink_validate,
+       .phylink_get_caps = mv88e6352_phylink_get_caps,
 };
 
 static const struct mv88e6xxx_ops mv88e6185_ops = {
@@ -4118,7 +4286,7 @@ static const struct mv88e6xxx_ops mv88e6185_ops = {
        .reset = mv88e6185_g1_reset,
        .vtu_getnext = mv88e6185_g1_vtu_getnext,
        .vtu_loadpurge = mv88e6185_g1_vtu_loadpurge,
-       .phylink_validate = mv88e6185_phylink_validate,
+       .phylink_get_caps = mv88e6185_phylink_get_caps,
        .set_max_frame_size = mv88e6185_g1_set_max_frame_size,
 };
 
@@ -4180,7 +4348,7 @@ static const struct mv88e6xxx_ops mv88e6190_ops = {
        .serdes_get_regs_len = mv88e6390_serdes_get_regs_len,
        .serdes_get_regs = mv88e6390_serdes_get_regs,
        .gpio_ops = &mv88e6352_gpio_ops,
-       .phylink_validate = mv88e6390_phylink_validate,
+       .phylink_get_caps = mv88e6390_phylink_get_caps,
 };
 
 static const struct mv88e6xxx_ops mv88e6190x_ops = {
@@ -4241,7 +4409,7 @@ static const struct mv88e6xxx_ops mv88e6190x_ops = {
        .serdes_get_regs_len = mv88e6390_serdes_get_regs_len,
        .serdes_get_regs = mv88e6390_serdes_get_regs,
        .gpio_ops = &mv88e6352_gpio_ops,
-       .phylink_validate = mv88e6390x_phylink_validate,
+       .phylink_get_caps = mv88e6390x_phylink_get_caps,
 };
 
 static const struct mv88e6xxx_ops mv88e6191_ops = {
@@ -4301,7 +4469,7 @@ static const struct mv88e6xxx_ops mv88e6191_ops = {
        .serdes_get_regs = mv88e6390_serdes_get_regs,
        .avb_ops = &mv88e6390_avb_ops,
        .ptp_ops = &mv88e6352_ptp_ops,
-       .phylink_validate = mv88e6390_phylink_validate,
+       .phylink_get_caps = mv88e6390_phylink_get_caps,
 };
 
 static const struct mv88e6xxx_ops mv88e6240_ops = {
@@ -4361,7 +4529,7 @@ static const struct mv88e6xxx_ops mv88e6240_ops = {
        .gpio_ops = &mv88e6352_gpio_ops,
        .avb_ops = &mv88e6352_avb_ops,
        .ptp_ops = &mv88e6352_ptp_ops,
-       .phylink_validate = mv88e6352_phylink_validate,
+       .phylink_get_caps = mv88e6352_phylink_get_caps,
 };
 
 static const struct mv88e6xxx_ops mv88e6250_ops = {
@@ -4401,7 +4569,7 @@ static const struct mv88e6xxx_ops mv88e6250_ops = {
        .vtu_loadpurge = mv88e6185_g1_vtu_loadpurge,
        .avb_ops = &mv88e6352_avb_ops,
        .ptp_ops = &mv88e6250_ptp_ops,
-       .phylink_validate = mv88e6065_phylink_validate,
+       .phylink_get_caps = mv88e6250_phylink_get_caps,
 };
 
 static const struct mv88e6xxx_ops mv88e6290_ops = {
@@ -4463,7 +4631,7 @@ static const struct mv88e6xxx_ops mv88e6290_ops = {
        .gpio_ops = &mv88e6352_gpio_ops,
        .avb_ops = &mv88e6390_avb_ops,
        .ptp_ops = &mv88e6352_ptp_ops,
-       .phylink_validate = mv88e6390_phylink_validate,
+       .phylink_get_caps = mv88e6390_phylink_get_caps,
 };
 
 static const struct mv88e6xxx_ops mv88e6320_ops = {
@@ -4507,7 +4675,7 @@ static const struct mv88e6xxx_ops mv88e6320_ops = {
        .gpio_ops = &mv88e6352_gpio_ops,
        .avb_ops = &mv88e6352_avb_ops,
        .ptp_ops = &mv88e6352_ptp_ops,
-       .phylink_validate = mv88e6185_phylink_validate,
+       .phylink_get_caps = mv88e6185_phylink_get_caps,
 };
 
 static const struct mv88e6xxx_ops mv88e6321_ops = {
@@ -4549,7 +4717,7 @@ static const struct mv88e6xxx_ops mv88e6321_ops = {
        .gpio_ops = &mv88e6352_gpio_ops,
        .avb_ops = &mv88e6352_avb_ops,
        .ptp_ops = &mv88e6352_ptp_ops,
-       .phylink_validate = mv88e6185_phylink_validate,
+       .phylink_get_caps = mv88e6185_phylink_get_caps,
 };
 
 static const struct mv88e6xxx_ops mv88e6341_ops = {
@@ -4615,7 +4783,7 @@ static const struct mv88e6xxx_ops mv88e6341_ops = {
        .serdes_get_stats = mv88e6390_serdes_get_stats,
        .serdes_get_regs_len = mv88e6390_serdes_get_regs_len,
        .serdes_get_regs = mv88e6390_serdes_get_regs,
-       .phylink_validate = mv88e6341_phylink_validate,
+       .phylink_get_caps = mv88e6341_phylink_get_caps,
 };
 
 static const struct mv88e6xxx_ops mv88e6350_ops = {
@@ -4657,7 +4825,7 @@ static const struct mv88e6xxx_ops mv88e6350_ops = {
        .atu_set_hash = mv88e6165_g1_atu_set_hash,
        .vtu_getnext = mv88e6352_g1_vtu_getnext,
        .vtu_loadpurge = mv88e6352_g1_vtu_loadpurge,
-       .phylink_validate = mv88e6185_phylink_validate,
+       .phylink_get_caps = mv88e6185_phylink_get_caps,
 };
 
 static const struct mv88e6xxx_ops mv88e6351_ops = {
@@ -4701,7 +4869,7 @@ static const struct mv88e6xxx_ops mv88e6351_ops = {
        .vtu_loadpurge = mv88e6352_g1_vtu_loadpurge,
        .avb_ops = &mv88e6352_avb_ops,
        .ptp_ops = &mv88e6352_ptp_ops,
-       .phylink_validate = mv88e6185_phylink_validate,
+       .phylink_get_caps = mv88e6185_phylink_get_caps,
 };
 
 static const struct mv88e6xxx_ops mv88e6352_ops = {
@@ -4764,7 +4932,7 @@ static const struct mv88e6xxx_ops mv88e6352_ops = {
        .serdes_get_stats = mv88e6352_serdes_get_stats,
        .serdes_get_regs_len = mv88e6352_serdes_get_regs_len,
        .serdes_get_regs = mv88e6352_serdes_get_regs,
-       .phylink_validate = mv88e6352_phylink_validate,
+       .phylink_get_caps = mv88e6352_phylink_get_caps,
 };
 
 static const struct mv88e6xxx_ops mv88e6390_ops = {
@@ -4829,7 +4997,7 @@ static const struct mv88e6xxx_ops mv88e6390_ops = {
        .serdes_get_stats = mv88e6390_serdes_get_stats,
        .serdes_get_regs_len = mv88e6390_serdes_get_regs_len,
        .serdes_get_regs = mv88e6390_serdes_get_regs,
-       .phylink_validate = mv88e6390_phylink_validate,
+       .phylink_get_caps = mv88e6390_phylink_get_caps,
 };
 
 static const struct mv88e6xxx_ops mv88e6390x_ops = {
@@ -4893,7 +5061,7 @@ static const struct mv88e6xxx_ops mv88e6390x_ops = {
        .gpio_ops = &mv88e6352_gpio_ops,
        .avb_ops = &mv88e6390_avb_ops,
        .ptp_ops = &mv88e6352_ptp_ops,
-       .phylink_validate = mv88e6390x_phylink_validate,
+       .phylink_get_caps = mv88e6390x_phylink_get_caps,
 };
 
 static const struct mv88e6xxx_ops mv88e6393x_ops = {
@@ -4957,7 +5125,7 @@ static const struct mv88e6xxx_ops mv88e6393x_ops = {
        .gpio_ops = &mv88e6352_gpio_ops,
        .avb_ops = &mv88e6390_avb_ops,
        .ptp_ops = &mv88e6352_ptp_ops,
-       .phylink_validate = mv88e6393x_phylink_validate,
+       .phylink_get_caps = mv88e6393x_phylink_get_caps,
 };
 
 static const struct mv88e6xxx_info mv88e6xxx_table[] = {
@@ -6226,7 +6394,7 @@ static const struct dsa_switch_ops mv88e6xxx_switch_ops = {
        .teardown               = mv88e6xxx_teardown,
        .port_setup             = mv88e6xxx_port_setup,
        .port_teardown          = mv88e6xxx_port_teardown,
-       .phylink_validate       = mv88e6xxx_validate,
+       .phylink_get_caps       = mv88e6xxx_get_caps,
        .phylink_mac_link_state = mv88e6xxx_serdes_pcs_get_state,
        .phylink_mac_config     = mv88e6xxx_mac_config,
        .phylink_mac_an_restart = mv88e6xxx_serdes_pcs_an_restart,
index 8271b8a..12aa637 100644 (file)
@@ -179,6 +179,7 @@ struct mv88e6xxx_vtu_entry {
        u16     fid;
        u8      sid;
        bool    valid;
+       bool    policy;
        u8      member[DSA_MAX_PORTS];
        u8      state[DSA_MAX_PORTS];
 };
@@ -392,6 +393,7 @@ struct mv88e6xxx_chip {
 struct mv88e6xxx_bus_ops {
        int (*read)(struct mv88e6xxx_chip *chip, int addr, int reg, u16 *val);
        int (*write)(struct mv88e6xxx_chip *chip, int addr, int reg, u16 val);
+       int (*init)(struct mv88e6xxx_chip *chip);
 };
 
 struct mv88e6xxx_mdio_bus {
@@ -609,9 +611,8 @@ struct mv88e6xxx_ops {
        const struct mv88e6xxx_ptp_ops *ptp_ops;
 
        /* Phylink */
-       void (*phylink_validate)(struct mv88e6xxx_chip *chip, int port,
-                                unsigned long *mask,
-                                struct phylink_link_state *state);
+       void (*phylink_get_caps)(struct mv88e6xxx_chip *chip, int port,
+                                struct phylink_config *config);
 
        /* Max Frame Size */
        int (*set_max_frame_size)(struct mv88e6xxx_chip *chip, int mtu);
index 4f3dbb0..2c1607c 100644 (file)
@@ -46,6 +46,7 @@
 
 /* Offset 0x02: VTU FID Register */
 #define MV88E6352_G1_VTU_FID           0x02
+#define MV88E6352_G1_VTU_FID_VID_POLICY        0x1000
 #define MV88E6352_G1_VTU_FID_MASK      0x0fff
 
 /* Offset 0x03: VTU SID Register */
index ae12c98..b1bd927 100644 (file)
@@ -27,7 +27,7 @@ static int mv88e6xxx_g1_vtu_fid_read(struct mv88e6xxx_chip *chip,
                return err;
 
        entry->fid = val & MV88E6352_G1_VTU_FID_MASK;
-
+       entry->policy = !!(val & MV88E6352_G1_VTU_FID_VID_POLICY);
        return 0;
 }
 
@@ -36,6 +36,9 @@ static int mv88e6xxx_g1_vtu_fid_write(struct mv88e6xxx_chip *chip,
 {
        u16 val = entry->fid & MV88E6352_G1_VTU_FID_MASK;
 
+       if (entry->policy)
+               val |= MV88E6352_G1_VTU_FID_VID_POLICY;
+
        return mv88e6xxx_g1_write(chip, MV88E6352_G1_VTU_FID, val);
 }
 
index f3e2757..807aeaa 100644 (file)
 #define MV88E6352_G2_SCRATCH_CONFIG_DATA1_NO_CPU       BIT(2)
 #define MV88E6352_G2_SCRATCH_CONFIG_DATA2      0x72
 #define MV88E6352_G2_SCRATCH_CONFIG_DATA2_P0_MODE_MASK 0x3
+#define MV88E6352_G2_SCRATCH_CONFIG_DATA3      0x73
+#define MV88E6352_G2_SCRATCH_CONFIG_DATA3_S_SEL                BIT(1)
 
 #define MV88E6352_G2_SCRATCH_GPIO_PCTL_GPIO    0
 #define MV88E6352_G2_SCRATCH_GPIO_PCTL_TRIG    1
@@ -370,6 +372,7 @@ extern const struct mv88e6xxx_gpio_ops mv88e6352_gpio_ops;
 
 int mv88e6xxx_g2_scratch_gpio_set_smi(struct mv88e6xxx_chip *chip,
                                      bool external);
+int mv88e6352_g2_scratch_port_has_serdes(struct mv88e6xxx_chip *chip, int port);
 int mv88e6xxx_g2_atu_stats_set(struct mv88e6xxx_chip *chip, u16 kind, u16 bin);
 int mv88e6xxx_g2_atu_stats_get(struct mv88e6xxx_chip *chip, u16 *stats);
 
index eda7100..a9d6e40 100644 (file)
@@ -289,3 +289,31 @@ int mv88e6xxx_g2_scratch_gpio_set_smi(struct mv88e6xxx_chip *chip,
 
        return mv88e6xxx_g2_scratch_write(chip, misc_cfg, val);
 }
+
+/**
+ * mv88e6352_g2_scratch_port_has_serdes - indicate if a port can have a serdes
+ * @chip: chip private data
+ * @port: port number to check for serdes
+ *
+ * Indicates whether the port may have a serdes attached according to the
+ * pin strapping. Returns negative error number, 0 if the port is not
+ * configured to have a serdes, and 1 if the port is configured to have a
+ * serdes attached.
+ */
+int mv88e6352_g2_scratch_port_has_serdes(struct mv88e6xxx_chip *chip, int port)
+{
+       u8 config3, p;
+       int err;
+
+       err = mv88e6xxx_g2_scratch_read(chip, MV88E6352_G2_SCRATCH_CONFIG_DATA3,
+                                       &config3);
+       if (err)
+               return err;
+
+       if (config3 & MV88E6352_G2_SCRATCH_CONFIG_DATA3_S_SEL)
+               p = 5;
+       else
+               p = 4;
+
+       return port == p;
+}
index ab41619..ceb4501 100644 (file)
@@ -1278,7 +1278,7 @@ int mv88e6xxx_port_drop_untagged(struct mv88e6xxx_chip *chip, int port,
        return mv88e6xxx_port_write(chip, port, MV88E6XXX_PORT_CTL2, new);
 }
 
-int mv88e6xxx_port_set_map_da(struct mv88e6xxx_chip *chip, int port)
+int mv88e6xxx_port_set_map_da(struct mv88e6xxx_chip *chip, int port, bool map)
 {
        u16 reg;
        int err;
@@ -1287,7 +1287,10 @@ int mv88e6xxx_port_set_map_da(struct mv88e6xxx_chip *chip, int port)
        if (err)
                return err;
 
-       reg |= MV88E6XXX_PORT_CTL2_MAP_DA;
+       if (map)
+               reg |= MV88E6XXX_PORT_CTL2_MAP_DA;
+       else
+               reg &= ~MV88E6XXX_PORT_CTL2_MAP_DA;
 
        return mv88e6xxx_port_write(chip, port, MV88E6XXX_PORT_CTL2, reg);
 }
index 03382b6..3a13db2 100644 (file)
 #define MV88E6XXX_PORT_STS_TX_PAUSED           0x0020
 #define MV88E6XXX_PORT_STS_FLOW_CTL            0x0010
 #define MV88E6XXX_PORT_STS_CMODE_MASK          0x000f
+#define MV88E6XXX_PORT_STS_CMODE_MII_PHY       0x0001
+#define MV88E6XXX_PORT_STS_CMODE_MII           0x0002
+#define MV88E6XXX_PORT_STS_CMODE_GMII          0x0003
+#define MV88E6XXX_PORT_STS_CMODE_RMII_PHY      0x0004
+#define MV88E6XXX_PORT_STS_CMODE_RMII          0x0005
 #define MV88E6XXX_PORT_STS_CMODE_RGMII         0x0007
 #define MV88E6XXX_PORT_STS_CMODE_100BASEX      0x0008
 #define MV88E6XXX_PORT_STS_CMODE_1000BASEX     0x0009
@@ -425,7 +430,7 @@ int mv88e6185_port_get_cmode(struct mv88e6xxx_chip *chip, int port, u8 *cmode);
 int mv88e6352_port_get_cmode(struct mv88e6xxx_chip *chip, int port, u8 *cmode);
 int mv88e6xxx_port_drop_untagged(struct mv88e6xxx_chip *chip, int port,
                                 bool drop_untagged);
-int mv88e6xxx_port_set_map_da(struct mv88e6xxx_chip *chip, int port);
+int mv88e6xxx_port_set_map_da(struct mv88e6xxx_chip *chip, int port, bool map);
 int mv88e6095_port_set_upstream_port(struct mv88e6xxx_chip *chip, int port,
                                     int upstream_port);
 int mv88e6xxx_port_set_mirror(struct mv88e6xxx_chip *chip, int port,
index 2b05ead..6a177bf 100644 (file)
@@ -272,14 +272,6 @@ int mv88e6352_serdes_get_lane(struct mv88e6xxx_chip *chip, int port)
        return lane;
 }
 
-static bool mv88e6352_port_has_serdes(struct mv88e6xxx_chip *chip, int port)
-{
-       if (mv88e6xxx_serdes_get_lane(chip, port) >= 0)
-               return true;
-
-       return false;
-}
-
 struct mv88e6352_serdes_hw_stat {
        char string[ETH_GSTRING_LEN];
        int sizeof_stat;
@@ -293,20 +285,24 @@ static struct mv88e6352_serdes_hw_stat mv88e6352_serdes_hw_stats[] = {
 
 int mv88e6352_serdes_get_sset_count(struct mv88e6xxx_chip *chip, int port)
 {
-       if (mv88e6352_port_has_serdes(chip, port))
-               return ARRAY_SIZE(mv88e6352_serdes_hw_stats);
+       int err;
 
-       return 0;
+       err = mv88e6352_g2_scratch_port_has_serdes(chip, port);
+       if (err <= 0)
+               return err;
+
+       return ARRAY_SIZE(mv88e6352_serdes_hw_stats);
 }
 
 int mv88e6352_serdes_get_strings(struct mv88e6xxx_chip *chip,
                                 int port, uint8_t *data)
 {
        struct mv88e6352_serdes_hw_stat *stat;
-       int i;
+       int err, i;
 
-       if (!mv88e6352_port_has_serdes(chip, port))
-               return 0;
+       err = mv88e6352_g2_scratch_port_has_serdes(chip, port);
+       if (err <= 0)
+               return err;
 
        for (i = 0; i < ARRAY_SIZE(mv88e6352_serdes_hw_stats); i++) {
                stat = &mv88e6352_serdes_hw_stats[i];
@@ -348,11 +344,12 @@ int mv88e6352_serdes_get_stats(struct mv88e6xxx_chip *chip, int port,
 {
        struct mv88e6xxx_port *mv88e6xxx_port = &chip->ports[port];
        struct mv88e6352_serdes_hw_stat *stat;
+       int i, err;
        u64 value;
-       int i;
 
-       if (!mv88e6352_port_has_serdes(chip, port))
-               return 0;
+       err = mv88e6352_g2_scratch_port_has_serdes(chip, port);
+       if (err <= 0)
+               return err;
 
        BUILD_BUG_ON(ARRAY_SIZE(mv88e6352_serdes_hw_stats) >
                     ARRAY_SIZE(mv88e6xxx_port->serdes_stats));
@@ -419,8 +416,13 @@ unsigned int mv88e6352_serdes_irq_mapping(struct mv88e6xxx_chip *chip, int port)
 
 int mv88e6352_serdes_get_regs_len(struct mv88e6xxx_chip *chip, int port)
 {
-       if (!mv88e6352_port_has_serdes(chip, port))
-               return 0;
+       int err;
+
+       mv88e6xxx_reg_lock(chip);
+       err = mv88e6352_g2_scratch_port_has_serdes(chip, port);
+       mv88e6xxx_reg_unlock(chip);
+       if (err <= 0)
+               return err;
 
        return 32 * sizeof(u16);
 }
@@ -432,7 +434,8 @@ void mv88e6352_serdes_get_regs(struct mv88e6xxx_chip *chip, int port, void *_p)
        int err;
        int i;
 
-       if (!mv88e6352_port_has_serdes(chip, port))
+       err = mv88e6352_g2_scratch_port_has_serdes(chip, port);
+       if (err <= 0)
                return;
 
        for (i = 0 ; i < 32; i++) {
index 282fe08..a990271 100644 (file)
@@ -55,11 +55,15 @@ static int mv88e6xxx_smi_direct_write(struct mv88e6xxx_chip *chip,
 static int mv88e6xxx_smi_direct_wait(struct mv88e6xxx_chip *chip,
                                     int dev, int reg, int bit, int val)
 {
+       const unsigned long timeout = jiffies + msecs_to_jiffies(50);
        u16 data;
        int err;
        int i;
 
-       for (i = 0; i < 16; i++) {
+       /* Even if the initial poll takes longer than 50ms, always do
+        * at least one more attempt.
+        */
+       for (i = 0; time_before(jiffies, timeout) || (i < 2); i++) {
                err = mv88e6xxx_smi_direct_read(chip, dev, reg, &data);
                if (err)
                        return err;
@@ -67,7 +71,10 @@ static int mv88e6xxx_smi_direct_wait(struct mv88e6xxx_chip *chip,
                if (!!(data & BIT(bit)) == !!val)
                        return 0;
 
-               usleep_range(1000, 2000);
+               if (i < 2)
+                       cpu_relax();
+               else
+                       usleep_range(1000, 2000);
        }
 
        return -ETIMEDOUT;
@@ -104,11 +111,6 @@ static int mv88e6xxx_smi_indirect_read(struct mv88e6xxx_chip *chip,
 {
        int err;
 
-       err = mv88e6xxx_smi_direct_wait(chip, chip->sw_addr,
-                                       MV88E6XXX_SMI_CMD, 15, 0);
-       if (err)
-               return err;
-
        err = mv88e6xxx_smi_direct_write(chip, chip->sw_addr,
                                         MV88E6XXX_SMI_CMD,
                                         MV88E6XXX_SMI_CMD_BUSY |
@@ -132,11 +134,6 @@ static int mv88e6xxx_smi_indirect_write(struct mv88e6xxx_chip *chip,
 {
        int err;
 
-       err = mv88e6xxx_smi_direct_wait(chip, chip->sw_addr,
-                                       MV88E6XXX_SMI_CMD, 15, 0);
-       if (err)
-               return err;
-
        err = mv88e6xxx_smi_direct_write(chip, chip->sw_addr,
                                         MV88E6XXX_SMI_DATA, data);
        if (err)
@@ -155,9 +152,20 @@ static int mv88e6xxx_smi_indirect_write(struct mv88e6xxx_chip *chip,
                                         MV88E6XXX_SMI_CMD, 15, 0);
 }
 
+static int mv88e6xxx_smi_indirect_init(struct mv88e6xxx_chip *chip)
+{
+       /* Ensure that the chip starts out in the ready state. As both
+        * reads and writes always ensure this on return, they can
+        * safely depend on the chip not being busy on entry.
+        */
+       return mv88e6xxx_smi_direct_wait(chip, chip->sw_addr,
+                                        MV88E6XXX_SMI_CMD, 15, 0);
+}
+
 static const struct mv88e6xxx_bus_ops mv88e6xxx_smi_indirect_ops = {
        .read = mv88e6xxx_smi_indirect_read,
        .write = mv88e6xxx_smi_indirect_write,
+       .init = mv88e6xxx_smi_indirect_init,
 };
 
 int mv88e6xxx_smi_init(struct mv88e6xxx_chip *chip,
@@ -175,5 +183,8 @@ int mv88e6xxx_smi_init(struct mv88e6xxx_chip *chip,
        chip->bus = bus;
        chip->sw_addr = sw_addr;
 
+       if (chip->smi_ops->init)
+               return chip->smi_ops->init(chip);
+
        return 0;
 }
index c39de2a..e5098cf 100644 (file)
@@ -499,52 +499,27 @@ static enum dsa_tag_protocol ar9331_sw_get_tag_protocol(struct dsa_switch *ds,
        return DSA_TAG_PROTO_AR9331;
 }
 
-static void ar9331_sw_phylink_validate(struct dsa_switch *ds, int port,
-                                      unsigned long *supported,
-                                      struct phylink_link_state *state)
+static void ar9331_sw_phylink_get_caps(struct dsa_switch *ds, int port,
+                                      struct phylink_config *config)
 {
-       __ETHTOOL_DECLARE_LINK_MODE_MASK(mask) = { 0, };
+       config->mac_capabilities = MAC_ASYM_PAUSE | MAC_SYM_PAUSE |
+               MAC_10 | MAC_100;
 
        switch (port) {
        case 0:
-               if (state->interface != PHY_INTERFACE_MODE_GMII)
-                       goto unsupported;
-
-               phylink_set(mask, 1000baseT_Full);
-               phylink_set(mask, 1000baseT_Half);
+               __set_bit(PHY_INTERFACE_MODE_GMII,
+                         config->supported_interfaces);
+               config->mac_capabilities |= MAC_1000;
                break;
        case 1:
        case 2:
        case 3:
        case 4:
        case 5:
-               if (state->interface != PHY_INTERFACE_MODE_INTERNAL)
-                       goto unsupported;
+               __set_bit(PHY_INTERFACE_MODE_INTERNAL,
+                         config->supported_interfaces);
                break;
-       default:
-               linkmode_zero(supported);
-               dev_err(ds->dev, "Unsupported port: %i\n", port);
-               return;
        }
-
-       phylink_set_port_modes(mask);
-       phylink_set(mask, Pause);
-       phylink_set(mask, Asym_Pause);
-
-       phylink_set(mask, 10baseT_Half);
-       phylink_set(mask, 10baseT_Full);
-       phylink_set(mask, 100baseT_Half);
-       phylink_set(mask, 100baseT_Full);
-
-       linkmode_and(supported, supported, mask);
-       linkmode_and(state->advertising, state->advertising, mask);
-
-       return;
-
-unsupported:
-       linkmode_zero(supported);
-       dev_err(ds->dev, "Unsupported interface: %d, port: %d\n",
-               state->interface, port);
 }
 
 static void ar9331_sw_phylink_mac_config(struct dsa_switch *ds, int port,
@@ -697,7 +672,7 @@ static const struct dsa_switch_ops ar9331_sw_ops = {
        .get_tag_protocol       = ar9331_sw_get_tag_protocol,
        .setup                  = ar9331_sw_setup,
        .port_disable           = ar9331_sw_port_disable,
-       .phylink_validate       = ar9331_sw_phylink_validate,
+       .phylink_get_caps       = ar9331_sw_phylink_get_caps,
        .phylink_mac_config     = ar9331_sw_phylink_mac_config,
        .phylink_mac_link_down  = ar9331_sw_phylink_mac_link_down,
        .phylink_mac_link_up    = ar9331_sw_phylink_mac_link_up,
index 0396945..c09d156 100644 (file)
@@ -20,6 +20,7 @@
 #include <linux/phylink.h>
 #include <linux/gpio/consumer.h>
 #include <linux/etherdevice.h>
+#include <linux/dsa/tag_qca.h>
 
 #include "qca8k.h"
 
@@ -74,12 +75,6 @@ static const struct qca8k_mib_desc ar8327_mib[] = {
        MIB_DESC(1, 0xac, "TXUnicast"),
 };
 
-/* The 32bit switch registers are accessed indirectly. To achieve this we need
- * to set the page of the register. Track the last page that was set to reduce
- * mdio writes
- */
-static u16 qca8k_current_page = 0xffff;
-
 static void
 qca8k_split_addr(u32 regaddr, u16 *r1, u16 *r2, u16 *page)
 {
@@ -94,6 +89,44 @@ qca8k_split_addr(u32 regaddr, u16 *r1, u16 *r2, u16 *page)
 }
 
 static int
+qca8k_set_lo(struct qca8k_priv *priv, int phy_id, u32 regnum, u16 lo)
+{
+       u16 *cached_lo = &priv->mdio_cache.lo;
+       struct mii_bus *bus = priv->bus;
+       int ret;
+
+       if (lo == *cached_lo)
+               return 0;
+
+       ret = bus->write(bus, phy_id, regnum, lo);
+       if (ret < 0)
+               dev_err_ratelimited(&bus->dev,
+                                   "failed to write qca8k 32bit lo register\n");
+
+       *cached_lo = lo;
+       return 0;
+}
+
+static int
+qca8k_set_hi(struct qca8k_priv *priv, int phy_id, u32 regnum, u16 hi)
+{
+       u16 *cached_hi = &priv->mdio_cache.hi;
+       struct mii_bus *bus = priv->bus;
+       int ret;
+
+       if (hi == *cached_hi)
+               return 0;
+
+       ret = bus->write(bus, phy_id, regnum, hi);
+       if (ret < 0)
+               dev_err_ratelimited(&bus->dev,
+                                   "failed to write qca8k 32bit hi register\n");
+
+       *cached_hi = hi;
+       return 0;
+}
+
+static int
 qca8k_mii_read32(struct mii_bus *bus, int phy_id, u32 regnum, u32 *val)
 {
        int ret;
@@ -116,7 +149,7 @@ qca8k_mii_read32(struct mii_bus *bus, int phy_id, u32 regnum, u32 *val)
 }
 
 static void
-qca8k_mii_write32(struct mii_bus *bus, int phy_id, u32 regnum, u32 val)
+qca8k_mii_write32(struct qca8k_priv *priv, int phy_id, u32 regnum, u32 val)
 {
        u16 lo, hi;
        int ret;
@@ -124,20 +157,19 @@ qca8k_mii_write32(struct mii_bus *bus, int phy_id, u32 regnum, u32 val)
        lo = val & 0xffff;
        hi = (u16)(val >> 16);
 
-       ret = bus->write(bus, phy_id, regnum, lo);
+       ret = qca8k_set_lo(priv, phy_id, regnum, lo);
        if (ret >= 0)
-               ret = bus->write(bus, phy_id, regnum + 1, hi);
-       if (ret < 0)
-               dev_err_ratelimited(&bus->dev,
-                                   "failed to write qca8k 32bit register\n");
+               ret = qca8k_set_hi(priv, phy_id, regnum + 1, hi);
 }
 
 static int
-qca8k_set_page(struct mii_bus *bus, u16 page)
+qca8k_set_page(struct qca8k_priv *priv, u16 page)
 {
+       u16 *cached_page = &priv->mdio_cache.page;
+       struct mii_bus *bus = priv->bus;
        int ret;
 
-       if (page == qca8k_current_page)
+       if (page == *cached_page)
                return 0;
 
        ret = bus->write(bus, 0x18, 0, page);
@@ -147,7 +179,7 @@ qca8k_set_page(struct mii_bus *bus, u16 page)
                return ret;
        }
 
-       qca8k_current_page = page;
+       *cached_page = page;
        usleep_range(1000, 2000);
        return 0;
 }
@@ -170,6 +202,252 @@ qca8k_rmw(struct qca8k_priv *priv, u32 reg, u32 mask, u32 write_val)
        return regmap_update_bits(priv->regmap, reg, mask, write_val);
 }
 
+static void qca8k_rw_reg_ack_handler(struct dsa_switch *ds, struct sk_buff *skb)
+{
+       struct qca8k_mgmt_eth_data *mgmt_eth_data;
+       struct qca8k_priv *priv = ds->priv;
+       struct qca_mgmt_ethhdr *mgmt_ethhdr;
+       u8 len, cmd;
+
+       mgmt_ethhdr = (struct qca_mgmt_ethhdr *)skb_mac_header(skb);
+       mgmt_eth_data = &priv->mgmt_eth_data;
+
+       cmd = FIELD_GET(QCA_HDR_MGMT_CMD, mgmt_ethhdr->command);
+       len = FIELD_GET(QCA_HDR_MGMT_LENGTH, mgmt_ethhdr->command);
+
+       /* Make sure the seq match the requested packet */
+       if (mgmt_ethhdr->seq == mgmt_eth_data->seq)
+               mgmt_eth_data->ack = true;
+
+       if (cmd == MDIO_READ) {
+               mgmt_eth_data->data[0] = mgmt_ethhdr->mdio_data;
+
+               /* Get the rest of the 12 byte of data.
+                * The read/write function will extract the requested data.
+                */
+               if (len > QCA_HDR_MGMT_DATA1_LEN)
+                       memcpy(mgmt_eth_data->data + 1, skb->data,
+                              QCA_HDR_MGMT_DATA2_LEN);
+       }
+
+       complete(&mgmt_eth_data->rw_done);
+}
+
+static struct sk_buff *qca8k_alloc_mdio_header(enum mdio_cmd cmd, u32 reg, u32 *val,
+                                              int priority, unsigned int len)
+{
+       struct qca_mgmt_ethhdr *mgmt_ethhdr;
+       unsigned int real_len;
+       struct sk_buff *skb;
+       u32 *data2;
+       u16 hdr;
+
+       skb = dev_alloc_skb(QCA_HDR_MGMT_PKT_LEN);
+       if (!skb)
+               return NULL;
+
+       /* Max value for len reg is 15 (0xf) but the switch actually return 16 byte
+        * Actually for some reason the steps are:
+        * 0: nothing
+        * 1-4: first 4 byte
+        * 5-6: first 12 byte
+        * 7-15: all 16 byte
+        */
+       if (len == 16)
+               real_len = 15;
+       else
+               real_len = len;
+
+       skb_reset_mac_header(skb);
+       skb_set_network_header(skb, skb->len);
+
+       mgmt_ethhdr = skb_push(skb, QCA_HDR_MGMT_HEADER_LEN + QCA_HDR_LEN);
+
+       hdr = FIELD_PREP(QCA_HDR_XMIT_VERSION, QCA_HDR_VERSION);
+       hdr |= FIELD_PREP(QCA_HDR_XMIT_PRIORITY, priority);
+       hdr |= QCA_HDR_XMIT_FROM_CPU;
+       hdr |= FIELD_PREP(QCA_HDR_XMIT_DP_BIT, BIT(0));
+       hdr |= FIELD_PREP(QCA_HDR_XMIT_CONTROL, QCA_HDR_XMIT_TYPE_RW_REG);
+
+       mgmt_ethhdr->command = FIELD_PREP(QCA_HDR_MGMT_ADDR, reg);
+       mgmt_ethhdr->command |= FIELD_PREP(QCA_HDR_MGMT_LENGTH, real_len);
+       mgmt_ethhdr->command |= FIELD_PREP(QCA_HDR_MGMT_CMD, cmd);
+       mgmt_ethhdr->command |= FIELD_PREP(QCA_HDR_MGMT_CHECK_CODE,
+                                          QCA_HDR_MGMT_CHECK_CODE_VAL);
+
+       if (cmd == MDIO_WRITE)
+               mgmt_ethhdr->mdio_data = *val;
+
+       mgmt_ethhdr->hdr = htons(hdr);
+
+       data2 = skb_put_zero(skb, QCA_HDR_MGMT_DATA2_LEN + QCA_HDR_MGMT_PADDING_LEN);
+       if (cmd == MDIO_WRITE && len > QCA_HDR_MGMT_DATA1_LEN)
+               memcpy(data2, val + 1, len - QCA_HDR_MGMT_DATA1_LEN);
+
+       return skb;
+}
+
+static void qca8k_mdio_header_fill_seq_num(struct sk_buff *skb, u32 seq_num)
+{
+       struct qca_mgmt_ethhdr *mgmt_ethhdr;
+
+       mgmt_ethhdr = (struct qca_mgmt_ethhdr *)skb->data;
+       mgmt_ethhdr->seq = FIELD_PREP(QCA_HDR_MGMT_SEQ_NUM, seq_num);
+}
+
+static int qca8k_read_eth(struct qca8k_priv *priv, u32 reg, u32 *val, int len)
+{
+       struct qca8k_mgmt_eth_data *mgmt_eth_data = &priv->mgmt_eth_data;
+       struct sk_buff *skb;
+       bool ack;
+       int ret;
+
+       skb = qca8k_alloc_mdio_header(MDIO_READ, reg, NULL,
+                                     QCA8K_ETHERNET_MDIO_PRIORITY, len);
+       if (!skb)
+               return -ENOMEM;
+
+       mutex_lock(&mgmt_eth_data->mutex);
+
+       /* Check mgmt_master if is operational */
+       if (!priv->mgmt_master) {
+               kfree_skb(skb);
+               mutex_unlock(&mgmt_eth_data->mutex);
+               return -EINVAL;
+       }
+
+       skb->dev = priv->mgmt_master;
+
+       reinit_completion(&mgmt_eth_data->rw_done);
+
+       /* Increment seq_num and set it in the mdio pkt */
+       mgmt_eth_data->seq++;
+       qca8k_mdio_header_fill_seq_num(skb, mgmt_eth_data->seq);
+       mgmt_eth_data->ack = false;
+
+       dev_queue_xmit(skb);
+
+       ret = wait_for_completion_timeout(&mgmt_eth_data->rw_done,
+                                         msecs_to_jiffies(QCA8K_ETHERNET_TIMEOUT));
+
+       *val = mgmt_eth_data->data[0];
+       if (len > QCA_HDR_MGMT_DATA1_LEN)
+               memcpy(val + 1, mgmt_eth_data->data + 1, len - QCA_HDR_MGMT_DATA1_LEN);
+
+       ack = mgmt_eth_data->ack;
+
+       mutex_unlock(&mgmt_eth_data->mutex);
+
+       if (ret <= 0)
+               return -ETIMEDOUT;
+
+       if (!ack)
+               return -EINVAL;
+
+       return 0;
+}
+
+static int qca8k_write_eth(struct qca8k_priv *priv, u32 reg, u32 *val, int len)
+{
+       struct qca8k_mgmt_eth_data *mgmt_eth_data = &priv->mgmt_eth_data;
+       struct sk_buff *skb;
+       bool ack;
+       int ret;
+
+       skb = qca8k_alloc_mdio_header(MDIO_WRITE, reg, val,
+                                     QCA8K_ETHERNET_MDIO_PRIORITY, len);
+       if (!skb)
+               return -ENOMEM;
+
+       mutex_lock(&mgmt_eth_data->mutex);
+
+       /* Check mgmt_master if is operational */
+       if (!priv->mgmt_master) {
+               kfree_skb(skb);
+               mutex_unlock(&mgmt_eth_data->mutex);
+               return -EINVAL;
+       }
+
+       skb->dev = priv->mgmt_master;
+
+       reinit_completion(&mgmt_eth_data->rw_done);
+
+       /* Increment seq_num and set it in the mdio pkt */
+       mgmt_eth_data->seq++;
+       qca8k_mdio_header_fill_seq_num(skb, mgmt_eth_data->seq);
+       mgmt_eth_data->ack = false;
+
+       dev_queue_xmit(skb);
+
+       ret = wait_for_completion_timeout(&mgmt_eth_data->rw_done,
+                                         msecs_to_jiffies(QCA8K_ETHERNET_TIMEOUT));
+
+       ack = mgmt_eth_data->ack;
+
+       mutex_unlock(&mgmt_eth_data->mutex);
+
+       if (ret <= 0)
+               return -ETIMEDOUT;
+
+       if (!ack)
+               return -EINVAL;
+
+       return 0;
+}
+
+static int
+qca8k_regmap_update_bits_eth(struct qca8k_priv *priv, u32 reg, u32 mask, u32 write_val)
+{
+       u32 val = 0;
+       int ret;
+
+       ret = qca8k_read_eth(priv, reg, &val, sizeof(val));
+       if (ret)
+               return ret;
+
+       val &= ~mask;
+       val |= write_val;
+
+       return qca8k_write_eth(priv, reg, &val, sizeof(val));
+}
+
+static int
+qca8k_bulk_read(struct qca8k_priv *priv, u32 reg, u32 *val, int len)
+{
+       int i, count = len / sizeof(u32), ret;
+
+       if (priv->mgmt_master && !qca8k_read_eth(priv, reg, val, len))
+               return 0;
+
+       for (i = 0; i < count; i++) {
+               ret = regmap_read(priv->regmap, reg + (i * 4), val + i);
+               if (ret < 0)
+                       return ret;
+       }
+
+       return 0;
+}
+
+static int
+qca8k_bulk_write(struct qca8k_priv *priv, u32 reg, u32 *val, int len)
+{
+       int i, count = len / sizeof(u32), ret;
+       u32 tmp;
+
+       if (priv->mgmt_master && !qca8k_write_eth(priv, reg, val, len))
+               return 0;
+
+       for (i = 0; i < count; i++) {
+               tmp = val[i];
+
+               ret = regmap_write(priv->regmap, reg + (i * 4), tmp);
+               if (ret < 0)
+                       return ret;
+       }
+
+       return 0;
+}
+
 static int
 qca8k_regmap_read(void *ctx, uint32_t reg, uint32_t *val)
 {
@@ -178,11 +456,14 @@ qca8k_regmap_read(void *ctx, uint32_t reg, uint32_t *val)
        u16 r1, r2, page;
        int ret;
 
+       if (!qca8k_read_eth(priv, reg, val, sizeof(*val)))
+               return 0;
+
        qca8k_split_addr(reg, &r1, &r2, &page);
 
        mutex_lock_nested(&bus->mdio_lock, MDIO_MUTEX_NESTED);
 
-       ret = qca8k_set_page(bus, page);
+       ret = qca8k_set_page(priv, page);
        if (ret < 0)
                goto exit;
 
@@ -201,15 +482,18 @@ qca8k_regmap_write(void *ctx, uint32_t reg, uint32_t val)
        u16 r1, r2, page;
        int ret;
 
+       if (!qca8k_write_eth(priv, reg, &val, sizeof(val)))
+               return 0;
+
        qca8k_split_addr(reg, &r1, &r2, &page);
 
        mutex_lock_nested(&bus->mdio_lock, MDIO_MUTEX_NESTED);
 
-       ret = qca8k_set_page(bus, page);
+       ret = qca8k_set_page(priv, page);
        if (ret < 0)
                goto exit;
 
-       qca8k_mii_write32(bus, 0x10 | r2, r1, val);
+       qca8k_mii_write32(priv, 0x10 | r2, r1, val);
 
 exit:
        mutex_unlock(&bus->mdio_lock);
@@ -225,11 +509,14 @@ qca8k_regmap_update_bits(void *ctx, uint32_t reg, uint32_t mask, uint32_t write_
        u32 val;
        int ret;
 
+       if (!qca8k_regmap_update_bits_eth(priv, reg, mask, write_val))
+               return 0;
+
        qca8k_split_addr(reg, &r1, &r2, &page);
 
        mutex_lock_nested(&bus->mdio_lock, MDIO_MUTEX_NESTED);
 
-       ret = qca8k_set_page(bus, page);
+       ret = qca8k_set_page(priv, page);
        if (ret < 0)
                goto exit;
 
@@ -239,7 +526,7 @@ qca8k_regmap_update_bits(void *ctx, uint32_t reg, uint32_t mask, uint32_t write_
 
        val &= ~mask;
        val |= write_val;
-       qca8k_mii_write32(bus, 0x10 | r2, r1, val);
+       qca8k_mii_write32(priv, 0x10 | r2, r1, val);
 
 exit:
        mutex_unlock(&bus->mdio_lock);
@@ -296,17 +583,13 @@ qca8k_busy_wait(struct qca8k_priv *priv, u32 reg, u32 mask)
 static int
 qca8k_fdb_read(struct qca8k_priv *priv, struct qca8k_fdb *fdb)
 {
-       u32 reg[4], val;
-       int i, ret;
+       u32 reg[3];
+       int ret;
 
        /* load the ARL table into an array */
-       for (i = 0; i < 4; i++) {
-               ret = qca8k_read(priv, QCA8K_REG_ATU_DATA0 + (i * 4), &val);
-               if (ret < 0)
-                       return ret;
-
-               reg[i] = val;
-       }
+       ret = qca8k_bulk_read(priv, QCA8K_REG_ATU_DATA0, reg, sizeof(reg));
+       if (ret)
+               return ret;
 
        /* vid - 83:72 */
        fdb->vid = FIELD_GET(QCA8K_ATU_VID_MASK, reg[2]);
@@ -330,7 +613,6 @@ qca8k_fdb_write(struct qca8k_priv *priv, u16 vid, u8 port_mask, const u8 *mac,
                u8 aging)
 {
        u32 reg[3] = { 0 };
-       int i;
 
        /* vid - 83:72 */
        reg[2] = FIELD_PREP(QCA8K_ATU_VID_MASK, vid);
@@ -347,8 +629,7 @@ qca8k_fdb_write(struct qca8k_priv *priv, u16 vid, u8 port_mask, const u8 *mac,
        reg[0] |= FIELD_PREP(QCA8K_ATU_ADDR5_MASK, mac[5]);
 
        /* load the array into the ARL table */
-       for (i = 0; i < 3; i++)
-               qca8k_write(priv, QCA8K_REG_ATU_DATA0 + (i * 4), reg[i]);
+       qca8k_bulk_write(priv, QCA8K_REG_ATU_DATA0, reg, sizeof(reg));
 }
 
 static int
@@ -632,7 +913,10 @@ qca8k_mib_init(struct qca8k_priv *priv)
        int ret;
 
        mutex_lock(&priv->reg_mutex);
-       ret = regmap_set_bits(priv->regmap, QCA8K_REG_MIB, QCA8K_MIB_FLUSH | QCA8K_MIB_BUSY);
+       ret = regmap_update_bits(priv->regmap, QCA8K_REG_MIB,
+                                QCA8K_MIB_FUNC | QCA8K_MIB_BUSY,
+                                FIELD_PREP(QCA8K_MIB_FUNC, QCA8K_MIB_FLUSH) |
+                                QCA8K_MIB_BUSY);
        if (ret)
                goto exit;
 
@@ -666,6 +950,199 @@ qca8k_port_set_status(struct qca8k_priv *priv, int port, int enable)
                regmap_clear_bits(priv->regmap, QCA8K_REG_PORT_STATUS(port), mask);
 }
 
+static int
+qca8k_phy_eth_busy_wait(struct qca8k_mgmt_eth_data *mgmt_eth_data,
+                       struct sk_buff *read_skb, u32 *val)
+{
+       struct sk_buff *skb = skb_copy(read_skb, GFP_KERNEL);
+       bool ack;
+       int ret;
+
+       reinit_completion(&mgmt_eth_data->rw_done);
+
+       /* Increment seq_num and set it in the copy pkt */
+       mgmt_eth_data->seq++;
+       qca8k_mdio_header_fill_seq_num(skb, mgmt_eth_data->seq);
+       mgmt_eth_data->ack = false;
+
+       dev_queue_xmit(skb);
+
+       ret = wait_for_completion_timeout(&mgmt_eth_data->rw_done,
+                                         QCA8K_ETHERNET_TIMEOUT);
+
+       ack = mgmt_eth_data->ack;
+
+       if (ret <= 0)
+               return -ETIMEDOUT;
+
+       if (!ack)
+               return -EINVAL;
+
+       *val = mgmt_eth_data->data[0];
+
+       return 0;
+}
+
+static int
+qca8k_phy_eth_command(struct qca8k_priv *priv, bool read, int phy,
+                     int regnum, u16 data)
+{
+       struct sk_buff *write_skb, *clear_skb, *read_skb;
+       struct qca8k_mgmt_eth_data *mgmt_eth_data;
+       u32 write_val, clear_val = 0, val;
+       struct net_device *mgmt_master;
+       int ret, ret1;
+       bool ack;
+
+       if (regnum >= QCA8K_MDIO_MASTER_MAX_REG)
+               return -EINVAL;
+
+       mgmt_eth_data = &priv->mgmt_eth_data;
+
+       write_val = QCA8K_MDIO_MASTER_BUSY | QCA8K_MDIO_MASTER_EN |
+                   QCA8K_MDIO_MASTER_PHY_ADDR(phy) |
+                   QCA8K_MDIO_MASTER_REG_ADDR(regnum);
+
+       if (read) {
+               write_val |= QCA8K_MDIO_MASTER_READ;
+       } else {
+               write_val |= QCA8K_MDIO_MASTER_WRITE;
+               write_val |= QCA8K_MDIO_MASTER_DATA(data);
+       }
+
+       /* Prealloc all the needed skb before the lock */
+       write_skb = qca8k_alloc_mdio_header(MDIO_WRITE, QCA8K_MDIO_MASTER_CTRL, &write_val,
+                                           QCA8K_ETHERNET_PHY_PRIORITY, sizeof(write_val));
+       if (!write_skb)
+               return -ENOMEM;
+
+       clear_skb = qca8k_alloc_mdio_header(MDIO_WRITE, QCA8K_MDIO_MASTER_CTRL, &clear_val,
+                                           QCA8K_ETHERNET_PHY_PRIORITY, sizeof(clear_val));
+       if (!clear_skb) {
+               ret = -ENOMEM;
+               goto err_clear_skb;
+       }
+
+       read_skb = qca8k_alloc_mdio_header(MDIO_READ, QCA8K_MDIO_MASTER_CTRL, &clear_val,
+                                          QCA8K_ETHERNET_PHY_PRIORITY, sizeof(clear_val));
+       if (!read_skb) {
+               ret = -ENOMEM;
+               goto err_read_skb;
+       }
+
+       /* Actually start the request:
+        * 1. Send mdio master packet
+        * 2. Busy Wait for mdio master command
+        * 3. Get the data if we are reading
+        * 4. Reset the mdio master (even with error)
+        */
+       mutex_lock(&mgmt_eth_data->mutex);
+
+       /* Check if mgmt_master is operational */
+       mgmt_master = priv->mgmt_master;
+       if (!mgmt_master) {
+               mutex_unlock(&mgmt_eth_data->mutex);
+               ret = -EINVAL;
+               goto err_mgmt_master;
+       }
+
+       read_skb->dev = mgmt_master;
+       clear_skb->dev = mgmt_master;
+       write_skb->dev = mgmt_master;
+
+       reinit_completion(&mgmt_eth_data->rw_done);
+
+       /* Increment seq_num and set it in the write pkt */
+       mgmt_eth_data->seq++;
+       qca8k_mdio_header_fill_seq_num(write_skb, mgmt_eth_data->seq);
+       mgmt_eth_data->ack = false;
+
+       dev_queue_xmit(write_skb);
+
+       ret = wait_for_completion_timeout(&mgmt_eth_data->rw_done,
+                                         QCA8K_ETHERNET_TIMEOUT);
+
+       ack = mgmt_eth_data->ack;
+
+       if (ret <= 0) {
+               ret = -ETIMEDOUT;
+               kfree_skb(read_skb);
+               goto exit;
+       }
+
+       if (!ack) {
+               ret = -EINVAL;
+               kfree_skb(read_skb);
+               goto exit;
+       }
+
+       ret = read_poll_timeout(qca8k_phy_eth_busy_wait, ret1,
+                               !(val & QCA8K_MDIO_MASTER_BUSY), 0,
+                               QCA8K_BUSY_WAIT_TIMEOUT * USEC_PER_MSEC, false,
+                               mgmt_eth_data, read_skb, &val);
+
+       if (ret < 0 && ret1 < 0) {
+               ret = ret1;
+               goto exit;
+       }
+
+       if (read) {
+               reinit_completion(&mgmt_eth_data->rw_done);
+
+               /* Increment seq_num and set it in the read pkt */
+               mgmt_eth_data->seq++;
+               qca8k_mdio_header_fill_seq_num(read_skb, mgmt_eth_data->seq);
+               mgmt_eth_data->ack = false;
+
+               dev_queue_xmit(read_skb);
+
+               ret = wait_for_completion_timeout(&mgmt_eth_data->rw_done,
+                                                 QCA8K_ETHERNET_TIMEOUT);
+
+               ack = mgmt_eth_data->ack;
+
+               if (ret <= 0) {
+                       ret = -ETIMEDOUT;
+                       goto exit;
+               }
+
+               if (!ack) {
+                       ret = -EINVAL;
+                       goto exit;
+               }
+
+               ret = mgmt_eth_data->data[0] & QCA8K_MDIO_MASTER_DATA_MASK;
+       } else {
+               kfree_skb(read_skb);
+       }
+exit:
+       reinit_completion(&mgmt_eth_data->rw_done);
+
+       /* Increment seq_num and set it in the clear pkt */
+       mgmt_eth_data->seq++;
+       qca8k_mdio_header_fill_seq_num(clear_skb, mgmt_eth_data->seq);
+       mgmt_eth_data->ack = false;
+
+       dev_queue_xmit(clear_skb);
+
+       wait_for_completion_timeout(&mgmt_eth_data->rw_done,
+                                   QCA8K_ETHERNET_TIMEOUT);
+
+       mutex_unlock(&mgmt_eth_data->mutex);
+
+       return ret;
+
+       /* Error handling before lock */
+err_mgmt_master:
+       kfree_skb(read_skb);
+err_read_skb:
+       kfree_skb(clear_skb);
+err_clear_skb:
+       kfree_skb(write_skb);
+
+       return ret;
+}
+
 static u32
 qca8k_port_to_phy(int port)
 {
@@ -704,8 +1181,9 @@ qca8k_mdio_busy_wait(struct mii_bus *bus, u32 reg, u32 mask)
 }
 
 static int
-qca8k_mdio_write(struct mii_bus *bus, int phy, int regnum, u16 data)
+qca8k_mdio_write(struct qca8k_priv *priv, int phy, int regnum, u16 data)
 {
+       struct mii_bus *bus = priv->bus;
        u16 r1, r2, page;
        u32 val;
        int ret;
@@ -722,18 +1200,18 @@ qca8k_mdio_write(struct mii_bus *bus, int phy, int regnum, u16 data)
 
        mutex_lock_nested(&bus->mdio_lock, MDIO_MUTEX_NESTED);
 
-       ret = qca8k_set_page(bus, page);
+       ret = qca8k_set_page(priv, page);
        if (ret)
                goto exit;
 
-       qca8k_mii_write32(bus, 0x10 | r2, r1, val);
+       qca8k_mii_write32(priv, 0x10 | r2, r1, val);
 
        ret = qca8k_mdio_busy_wait(bus, QCA8K_MDIO_MASTER_CTRL,
                                   QCA8K_MDIO_MASTER_BUSY);
 
 exit:
        /* even if the busy_wait timeouts try to clear the MASTER_EN */
-       qca8k_mii_write32(bus, 0x10 | r2, r1, 0);
+       qca8k_mii_write32(priv, 0x10 | r2, r1, 0);
 
        mutex_unlock(&bus->mdio_lock);
 
@@ -741,8 +1219,9 @@ exit:
 }
 
 static int
-qca8k_mdio_read(struct mii_bus *bus, int phy, int regnum)
+qca8k_mdio_read(struct qca8k_priv *priv, int phy, int regnum)
 {
+       struct mii_bus *bus = priv->bus;
        u16 r1, r2, page;
        u32 val;
        int ret;
@@ -758,11 +1237,11 @@ qca8k_mdio_read(struct mii_bus *bus, int phy, int regnum)
 
        mutex_lock_nested(&bus->mdio_lock, MDIO_MUTEX_NESTED);
 
-       ret = qca8k_set_page(bus, page);
+       ret = qca8k_set_page(priv, page);
        if (ret)
                goto exit;
 
-       qca8k_mii_write32(bus, 0x10 | r2, r1, val);
+       qca8k_mii_write32(priv, 0x10 | r2, r1, val);
 
        ret = qca8k_mdio_busy_wait(bus, QCA8K_MDIO_MASTER_CTRL,
                                   QCA8K_MDIO_MASTER_BUSY);
@@ -773,7 +1252,7 @@ qca8k_mdio_read(struct mii_bus *bus, int phy, int regnum)
 
 exit:
        /* even if the busy_wait timeouts try to clear the MASTER_EN */
-       qca8k_mii_write32(bus, 0x10 | r2, r1, 0);
+       qca8k_mii_write32(priv, 0x10 | r2, r1, 0);
 
        mutex_unlock(&bus->mdio_lock);
 
@@ -787,24 +1266,35 @@ static int
 qca8k_internal_mdio_write(struct mii_bus *slave_bus, int phy, int regnum, u16 data)
 {
        struct qca8k_priv *priv = slave_bus->priv;
-       struct mii_bus *bus = priv->bus;
+       int ret;
 
-       return qca8k_mdio_write(bus, phy, regnum, data);
+       /* Use mdio Ethernet when available, fallback to legacy one on error */
+       ret = qca8k_phy_eth_command(priv, false, phy, regnum, data);
+       if (!ret)
+               return 0;
+
+       return qca8k_mdio_write(priv, phy, regnum, data);
 }
 
 static int
 qca8k_internal_mdio_read(struct mii_bus *slave_bus, int phy, int regnum)
 {
        struct qca8k_priv *priv = slave_bus->priv;
-       struct mii_bus *bus = priv->bus;
+       int ret;
 
-       return qca8k_mdio_read(bus, phy, regnum);
+       /* Use mdio Ethernet when available, fallback to legacy one on error */
+       ret = qca8k_phy_eth_command(priv, true, phy, regnum, 0);
+       if (ret >= 0)
+               return ret;
+
+       return qca8k_mdio_read(priv, phy, regnum);
 }
 
 static int
 qca8k_phy_write(struct dsa_switch *ds, int port, int regnum, u16 data)
 {
        struct qca8k_priv *priv = ds->priv;
+       int ret;
 
        /* Check if the legacy mapping should be used and the
         * port is not correctly mapped to the right PHY in the
@@ -813,7 +1303,12 @@ qca8k_phy_write(struct dsa_switch *ds, int port, int regnum, u16 data)
        if (priv->legacy_phy_port_mapping)
                port = qca8k_port_to_phy(port) % PHY_MAX_ADDR;
 
-       return qca8k_mdio_write(priv->bus, port, regnum, data);
+       /* Use mdio Ethernet when available, fallback to legacy one on error */
+       ret = qca8k_phy_eth_command(priv, false, port, regnum, 0);
+       if (!ret)
+               return ret;
+
+       return qca8k_mdio_write(priv, port, regnum, data);
 }
 
 static int
@@ -829,7 +1324,12 @@ qca8k_phy_read(struct dsa_switch *ds, int port, int regnum)
        if (priv->legacy_phy_port_mapping)
                port = qca8k_port_to_phy(port) % PHY_MAX_ADDR;
 
-       ret = qca8k_mdio_read(priv->bus, port, regnum);
+       /* Use mdio Ethernet when available, fallback to legacy one on error */
+       ret = qca8k_phy_eth_command(priv, true, port, regnum, 0);
+       if (ret >= 0)
+               return ret;
+
+       ret = qca8k_mdio_read(priv, port, regnum);
 
        if (ret < 0)
                return 0xffff;
@@ -1531,67 +2031,39 @@ qca8k_phylink_mac_config(struct dsa_switch *ds, int port, unsigned int mode,
        }
 }
 
-static void
-qca8k_phylink_validate(struct dsa_switch *ds, int port,
-                      unsigned long *supported,
-                      struct phylink_link_state *state)
+static void qca8k_phylink_get_caps(struct dsa_switch *ds, int port,
+                                  struct phylink_config *config)
 {
-       __ETHTOOL_DECLARE_LINK_MODE_MASK(mask) = { 0, };
-
        switch (port) {
        case 0: /* 1st CPU port */
-               if (state->interface != PHY_INTERFACE_MODE_NA &&
-                   state->interface != PHY_INTERFACE_MODE_RGMII &&
-                   state->interface != PHY_INTERFACE_MODE_RGMII_ID &&
-                   state->interface != PHY_INTERFACE_MODE_RGMII_TXID &&
-                   state->interface != PHY_INTERFACE_MODE_RGMII_RXID &&
-                   state->interface != PHY_INTERFACE_MODE_SGMII)
-                       goto unsupported;
+               phy_interface_set_rgmii(config->supported_interfaces);
+               __set_bit(PHY_INTERFACE_MODE_SGMII,
+                         config->supported_interfaces);
                break;
+
        case 1:
        case 2:
        case 3:
        case 4:
        case 5:
                /* Internal PHY */
-               if (state->interface != PHY_INTERFACE_MODE_NA &&
-                   state->interface != PHY_INTERFACE_MODE_GMII &&
-                   state->interface != PHY_INTERFACE_MODE_INTERNAL)
-                       goto unsupported;
+               __set_bit(PHY_INTERFACE_MODE_GMII,
+                         config->supported_interfaces);
+               __set_bit(PHY_INTERFACE_MODE_INTERNAL,
+                         config->supported_interfaces);
                break;
+
        case 6: /* 2nd CPU port / external PHY */
-               if (state->interface != PHY_INTERFACE_MODE_NA &&
-                   state->interface != PHY_INTERFACE_MODE_RGMII &&
-                   state->interface != PHY_INTERFACE_MODE_RGMII_ID &&
-                   state->interface != PHY_INTERFACE_MODE_RGMII_TXID &&
-                   state->interface != PHY_INTERFACE_MODE_RGMII_RXID &&
-                   state->interface != PHY_INTERFACE_MODE_SGMII &&
-                   state->interface != PHY_INTERFACE_MODE_1000BASEX)
-                       goto unsupported;
+               phy_interface_set_rgmii(config->supported_interfaces);
+               __set_bit(PHY_INTERFACE_MODE_SGMII,
+                         config->supported_interfaces);
+               __set_bit(PHY_INTERFACE_MODE_1000BASEX,
+                         config->supported_interfaces);
                break;
-       default:
-unsupported:
-               linkmode_zero(supported);
-               return;
        }
 
-       phylink_set_port_modes(mask);
-       phylink_set(mask, Autoneg);
-
-       phylink_set(mask, 1000baseT_Full);
-       phylink_set(mask, 10baseT_Half);
-       phylink_set(mask, 10baseT_Full);
-       phylink_set(mask, 100baseT_Half);
-       phylink_set(mask, 100baseT_Full);
-
-       if (state->interface == PHY_INTERFACE_MODE_1000BASEX)
-               phylink_set(mask, 1000baseX_Full);
-
-       phylink_set(mask, Pause);
-       phylink_set(mask, Asym_Pause);
-
-       linkmode_and(supported, supported, mask);
-       linkmode_and(state->advertising, state->advertising, mask);
+       config->mac_capabilities = MAC_ASYM_PAUSE | MAC_SYM_PAUSE |
+               MAC_10 | MAC_100 | MAC_1000FD;
 }
 
 static int
@@ -1703,6 +2175,97 @@ qca8k_get_strings(struct dsa_switch *ds, int port, u32 stringset, uint8_t *data)
                        ETH_GSTRING_LEN);
 }
 
+static void qca8k_mib_autocast_handler(struct dsa_switch *ds, struct sk_buff *skb)
+{
+       const struct qca8k_match_data *match_data;
+       struct qca8k_mib_eth_data *mib_eth_data;
+       struct qca8k_priv *priv = ds->priv;
+       const struct qca8k_mib_desc *mib;
+       struct mib_ethhdr *mib_ethhdr;
+       int i, mib_len, offset = 0;
+       u64 *data;
+       u8 port;
+
+       mib_ethhdr = (struct mib_ethhdr *)skb_mac_header(skb);
+       mib_eth_data = &priv->mib_eth_data;
+
+       /* The switch autocast every port. Ignore other packet and
+        * parse only the requested one.
+        */
+       port = FIELD_GET(QCA_HDR_RECV_SOURCE_PORT, ntohs(mib_ethhdr->hdr));
+       if (port != mib_eth_data->req_port)
+               goto exit;
+
+       match_data = device_get_match_data(priv->dev);
+       data = mib_eth_data->data;
+
+       for (i = 0; i < match_data->mib_count; i++) {
+               mib = &ar8327_mib[i];
+
+               /* First 3 mib are present in the skb head */
+               if (i < 3) {
+                       data[i] = mib_ethhdr->data[i];
+                       continue;
+               }
+
+               mib_len = sizeof(uint32_t);
+
+               /* Some mib are 64 bit wide */
+               if (mib->size == 2)
+                       mib_len = sizeof(uint64_t);
+
+               /* Copy the mib value from packet to the */
+               memcpy(data + i, skb->data + offset, mib_len);
+
+               /* Set the offset for the next mib */
+               offset += mib_len;
+       }
+
+exit:
+       /* Complete on receiving all the mib packet */
+       if (refcount_dec_and_test(&mib_eth_data->port_parsed))
+               complete(&mib_eth_data->rw_done);
+}
+
+static int
+qca8k_get_ethtool_stats_eth(struct dsa_switch *ds, int port, u64 *data)
+{
+       struct dsa_port *dp = dsa_to_port(ds, port);
+       struct qca8k_mib_eth_data *mib_eth_data;
+       struct qca8k_priv *priv = ds->priv;
+       int ret;
+
+       mib_eth_data = &priv->mib_eth_data;
+
+       mutex_lock(&mib_eth_data->mutex);
+
+       reinit_completion(&mib_eth_data->rw_done);
+
+       mib_eth_data->req_port = dp->index;
+       mib_eth_data->data = data;
+       refcount_set(&mib_eth_data->port_parsed, QCA8K_NUM_PORTS);
+
+       mutex_lock(&priv->reg_mutex);
+
+       /* Send mib autocast request */
+       ret = regmap_update_bits(priv->regmap, QCA8K_REG_MIB,
+                                QCA8K_MIB_FUNC | QCA8K_MIB_BUSY,
+                                FIELD_PREP(QCA8K_MIB_FUNC, QCA8K_MIB_CAST) |
+                                QCA8K_MIB_BUSY);
+
+       mutex_unlock(&priv->reg_mutex);
+
+       if (ret)
+               goto exit;
+
+       ret = wait_for_completion_timeout(&mib_eth_data->rw_done, QCA8K_ETHERNET_TIMEOUT);
+
+exit:
+       mutex_unlock(&mib_eth_data->mutex);
+
+       return ret;
+}
+
 static void
 qca8k_get_ethtool_stats(struct dsa_switch *ds, int port,
                        uint64_t *data)
@@ -1714,6 +2277,10 @@ qca8k_get_ethtool_stats(struct dsa_switch *ds, int port,
        u32 hi = 0;
        int ret;
 
+       if (priv->mgmt_master &&
+           qca8k_get_ethtool_stats_eth(ds, port, data) > 0)
+               return;
+
        match_data = of_device_get_match_data(priv->dev);
 
        for (i = 0; i < match_data->mib_count; i++) {
@@ -2383,6 +2950,46 @@ qca8k_port_lag_leave(struct dsa_switch *ds, int port,
        return qca8k_lag_refresh_portmap(ds, port, lag, true);
 }
 
+static void
+qca8k_master_change(struct dsa_switch *ds, const struct net_device *master,
+                   bool operational)
+{
+       struct dsa_port *dp = master->dsa_ptr;
+       struct qca8k_priv *priv = ds->priv;
+
+       /* Ethernet MIB/MDIO is only supported for CPU port 0 */
+       if (dp->index != 0)
+               return;
+
+       mutex_lock(&priv->mgmt_eth_data.mutex);
+       mutex_lock(&priv->mib_eth_data.mutex);
+
+       priv->mgmt_master = operational ? (struct net_device *)master : NULL;
+
+       mutex_unlock(&priv->mib_eth_data.mutex);
+       mutex_unlock(&priv->mgmt_eth_data.mutex);
+}
+
+static int qca8k_connect_tag_protocol(struct dsa_switch *ds,
+                                     enum dsa_tag_protocol proto)
+{
+       struct qca_tagger_data *tagger_data;
+
+       switch (proto) {
+       case DSA_TAG_PROTO_QCA:
+               tagger_data = ds->tagger_data;
+
+               tagger_data->rw_reg_ack_handler = qca8k_rw_reg_ack_handler;
+               tagger_data->mib_autocast_handler = qca8k_mib_autocast_handler;
+
+               break;
+       default:
+               return -EOPNOTSUPP;
+       }
+
+       return 0;
+}
+
 static const struct dsa_switch_ops qca8k_switch_ops = {
        .get_tag_protocol       = qca8k_get_tag_protocol,
        .setup                  = qca8k_setup,
@@ -2410,7 +3017,7 @@ static const struct dsa_switch_ops qca8k_switch_ops = {
        .port_vlan_filtering    = qca8k_port_vlan_filtering,
        .port_vlan_add          = qca8k_port_vlan_add,
        .port_vlan_del          = qca8k_port_vlan_del,
-       .phylink_validate       = qca8k_phylink_validate,
+       .phylink_get_caps       = qca8k_phylink_get_caps,
        .phylink_mac_link_state = qca8k_phylink_mac_link_state,
        .phylink_mac_config     = qca8k_phylink_mac_config,
        .phylink_mac_link_down  = qca8k_phylink_mac_link_down,
@@ -2418,6 +3025,8 @@ static const struct dsa_switch_ops qca8k_switch_ops = {
        .get_phy_flags          = qca8k_get_phy_flags,
        .port_lag_join          = qca8k_port_lag_join,
        .port_lag_leave         = qca8k_port_lag_leave,
+       .master_state_change    = qca8k_master_change,
+       .connect_tag_protocol   = qca8k_connect_tag_protocol,
 };
 
 static int qca8k_read_switch_id(struct qca8k_priv *priv)
@@ -2488,6 +3097,10 @@ qca8k_sw_probe(struct mdio_device *mdiodev)
                return PTR_ERR(priv->regmap);
        }
 
+       priv->mdio_cache.page = 0xffff;
+       priv->mdio_cache.lo = 0xffff;
+       priv->mdio_cache.hi = 0xffff;
+
        /* Check the detected switch id */
        ret = qca8k_read_switch_id(priv);
        if (ret)
@@ -2497,6 +3110,12 @@ qca8k_sw_probe(struct mdio_device *mdiodev)
        if (!priv->ds)
                return -ENOMEM;
 
+       mutex_init(&priv->mgmt_eth_data.mutex);
+       init_completion(&priv->mgmt_eth_data.rw_done);
+
+       mutex_init(&priv->mib_eth_data.mutex);
+       init_completion(&priv->mib_eth_data.rw_done);
+
        priv->ds->dev = &mdiodev->dev;
        priv->ds->num_ports = QCA8K_NUM_PORTS;
        priv->ds->priv = priv;
index ab4a417..c3d3c22 100644 (file)
 #include <linux/delay.h>
 #include <linux/regmap.h>
 #include <linux/gpio.h>
+#include <linux/dsa/tag_qca.h>
+
+#define QCA8K_ETHERNET_MDIO_PRIORITY                   7
+#define QCA8K_ETHERNET_PHY_PRIORITY                    6
+#define QCA8K_ETHERNET_TIMEOUT                         100
 
 #define QCA8K_NUM_PORTS                                        7
 #define QCA8K_NUM_CPU_PORTS                            2
@@ -63,7 +68,7 @@
 #define QCA8K_REG_MODULE_EN                            0x030
 #define   QCA8K_MODULE_EN_MIB                          BIT(0)
 #define QCA8K_REG_MIB                                  0x034
-#define   QCA8K_MIB_FLUSH                              BIT(24)
+#define   QCA8K_MIB_FUNC                               GENMASK(26, 24)
 #define   QCA8K_MIB_CPU_KEEP                           BIT(20)
 #define   QCA8K_MIB_BUSY                               BIT(17)
 #define QCA8K_MDIO_MASTER_CTRL                         0x3c
@@ -313,6 +318,12 @@ enum qca8k_vlan_cmd {
        QCA8K_VLAN_READ = 6,
 };
 
+enum qca8k_mid_cmd {
+       QCA8K_MIB_FLUSH = 1,
+       QCA8K_MIB_FLUSH_PORT = 2,
+       QCA8K_MIB_CAST = 3,
+};
+
 struct ar8xxx_port_status {
        int enabled;
 };
@@ -328,6 +339,22 @@ enum {
        QCA8K_CPU_PORT6,
 };
 
+struct qca8k_mgmt_eth_data {
+       struct completion rw_done;
+       struct mutex mutex; /* Enforce one mdio read/write at time */
+       bool ack;
+       u32 seq;
+       u32 data[4];
+};
+
+struct qca8k_mib_eth_data {
+       struct completion rw_done;
+       struct mutex mutex; /* Process one command at time */
+       refcount_t port_parsed; /* Counter to track parsed port */
+       u8 req_port;
+       u64 *data; /* pointer to ethtool data */
+};
+
 struct qca8k_ports_config {
        bool sgmii_rx_clk_falling_edge;
        bool sgmii_tx_clk_falling_edge;
@@ -336,6 +363,19 @@ struct qca8k_ports_config {
        u8 rgmii_tx_delay[QCA8K_NUM_CPU_PORTS]; /* 0: CPU port0, 1: CPU port6 */
 };
 
+struct qca8k_mdio_cache {
+/* The 32bit switch registers are accessed indirectly. To achieve this we need
+ * to set the page of the register. Track the last page that was set to reduce
+ * mdio writes
+ */
+       u16 page;
+/* lo and hi can also be cached and from Documentation we can skip one
+ * extra mdio write if lo or hi is didn't change.
+ */
+       u16 lo;
+       u16 hi;
+};
+
 struct qca8k_priv {
        u8 switch_id;
        u8 switch_revision;
@@ -353,6 +393,10 @@ struct qca8k_priv {
        struct dsa_switch_ops ops;
        struct gpio_desc *reset_gpio;
        unsigned int port_mtu[QCA8K_NUM_PORTS];
+       struct net_device *mgmt_master; /* Track if mdio/mib Ethernet is available */
+       struct qca8k_mgmt_eth_data mgmt_eth_data;
+       struct qca8k_mib_eth_data mib_eth_data;
+       struct qca8k_mdio_cache mdio_cache;
 };
 
 struct qca8k_mib_desc {
diff --git a/drivers/net/dsa/realtek-smi-core.c b/drivers/net/dsa/realtek-smi-core.c
deleted file mode 100644 (file)
index aae46ad..0000000
+++ /dev/null
@@ -1,523 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0+
-/* Realtek Simple Management Interface (SMI) driver
- * It can be discussed how "simple" this interface is.
- *
- * The SMI protocol piggy-backs the MDIO MDC and MDIO signals levels
- * but the protocol is not MDIO at all. Instead it is a Realtek
- * pecularity that need to bit-bang the lines in a special way to
- * communicate with the switch.
- *
- * ASICs we intend to support with this driver:
- *
- * RTL8366   - The original version, apparently
- * RTL8369   - Similar enough to have the same datsheet as RTL8366
- * RTL8366RB - Probably reads out "RTL8366 revision B", has a quite
- *             different register layout from the other two
- * RTL8366S  - Is this "RTL8366 super"?
- * RTL8367   - Has an OpenWRT driver as well
- * RTL8368S  - Seems to be an alternative name for RTL8366RB
- * RTL8370   - Also uses SMI
- *
- * Copyright (C) 2017 Linus Walleij <linus.walleij@linaro.org>
- * Copyright (C) 2010 Antti Seppälä <a.seppala@gmail.com>
- * Copyright (C) 2010 Roman Yeryomin <roman@advem.lv>
- * Copyright (C) 2011 Colin Leitner <colin.leitner@googlemail.com>
- * Copyright (C) 2009-2010 Gabor Juhos <juhosg@openwrt.org>
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/device.h>
-#include <linux/spinlock.h>
-#include <linux/skbuff.h>
-#include <linux/of.h>
-#include <linux/of_device.h>
-#include <linux/of_mdio.h>
-#include <linux/delay.h>
-#include <linux/gpio/consumer.h>
-#include <linux/platform_device.h>
-#include <linux/regmap.h>
-#include <linux/bitops.h>
-#include <linux/if_bridge.h>
-
-#include "realtek-smi-core.h"
-
-#define REALTEK_SMI_ACK_RETRY_COUNT            5
-#define REALTEK_SMI_HW_STOP_DELAY              25      /* msecs */
-#define REALTEK_SMI_HW_START_DELAY             100     /* msecs */
-
-static inline void realtek_smi_clk_delay(struct realtek_smi *smi)
-{
-       ndelay(smi->clk_delay);
-}
-
-static void realtek_smi_start(struct realtek_smi *smi)
-{
-       /* Set GPIO pins to output mode, with initial state:
-        * SCK = 0, SDA = 1
-        */
-       gpiod_direction_output(smi->mdc, 0);
-       gpiod_direction_output(smi->mdio, 1);
-       realtek_smi_clk_delay(smi);
-
-       /* CLK 1: 0 -> 1, 1 -> 0 */
-       gpiod_set_value(smi->mdc, 1);
-       realtek_smi_clk_delay(smi);
-       gpiod_set_value(smi->mdc, 0);
-       realtek_smi_clk_delay(smi);
-
-       /* CLK 2: */
-       gpiod_set_value(smi->mdc, 1);
-       realtek_smi_clk_delay(smi);
-       gpiod_set_value(smi->mdio, 0);
-       realtek_smi_clk_delay(smi);
-       gpiod_set_value(smi->mdc, 0);
-       realtek_smi_clk_delay(smi);
-       gpiod_set_value(smi->mdio, 1);
-}
-
-static void realtek_smi_stop(struct realtek_smi *smi)
-{
-       realtek_smi_clk_delay(smi);
-       gpiod_set_value(smi->mdio, 0);
-       gpiod_set_value(smi->mdc, 1);
-       realtek_smi_clk_delay(smi);
-       gpiod_set_value(smi->mdio, 1);
-       realtek_smi_clk_delay(smi);
-       gpiod_set_value(smi->mdc, 1);
-       realtek_smi_clk_delay(smi);
-       gpiod_set_value(smi->mdc, 0);
-       realtek_smi_clk_delay(smi);
-       gpiod_set_value(smi->mdc, 1);
-
-       /* Add a click */
-       realtek_smi_clk_delay(smi);
-       gpiod_set_value(smi->mdc, 0);
-       realtek_smi_clk_delay(smi);
-       gpiod_set_value(smi->mdc, 1);
-
-       /* Set GPIO pins to input mode */
-       gpiod_direction_input(smi->mdio);
-       gpiod_direction_input(smi->mdc);
-}
-
-static void realtek_smi_write_bits(struct realtek_smi *smi, u32 data, u32 len)
-{
-       for (; len > 0; len--) {
-               realtek_smi_clk_delay(smi);
-
-               /* Prepare data */
-               gpiod_set_value(smi->mdio, !!(data & (1 << (len - 1))));
-               realtek_smi_clk_delay(smi);
-
-               /* Clocking */
-               gpiod_set_value(smi->mdc, 1);
-               realtek_smi_clk_delay(smi);
-               gpiod_set_value(smi->mdc, 0);
-       }
-}
-
-static void realtek_smi_read_bits(struct realtek_smi *smi, u32 len, u32 *data)
-{
-       gpiod_direction_input(smi->mdio);
-
-       for (*data = 0; len > 0; len--) {
-               u32 u;
-
-               realtek_smi_clk_delay(smi);
-
-               /* Clocking */
-               gpiod_set_value(smi->mdc, 1);
-               realtek_smi_clk_delay(smi);
-               u = !!gpiod_get_value(smi->mdio);
-               gpiod_set_value(smi->mdc, 0);
-
-               *data |= (u << (len - 1));
-       }
-
-       gpiod_direction_output(smi->mdio, 0);
-}
-
-static int realtek_smi_wait_for_ack(struct realtek_smi *smi)
-{
-       int retry_cnt;
-
-       retry_cnt = 0;
-       do {
-               u32 ack;
-
-               realtek_smi_read_bits(smi, 1, &ack);
-               if (ack == 0)
-                       break;
-
-               if (++retry_cnt > REALTEK_SMI_ACK_RETRY_COUNT) {
-                       dev_err(smi->dev, "ACK timeout\n");
-                       return -ETIMEDOUT;
-               }
-       } while (1);
-
-       return 0;
-}
-
-static int realtek_smi_write_byte(struct realtek_smi *smi, u8 data)
-{
-       realtek_smi_write_bits(smi, data, 8);
-       return realtek_smi_wait_for_ack(smi);
-}
-
-static int realtek_smi_write_byte_noack(struct realtek_smi *smi, u8 data)
-{
-       realtek_smi_write_bits(smi, data, 8);
-       return 0;
-}
-
-static int realtek_smi_read_byte0(struct realtek_smi *smi, u8 *data)
-{
-       u32 t;
-
-       /* Read data */
-       realtek_smi_read_bits(smi, 8, &t);
-       *data = (t & 0xff);
-
-       /* Send an ACK */
-       realtek_smi_write_bits(smi, 0x00, 1);
-
-       return 0;
-}
-
-static int realtek_smi_read_byte1(struct realtek_smi *smi, u8 *data)
-{
-       u32 t;
-
-       /* Read data */
-       realtek_smi_read_bits(smi, 8, &t);
-       *data = (t & 0xff);
-
-       /* Send an ACK */
-       realtek_smi_write_bits(smi, 0x01, 1);
-
-       return 0;
-}
-
-static int realtek_smi_read_reg(struct realtek_smi *smi, u32 addr, u32 *data)
-{
-       unsigned long flags;
-       u8 lo = 0;
-       u8 hi = 0;
-       int ret;
-
-       spin_lock_irqsave(&smi->lock, flags);
-
-       realtek_smi_start(smi);
-
-       /* Send READ command */
-       ret = realtek_smi_write_byte(smi, smi->cmd_read);
-       if (ret)
-               goto out;
-
-       /* Set ADDR[7:0] */
-       ret = realtek_smi_write_byte(smi, addr & 0xff);
-       if (ret)
-               goto out;
-
-       /* Set ADDR[15:8] */
-       ret = realtek_smi_write_byte(smi, addr >> 8);
-       if (ret)
-               goto out;
-
-       /* Read DATA[7:0] */
-       realtek_smi_read_byte0(smi, &lo);
-       /* Read DATA[15:8] */
-       realtek_smi_read_byte1(smi, &hi);
-
-       *data = ((u32)lo) | (((u32)hi) << 8);
-
-       ret = 0;
-
- out:
-       realtek_smi_stop(smi);
-       spin_unlock_irqrestore(&smi->lock, flags);
-
-       return ret;
-}
-
-static int realtek_smi_write_reg(struct realtek_smi *smi,
-                                u32 addr, u32 data, bool ack)
-{
-       unsigned long flags;
-       int ret;
-
-       spin_lock_irqsave(&smi->lock, flags);
-
-       realtek_smi_start(smi);
-
-       /* Send WRITE command */
-       ret = realtek_smi_write_byte(smi, smi->cmd_write);
-       if (ret)
-               goto out;
-
-       /* Set ADDR[7:0] */
-       ret = realtek_smi_write_byte(smi, addr & 0xff);
-       if (ret)
-               goto out;
-
-       /* Set ADDR[15:8] */
-       ret = realtek_smi_write_byte(smi, addr >> 8);
-       if (ret)
-               goto out;
-
-       /* Write DATA[7:0] */
-       ret = realtek_smi_write_byte(smi, data & 0xff);
-       if (ret)
-               goto out;
-
-       /* Write DATA[15:8] */
-       if (ack)
-               ret = realtek_smi_write_byte(smi, data >> 8);
-       else
-               ret = realtek_smi_write_byte_noack(smi, data >> 8);
-       if (ret)
-               goto out;
-
-       ret = 0;
-
- out:
-       realtek_smi_stop(smi);
-       spin_unlock_irqrestore(&smi->lock, flags);
-
-       return ret;
-}
-
-/* There is one single case when we need to use this accessor and that
- * is when issueing soft reset. Since the device reset as soon as we write
- * that bit, no ACK will come back for natural reasons.
- */
-int realtek_smi_write_reg_noack(struct realtek_smi *smi, u32 addr,
-                               u32 data)
-{
-       return realtek_smi_write_reg(smi, addr, data, false);
-}
-EXPORT_SYMBOL_GPL(realtek_smi_write_reg_noack);
-
-/* Regmap accessors */
-
-static int realtek_smi_write(void *ctx, u32 reg, u32 val)
-{
-       struct realtek_smi *smi = ctx;
-
-       return realtek_smi_write_reg(smi, reg, val, true);
-}
-
-static int realtek_smi_read(void *ctx, u32 reg, u32 *val)
-{
-       struct realtek_smi *smi = ctx;
-
-       return realtek_smi_read_reg(smi, reg, val);
-}
-
-static const struct regmap_config realtek_smi_mdio_regmap_config = {
-       .reg_bits = 10, /* A4..A0 R4..R0 */
-       .val_bits = 16,
-       .reg_stride = 1,
-       /* PHY regs are at 0x8000 */
-       .max_register = 0xffff,
-       .reg_format_endian = REGMAP_ENDIAN_BIG,
-       .reg_read = realtek_smi_read,
-       .reg_write = realtek_smi_write,
-       .cache_type = REGCACHE_NONE,
-};
-
-static int realtek_smi_mdio_read(struct mii_bus *bus, int addr, int regnum)
-{
-       struct realtek_smi *smi = bus->priv;
-
-       return smi->ops->phy_read(smi, addr, regnum);
-}
-
-static int realtek_smi_mdio_write(struct mii_bus *bus, int addr, int regnum,
-                                 u16 val)
-{
-       struct realtek_smi *smi = bus->priv;
-
-       return smi->ops->phy_write(smi, addr, regnum, val);
-}
-
-int realtek_smi_setup_mdio(struct realtek_smi *smi)
-{
-       struct device_node *mdio_np;
-       int ret;
-
-       mdio_np = of_get_compatible_child(smi->dev->of_node, "realtek,smi-mdio");
-       if (!mdio_np) {
-               dev_err(smi->dev, "no MDIO bus node\n");
-               return -ENODEV;
-       }
-
-       smi->slave_mii_bus = devm_mdiobus_alloc(smi->dev);
-       if (!smi->slave_mii_bus) {
-               ret = -ENOMEM;
-               goto err_put_node;
-       }
-       smi->slave_mii_bus->priv = smi;
-       smi->slave_mii_bus->name = "SMI slave MII";
-       smi->slave_mii_bus->read = realtek_smi_mdio_read;
-       smi->slave_mii_bus->write = realtek_smi_mdio_write;
-       snprintf(smi->slave_mii_bus->id, MII_BUS_ID_SIZE, "SMI-%d",
-                smi->ds->index);
-       smi->slave_mii_bus->dev.of_node = mdio_np;
-       smi->slave_mii_bus->parent = smi->dev;
-       smi->ds->slave_mii_bus = smi->slave_mii_bus;
-
-       ret = devm_of_mdiobus_register(smi->dev, smi->slave_mii_bus, mdio_np);
-       if (ret) {
-               dev_err(smi->dev, "unable to register MDIO bus %s\n",
-                       smi->slave_mii_bus->id);
-               goto err_put_node;
-       }
-
-       return 0;
-
-err_put_node:
-       of_node_put(mdio_np);
-
-       return ret;
-}
-
-static int realtek_smi_probe(struct platform_device *pdev)
-{
-       const struct realtek_smi_variant *var;
-       struct device *dev = &pdev->dev;
-       struct realtek_smi *smi;
-       struct device_node *np;
-       int ret;
-
-       var = of_device_get_match_data(dev);
-       np = dev->of_node;
-
-       smi = devm_kzalloc(dev, sizeof(*smi) + var->chip_data_sz, GFP_KERNEL);
-       if (!smi)
-               return -ENOMEM;
-       smi->chip_data = (void *)smi + sizeof(*smi);
-       smi->map = devm_regmap_init(dev, NULL, smi,
-                                   &realtek_smi_mdio_regmap_config);
-       if (IS_ERR(smi->map)) {
-               ret = PTR_ERR(smi->map);
-               dev_err(dev, "regmap init failed: %d\n", ret);
-               return ret;
-       }
-
-       /* Link forward and backward */
-       smi->dev = dev;
-       smi->clk_delay = var->clk_delay;
-       smi->cmd_read = var->cmd_read;
-       smi->cmd_write = var->cmd_write;
-       smi->ops = var->ops;
-
-       dev_set_drvdata(dev, smi);
-       spin_lock_init(&smi->lock);
-
-       /* TODO: if power is software controlled, set up any regulators here */
-
-       /* Assert then deassert RESET */
-       smi->reset = devm_gpiod_get_optional(dev, "reset", GPIOD_OUT_HIGH);
-       if (IS_ERR(smi->reset)) {
-               dev_err(dev, "failed to get RESET GPIO\n");
-               return PTR_ERR(smi->reset);
-       }
-       msleep(REALTEK_SMI_HW_STOP_DELAY);
-       gpiod_set_value(smi->reset, 0);
-       msleep(REALTEK_SMI_HW_START_DELAY);
-       dev_info(dev, "deasserted RESET\n");
-
-       /* Fetch MDIO pins */
-       smi->mdc = devm_gpiod_get_optional(dev, "mdc", GPIOD_OUT_LOW);
-       if (IS_ERR(smi->mdc))
-               return PTR_ERR(smi->mdc);
-       smi->mdio = devm_gpiod_get_optional(dev, "mdio", GPIOD_OUT_LOW);
-       if (IS_ERR(smi->mdio))
-               return PTR_ERR(smi->mdio);
-
-       smi->leds_disabled = of_property_read_bool(np, "realtek,disable-leds");
-
-       ret = smi->ops->detect(smi);
-       if (ret) {
-               dev_err(dev, "unable to detect switch\n");
-               return ret;
-       }
-
-       smi->ds = devm_kzalloc(dev, sizeof(*smi->ds), GFP_KERNEL);
-       if (!smi->ds)
-               return -ENOMEM;
-
-       smi->ds->dev = dev;
-       smi->ds->num_ports = smi->num_ports;
-       smi->ds->priv = smi;
-
-       smi->ds->ops = var->ds_ops;
-       ret = dsa_register_switch(smi->ds);
-       if (ret) {
-               dev_err_probe(dev, ret, "unable to register switch\n");
-               return ret;
-       }
-       return 0;
-}
-
-static int realtek_smi_remove(struct platform_device *pdev)
-{
-       struct realtek_smi *smi = platform_get_drvdata(pdev);
-
-       if (!smi)
-               return 0;
-
-       dsa_unregister_switch(smi->ds);
-       if (smi->slave_mii_bus)
-               of_node_put(smi->slave_mii_bus->dev.of_node);
-       gpiod_set_value(smi->reset, 1);
-
-       platform_set_drvdata(pdev, NULL);
-
-       return 0;
-}
-
-static void realtek_smi_shutdown(struct platform_device *pdev)
-{
-       struct realtek_smi *smi = platform_get_drvdata(pdev);
-
-       if (!smi)
-               return;
-
-       dsa_switch_shutdown(smi->ds);
-
-       platform_set_drvdata(pdev, NULL);
-}
-
-static const struct of_device_id realtek_smi_of_match[] = {
-       {
-               .compatible = "realtek,rtl8366rb",
-               .data = &rtl8366rb_variant,
-       },
-       {
-               /* FIXME: add support for RTL8366S and more */
-               .compatible = "realtek,rtl8366s",
-               .data = NULL,
-       },
-       {
-               .compatible = "realtek,rtl8365mb",
-               .data = &rtl8365mb_variant,
-       },
-       { /* sentinel */ },
-};
-MODULE_DEVICE_TABLE(of, realtek_smi_of_match);
-
-static struct platform_driver realtek_smi_driver = {
-       .driver = {
-               .name = "realtek-smi",
-               .of_match_table = of_match_ptr(realtek_smi_of_match),
-       },
-       .probe  = realtek_smi_probe,
-       .remove = realtek_smi_remove,
-       .shutdown = realtek_smi_shutdown,
-};
-module_platform_driver(realtek_smi_driver);
-
-MODULE_LICENSE("GPL");
diff --git a/drivers/net/dsa/realtek/Kconfig b/drivers/net/dsa/realtek/Kconfig
new file mode 100644 (file)
index 0000000..b7427a8
--- /dev/null
@@ -0,0 +1,40 @@
+# SPDX-License-Identifier: GPL-2.0-only
+menuconfig NET_DSA_REALTEK
+       tristate "Realtek Ethernet switch family support"
+       depends on NET_DSA
+       select FIXED_PHY
+       select IRQ_DOMAIN
+       select REALTEK_PHY
+       select REGMAP
+       help
+         Select to enable support for Realtek Ethernet switch chips.
+
+config NET_DSA_REALTEK_MDIO
+       tristate "Realtek MDIO connected switch driver"
+       depends on NET_DSA_REALTEK
+       help
+         Select to enable support for registering switches configured
+         through MDIO.
+
+config NET_DSA_REALTEK_SMI
+       tristate "Realtek SMI connected switch driver"
+       depends on NET_DSA_REALTEK
+       help
+         Select to enable support for registering switches connected
+         through SMI.
+
+config NET_DSA_REALTEK_RTL8365MB
+       tristate "Realtek RTL8365MB switch subdriver"
+       depends on NET_DSA_REALTEK
+       depends on NET_DSA_REALTEK_SMI || NET_DSA_REALTEK_MDIO
+       select NET_DSA_TAG_RTL8_4
+       help
+         Select to enable support for Realtek RTL8365MB-VC and RTL8367S.
+
+config NET_DSA_REALTEK_RTL8366RB
+       tristate "Realtek RTL8366RB switch subdriver"
+       depends on NET_DSA_REALTEK
+       depends on NET_DSA_REALTEK_SMI || NET_DSA_REALTEK_MDIO
+       select NET_DSA_TAG_RTL4_A
+       help
+         Select to enable support for Realtek RTL8366RB
diff --git a/drivers/net/dsa/realtek/Makefile b/drivers/net/dsa/realtek/Makefile
new file mode 100644 (file)
index 0000000..0aab572
--- /dev/null
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_NET_DSA_REALTEK_MDIO)     += realtek-mdio.o
+obj-$(CONFIG_NET_DSA_REALTEK_SMI)      += realtek-smi.o
+obj-$(CONFIG_NET_DSA_REALTEK_RTL8366RB) += rtl8366.o
+rtl8366-objs                           := rtl8366-core.o rtl8366rb.o
+obj-$(CONFIG_NET_DSA_REALTEK_RTL8365MB) += rtl8365mb.o
diff --git a/drivers/net/dsa/realtek/realtek-mdio.c b/drivers/net/dsa/realtek/realtek-mdio.c
new file mode 100644 (file)
index 0000000..0c5f2bd
--- /dev/null
@@ -0,0 +1,229 @@
+// SPDX-License-Identifier: GPL-2.0+
+/* Realtek MDIO interface driver
+ *
+ * ASICs we intend to support with this driver:
+ *
+ * RTL8366   - The original version, apparently
+ * RTL8369   - Similar enough to have the same datsheet as RTL8366
+ * RTL8366RB - Probably reads out "RTL8366 revision B", has a quite
+ *             different register layout from the other two
+ * RTL8366S  - Is this "RTL8366 super"?
+ * RTL8367   - Has an OpenWRT driver as well
+ * RTL8368S  - Seems to be an alternative name for RTL8366RB
+ * RTL8370   - Also uses SMI
+ *
+ * Copyright (C) 2017 Linus Walleij <linus.walleij@linaro.org>
+ * Copyright (C) 2010 Antti Seppälä <a.seppala@gmail.com>
+ * Copyright (C) 2010 Roman Yeryomin <roman@advem.lv>
+ * Copyright (C) 2011 Colin Leitner <colin.leitner@googlemail.com>
+ * Copyright (C) 2009-2010 Gabor Juhos <juhosg@openwrt.org>
+ */
+
+#include <linux/module.h>
+#include <linux/of_device.h>
+#include <linux/regmap.h>
+
+#include "realtek.h"
+
+/* Read/write via mdiobus */
+#define REALTEK_MDIO_CTRL0_REG         31
+#define REALTEK_MDIO_START_REG         29
+#define REALTEK_MDIO_CTRL1_REG         21
+#define REALTEK_MDIO_ADDRESS_REG       23
+#define REALTEK_MDIO_DATA_WRITE_REG    24
+#define REALTEK_MDIO_DATA_READ_REG     25
+
+#define REALTEK_MDIO_START_OP          0xFFFF
+#define REALTEK_MDIO_ADDR_OP           0x000E
+#define REALTEK_MDIO_READ_OP           0x0001
+#define REALTEK_MDIO_WRITE_OP          0x0003
+
+static int realtek_mdio_write(void *ctx, u32 reg, u32 val)
+{
+       struct realtek_priv *priv = ctx;
+       struct mii_bus *bus = priv->bus;
+       int ret;
+
+       mutex_lock(&bus->mdio_lock);
+
+       ret = bus->write(bus, priv->mdio_addr, REALTEK_MDIO_CTRL0_REG, REALTEK_MDIO_ADDR_OP);
+       if (ret)
+               goto out_unlock;
+
+       ret = bus->write(bus, priv->mdio_addr, REALTEK_MDIO_ADDRESS_REG, reg);
+       if (ret)
+               goto out_unlock;
+
+       ret = bus->write(bus, priv->mdio_addr, REALTEK_MDIO_DATA_WRITE_REG, val);
+       if (ret)
+               goto out_unlock;
+
+       ret = bus->write(bus, priv->mdio_addr, REALTEK_MDIO_CTRL1_REG, REALTEK_MDIO_WRITE_OP);
+
+out_unlock:
+       mutex_unlock(&bus->mdio_lock);
+
+       return ret;
+}
+
+static int realtek_mdio_read(void *ctx, u32 reg, u32 *val)
+{
+       struct realtek_priv *priv = ctx;
+       struct mii_bus *bus = priv->bus;
+       int ret;
+
+       mutex_lock(&bus->mdio_lock);
+
+       ret = bus->write(bus, priv->mdio_addr, REALTEK_MDIO_CTRL0_REG, REALTEK_MDIO_ADDR_OP);
+       if (ret)
+               goto out_unlock;
+
+       ret = bus->write(bus, priv->mdio_addr, REALTEK_MDIO_ADDRESS_REG, reg);
+       if (ret)
+               goto out_unlock;
+
+       ret = bus->write(bus, priv->mdio_addr, REALTEK_MDIO_CTRL1_REG, REALTEK_MDIO_READ_OP);
+       if (ret)
+               goto out_unlock;
+
+       ret = bus->read(bus, priv->mdio_addr, REALTEK_MDIO_DATA_READ_REG);
+       if (ret >= 0) {
+               *val = ret;
+               ret = 0;
+       }
+
+out_unlock:
+       mutex_unlock(&bus->mdio_lock);
+
+       return ret;
+}
+
+static const struct regmap_config realtek_mdio_regmap_config = {
+       .reg_bits = 10, /* A4..A0 R4..R0 */
+       .val_bits = 16,
+       .reg_stride = 1,
+       /* PHY regs are at 0x8000 */
+       .max_register = 0xffff,
+       .reg_format_endian = REGMAP_ENDIAN_BIG,
+       .reg_read = realtek_mdio_read,
+       .reg_write = realtek_mdio_write,
+       .cache_type = REGCACHE_NONE,
+};
+
+static int realtek_mdio_probe(struct mdio_device *mdiodev)
+{
+       struct realtek_priv *priv;
+       struct device *dev = &mdiodev->dev;
+       const struct realtek_variant *var;
+       int ret;
+       struct device_node *np;
+
+       var = of_device_get_match_data(dev);
+       if (!var)
+               return -EINVAL;
+
+       priv = devm_kzalloc(&mdiodev->dev, sizeof(*priv), GFP_KERNEL);
+       if (!priv)
+               return -ENOMEM;
+
+       priv->map = devm_regmap_init(dev, NULL, priv, &realtek_mdio_regmap_config);
+       if (IS_ERR(priv->map)) {
+               ret = PTR_ERR(priv->map);
+               dev_err(dev, "regmap init failed: %d\n", ret);
+               return ret;
+       }
+
+       priv->mdio_addr = mdiodev->addr;
+       priv->bus = mdiodev->bus;
+       priv->dev = &mdiodev->dev;
+       priv->chip_data = (void *)priv + sizeof(*priv);
+
+       priv->clk_delay = var->clk_delay;
+       priv->cmd_read = var->cmd_read;
+       priv->cmd_write = var->cmd_write;
+       priv->ops = var->ops;
+
+       priv->write_reg_noack = realtek_mdio_write;
+
+       np = dev->of_node;
+
+       dev_set_drvdata(dev, priv);
+
+       /* TODO: if power is software controlled, set up any regulators here */
+       priv->leds_disabled = of_property_read_bool(np, "realtek,disable-leds");
+
+       ret = priv->ops->detect(priv);
+       if (ret) {
+               dev_err(dev, "unable to detect switch\n");
+               return ret;
+       }
+
+       priv->ds = devm_kzalloc(dev, sizeof(*priv->ds), GFP_KERNEL);
+       if (!priv->ds)
+               return -ENOMEM;
+
+       priv->ds->dev = dev;
+       priv->ds->num_ports = priv->num_ports;
+       priv->ds->priv = priv;
+       priv->ds->ops = var->ds_ops_mdio;
+
+       ret = dsa_register_switch(priv->ds);
+       if (ret) {
+               dev_err(priv->dev, "unable to register switch ret = %d\n", ret);
+               return ret;
+       }
+
+       return 0;
+}
+
+static void realtek_mdio_remove(struct mdio_device *mdiodev)
+{
+       struct realtek_priv *priv = dev_get_drvdata(&mdiodev->dev);
+
+       if (!priv)
+               return;
+
+       dsa_unregister_switch(priv->ds);
+
+       dev_set_drvdata(&mdiodev->dev, NULL);
+}
+
+static void realtek_mdio_shutdown(struct mdio_device *mdiodev)
+{
+       struct realtek_priv *priv = dev_get_drvdata(&mdiodev->dev);
+
+       if (!priv)
+               return;
+
+       dsa_switch_shutdown(priv->ds);
+
+       dev_set_drvdata(&mdiodev->dev, NULL);
+}
+
+static const struct of_device_id realtek_mdio_of_match[] = {
+#if IS_ENABLED(CONFIG_NET_DSA_REALTEK_RTL8366RB)
+       { .compatible = "realtek,rtl8366rb", .data = &rtl8366rb_variant, },
+#endif
+#if IS_ENABLED(CONFIG_NET_DSA_REALTEK_RTL8365MB)
+       { .compatible = "realtek,rtl8365mb", .data = &rtl8365mb_variant, },
+       { .compatible = "realtek,rtl8367s", .data = &rtl8365mb_variant, },
+#endif
+       { /* sentinel */ },
+};
+MODULE_DEVICE_TABLE(of, realtek_mdio_of_match);
+
+static struct mdio_driver realtek_mdio_driver = {
+       .mdiodrv.driver = {
+               .name = "realtek-mdio",
+               .of_match_table = of_match_ptr(realtek_mdio_of_match),
+       },
+       .probe  = realtek_mdio_probe,
+       .remove = realtek_mdio_remove,
+       .shutdown = realtek_mdio_shutdown,
+};
+
+mdio_module_driver(realtek_mdio_driver);
+
+MODULE_AUTHOR("Luiz Angelo Daros de Luca <luizluca@gmail.com>");
+MODULE_DESCRIPTION("Driver for Realtek ethernet switch connected via MDIO interface");
+MODULE_LICENSE("GPL");
diff --git a/drivers/net/dsa/realtek/realtek-smi.c b/drivers/net/dsa/realtek/realtek-smi.c
new file mode 100644 (file)
index 0000000..946fbbd
--- /dev/null
@@ -0,0 +1,535 @@
+// SPDX-License-Identifier: GPL-2.0+
+/* Realtek Simple Management Interface (SMI) driver
+ * It can be discussed how "simple" this interface is.
+ *
+ * The SMI protocol piggy-backs the MDIO MDC and MDIO signals levels
+ * but the protocol is not MDIO at all. Instead it is a Realtek
+ * pecularity that need to bit-bang the lines in a special way to
+ * communicate with the switch.
+ *
+ * ASICs we intend to support with this driver:
+ *
+ * RTL8366   - The original version, apparently
+ * RTL8369   - Similar enough to have the same datsheet as RTL8366
+ * RTL8366RB - Probably reads out "RTL8366 revision B", has a quite
+ *             different register layout from the other two
+ * RTL8366S  - Is this "RTL8366 super"?
+ * RTL8367   - Has an OpenWRT driver as well
+ * RTL8368S  - Seems to be an alternative name for RTL8366RB
+ * RTL8370   - Also uses SMI
+ *
+ * Copyright (C) 2017 Linus Walleij <linus.walleij@linaro.org>
+ * Copyright (C) 2010 Antti Seppälä <a.seppala@gmail.com>
+ * Copyright (C) 2010 Roman Yeryomin <roman@advem.lv>
+ * Copyright (C) 2011 Colin Leitner <colin.leitner@googlemail.com>
+ * Copyright (C) 2009-2010 Gabor Juhos <juhosg@openwrt.org>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/spinlock.h>
+#include <linux/skbuff.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/of_mdio.h>
+#include <linux/delay.h>
+#include <linux/gpio/consumer.h>
+#include <linux/platform_device.h>
+#include <linux/regmap.h>
+#include <linux/bitops.h>
+#include <linux/if_bridge.h>
+
+#include "realtek.h"
+
+#define REALTEK_SMI_ACK_RETRY_COUNT            5
+#define REALTEK_SMI_HW_STOP_DELAY              25      /* msecs */
+#define REALTEK_SMI_HW_START_DELAY             100     /* msecs */
+
+static inline void realtek_smi_clk_delay(struct realtek_priv *priv)
+{
+       ndelay(priv->clk_delay);
+}
+
+static void realtek_smi_start(struct realtek_priv *priv)
+{
+       /* Set GPIO pins to output mode, with initial state:
+        * SCK = 0, SDA = 1
+        */
+       gpiod_direction_output(priv->mdc, 0);
+       gpiod_direction_output(priv->mdio, 1);
+       realtek_smi_clk_delay(priv);
+
+       /* CLK 1: 0 -> 1, 1 -> 0 */
+       gpiod_set_value(priv->mdc, 1);
+       realtek_smi_clk_delay(priv);
+       gpiod_set_value(priv->mdc, 0);
+       realtek_smi_clk_delay(priv);
+
+       /* CLK 2: */
+       gpiod_set_value(priv->mdc, 1);
+       realtek_smi_clk_delay(priv);
+       gpiod_set_value(priv->mdio, 0);
+       realtek_smi_clk_delay(priv);
+       gpiod_set_value(priv->mdc, 0);
+       realtek_smi_clk_delay(priv);
+       gpiod_set_value(priv->mdio, 1);
+}
+
+static void realtek_smi_stop(struct realtek_priv *priv)
+{
+       realtek_smi_clk_delay(priv);
+       gpiod_set_value(priv->mdio, 0);
+       gpiod_set_value(priv->mdc, 1);
+       realtek_smi_clk_delay(priv);
+       gpiod_set_value(priv->mdio, 1);
+       realtek_smi_clk_delay(priv);
+       gpiod_set_value(priv->mdc, 1);
+       realtek_smi_clk_delay(priv);
+       gpiod_set_value(priv->mdc, 0);
+       realtek_smi_clk_delay(priv);
+       gpiod_set_value(priv->mdc, 1);
+
+       /* Add a click */
+       realtek_smi_clk_delay(priv);
+       gpiod_set_value(priv->mdc, 0);
+       realtek_smi_clk_delay(priv);
+       gpiod_set_value(priv->mdc, 1);
+
+       /* Set GPIO pins to input mode */
+       gpiod_direction_input(priv->mdio);
+       gpiod_direction_input(priv->mdc);
+}
+
+static void realtek_smi_write_bits(struct realtek_priv *priv, u32 data, u32 len)
+{
+       for (; len > 0; len--) {
+               realtek_smi_clk_delay(priv);
+
+               /* Prepare data */
+               gpiod_set_value(priv->mdio, !!(data & (1 << (len - 1))));
+               realtek_smi_clk_delay(priv);
+
+               /* Clocking */
+               gpiod_set_value(priv->mdc, 1);
+               realtek_smi_clk_delay(priv);
+               gpiod_set_value(priv->mdc, 0);
+       }
+}
+
+static void realtek_smi_read_bits(struct realtek_priv *priv, u32 len, u32 *data)
+{
+       gpiod_direction_input(priv->mdio);
+
+       for (*data = 0; len > 0; len--) {
+               u32 u;
+
+               realtek_smi_clk_delay(priv);
+
+               /* Clocking */
+               gpiod_set_value(priv->mdc, 1);
+               realtek_smi_clk_delay(priv);
+               u = !!gpiod_get_value(priv->mdio);
+               gpiod_set_value(priv->mdc, 0);
+
+               *data |= (u << (len - 1));
+       }
+
+       gpiod_direction_output(priv->mdio, 0);
+}
+
+static int realtek_smi_wait_for_ack(struct realtek_priv *priv)
+{
+       int retry_cnt;
+
+       retry_cnt = 0;
+       do {
+               u32 ack;
+
+               realtek_smi_read_bits(priv, 1, &ack);
+               if (ack == 0)
+                       break;
+
+               if (++retry_cnt > REALTEK_SMI_ACK_RETRY_COUNT) {
+                       dev_err(priv->dev, "ACK timeout\n");
+                       return -ETIMEDOUT;
+               }
+       } while (1);
+
+       return 0;
+}
+
+static int realtek_smi_write_byte(struct realtek_priv *priv, u8 data)
+{
+       realtek_smi_write_bits(priv, data, 8);
+       return realtek_smi_wait_for_ack(priv);
+}
+
+static int realtek_smi_write_byte_noack(struct realtek_priv *priv, u8 data)
+{
+       realtek_smi_write_bits(priv, data, 8);
+       return 0;
+}
+
+static int realtek_smi_read_byte0(struct realtek_priv *priv, u8 *data)
+{
+       u32 t;
+
+       /* Read data */
+       realtek_smi_read_bits(priv, 8, &t);
+       *data = (t & 0xff);
+
+       /* Send an ACK */
+       realtek_smi_write_bits(priv, 0x00, 1);
+
+       return 0;
+}
+
+static int realtek_smi_read_byte1(struct realtek_priv *priv, u8 *data)
+{
+       u32 t;
+
+       /* Read data */
+       realtek_smi_read_bits(priv, 8, &t);
+       *data = (t & 0xff);
+
+       /* Send an ACK */
+       realtek_smi_write_bits(priv, 0x01, 1);
+
+       return 0;
+}
+
+static int realtek_smi_read_reg(struct realtek_priv *priv, u32 addr, u32 *data)
+{
+       unsigned long flags;
+       u8 lo = 0;
+       u8 hi = 0;
+       int ret;
+
+       spin_lock_irqsave(&priv->lock, flags);
+
+       realtek_smi_start(priv);
+
+       /* Send READ command */
+       ret = realtek_smi_write_byte(priv, priv->cmd_read);
+       if (ret)
+               goto out;
+
+       /* Set ADDR[7:0] */
+       ret = realtek_smi_write_byte(priv, addr & 0xff);
+       if (ret)
+               goto out;
+
+       /* Set ADDR[15:8] */
+       ret = realtek_smi_write_byte(priv, addr >> 8);
+       if (ret)
+               goto out;
+
+       /* Read DATA[7:0] */
+       realtek_smi_read_byte0(priv, &lo);
+       /* Read DATA[15:8] */
+       realtek_smi_read_byte1(priv, &hi);
+
+       *data = ((u32)lo) | (((u32)hi) << 8);
+
+       ret = 0;
+
+ out:
+       realtek_smi_stop(priv);
+       spin_unlock_irqrestore(&priv->lock, flags);
+
+       return ret;
+}
+
+static int realtek_smi_write_reg(struct realtek_priv *priv,
+                                u32 addr, u32 data, bool ack)
+{
+       unsigned long flags;
+       int ret;
+
+       spin_lock_irqsave(&priv->lock, flags);
+
+       realtek_smi_start(priv);
+
+       /* Send WRITE command */
+       ret = realtek_smi_write_byte(priv, priv->cmd_write);
+       if (ret)
+               goto out;
+
+       /* Set ADDR[7:0] */
+       ret = realtek_smi_write_byte(priv, addr & 0xff);
+       if (ret)
+               goto out;
+
+       /* Set ADDR[15:8] */
+       ret = realtek_smi_write_byte(priv, addr >> 8);
+       if (ret)
+               goto out;
+
+       /* Write DATA[7:0] */
+       ret = realtek_smi_write_byte(priv, data & 0xff);
+       if (ret)
+               goto out;
+
+       /* Write DATA[15:8] */
+       if (ack)
+               ret = realtek_smi_write_byte(priv, data >> 8);
+       else
+               ret = realtek_smi_write_byte_noack(priv, data >> 8);
+       if (ret)
+               goto out;
+
+       ret = 0;
+
+ out:
+       realtek_smi_stop(priv);
+       spin_unlock_irqrestore(&priv->lock, flags);
+
+       return ret;
+}
+
+/* There is one single case when we need to use this accessor and that
+ * is when issueing soft reset. Since the device reset as soon as we write
+ * that bit, no ACK will come back for natural reasons.
+ */
+static int realtek_smi_write_reg_noack(void *ctx, u32 reg, u32 val)
+{
+       return realtek_smi_write_reg(ctx, reg, val, false);
+}
+
+/* Regmap accessors */
+
+static int realtek_smi_write(void *ctx, u32 reg, u32 val)
+{
+       struct realtek_priv *priv = ctx;
+
+       return realtek_smi_write_reg(priv, reg, val, true);
+}
+
+static int realtek_smi_read(void *ctx, u32 reg, u32 *val)
+{
+       struct realtek_priv *priv = ctx;
+
+       return realtek_smi_read_reg(priv, reg, val);
+}
+
+static const struct regmap_config realtek_smi_mdio_regmap_config = {
+       .reg_bits = 10, /* A4..A0 R4..R0 */
+       .val_bits = 16,
+       .reg_stride = 1,
+       /* PHY regs are at 0x8000 */
+       .max_register = 0xffff,
+       .reg_format_endian = REGMAP_ENDIAN_BIG,
+       .reg_read = realtek_smi_read,
+       .reg_write = realtek_smi_write,
+       .cache_type = REGCACHE_NONE,
+};
+
+static int realtek_smi_mdio_read(struct mii_bus *bus, int addr, int regnum)
+{
+       struct realtek_priv *priv = bus->priv;
+
+       return priv->ops->phy_read(priv, addr, regnum);
+}
+
+static int realtek_smi_mdio_write(struct mii_bus *bus, int addr, int regnum,
+                                 u16 val)
+{
+       struct realtek_priv *priv = bus->priv;
+
+       return priv->ops->phy_write(priv, addr, regnum, val);
+}
+
+static int realtek_smi_setup_mdio(struct dsa_switch *ds)
+{
+       struct realtek_priv *priv =  ds->priv;
+       struct device_node *mdio_np;
+       int ret;
+
+       mdio_np = of_get_compatible_child(priv->dev->of_node, "realtek,smi-mdio");
+       if (!mdio_np) {
+               dev_err(priv->dev, "no MDIO bus node\n");
+               return -ENODEV;
+       }
+
+       priv->slave_mii_bus = devm_mdiobus_alloc(priv->dev);
+       if (!priv->slave_mii_bus) {
+               ret = -ENOMEM;
+               goto err_put_node;
+       }
+       priv->slave_mii_bus->priv = priv;
+       priv->slave_mii_bus->name = "SMI slave MII";
+       priv->slave_mii_bus->read = realtek_smi_mdio_read;
+       priv->slave_mii_bus->write = realtek_smi_mdio_write;
+       snprintf(priv->slave_mii_bus->id, MII_BUS_ID_SIZE, "SMI-%d",
+                ds->index);
+       priv->slave_mii_bus->dev.of_node = mdio_np;
+       priv->slave_mii_bus->parent = priv->dev;
+       ds->slave_mii_bus = priv->slave_mii_bus;
+
+       ret = devm_of_mdiobus_register(priv->dev, priv->slave_mii_bus, mdio_np);
+       if (ret) {
+               dev_err(priv->dev, "unable to register MDIO bus %s\n",
+                       priv->slave_mii_bus->id);
+               goto err_put_node;
+       }
+
+       return 0;
+
+err_put_node:
+       of_node_put(mdio_np);
+
+       return ret;
+}
+
+static int realtek_smi_probe(struct platform_device *pdev)
+{
+       const struct realtek_variant *var;
+       struct device *dev = &pdev->dev;
+       struct realtek_priv *priv;
+       struct device_node *np;
+       int ret;
+
+       var = of_device_get_match_data(dev);
+       np = dev->of_node;
+
+       priv = devm_kzalloc(dev, sizeof(*priv) + var->chip_data_sz, GFP_KERNEL);
+       if (!priv)
+               return -ENOMEM;
+       priv->chip_data = (void *)priv + sizeof(*priv);
+       priv->map = devm_regmap_init(dev, NULL, priv,
+                                    &realtek_smi_mdio_regmap_config);
+       if (IS_ERR(priv->map)) {
+               ret = PTR_ERR(priv->map);
+               dev_err(dev, "regmap init failed: %d\n", ret);
+               return ret;
+       }
+
+       /* Link forward and backward */
+       priv->dev = dev;
+       priv->clk_delay = var->clk_delay;
+       priv->cmd_read = var->cmd_read;
+       priv->cmd_write = var->cmd_write;
+       priv->ops = var->ops;
+
+       priv->setup_interface = realtek_smi_setup_mdio;
+       priv->write_reg_noack = realtek_smi_write_reg_noack;
+
+       dev_set_drvdata(dev, priv);
+       spin_lock_init(&priv->lock);
+
+       /* TODO: if power is software controlled, set up any regulators here */
+
+       /* Assert then deassert RESET */
+       priv->reset = devm_gpiod_get_optional(dev, "reset", GPIOD_OUT_HIGH);
+       if (IS_ERR(priv->reset)) {
+               dev_err(dev, "failed to get RESET GPIO\n");
+               return PTR_ERR(priv->reset);
+       }
+       msleep(REALTEK_SMI_HW_STOP_DELAY);
+       gpiod_set_value(priv->reset, 0);
+       msleep(REALTEK_SMI_HW_START_DELAY);
+       dev_info(dev, "deasserted RESET\n");
+
+       /* Fetch MDIO pins */
+       priv->mdc = devm_gpiod_get_optional(dev, "mdc", GPIOD_OUT_LOW);
+       if (IS_ERR(priv->mdc))
+               return PTR_ERR(priv->mdc);
+       priv->mdio = devm_gpiod_get_optional(dev, "mdio", GPIOD_OUT_LOW);
+       if (IS_ERR(priv->mdio))
+               return PTR_ERR(priv->mdio);
+
+       priv->leds_disabled = of_property_read_bool(np, "realtek,disable-leds");
+
+       ret = priv->ops->detect(priv);
+       if (ret) {
+               dev_err(dev, "unable to detect switch\n");
+               return ret;
+       }
+
+       priv->ds = devm_kzalloc(dev, sizeof(*priv->ds), GFP_KERNEL);
+       if (!priv->ds)
+               return -ENOMEM;
+
+       priv->ds->dev = dev;
+       priv->ds->num_ports = priv->num_ports;
+       priv->ds->priv = priv;
+
+       priv->ds->ops = var->ds_ops_smi;
+       ret = dsa_register_switch(priv->ds);
+       if (ret) {
+               dev_err_probe(dev, ret, "unable to register switch\n");
+               return ret;
+       }
+       return 0;
+}
+
+static int realtek_smi_remove(struct platform_device *pdev)
+{
+       struct realtek_priv *priv = platform_get_drvdata(pdev);
+
+       if (!priv)
+               return 0;
+
+       dsa_unregister_switch(priv->ds);
+       if (priv->slave_mii_bus)
+               of_node_put(priv->slave_mii_bus->dev.of_node);
+       gpiod_set_value(priv->reset, 1);
+
+       platform_set_drvdata(pdev, NULL);
+
+       return 0;
+}
+
+static void realtek_smi_shutdown(struct platform_device *pdev)
+{
+       struct realtek_priv *priv = platform_get_drvdata(pdev);
+
+       if (!priv)
+               return;
+
+       dsa_switch_shutdown(priv->ds);
+
+       platform_set_drvdata(pdev, NULL);
+}
+
+static const struct of_device_id realtek_smi_of_match[] = {
+#if IS_ENABLED(CONFIG_NET_DSA_REALTEK_RTL8366RB)
+       {
+               .compatible = "realtek,rtl8366rb",
+               .data = &rtl8366rb_variant,
+       },
+#endif
+       {
+               /* FIXME: add support for RTL8366S and more */
+               .compatible = "realtek,rtl8366s",
+               .data = NULL,
+       },
+#if IS_ENABLED(CONFIG_NET_DSA_REALTEK_RTL8365MB)
+       {
+               .compatible = "realtek,rtl8365mb",
+               .data = &rtl8365mb_variant,
+       },
+       {
+               .compatible = "realtek,rtl8367s",
+               .data = &rtl8365mb_variant,
+       },
+#endif
+       { /* sentinel */ },
+};
+MODULE_DEVICE_TABLE(of, realtek_smi_of_match);
+
+static struct platform_driver realtek_smi_driver = {
+       .driver = {
+               .name = "realtek-smi",
+               .of_match_table = of_match_ptr(realtek_smi_of_match),
+       },
+       .probe  = realtek_smi_probe,
+       .remove = realtek_smi_remove,
+       .shutdown = realtek_smi_shutdown,
+};
+module_platform_driver(realtek_smi_driver);
+
+MODULE_AUTHOR("Linus Walleij <linus.walleij@linaro.org>");
+MODULE_DESCRIPTION("Driver for Realtek ethernet switch connected via SMI interface");
+MODULE_LICENSE("GPL");
similarity index 55%
rename from drivers/net/dsa/realtek-smi-core.h
rename to drivers/net/dsa/realtek/realtek.h
index 5bfa53e..ed5abf6 100644 (file)
@@ -13,7 +13,7 @@
 #include <linux/gpio/consumer.h>
 #include <net/dsa.h>
 
-struct realtek_smi_ops;
+struct realtek_ops;
 struct dentry;
 struct inode;
 struct file;
@@ -25,7 +25,7 @@ struct rtl8366_mib_counter {
        const char      *name;
 };
 
-/**
+/*
  * struct rtl8366_vlan_mc - Virtual LAN member configuration
  */
 struct rtl8366_vlan_mc {
@@ -43,13 +43,15 @@ struct rtl8366_vlan_4k {
        u8      fid;
 };
 
-struct realtek_smi {
+struct realtek_priv {
        struct device           *dev;
        struct gpio_desc        *reset;
        struct gpio_desc        *mdc;
        struct gpio_desc        *mdio;
        struct regmap           *map;
        struct mii_bus          *slave_mii_bus;
+       struct mii_bus          *bus;
+       int                     mdio_addr;
 
        unsigned int            clk_delay;
        u8                      cmd_read;
@@ -65,7 +67,9 @@ struct realtek_smi {
        unsigned int            num_mib_counters;
        struct rtl8366_mib_counter *mib_counters;
 
-       const struct realtek_smi_ops *ops;
+       const struct realtek_ops *ops;
+       int                     (*setup_interface)(struct dsa_switch *ds);
+       int                     (*write_reg_noack)(void *ctx, u32 addr, u32 data);
 
        int                     vlan_enabled;
        int                     vlan4k_enabled;
@@ -74,61 +78,57 @@ struct realtek_smi {
        void                    *chip_data; /* Per-chip extra variant data */
 };
 
-/**
- * struct realtek_smi_ops - vtable for the per-SMI-chiptype operations
+/*
+ * struct realtek_ops - vtable for the per-SMI-chiptype operations
  * @detect: detects the chiptype
  */
-struct realtek_smi_ops {
-       int     (*detect)(struct realtek_smi *smi);
-       int     (*reset_chip)(struct realtek_smi *smi);
-       int     (*setup)(struct realtek_smi *smi);
-       void    (*cleanup)(struct realtek_smi *smi);
-       int     (*get_mib_counter)(struct realtek_smi *smi,
+struct realtek_ops {
+       int     (*detect)(struct realtek_priv *priv);
+       int     (*reset_chip)(struct realtek_priv *priv);
+       int     (*setup)(struct realtek_priv *priv);
+       void    (*cleanup)(struct realtek_priv *priv);
+       int     (*get_mib_counter)(struct realtek_priv *priv,
                                   int port,
                                   struct rtl8366_mib_counter *mib,
                                   u64 *mibvalue);
-       int     (*get_vlan_mc)(struct realtek_smi *smi, u32 index,
+       int     (*get_vlan_mc)(struct realtek_priv *priv, u32 index,
                               struct rtl8366_vlan_mc *vlanmc);
-       int     (*set_vlan_mc)(struct realtek_smi *smi, u32 index,
+       int     (*set_vlan_mc)(struct realtek_priv *priv, u32 index,
                               const struct rtl8366_vlan_mc *vlanmc);
-       int     (*get_vlan_4k)(struct realtek_smi *smi, u32 vid,
+       int     (*get_vlan_4k)(struct realtek_priv *priv, u32 vid,
                               struct rtl8366_vlan_4k *vlan4k);
-       int     (*set_vlan_4k)(struct realtek_smi *smi,
+       int     (*set_vlan_4k)(struct realtek_priv *priv,
                               const struct rtl8366_vlan_4k *vlan4k);
-       int     (*get_mc_index)(struct realtek_smi *smi, int port, int *val);
-       int     (*set_mc_index)(struct realtek_smi *smi, int port, int index);
-       bool    (*is_vlan_valid)(struct realtek_smi *smi, unsigned int vlan);
-       int     (*enable_vlan)(struct realtek_smi *smi, bool enable);
-       int     (*enable_vlan4k)(struct realtek_smi *smi, bool enable);
-       int     (*enable_port)(struct realtek_smi *smi, int port, bool enable);
-       int     (*phy_read)(struct realtek_smi *smi, int phy, int regnum);
-       int     (*phy_write)(struct realtek_smi *smi, int phy, int regnum,
+       int     (*get_mc_index)(struct realtek_priv *priv, int port, int *val);
+       int     (*set_mc_index)(struct realtek_priv *priv, int port, int index);
+       bool    (*is_vlan_valid)(struct realtek_priv *priv, unsigned int vlan);
+       int     (*enable_vlan)(struct realtek_priv *priv, bool enable);
+       int     (*enable_vlan4k)(struct realtek_priv *priv, bool enable);
+       int     (*enable_port)(struct realtek_priv *priv, int port, bool enable);
+       int     (*phy_read)(struct realtek_priv *priv, int phy, int regnum);
+       int     (*phy_write)(struct realtek_priv *priv, int phy, int regnum,
                             u16 val);
 };
 
-struct realtek_smi_variant {
-       const struct dsa_switch_ops *ds_ops;
-       const struct realtek_smi_ops *ops;
+struct realtek_variant {
+       const struct dsa_switch_ops *ds_ops_smi;
+       const struct dsa_switch_ops *ds_ops_mdio;
+       const struct realtek_ops *ops;
        unsigned int clk_delay;
        u8 cmd_read;
        u8 cmd_write;
        size_t chip_data_sz;
 };
 
-/* SMI core calls */
-int realtek_smi_write_reg_noack(struct realtek_smi *smi, u32 addr,
-                               u32 data);
-int realtek_smi_setup_mdio(struct realtek_smi *smi);
-
 /* RTL8366 library helpers */
-int rtl8366_mc_is_used(struct realtek_smi *smi, int mc_index, int *used);
-int rtl8366_set_vlan(struct realtek_smi *smi, int vid, u32 member,
+int rtl8366_mc_is_used(struct realtek_priv *priv, int mc_index, int *used);
+int rtl8366_set_vlan(struct realtek_priv *priv, int vid, u32 member,
                     u32 untag, u32 fid);
-int rtl8366_set_pvid(struct realtek_smi *smi, unsigned int port,
+int rtl8366_set_pvid(struct realtek_priv *priv, unsigned int port,
                     unsigned int vid);
-int rtl8366_enable_vlan4k(struct realtek_smi *smi, bool enable);
-int rtl8366_enable_vlan(struct realtek_smi *smi, bool enable);
-int rtl8366_reset_vlan(struct realtek_smi *smi);
+int rtl8366_enable_vlan4k(struct realtek_priv *priv, bool enable);
+int rtl8366_enable_vlan(struct realtek_priv *priv, bool enable);
+int rtl8366_reset_vlan(struct realtek_priv *priv);
 int rtl8366_vlan_add(struct dsa_switch *ds, int port,
                     const struct switchdev_obj_port_vlan *vlan,
                     struct netlink_ext_ack *extack);
@@ -139,7 +139,7 @@ void rtl8366_get_strings(struct dsa_switch *ds, int port, u32 stringset,
 int rtl8366_get_sset_count(struct dsa_switch *ds, int port, int sset);
 void rtl8366_get_ethtool_stats(struct dsa_switch *ds, int port, uint64_t *data);
 
-extern const struct realtek_smi_variant rtl8366rb_variant;
-extern const struct realtek_smi_variant rtl8365mb_variant;
+extern const struct realtek_variant rtl8366rb_variant;
+extern const struct realtek_variant rtl8365mb_variant;
 
 #endif /*  _REALTEK_SMI_H */
similarity index 75%
rename from drivers/net/dsa/rtl8365mb.c
rename to drivers/net/dsa/realtek/rtl8365mb.c
index 3b72954..2ed5921 100644 (file)
 #include <linux/regmap.h>
 #include <linux/if_bridge.h>
 
-#include "realtek-smi-core.h"
+#include "realtek.h"
 
 /* Chip-specific data and limits */
-#define RTL8365MB_CHIP_ID_8365MB_VC            0x6367
-#define RTL8365MB_CPU_PORT_NUM_8365MB_VC       6
-#define RTL8365MB_LEARN_LIMIT_MAX_8365MB_VC    2112
+#define RTL8365MB_CHIP_ID_8365MB_VC    0x6367
+#define RTL8365MB_CHIP_VER_8365MB_VC   0x0040
+
+#define RTL8365MB_CHIP_ID_8367S                0x6367
+#define RTL8365MB_CHIP_VER_8367S       0x00A0
+
+#define RTL8365MB_CHIP_ID_8367RB       0x6367
+#define RTL8365MB_CHIP_VER_8367RB      0x0020
 
 /* Family-specific data and limits */
-#define RTL8365MB_PHYADDRMAX   7
-#define RTL8365MB_NUM_PHYREGS  32
-#define RTL8365MB_PHYREGMAX    (RTL8365MB_NUM_PHYREGS - 1)
-#define RTL8365MB_MAX_NUM_PORTS        (RTL8365MB_CPU_PORT_NUM_8365MB_VC + 1)
+#define RTL8365MB_PHYADDRMAX           7
+#define RTL8365MB_NUM_PHYREGS          32
+#define RTL8365MB_PHYREGMAX            (RTL8365MB_NUM_PHYREGS - 1)
+/* RTL8370MB and RTL8310SR, possibly suportable by this driver, have 10 ports */
+#define RTL8365MB_MAX_NUM_PORTS                10
+#define RTL8365MB_LEARN_LIMIT_MAX      2112
+
+/* valid for all 6-port or less variants */
+static const int rtl8365mb_extint_port_map[]  = { -1, -1, -1, -1, -1, -1, 1, 2, -1, -1};
 
 /* Chip identification registers */
 #define RTL8365MB_CHIP_ID_REG          0x1300
 /* The PHY OCP addresses of PHY registers 0~31 start here */
 #define RTL8365MB_PHY_OCP_ADDR_PHYREG_BASE             0xA400
 
-/* EXT port interface mode values - used in DIGITAL_INTERFACE_SELECT */
+/* EXT interface port mode values - used in DIGITAL_INTERFACE_SELECT */
 #define RTL8365MB_EXT_PORT_MODE_DISABLE                0
 #define RTL8365MB_EXT_PORT_MODE_RGMII          1
 #define RTL8365MB_EXT_PORT_MODE_MII_MAC                2
 #define RTL8365MB_EXT_PORT_MODE_1000X          12
 #define RTL8365MB_EXT_PORT_MODE_100FX          13
 
-/* EXT port interface mode configuration registers 0~1 */
-#define RTL8365MB_DIGITAL_INTERFACE_SELECT_REG0                0x1305
-#define RTL8365MB_DIGITAL_INTERFACE_SELECT_REG1                0x13C3
-#define RTL8365MB_DIGITAL_INTERFACE_SELECT_REG(_extport)   \
-               (RTL8365MB_DIGITAL_INTERFACE_SELECT_REG0 + \
-                ((_extport) >> 1) * (0x13C3 - 0x1305))
-#define   RTL8365MB_DIGITAL_INTERFACE_SELECT_MODE_MASK(_extport) \
-               (0xF << (((_extport) % 2)))
-#define   RTL8365MB_DIGITAL_INTERFACE_SELECT_MODE_OFFSET(_extport) \
-               (((_extport) % 2) * 4)
-
-/* EXT port RGMII TX/RX delay configuration registers 1~2 */
-#define RTL8365MB_EXT_RGMXF_REG1               0x1307
-#define RTL8365MB_EXT_RGMXF_REG2               0x13C5
-#define RTL8365MB_EXT_RGMXF_REG(_extport)   \
-               (RTL8365MB_EXT_RGMXF_REG1 + \
-                (((_extport) >> 1) * (0x13C5 - 0x1307)))
+/* Realtek docs and driver uses logic number as EXT_PORT0=16, EXT_PORT1=17,
+ * EXT_PORT2=18, to interact with switch ports. That logic number is internally
+ * converted to either a physical port number (0..9) or an external interface id (0..2),
+ * depending on which function was called. The external interface id is calculated as
+ * (ext_id=logic_port-15), while the logical to physical map depends on the chip id/version.
+ *
+ * EXT_PORT0 mentioned in datasheets and rtl8367c driver is used in this driver
+ * as extid==1, EXT_PORT2, mentioned in Realtek rtl8367c driver for 10-port switches,
+ * would have an ext_id of 3 (out of range for most extint macros) and ext_id 0 does
+ * not seem to be used as well for this family.
+ */
+
+/* EXT interface mode configuration registers 0~1 */
+#define RTL8365MB_DIGITAL_INTERFACE_SELECT_REG0                0x1305 /* EXT1 */
+#define RTL8365MB_DIGITAL_INTERFACE_SELECT_REG1                0x13C3 /* EXT2 */
+#define RTL8365MB_DIGITAL_INTERFACE_SELECT_REG(_extint) \
+               ((_extint) == 1 ? RTL8365MB_DIGITAL_INTERFACE_SELECT_REG0 : \
+                (_extint) == 2 ? RTL8365MB_DIGITAL_INTERFACE_SELECT_REG1 : \
+                0x0)
+#define   RTL8365MB_DIGITAL_INTERFACE_SELECT_MODE_MASK(_extint) \
+               (0xF << (((_extint) % 2)))
+#define   RTL8365MB_DIGITAL_INTERFACE_SELECT_MODE_OFFSET(_extint) \
+               (((_extint) % 2) * 4)
+
+/* EXT interface RGMII TX/RX delay configuration registers 0~2 */
+#define RTL8365MB_EXT_RGMXF_REG0               0x1306 /* EXT0 */
+#define RTL8365MB_EXT_RGMXF_REG1               0x1307 /* EXT1 */
+#define RTL8365MB_EXT_RGMXF_REG2               0x13C5 /* EXT2 */
+#define RTL8365MB_EXT_RGMXF_REG(_extint) \
+               ((_extint) == 0 ? RTL8365MB_EXT_RGMXF_REG0 : \
+                (_extint) == 1 ? RTL8365MB_EXT_RGMXF_REG1 : \
+                (_extint) == 2 ? RTL8365MB_EXT_RGMXF_REG2 : \
+                0x0)
 #define   RTL8365MB_EXT_RGMXF_RXDELAY_MASK     0x0007
 #define   RTL8365MB_EXT_RGMXF_TXDELAY_MASK     0x0008
 
-/* External port speed values - used in DIGITAL_INTERFACE_FORCE */
+/* External interface port speed values - used in DIGITAL_INTERFACE_FORCE */
 #define RTL8365MB_PORT_SPEED_10M       0
 #define RTL8365MB_PORT_SPEED_100M      1
 #define RTL8365MB_PORT_SPEED_1000M     2
 
-/* EXT port force configuration registers 0~2 */
-#define RTL8365MB_DIGITAL_INTERFACE_FORCE_REG0                 0x1310
-#define RTL8365MB_DIGITAL_INTERFACE_FORCE_REG1                 0x1311
-#define RTL8365MB_DIGITAL_INTERFACE_FORCE_REG2                 0x13C4
-#define RTL8365MB_DIGITAL_INTERFACE_FORCE_REG(_extport)   \
-               (RTL8365MB_DIGITAL_INTERFACE_FORCE_REG0 + \
-                ((_extport) & 0x1) +                     \
-                ((((_extport) >> 1) & 0x1) * (0x13C4 - 0x1310)))
+/* EXT interface force configuration registers 0~2 */
+#define RTL8365MB_DIGITAL_INTERFACE_FORCE_REG0         0x1310 /* EXT0 */
+#define RTL8365MB_DIGITAL_INTERFACE_FORCE_REG1         0x1311 /* EXT1 */
+#define RTL8365MB_DIGITAL_INTERFACE_FORCE_REG2         0x13C4 /* EXT2 */
+#define RTL8365MB_DIGITAL_INTERFACE_FORCE_REG(_extint) \
+               ((_extint) == 0 ? RTL8365MB_DIGITAL_INTERFACE_FORCE_REG0 : \
+                (_extint) == 1 ? RTL8365MB_DIGITAL_INTERFACE_FORCE_REG1 : \
+                (_extint) == 2 ? RTL8365MB_DIGITAL_INTERFACE_FORCE_REG2 : \
+                0x0)
 #define   RTL8365MB_DIGITAL_INTERFACE_FORCE_EN_MASK            0x1000
 #define   RTL8365MB_DIGITAL_INTERFACE_FORCE_NWAY_MASK          0x0080
 #define   RTL8365MB_DIGITAL_INTERFACE_FORCE_TXPAUSE_MASK       0x0040
@@ -516,7 +543,7 @@ struct rtl8365mb_cpu {
 
 /**
  * struct rtl8365mb_port - private per-port data
- * @smi: pointer to parent realtek_smi data
+ * @priv: pointer to parent realtek_priv data
  * @index: DSA port index, same as dsa_port::index
  * @stats: link statistics populated by rtl8365mb_stats_poll, ready for atomic
  *         access via rtl8365mb_get_stats64
@@ -524,7 +551,7 @@ struct rtl8365mb_cpu {
  * @mib_work: delayed work for polling MIB counters
  */
 struct rtl8365mb_port {
-       struct realtek_smi *smi;
+       struct realtek_priv *priv;
        unsigned int index;
        struct rtnl_link_stats64 stats;
        spinlock_t stats_lock;
@@ -533,13 +560,12 @@ struct rtl8365mb_port {
 
 /**
  * struct rtl8365mb - private chip-specific driver data
- * @smi: pointer to parent realtek_smi data
+ * @priv: pointer to parent realtek_priv data
  * @irq: registered IRQ or zero
  * @chip_id: chip identifier
  * @chip_ver: chip silicon revision
  * @port_mask: mask of all ports
  * @learn_limit_max: maximum number of L2 addresses the chip can learn
- * @cpu: CPU tagging and CPU port configuration for this chip
  * @mib_lock: prevent concurrent reads of MIB counters
  * @ports: per-port data
  * @jam_table: chip-specific initialization jam table
@@ -548,29 +574,28 @@ struct rtl8365mb_port {
  * Private data for this driver.
  */
 struct rtl8365mb {
-       struct realtek_smi *smi;
+       struct realtek_priv *priv;
        int irq;
        u32 chip_id;
        u32 chip_ver;
        u32 port_mask;
        u32 learn_limit_max;
-       struct rtl8365mb_cpu cpu;
        struct mutex mib_lock;
        struct rtl8365mb_port ports[RTL8365MB_MAX_NUM_PORTS];
        const struct rtl8365mb_jam_tbl_entry *jam_table;
        size_t jam_size;
 };
 
-static int rtl8365mb_phy_poll_busy(struct realtek_smi *smi)
+static int rtl8365mb_phy_poll_busy(struct realtek_priv *priv)
 {
        u32 val;
 
-       return regmap_read_poll_timeout(smi->map,
+       return regmap_read_poll_timeout(priv->map,
                                        RTL8365MB_INDIRECT_ACCESS_STATUS_REG,
                                        val, !val, 10, 100);
 }
 
-static int rtl8365mb_phy_ocp_prepare(struct realtek_smi *smi, int phy,
+static int rtl8365mb_phy_ocp_prepare(struct realtek_priv *priv, int phy,
                                     u32 ocp_addr)
 {
        u32 val;
@@ -579,7 +604,7 @@ static int rtl8365mb_phy_ocp_prepare(struct realtek_smi *smi, int phy,
        /* Set OCP prefix */
        val = FIELD_GET(RTL8365MB_PHY_OCP_ADDR_PREFIX_MASK, ocp_addr);
        ret = regmap_update_bits(
-               smi->map, RTL8365MB_GPHY_OCP_MSB_0_REG,
+               priv->map, RTL8365MB_GPHY_OCP_MSB_0_REG,
                RTL8365MB_GPHY_OCP_MSB_0_CFG_CPU_OCPADR_MASK,
                FIELD_PREP(RTL8365MB_GPHY_OCP_MSB_0_CFG_CPU_OCPADR_MASK, val));
        if (ret)
@@ -592,7 +617,7 @@ static int rtl8365mb_phy_ocp_prepare(struct realtek_smi *smi, int phy,
                          ocp_addr >> 1);
        val |= FIELD_PREP(RTL8365MB_INDIRECT_ACCESS_ADDRESS_OCPADR_9_6_MASK,
                          ocp_addr >> 6);
-       ret = regmap_write(smi->map, RTL8365MB_INDIRECT_ACCESS_ADDRESS_REG,
+       ret = regmap_write(priv->map, RTL8365MB_INDIRECT_ACCESS_ADDRESS_REG,
                           val);
        if (ret)
                return ret;
@@ -600,17 +625,17 @@ static int rtl8365mb_phy_ocp_prepare(struct realtek_smi *smi, int phy,
        return 0;
 }
 
-static int rtl8365mb_phy_ocp_read(struct realtek_smi *smi, int phy,
+static int rtl8365mb_phy_ocp_read(struct realtek_priv *priv, int phy,
                                  u32 ocp_addr, u16 *data)
 {
        u32 val;
        int ret;
 
-       ret = rtl8365mb_phy_poll_busy(smi);
+       ret = rtl8365mb_phy_poll_busy(priv);
        if (ret)
                return ret;
 
-       ret = rtl8365mb_phy_ocp_prepare(smi, phy, ocp_addr);
+       ret = rtl8365mb_phy_ocp_prepare(priv, phy, ocp_addr);
        if (ret)
                return ret;
 
@@ -619,16 +644,16 @@ static int rtl8365mb_phy_ocp_read(struct realtek_smi *smi, int phy,
                         RTL8365MB_INDIRECT_ACCESS_CTRL_CMD_VALUE) |
              FIELD_PREP(RTL8365MB_INDIRECT_ACCESS_CTRL_RW_MASK,
                         RTL8365MB_INDIRECT_ACCESS_CTRL_RW_READ);
-       ret = regmap_write(smi->map, RTL8365MB_INDIRECT_ACCESS_CTRL_REG, val);
+       ret = regmap_write(priv->map, RTL8365MB_INDIRECT_ACCESS_CTRL_REG, val);
        if (ret)
                return ret;
 
-       ret = rtl8365mb_phy_poll_busy(smi);
+       ret = rtl8365mb_phy_poll_busy(priv);
        if (ret)
                return ret;
 
        /* Get PHY register data */
-       ret = regmap_read(smi->map, RTL8365MB_INDIRECT_ACCESS_READ_DATA_REG,
+       ret = regmap_read(priv->map, RTL8365MB_INDIRECT_ACCESS_READ_DATA_REG,
                          &val);
        if (ret)
                return ret;
@@ -638,22 +663,22 @@ static int rtl8365mb_phy_ocp_read(struct realtek_smi *smi, int phy,
        return 0;
 }
 
-static int rtl8365mb_phy_ocp_write(struct realtek_smi *smi, int phy,
+static int rtl8365mb_phy_ocp_write(struct realtek_priv *priv, int phy,
                                   u32 ocp_addr, u16 data)
 {
        u32 val;
        int ret;
 
-       ret = rtl8365mb_phy_poll_busy(smi);
+       ret = rtl8365mb_phy_poll_busy(priv);
        if (ret)
                return ret;
 
-       ret = rtl8365mb_phy_ocp_prepare(smi, phy, ocp_addr);
+       ret = rtl8365mb_phy_ocp_prepare(priv, phy, ocp_addr);
        if (ret)
                return ret;
 
        /* Set PHY register data */
-       ret = regmap_write(smi->map, RTL8365MB_INDIRECT_ACCESS_WRITE_DATA_REG,
+       ret = regmap_write(priv->map, RTL8365MB_INDIRECT_ACCESS_WRITE_DATA_REG,
                           data);
        if (ret)
                return ret;
@@ -663,18 +688,18 @@ static int rtl8365mb_phy_ocp_write(struct realtek_smi *smi, int phy,
                         RTL8365MB_INDIRECT_ACCESS_CTRL_CMD_VALUE) |
              FIELD_PREP(RTL8365MB_INDIRECT_ACCESS_CTRL_RW_MASK,
                         RTL8365MB_INDIRECT_ACCESS_CTRL_RW_WRITE);
-       ret = regmap_write(smi->map, RTL8365MB_INDIRECT_ACCESS_CTRL_REG, val);
+       ret = regmap_write(priv->map, RTL8365MB_INDIRECT_ACCESS_CTRL_REG, val);
        if (ret)
                return ret;
 
-       ret = rtl8365mb_phy_poll_busy(smi);
+       ret = rtl8365mb_phy_poll_busy(priv);
        if (ret)
                return ret;
 
        return 0;
 }
 
-static int rtl8365mb_phy_read(struct realtek_smi *smi, int phy, int regnum)
+static int rtl8365mb_phy_read(struct realtek_priv *priv, int phy, int regnum)
 {
        u32 ocp_addr;
        u16 val;
@@ -688,21 +713,21 @@ static int rtl8365mb_phy_read(struct realtek_smi *smi, int phy, int regnum)
 
        ocp_addr = RTL8365MB_PHY_OCP_ADDR_PHYREG_BASE + regnum * 2;
 
-       ret = rtl8365mb_phy_ocp_read(smi, phy, ocp_addr, &val);
+       ret = rtl8365mb_phy_ocp_read(priv, phy, ocp_addr, &val);
        if (ret) {
-               dev_err(smi->dev,
+               dev_err(priv->dev,
                        "failed to read PHY%d reg %02x @ %04x, ret %d\n", phy,
                        regnum, ocp_addr, ret);
                return ret;
        }
 
-       dev_dbg(smi->dev, "read PHY%d register 0x%02x @ %04x, val <- %04x\n",
+       dev_dbg(priv->dev, "read PHY%d register 0x%02x @ %04x, val <- %04x\n",
                phy, regnum, ocp_addr, val);
 
        return val;
 }
 
-static int rtl8365mb_phy_write(struct realtek_smi *smi, int phy, int regnum,
+static int rtl8365mb_phy_write(struct realtek_priv *priv, int phy, int regnum,
                               u16 val)
 {
        u32 ocp_addr;
@@ -716,20 +741,31 @@ static int rtl8365mb_phy_write(struct realtek_smi *smi, int phy, int regnum,
 
        ocp_addr = RTL8365MB_PHY_OCP_ADDR_PHYREG_BASE + regnum * 2;
 
-       ret = rtl8365mb_phy_ocp_write(smi, phy, ocp_addr, val);
+       ret = rtl8365mb_phy_ocp_write(priv, phy, ocp_addr, val);
        if (ret) {
-               dev_err(smi->dev,
+               dev_err(priv->dev,
                        "failed to write PHY%d reg %02x @ %04x, ret %d\n", phy,
                        regnum, ocp_addr, ret);
                return ret;
        }
 
-       dev_dbg(smi->dev, "write PHY%d register 0x%02x @ %04x, val -> %04x\n",
+       dev_dbg(priv->dev, "write PHY%d register 0x%02x @ %04x, val -> %04x\n",
                phy, regnum, ocp_addr, val);
 
        return 0;
 }
 
+static int rtl8365mb_dsa_phy_read(struct dsa_switch *ds, int phy, int regnum)
+{
+       return rtl8365mb_phy_read(ds->priv, phy, regnum);
+}
+
+static int rtl8365mb_dsa_phy_write(struct dsa_switch *ds, int phy, int regnum,
+                                  u16 val)
+{
+       return rtl8365mb_phy_write(ds->priv, phy, regnum, val);
+}
+
 static enum dsa_tag_protocol
 rtl8365mb_get_tag_protocol(struct dsa_switch *ds, int port,
                           enum dsa_tag_protocol mp)
@@ -737,25 +773,25 @@ rtl8365mb_get_tag_protocol(struct dsa_switch *ds, int port,
        return DSA_TAG_PROTO_RTL8_4;
 }
 
-static int rtl8365mb_ext_config_rgmii(struct realtek_smi *smi, int port,
+static int rtl8365mb_ext_config_rgmii(struct realtek_priv *priv, int port,
                                      phy_interface_t interface)
 {
        struct device_node *dn;
        struct dsa_port *dp;
        int tx_delay = 0;
        int rx_delay = 0;
-       int ext_port;
+       int ext_int;
        u32 val;
        int ret;
 
-       if (port == smi->cpu_port) {
-               ext_port = 1;
-       } else {
-               dev_err(smi->dev, "only one EXT port is currently supported\n");
+       ext_int = rtl8365mb_extint_port_map[port];
+
+       if (ext_int <= 0) {
+               dev_err(priv->dev, "Port %d is not an external interface port\n", port);
                return -EINVAL;
        }
 
-       dp = dsa_to_port(smi->ds, port);
+       dp = dsa_to_port(priv->ds, port);
        dn = dp->dn;
 
        /* Set the RGMII TX/RX delay
@@ -786,8 +822,8 @@ static int rtl8365mb_ext_config_rgmii(struct realtek_smi *smi, int port,
                if (val == 0 || val == 2)
                        tx_delay = val / 2;
                else
-                       dev_warn(smi->dev,
-                                "EXT port TX delay must be 0 or 2 ns\n");
+                       dev_warn(priv->dev,
+                                "EXT interface TX delay must be 0 or 2 ns\n");
        }
 
        if (!of_property_read_u32(dn, "rx-internal-delay-ps", &val)) {
@@ -796,12 +832,12 @@ static int rtl8365mb_ext_config_rgmii(struct realtek_smi *smi, int port,
                if (val <= 7)
                        rx_delay = val;
                else
-                       dev_warn(smi->dev,
-                                "EXT port RX delay must be 0 to 2.1 ns\n");
+                       dev_warn(priv->dev,
+                                "EXT interface RX delay must be 0 to 2.1 ns\n");
        }
 
        ret = regmap_update_bits(
-               smi->map, RTL8365MB_EXT_RGMXF_REG(ext_port),
+               priv->map, RTL8365MB_EXT_RGMXF_REG(ext_int),
                RTL8365MB_EXT_RGMXF_TXDELAY_MASK |
                        RTL8365MB_EXT_RGMXF_RXDELAY_MASK,
                FIELD_PREP(RTL8365MB_EXT_RGMXF_TXDELAY_MASK, tx_delay) |
@@ -810,18 +846,18 @@ static int rtl8365mb_ext_config_rgmii(struct realtek_smi *smi, int port,
                return ret;
 
        ret = regmap_update_bits(
-               smi->map, RTL8365MB_DIGITAL_INTERFACE_SELECT_REG(ext_port),
-               RTL8365MB_DIGITAL_INTERFACE_SELECT_MODE_MASK(ext_port),
+               priv->map, RTL8365MB_DIGITAL_INTERFACE_SELECT_REG(ext_int),
+               RTL8365MB_DIGITAL_INTERFACE_SELECT_MODE_MASK(ext_int),
                RTL8365MB_EXT_PORT_MODE_RGMII
                        << RTL8365MB_DIGITAL_INTERFACE_SELECT_MODE_OFFSET(
-                                  ext_port));
+                                  ext_int));
        if (ret)
                return ret;
 
        return 0;
 }
 
-static int rtl8365mb_ext_config_forcemode(struct realtek_smi *smi, int port,
+static int rtl8365mb_ext_config_forcemode(struct realtek_priv *priv, int port,
                                          bool link, int speed, int duplex,
                                          bool tx_pause, bool rx_pause)
 {
@@ -830,14 +866,14 @@ static int rtl8365mb_ext_config_forcemode(struct realtek_smi *smi, int port,
        u32 r_duplex;
        u32 r_speed;
        u32 r_link;
-       int ext_port;
+       int ext_int;
        int val;
        int ret;
 
-       if (port == smi->cpu_port) {
-               ext_port = 1;
-       } else {
-               dev_err(smi->dev, "only one EXT port is currently supported\n");
+       ext_int = rtl8365mb_extint_port_map[port];
+
+       if (ext_int <= 0) {
+               dev_err(priv->dev, "Port %d is not an external interface port\n", port);
                return -EINVAL;
        }
 
@@ -854,7 +890,7 @@ static int rtl8365mb_ext_config_forcemode(struct realtek_smi *smi, int port,
                } else if (speed == SPEED_10) {
                        r_speed = RTL8365MB_PORT_SPEED_10M;
                } else {
-                       dev_err(smi->dev, "unsupported port speed %s\n",
+                       dev_err(priv->dev, "unsupported port speed %s\n",
                                phy_speed_to_str(speed));
                        return -EINVAL;
                }
@@ -864,7 +900,7 @@ static int rtl8365mb_ext_config_forcemode(struct realtek_smi *smi, int port,
                } else if (duplex == DUPLEX_HALF) {
                        r_duplex = 0;
                } else {
-                       dev_err(smi->dev, "unsupported duplex %s\n",
+                       dev_err(priv->dev, "unsupported duplex %s\n",
                                phy_duplex_to_str(duplex));
                        return -EINVAL;
                }
@@ -886,8 +922,8 @@ static int rtl8365mb_ext_config_forcemode(struct realtek_smi *smi, int port,
              FIELD_PREP(RTL8365MB_DIGITAL_INTERFACE_FORCE_DUPLEX_MASK,
                         r_duplex) |
              FIELD_PREP(RTL8365MB_DIGITAL_INTERFACE_FORCE_SPEED_MASK, r_speed);
-       ret = regmap_write(smi->map,
-                          RTL8365MB_DIGITAL_INTERFACE_FORCE_REG(ext_port),
+       ret = regmap_write(priv->map,
+                          RTL8365MB_DIGITAL_INTERFACE_FORCE_REG(ext_int),
                           val);
        if (ret)
                return ret;
@@ -898,13 +934,17 @@ static int rtl8365mb_ext_config_forcemode(struct realtek_smi *smi, int port,
 static bool rtl8365mb_phy_mode_supported(struct dsa_switch *ds, int port,
                                         phy_interface_t interface)
 {
-       if (dsa_is_user_port(ds, port) &&
+       int ext_int;
+
+       ext_int = rtl8365mb_extint_port_map[port];
+
+       if (ext_int < 0 &&
            (interface == PHY_INTERFACE_MODE_NA ||
             interface == PHY_INTERFACE_MODE_INTERNAL ||
             interface == PHY_INTERFACE_MODE_GMII))
                /* Internal PHY */
                return true;
-       else if (dsa_is_cpu_port(ds, port) &&
+       else if ((ext_int >= 1) &&
                 phy_interface_mode_is_rgmii(interface))
                /* Extension MAC */
                return true;
@@ -912,65 +952,43 @@ static bool rtl8365mb_phy_mode_supported(struct dsa_switch *ds, int port,
        return false;
 }
 
-static void rtl8365mb_phylink_validate(struct dsa_switch *ds, int port,
-                                      unsigned long *supported,
-                                      struct phylink_link_state *state)
+static void rtl8365mb_phylink_get_caps(struct dsa_switch *ds, int port,
+                                      struct phylink_config *config)
 {
-       struct realtek_smi *smi = ds->priv;
-       __ETHTOOL_DECLARE_LINK_MODE_MASK(mask) = { 0 };
-
-       /* include/linux/phylink.h says:
-        *     When @state->interface is %PHY_INTERFACE_MODE_NA, phylink
-        *     expects the MAC driver to return all supported link modes.
-        */
-       if (state->interface != PHY_INTERFACE_MODE_NA &&
-           !rtl8365mb_phy_mode_supported(ds, port, state->interface)) {
-               dev_err(smi->dev, "phy mode %s is unsupported on port %d\n",
-                       phy_modes(state->interface), port);
-               linkmode_zero(supported);
-               return;
-       }
-
-       phylink_set_port_modes(mask);
-
-       phylink_set(mask, Autoneg);
-       phylink_set(mask, Pause);
-       phylink_set(mask, Asym_Pause);
-
-       phylink_set(mask, 10baseT_Half);
-       phylink_set(mask, 10baseT_Full);
-       phylink_set(mask, 100baseT_Half);
-       phylink_set(mask, 100baseT_Full);
-       phylink_set(mask, 1000baseT_Full);
-
-       linkmode_and(supported, supported, mask);
-       linkmode_and(state->advertising, state->advertising, mask);
+       if (dsa_is_user_port(ds, port))
+               __set_bit(PHY_INTERFACE_MODE_INTERNAL,
+                         config->supported_interfaces);
+       else if (dsa_is_cpu_port(ds, port))
+               phy_interface_set_rgmii(config->supported_interfaces);
+
+       config->mac_capabilities = MAC_SYM_PAUSE | MAC_ASYM_PAUSE |
+                                  MAC_10 | MAC_100 | MAC_1000FD;
 }
 
 static void rtl8365mb_phylink_mac_config(struct dsa_switch *ds, int port,
                                         unsigned int mode,
                                         const struct phylink_link_state *state)
 {
-       struct realtek_smi *smi = ds->priv;
+       struct realtek_priv *priv = ds->priv;
        int ret;
 
        if (!rtl8365mb_phy_mode_supported(ds, port, state->interface)) {
-               dev_err(smi->dev, "phy mode %s is unsupported on port %d\n",
+               dev_err(priv->dev, "phy mode %s is unsupported on port %d\n",
                        phy_modes(state->interface), port);
                return;
        }
 
        if (mode != MLO_AN_PHY && mode != MLO_AN_FIXED) {
-               dev_err(smi->dev,
+               dev_err(priv->dev,
                        "port %d supports only conventional PHY or fixed-link\n",
                        port);
                return;
        }
 
        if (phy_interface_mode_is_rgmii(state->interface)) {
-               ret = rtl8365mb_ext_config_rgmii(smi, port, state->interface);
+               ret = rtl8365mb_ext_config_rgmii(priv, port, state->interface);
                if (ret)
-                       dev_err(smi->dev,
+                       dev_err(priv->dev,
                                "failed to configure RGMII mode on port %d: %d\n",
                                port, ret);
                return;
@@ -985,20 +1003,20 @@ static void rtl8365mb_phylink_mac_link_down(struct dsa_switch *ds, int port,
                                            unsigned int mode,
                                            phy_interface_t interface)
 {
-       struct realtek_smi *smi = ds->priv;
+       struct realtek_priv *priv = ds->priv;
        struct rtl8365mb_port *p;
        struct rtl8365mb *mb;
        int ret;
 
-       mb = smi->chip_data;
+       mb = priv->chip_data;
        p = &mb->ports[port];
        cancel_delayed_work_sync(&p->mib_work);
 
        if (phy_interface_mode_is_rgmii(interface)) {
-               ret = rtl8365mb_ext_config_forcemode(smi, port, false, 0, 0,
+               ret = rtl8365mb_ext_config_forcemode(priv, port, false, 0, 0,
                                                     false, false);
                if (ret)
-                       dev_err(smi->dev,
+                       dev_err(priv->dev,
                                "failed to reset forced mode on port %d: %d\n",
                                port, ret);
 
@@ -1013,21 +1031,21 @@ static void rtl8365mb_phylink_mac_link_up(struct dsa_switch *ds, int port,
                                          int duplex, bool tx_pause,
                                          bool rx_pause)
 {
-       struct realtek_smi *smi = ds->priv;
+       struct realtek_priv *priv = ds->priv;
        struct rtl8365mb_port *p;
        struct rtl8365mb *mb;
        int ret;
 
-       mb = smi->chip_data;
+       mb = priv->chip_data;
        p = &mb->ports[port];
        schedule_delayed_work(&p->mib_work, 0);
 
        if (phy_interface_mode_is_rgmii(interface)) {
-               ret = rtl8365mb_ext_config_forcemode(smi, port, true, speed,
+               ret = rtl8365mb_ext_config_forcemode(priv, port, true, speed,
                                                     duplex, tx_pause,
                                                     rx_pause);
                if (ret)
-                       dev_err(smi->dev,
+                       dev_err(priv->dev,
                                "failed to force mode on port %d: %d\n", port,
                                ret);
 
@@ -1038,7 +1056,7 @@ static void rtl8365mb_phylink_mac_link_up(struct dsa_switch *ds, int port,
 static void rtl8365mb_port_stp_state_set(struct dsa_switch *ds, int port,
                                         u8 state)
 {
-       struct realtek_smi *smi = ds->priv;
+       struct realtek_priv *priv = ds->priv;
        enum rtl8365mb_stp_state val;
        int msti = 0;
 
@@ -1057,36 +1075,36 @@ static void rtl8365mb_port_stp_state_set(struct dsa_switch *ds, int port,
                val = RTL8365MB_STP_STATE_FORWARDING;
                break;
        default:
-               dev_err(smi->dev, "invalid STP state: %u\n", state);
+               dev_err(priv->dev, "invalid STP state: %u\n", state);
                return;
        }
 
-       regmap_update_bits(smi->map, RTL8365MB_MSTI_CTRL_REG(msti, port),
+       regmap_update_bits(priv->map, RTL8365MB_MSTI_CTRL_REG(msti, port),
                           RTL8365MB_MSTI_CTRL_PORT_STATE_MASK(port),
                           val << RTL8365MB_MSTI_CTRL_PORT_STATE_OFFSET(port));
 }
 
-static int rtl8365mb_port_set_learning(struct realtek_smi *smi, int port,
+static int rtl8365mb_port_set_learning(struct realtek_priv *priv, int port,
                                       bool enable)
 {
-       struct rtl8365mb *mb = smi->chip_data;
+       struct rtl8365mb *mb = priv->chip_data;
 
        /* Enable/disable learning by limiting the number of L2 addresses the
         * port can learn. Realtek documentation states that a limit of zero
         * disables learning. When enabling learning, set it to the chip's
         * maximum.
         */
-       return regmap_write(smi->map, RTL8365MB_LUT_PORT_LEARN_LIMIT_REG(port),
+       return regmap_write(priv->map, RTL8365MB_LUT_PORT_LEARN_LIMIT_REG(port),
                            enable ? mb->learn_limit_max : 0);
 }
 
-static int rtl8365mb_port_set_isolation(struct realtek_smi *smi, int port,
+static int rtl8365mb_port_set_isolation(struct realtek_priv *priv, int port,
                                        u32 mask)
 {
-       return regmap_write(smi->map, RTL8365MB_PORT_ISOLATION_REG(port), mask);
+       return regmap_write(priv->map, RTL8365MB_PORT_ISOLATION_REG(port), mask);
 }
 
-static int rtl8365mb_mib_counter_read(struct realtek_smi *smi, int port,
+static int rtl8365mb_mib_counter_read(struct realtek_priv *priv, int port,
                                      u32 offset, u32 length, u64 *mibvalue)
 {
        u64 tmpvalue = 0;
@@ -1098,13 +1116,13 @@ static int rtl8365mb_mib_counter_read(struct realtek_smi *smi, int port,
         * and then poll the control register before reading the value from some
         * counter registers.
         */
-       ret = regmap_write(smi->map, RTL8365MB_MIB_ADDRESS_REG,
+       ret = regmap_write(priv->map, RTL8365MB_MIB_ADDRESS_REG,
                           RTL8365MB_MIB_ADDRESS(port, offset));
        if (ret)
                return ret;
 
        /* Poll for completion */
-       ret = regmap_read_poll_timeout(smi->map, RTL8365MB_MIB_CTRL0_REG, val,
+       ret = regmap_read_poll_timeout(priv->map, RTL8365MB_MIB_CTRL0_REG, val,
                                       !(val & RTL8365MB_MIB_CTRL0_BUSY_MASK),
                                       10, 100);
        if (ret)
@@ -1126,7 +1144,7 @@ static int rtl8365mb_mib_counter_read(struct realtek_smi *smi, int port,
 
        /* Read the MIB counter 16 bits at a time */
        for (i = 0; i < length; i++) {
-               ret = regmap_read(smi->map,
+               ret = regmap_read(priv->map,
                                  RTL8365MB_MIB_COUNTER_REG(offset - i), &val);
                if (ret)
                        return ret;
@@ -1142,21 +1160,21 @@ static int rtl8365mb_mib_counter_read(struct realtek_smi *smi, int port,
 
 static void rtl8365mb_get_ethtool_stats(struct dsa_switch *ds, int port, u64 *data)
 {
-       struct realtek_smi *smi = ds->priv;
+       struct realtek_priv *priv = ds->priv;
        struct rtl8365mb *mb;
        int ret;
        int i;
 
-       mb = smi->chip_data;
+       mb = priv->chip_data;
 
        mutex_lock(&mb->mib_lock);
        for (i = 0; i < RTL8365MB_MIB_END; i++) {
                struct rtl8365mb_mib_counter *mib = &rtl8365mb_mib_counters[i];
 
-               ret = rtl8365mb_mib_counter_read(smi, port, mib->offset,
+               ret = rtl8365mb_mib_counter_read(priv, port, mib->offset,
                                                 mib->length, &data[i]);
                if (ret) {
-                       dev_err(smi->dev,
+                       dev_err(priv->dev,
                                "failed to read port %d counters: %d\n", port,
                                ret);
                        break;
@@ -1190,15 +1208,15 @@ static int rtl8365mb_get_sset_count(struct dsa_switch *ds, int port, int sset)
 static void rtl8365mb_get_phy_stats(struct dsa_switch *ds, int port,
                                    struct ethtool_eth_phy_stats *phy_stats)
 {
-       struct realtek_smi *smi = ds->priv;
+       struct realtek_priv *priv = ds->priv;
        struct rtl8365mb_mib_counter *mib;
        struct rtl8365mb *mb;
 
-       mb = smi->chip_data;
+       mb = priv->chip_data;
        mib = &rtl8365mb_mib_counters[RTL8365MB_MIB_dot3StatsSymbolErrors];
 
        mutex_lock(&mb->mib_lock);
-       rtl8365mb_mib_counter_read(smi, port, mib->offset, mib->length,
+       rtl8365mb_mib_counter_read(priv, port, mib->offset, mib->length,
                                   &phy_stats->SymbolErrorDuringCarrier);
        mutex_unlock(&mb->mib_lock);
 }
@@ -1226,12 +1244,12 @@ static void rtl8365mb_get_mac_stats(struct dsa_switch *ds, int port,
                [RTL8365MB_MIB_dot3StatsExcessiveCollisions] = 1,
 
        };
-       struct realtek_smi *smi = ds->priv;
+       struct realtek_priv *priv = ds->priv;
        struct rtl8365mb *mb;
        int ret;
        int i;
 
-       mb = smi->chip_data;
+       mb = priv->chip_data;
 
        mutex_lock(&mb->mib_lock);
        for (i = 0; i < RTL8365MB_MIB_END; i++) {
@@ -1241,7 +1259,7 @@ static void rtl8365mb_get_mac_stats(struct dsa_switch *ds, int port,
                if (!cnt[i])
                        continue;
 
-               ret = rtl8365mb_mib_counter_read(smi, port, mib->offset,
+               ret = rtl8365mb_mib_counter_read(priv, port, mib->offset,
                                                 mib->length, &cnt[i]);
                if (ret)
                        break;
@@ -1291,20 +1309,20 @@ static void rtl8365mb_get_mac_stats(struct dsa_switch *ds, int port,
 static void rtl8365mb_get_ctrl_stats(struct dsa_switch *ds, int port,
                                     struct ethtool_eth_ctrl_stats *ctrl_stats)
 {
-       struct realtek_smi *smi = ds->priv;
+       struct realtek_priv *priv = ds->priv;
        struct rtl8365mb_mib_counter *mib;
        struct rtl8365mb *mb;
 
-       mb = smi->chip_data;
+       mb = priv->chip_data;
        mib = &rtl8365mb_mib_counters[RTL8365MB_MIB_dot3ControlInUnknownOpcodes];
 
        mutex_lock(&mb->mib_lock);
-       rtl8365mb_mib_counter_read(smi, port, mib->offset, mib->length,
+       rtl8365mb_mib_counter_read(priv, port, mib->offset, mib->length,
                                   &ctrl_stats->UnsupportedOpcodesReceived);
        mutex_unlock(&mb->mib_lock);
 }
 
-static void rtl8365mb_stats_update(struct realtek_smi *smi, int port)
+static void rtl8365mb_stats_update(struct realtek_priv *priv, int port)
 {
        u64 cnt[RTL8365MB_MIB_END] = {
                [RTL8365MB_MIB_ifOutOctets] = 1,
@@ -1323,7 +1341,7 @@ static void rtl8365mb_stats_update(struct realtek_smi *smi, int port)
                [RTL8365MB_MIB_dot3StatsFCSErrors] = 1,
                [RTL8365MB_MIB_dot3StatsLateCollisions] = 1,
        };
-       struct rtl8365mb *mb = smi->chip_data;
+       struct rtl8365mb *mb = priv->chip_data;
        struct rtnl_link_stats64 *stats;
        int ret;
        int i;
@@ -1338,7 +1356,7 @@ static void rtl8365mb_stats_update(struct realtek_smi *smi, int port)
                if (!cnt[i])
                        continue;
 
-               ret = rtl8365mb_mib_counter_read(smi, port, c->offset,
+               ret = rtl8365mb_mib_counter_read(priv, port, c->offset,
                                                 c->length, &cnt[i]);
                if (ret)
                        break;
@@ -1388,9 +1406,9 @@ static void rtl8365mb_stats_poll(struct work_struct *work)
        struct rtl8365mb_port *p = container_of(to_delayed_work(work),
                                                struct rtl8365mb_port,
                                                mib_work);
-       struct realtek_smi *smi = p->smi;
+       struct realtek_priv *priv = p->priv;
 
-       rtl8365mb_stats_update(smi, p->index);
+       rtl8365mb_stats_update(priv, p->index);
 
        schedule_delayed_work(&p->mib_work, RTL8365MB_STATS_INTERVAL_JIFFIES);
 }
@@ -1398,11 +1416,11 @@ static void rtl8365mb_stats_poll(struct work_struct *work)
 static void rtl8365mb_get_stats64(struct dsa_switch *ds, int port,
                                  struct rtnl_link_stats64 *s)
 {
-       struct realtek_smi *smi = ds->priv;
+       struct realtek_priv *priv = ds->priv;
        struct rtl8365mb_port *p;
        struct rtl8365mb *mb;
 
-       mb = smi->chip_data;
+       mb = priv->chip_data;
        p = &mb->ports[port];
 
        spin_lock(&p->stats_lock);
@@ -1410,9 +1428,9 @@ static void rtl8365mb_get_stats64(struct dsa_switch *ds, int port,
        spin_unlock(&p->stats_lock);
 }
 
-static void rtl8365mb_stats_setup(struct realtek_smi *smi)
+static void rtl8365mb_stats_setup(struct realtek_priv *priv)
 {
-       struct rtl8365mb *mb = smi->chip_data;
+       struct rtl8365mb *mb = priv->chip_data;
        int i;
 
        /* Per-chip global mutex to protect MIB counter access, since doing
@@ -1420,10 +1438,10 @@ static void rtl8365mb_stats_setup(struct realtek_smi *smi)
         */
        mutex_init(&mb->mib_lock);
 
-       for (i = 0; i < smi->num_ports; i++) {
+       for (i = 0; i < priv->num_ports; i++) {
                struct rtl8365mb_port *p = &mb->ports[i];
 
-               if (dsa_is_unused_port(smi->ds, i))
+               if (dsa_is_unused_port(priv->ds, i))
                        continue;
 
                /* Per-port spinlock to protect the stats64 data */
@@ -1436,45 +1454,45 @@ static void rtl8365mb_stats_setup(struct realtek_smi *smi)
        }
 }
 
-static void rtl8365mb_stats_teardown(struct realtek_smi *smi)
+static void rtl8365mb_stats_teardown(struct realtek_priv *priv)
 {
-       struct rtl8365mb *mb = smi->chip_data;
+       struct rtl8365mb *mb = priv->chip_data;
        int i;
 
-       for (i = 0; i < smi->num_ports; i++) {
+       for (i = 0; i < priv->num_ports; i++) {
                struct rtl8365mb_port *p = &mb->ports[i];
 
-               if (dsa_is_unused_port(smi->ds, i))
+               if (dsa_is_unused_port(priv->ds, i))
                        continue;
 
                cancel_delayed_work_sync(&p->mib_work);
        }
 }
 
-static int rtl8365mb_get_and_clear_status_reg(struct realtek_smi *smi, u32 reg,
+static int rtl8365mb_get_and_clear_status_reg(struct realtek_priv *priv, u32 reg,
                                              u32 *val)
 {
        int ret;
 
-       ret = regmap_read(smi->map, reg, val);
+       ret = regmap_read(priv->map, reg, val);
        if (ret)
                return ret;
 
-       return regmap_write(smi->map, reg, *val);
+       return regmap_write(priv->map, reg, *val);
 }
 
 static irqreturn_t rtl8365mb_irq(int irq, void *data)
 {
-       struct realtek_smi *smi = data;
+       struct realtek_priv *priv = data;
        unsigned long line_changes = 0;
        struct rtl8365mb *mb;
        u32 stat;
        int line;
        int ret;
 
-       mb = smi->chip_data;
+       mb = priv->chip_data;
 
-       ret = rtl8365mb_get_and_clear_status_reg(smi, RTL8365MB_INTR_STATUS_REG,
+       ret = rtl8365mb_get_and_clear_status_reg(priv, RTL8365MB_INTR_STATUS_REG,
                                                 &stat);
        if (ret)
                goto out_error;
@@ -1485,14 +1503,14 @@ static irqreturn_t rtl8365mb_irq(int irq, void *data)
                u32 val;
 
                ret = rtl8365mb_get_and_clear_status_reg(
-                       smi, RTL8365MB_PORT_LINKUP_IND_REG, &val);
+                       priv, RTL8365MB_PORT_LINKUP_IND_REG, &val);
                if (ret)
                        goto out_error;
 
                linkup_ind = FIELD_GET(RTL8365MB_PORT_LINKUP_IND_MASK, val);
 
                ret = rtl8365mb_get_and_clear_status_reg(
-                       smi, RTL8365MB_PORT_LINKDOWN_IND_REG, &val);
+                       priv, RTL8365MB_PORT_LINKDOWN_IND_REG, &val);
                if (ret)
                        goto out_error;
 
@@ -1504,8 +1522,8 @@ static irqreturn_t rtl8365mb_irq(int irq, void *data)
        if (!line_changes)
                goto out_none;
 
-       for_each_set_bit(line, &line_changes, smi->num_ports) {
-               int child_irq = irq_find_mapping(smi->irqdomain, line);
+       for_each_set_bit(line, &line_changes, priv->num_ports) {
+               int child_irq = irq_find_mapping(priv->irqdomain, line);
 
                handle_nested_irq(child_irq);
        }
@@ -1513,7 +1531,7 @@ static irqreturn_t rtl8365mb_irq(int irq, void *data)
        return IRQ_HANDLED;
 
 out_error:
-       dev_err(smi->dev, "failed to read interrupt status: %d\n", ret);
+       dev_err(priv->dev, "failed to read interrupt status: %d\n", ret);
 
 out_none:
        return IRQ_NONE;
@@ -1548,27 +1566,27 @@ static const struct irq_domain_ops rtl8365mb_irqdomain_ops = {
        .xlate = irq_domain_xlate_onecell,
 };
 
-static int rtl8365mb_set_irq_enable(struct realtek_smi *smi, bool enable)
+static int rtl8365mb_set_irq_enable(struct realtek_priv *priv, bool enable)
 {
-       return regmap_update_bits(smi->map, RTL8365MB_INTR_CTRL_REG,
+       return regmap_update_bits(priv->map, RTL8365MB_INTR_CTRL_REG,
                                  RTL8365MB_INTR_LINK_CHANGE_MASK,
                                  FIELD_PREP(RTL8365MB_INTR_LINK_CHANGE_MASK,
                                             enable ? 1 : 0));
 }
 
-static int rtl8365mb_irq_enable(struct realtek_smi *smi)
+static int rtl8365mb_irq_enable(struct realtek_priv *priv)
 {
-       return rtl8365mb_set_irq_enable(smi, true);
+       return rtl8365mb_set_irq_enable(priv, true);
 }
 
-static int rtl8365mb_irq_disable(struct realtek_smi *smi)
+static int rtl8365mb_irq_disable(struct realtek_priv *priv)
 {
-       return rtl8365mb_set_irq_enable(smi, false);
+       return rtl8365mb_set_irq_enable(priv, false);
 }
 
-static int rtl8365mb_irq_setup(struct realtek_smi *smi)
+static int rtl8365mb_irq_setup(struct realtek_priv *priv)
 {
-       struct rtl8365mb *mb = smi->chip_data;
+       struct rtl8365mb *mb = priv->chip_data;
        struct device_node *intc;
        u32 irq_trig;
        int virq;
@@ -1577,9 +1595,9 @@ static int rtl8365mb_irq_setup(struct realtek_smi *smi)
        int ret;
        int i;
 
-       intc = of_get_child_by_name(smi->dev->of_node, "interrupt-controller");
+       intc = of_get_child_by_name(priv->dev->of_node, "interrupt-controller");
        if (!intc) {
-               dev_err(smi->dev, "missing child interrupt-controller node\n");
+               dev_err(priv->dev, "missing child interrupt-controller node\n");
                return -EINVAL;
        }
 
@@ -1587,24 +1605,24 @@ static int rtl8365mb_irq_setup(struct realtek_smi *smi)
        irq = of_irq_get(intc, 0);
        if (irq <= 0) {
                if (irq != -EPROBE_DEFER)
-                       dev_err(smi->dev, "failed to get parent irq: %d\n",
+                       dev_err(priv->dev, "failed to get parent irq: %d\n",
                                irq);
                ret = irq ? irq : -EINVAL;
                goto out_put_node;
        }
 
-       smi->irqdomain = irq_domain_add_linear(intc, smi->num_ports,
-                                              &rtl8365mb_irqdomain_ops, smi);
-       if (!smi->irqdomain) {
-               dev_err(smi->dev, "failed to add irq domain\n");
+       priv->irqdomain = irq_domain_add_linear(intc, priv->num_ports,
+                                               &rtl8365mb_irqdomain_ops, priv);
+       if (!priv->irqdomain) {
+               dev_err(priv->dev, "failed to add irq domain\n");
                ret = -ENOMEM;
                goto out_put_node;
        }
 
-       for (i = 0; i < smi->num_ports; i++) {
-               virq = irq_create_mapping(smi->irqdomain, i);
+       for (i = 0; i < priv->num_ports; i++) {
+               virq = irq_create_mapping(priv->irqdomain, i);
                if (!virq) {
-                       dev_err(smi->dev,
+                       dev_err(priv->dev,
                                "failed to create irq domain mapping\n");
                        ret = -EINVAL;
                        goto out_remove_irqdomain;
@@ -1625,40 +1643,40 @@ static int rtl8365mb_irq_setup(struct realtek_smi *smi)
                val = RTL8365MB_INTR_POLARITY_LOW;
                break;
        default:
-               dev_err(smi->dev, "unsupported irq trigger type %u\n",
+               dev_err(priv->dev, "unsupported irq trigger type %u\n",
                        irq_trig);
                ret = -EINVAL;
                goto out_remove_irqdomain;
        }
 
-       ret = regmap_update_bits(smi->map, RTL8365MB_INTR_POLARITY_REG,
+       ret = regmap_update_bits(priv->map, RTL8365MB_INTR_POLARITY_REG,
                                 RTL8365MB_INTR_POLARITY_MASK,
                                 FIELD_PREP(RTL8365MB_INTR_POLARITY_MASK, val));
        if (ret)
                goto out_remove_irqdomain;
 
        /* Disable the interrupt in case the chip has it enabled on reset */
-       ret = rtl8365mb_irq_disable(smi);
+       ret = rtl8365mb_irq_disable(priv);
        if (ret)
                goto out_remove_irqdomain;
 
        /* Clear the interrupt status register */
-       ret = regmap_write(smi->map, RTL8365MB_INTR_STATUS_REG,
+       ret = regmap_write(priv->map, RTL8365MB_INTR_STATUS_REG,
                           RTL8365MB_INTR_ALL_MASK);
        if (ret)
                goto out_remove_irqdomain;
 
        ret = request_threaded_irq(irq, NULL, rtl8365mb_irq, IRQF_ONESHOT,
-                                  "rtl8365mb", smi);
+                                  "rtl8365mb", priv);
        if (ret) {
-               dev_err(smi->dev, "failed to request irq: %d\n", ret);
+               dev_err(priv->dev, "failed to request irq: %d\n", ret);
                goto out_remove_irqdomain;
        }
 
        /* Store the irq so that we know to free it during teardown */
        mb->irq = irq;
 
-       ret = rtl8365mb_irq_enable(smi);
+       ret = rtl8365mb_irq_enable(priv);
        if (ret)
                goto out_free_irq;
 
@@ -1667,17 +1685,17 @@ static int rtl8365mb_irq_setup(struct realtek_smi *smi)
        return 0;
 
 out_free_irq:
-       free_irq(mb->irq, smi);
+       free_irq(mb->irq, priv);
        mb->irq = 0;
 
 out_remove_irqdomain:
-       for (i = 0; i < smi->num_ports; i++) {
-               virq = irq_find_mapping(smi->irqdomain, i);
+       for (i = 0; i < priv->num_ports; i++) {
+               virq = irq_find_mapping(priv->irqdomain, i);
                irq_dispose_mapping(virq);
        }
 
-       irq_domain_remove(smi->irqdomain);
-       smi->irqdomain = NULL;
+       irq_domain_remove(priv->irqdomain);
+       priv->irqdomain = NULL;
 
 out_put_node:
        of_node_put(intc);
@@ -1685,36 +1703,34 @@ out_put_node:
        return ret;
 }
 
-static void rtl8365mb_irq_teardown(struct realtek_smi *smi)
+static void rtl8365mb_irq_teardown(struct realtek_priv *priv)
 {
-       struct rtl8365mb *mb = smi->chip_data;
+       struct rtl8365mb *mb = priv->chip_data;
        int virq;
        int i;
 
        if (mb->irq) {
-               free_irq(mb->irq, smi);
+               free_irq(mb->irq, priv);
                mb->irq = 0;
        }
 
-       if (smi->irqdomain) {
-               for (i = 0; i < smi->num_ports; i++) {
-                       virq = irq_find_mapping(smi->irqdomain, i);
+       if (priv->irqdomain) {
+               for (i = 0; i < priv->num_ports; i++) {
+                       virq = irq_find_mapping(priv->irqdomain, i);
                        irq_dispose_mapping(virq);
                }
 
-               irq_domain_remove(smi->irqdomain);
-               smi->irqdomain = NULL;
+               irq_domain_remove(priv->irqdomain);
+               priv->irqdomain = NULL;
        }
 }
 
-static int rtl8365mb_cpu_config(struct realtek_smi *smi)
+static int rtl8365mb_cpu_config(struct realtek_priv *priv, const struct rtl8365mb_cpu *cpu)
 {
-       struct rtl8365mb *mb = smi->chip_data;
-       struct rtl8365mb_cpu *cpu = &mb->cpu;
        u32 val;
        int ret;
 
-       ret = regmap_update_bits(smi->map, RTL8365MB_CPU_PORT_MASK_REG,
+       ret = regmap_update_bits(priv->map, RTL8365MB_CPU_PORT_MASK_REG,
                                 RTL8365MB_CPU_PORT_MASK_MASK,
                                 FIELD_PREP(RTL8365MB_CPU_PORT_MASK_MASK,
                                            cpu->mask));
@@ -1726,26 +1742,26 @@ static int rtl8365mb_cpu_config(struct realtek_smi *smi)
              FIELD_PREP(RTL8365MB_CPU_CTRL_TAG_POSITION_MASK, cpu->position) |
              FIELD_PREP(RTL8365MB_CPU_CTRL_RXBYTECOUNT_MASK, cpu->rx_length) |
              FIELD_PREP(RTL8365MB_CPU_CTRL_TAG_FORMAT_MASK, cpu->format) |
-             FIELD_PREP(RTL8365MB_CPU_CTRL_TRAP_PORT_MASK, cpu->trap_port) |
+             FIELD_PREP(RTL8365MB_CPU_CTRL_TRAP_PORT_MASK, cpu->trap_port & 0x7) |
              FIELD_PREP(RTL8365MB_CPU_CTRL_TRAP_PORT_EXT_MASK,
-                        cpu->trap_port >> 3);
-       ret = regmap_write(smi->map, RTL8365MB_CPU_CTRL_REG, val);
+                        cpu->trap_port >> 3 & 0x1);
+       ret = regmap_write(priv->map, RTL8365MB_CPU_CTRL_REG, val);
        if (ret)
                return ret;
 
        return 0;
 }
 
-static int rtl8365mb_switch_init(struct realtek_smi *smi)
+static int rtl8365mb_switch_init(struct realtek_priv *priv)
 {
-       struct rtl8365mb *mb = smi->chip_data;
+       struct rtl8365mb *mb = priv->chip_data;
        int ret;
        int i;
 
        /* Do any chip-specific init jam before getting to the common stuff */
        if (mb->jam_table) {
                for (i = 0; i < mb->jam_size; i++) {
-                       ret = regmap_write(smi->map, mb->jam_table[i].reg,
+                       ret = regmap_write(priv->map, mb->jam_table[i].reg,
                                           mb->jam_table[i].val);
                        if (ret)
                                return ret;
@@ -1754,7 +1770,7 @@ static int rtl8365mb_switch_init(struct realtek_smi *smi)
 
        /* Common init jam */
        for (i = 0; i < ARRAY_SIZE(rtl8365mb_init_jam_common); i++) {
-               ret = regmap_write(smi->map, rtl8365mb_init_jam_common[i].reg,
+               ret = regmap_write(priv->map, rtl8365mb_init_jam_common[i].reg,
                                   rtl8365mb_init_jam_common[i].val);
                if (ret)
                        return ret;
@@ -1763,75 +1779,86 @@ static int rtl8365mb_switch_init(struct realtek_smi *smi)
        return 0;
 }
 
-static int rtl8365mb_reset_chip(struct realtek_smi *smi)
+static int rtl8365mb_reset_chip(struct realtek_priv *priv)
 {
        u32 val;
 
-       realtek_smi_write_reg_noack(smi, RTL8365MB_CHIP_RESET_REG,
-                                   FIELD_PREP(RTL8365MB_CHIP_RESET_HW_MASK,
-                                              1));
+       priv->write_reg_noack(priv, RTL8365MB_CHIP_RESET_REG,
+                             FIELD_PREP(RTL8365MB_CHIP_RESET_HW_MASK, 1));
 
        /* Realtek documentation says the chip needs 1 second to reset. Sleep
         * for 100 ms before accessing any registers to prevent ACK timeouts.
         */
        msleep(100);
-       return regmap_read_poll_timeout(smi->map, RTL8365MB_CHIP_RESET_REG, val,
+       return regmap_read_poll_timeout(priv->map, RTL8365MB_CHIP_RESET_REG, val,
                                        !(val & RTL8365MB_CHIP_RESET_HW_MASK),
                                        20000, 1e6);
 }
 
 static int rtl8365mb_setup(struct dsa_switch *ds)
 {
-       struct realtek_smi *smi = ds->priv;
+       struct realtek_priv *priv = ds->priv;
+       struct rtl8365mb_cpu cpu = {0};
+       struct dsa_port *cpu_dp;
        struct rtl8365mb *mb;
        int ret;
        int i;
 
-       mb = smi->chip_data;
+       mb = priv->chip_data;
 
-       ret = rtl8365mb_reset_chip(smi);
+       ret = rtl8365mb_reset_chip(priv);
        if (ret) {
-               dev_err(smi->dev, "failed to reset chip: %d\n", ret);
+               dev_err(priv->dev, "failed to reset chip: %d\n", ret);
                goto out_error;
        }
 
        /* Configure switch to vendor-defined initial state */
-       ret = rtl8365mb_switch_init(smi);
+       ret = rtl8365mb_switch_init(priv);
        if (ret) {
-               dev_err(smi->dev, "failed to initialize switch: %d\n", ret);
+               dev_err(priv->dev, "failed to initialize switch: %d\n", ret);
                goto out_error;
        }
 
        /* Set up cascading IRQs */
-       ret = rtl8365mb_irq_setup(smi);
+       ret = rtl8365mb_irq_setup(priv);
        if (ret == -EPROBE_DEFER)
                return ret;
        else if (ret)
-               dev_info(smi->dev, "no interrupt support\n");
+               dev_info(priv->dev, "no interrupt support\n");
 
        /* Configure CPU tagging */
-       ret = rtl8365mb_cpu_config(smi);
+       cpu.trap_port = RTL8365MB_MAX_NUM_PORTS;
+       dsa_switch_for_each_cpu_port(cpu_dp, priv->ds) {
+               cpu.mask |= BIT(cpu_dp->index);
+
+               if (cpu.trap_port == RTL8365MB_MAX_NUM_PORTS)
+                       cpu.trap_port = cpu_dp->index;
+       }
+
+       cpu.enable = cpu.mask > 0;
+       cpu.insert = RTL8365MB_CPU_INSERT_TO_ALL;
+       cpu.position = RTL8365MB_CPU_POS_AFTER_SA;
+       cpu.rx_length = RTL8365MB_CPU_RXLEN_64BYTES;
+       cpu.format = RTL8365MB_CPU_FORMAT_8BYTES;
+
+       ret = rtl8365mb_cpu_config(priv, &cpu);
        if (ret)
                goto out_teardown_irq;
 
        /* Configure ports */
-       for (i = 0; i < smi->num_ports; i++) {
+       for (i = 0; i < priv->num_ports; i++) {
                struct rtl8365mb_port *p = &mb->ports[i];
 
-               if (dsa_is_unused_port(smi->ds, i))
+               if (dsa_is_unused_port(priv->ds, i))
                        continue;
 
-               /* Set up per-port private data */
-               p->smi = smi;
-               p->index = i;
-
                /* Forward only to the CPU */
-               ret = rtl8365mb_port_set_isolation(smi, i, BIT(smi->cpu_port));
+               ret = rtl8365mb_port_set_isolation(priv, i, cpu.mask);
                if (ret)
                        goto out_teardown_irq;
 
                /* Disable learning */
-               ret = rtl8365mb_port_set_learning(smi, i, false);
+               ret = rtl8365mb_port_set_learning(priv, i, false);
                if (ret)
                        goto out_teardown_irq;
 
@@ -1839,29 +1866,35 @@ static int rtl8365mb_setup(struct dsa_switch *ds)
                 * ports will still forward frames to the CPU despite being
                 * administratively down by default.
                 */
-               rtl8365mb_port_stp_state_set(smi->ds, i, BR_STATE_DISABLED);
+               rtl8365mb_port_stp_state_set(priv->ds, i, BR_STATE_DISABLED);
+
+               /* Set up per-port private data */
+               p->priv = priv;
+               p->index = i;
        }
 
        /* Set maximum packet length to 1536 bytes */
-       ret = regmap_update_bits(smi->map, RTL8365MB_CFG0_MAX_LEN_REG,
+       ret = regmap_update_bits(priv->map, RTL8365MB_CFG0_MAX_LEN_REG,
                                 RTL8365MB_CFG0_MAX_LEN_MASK,
                                 FIELD_PREP(RTL8365MB_CFG0_MAX_LEN_MASK, 1536));
        if (ret)
                goto out_teardown_irq;
 
-       ret = realtek_smi_setup_mdio(smi);
-       if (ret) {
-               dev_err(smi->dev, "could not set up MDIO bus\n");
-               goto out_teardown_irq;
+       if (priv->setup_interface) {
+               ret = priv->setup_interface(ds);
+               if (ret) {
+                       dev_err(priv->dev, "could not set up MDIO bus\n");
+                       goto out_teardown_irq;
+               }
        }
 
        /* Start statistics counter polling */
-       rtl8365mb_stats_setup(smi);
+       rtl8365mb_stats_setup(priv);
 
        return 0;
 
 out_teardown_irq:
-       rtl8365mb_irq_teardown(smi);
+       rtl8365mb_irq_teardown(priv);
 
 out_error:
        return ret;
@@ -1869,10 +1902,10 @@ out_error:
 
 static void rtl8365mb_teardown(struct dsa_switch *ds)
 {
-       struct realtek_smi *smi = ds->priv;
+       struct realtek_priv *priv = ds->priv;
 
-       rtl8365mb_stats_teardown(smi);
-       rtl8365mb_irq_teardown(smi);
+       rtl8365mb_stats_teardown(priv);
+       rtl8365mb_irq_teardown(priv);
 }
 
 static int rtl8365mb_get_chip_id_and_ver(struct regmap *map, u32 *id, u32 *ver)
@@ -1902,48 +1935,57 @@ static int rtl8365mb_get_chip_id_and_ver(struct regmap *map, u32 *id, u32 *ver)
        return 0;
 }
 
-static int rtl8365mb_detect(struct realtek_smi *smi)
+static int rtl8365mb_detect(struct realtek_priv *priv)
 {
-       struct rtl8365mb *mb = smi->chip_data;
+       struct rtl8365mb *mb = priv->chip_data;
        u32 chip_id;
        u32 chip_ver;
        int ret;
 
-       ret = rtl8365mb_get_chip_id_and_ver(smi->map, &chip_id, &chip_ver);
+       ret = rtl8365mb_get_chip_id_and_ver(priv->map, &chip_id, &chip_ver);
        if (ret) {
-               dev_err(smi->dev, "failed to read chip id and version: %d\n",
+               dev_err(priv->dev, "failed to read chip id and version: %d\n",
                        ret);
                return ret;
        }
 
        switch (chip_id) {
        case RTL8365MB_CHIP_ID_8365MB_VC:
-               dev_info(smi->dev,
-                        "found an RTL8365MB-VC switch (ver=0x%04x)\n",
-                        chip_ver);
+               switch (chip_ver) {
+               case RTL8365MB_CHIP_VER_8365MB_VC:
+                       dev_info(priv->dev,
+                                "found an RTL8365MB-VC switch (ver=0x%04x)\n",
+                                chip_ver);
+                       break;
+               case RTL8365MB_CHIP_VER_8367RB:
+                       dev_info(priv->dev,
+                                "found an RTL8367RB-VB switch (ver=0x%04x)\n",
+                                chip_ver);
+                       break;
+               case RTL8365MB_CHIP_VER_8367S:
+                       dev_info(priv->dev,
+                                "found an RTL8367S switch (ver=0x%04x)\n",
+                                chip_ver);
+                       break;
+               default:
+                       dev_err(priv->dev, "unrecognized switch version (ver=0x%04x)",
+                               chip_ver);
+                       return -ENODEV;
+               }
 
-               smi->cpu_port = RTL8365MB_CPU_PORT_NUM_8365MB_VC;
-               smi->num_ports = smi->cpu_port + 1;
+               priv->num_ports = RTL8365MB_MAX_NUM_PORTS;
 
-               mb->smi = smi;
+               mb->priv = priv;
                mb->chip_id = chip_id;
                mb->chip_ver = chip_ver;
-               mb->port_mask = BIT(smi->num_ports) - 1;
-               mb->learn_limit_max = RTL8365MB_LEARN_LIMIT_MAX_8365MB_VC;
+               mb->port_mask = GENMASK(priv->num_ports - 1, 0);
+               mb->learn_limit_max = RTL8365MB_LEARN_LIMIT_MAX;
                mb->jam_table = rtl8365mb_init_jam_8365mb_vc;
                mb->jam_size = ARRAY_SIZE(rtl8365mb_init_jam_8365mb_vc);
 
-               mb->cpu.enable = 1;
-               mb->cpu.mask = BIT(smi->cpu_port);
-               mb->cpu.trap_port = smi->cpu_port;
-               mb->cpu.insert = RTL8365MB_CPU_INSERT_TO_ALL;
-               mb->cpu.position = RTL8365MB_CPU_POS_AFTER_SA;
-               mb->cpu.rx_length = RTL8365MB_CPU_RXLEN_64BYTES;
-               mb->cpu.format = RTL8365MB_CPU_FORMAT_8BYTES;
-
                break;
        default:
-               dev_err(smi->dev,
+               dev_err(priv->dev,
                        "found an unknown Realtek switch (id=0x%04x, ver=0x%04x)\n",
                        chip_id, chip_ver);
                return -ENODEV;
@@ -1952,14 +1994,34 @@ static int rtl8365mb_detect(struct realtek_smi *smi)
        return 0;
 }
 
-static const struct dsa_switch_ops rtl8365mb_switch_ops = {
+static const struct dsa_switch_ops rtl8365mb_switch_ops_smi = {
+       .get_tag_protocol = rtl8365mb_get_tag_protocol,
+       .setup = rtl8365mb_setup,
+       .teardown = rtl8365mb_teardown,
+       .phylink_get_caps = rtl8365mb_phylink_get_caps,
+       .phylink_mac_config = rtl8365mb_phylink_mac_config,
+       .phylink_mac_link_down = rtl8365mb_phylink_mac_link_down,
+       .phylink_mac_link_up = rtl8365mb_phylink_mac_link_up,
+       .port_stp_state_set = rtl8365mb_port_stp_state_set,
+       .get_strings = rtl8365mb_get_strings,
+       .get_ethtool_stats = rtl8365mb_get_ethtool_stats,
+       .get_sset_count = rtl8365mb_get_sset_count,
+       .get_eth_phy_stats = rtl8365mb_get_phy_stats,
+       .get_eth_mac_stats = rtl8365mb_get_mac_stats,
+       .get_eth_ctrl_stats = rtl8365mb_get_ctrl_stats,
+       .get_stats64 = rtl8365mb_get_stats64,
+};
+
+static const struct dsa_switch_ops rtl8365mb_switch_ops_mdio = {
        .get_tag_protocol = rtl8365mb_get_tag_protocol,
        .setup = rtl8365mb_setup,
        .teardown = rtl8365mb_teardown,
-       .phylink_validate = rtl8365mb_phylink_validate,
+       .phylink_get_caps = rtl8365mb_phylink_get_caps,
        .phylink_mac_config = rtl8365mb_phylink_mac_config,
        .phylink_mac_link_down = rtl8365mb_phylink_mac_link_down,
        .phylink_mac_link_up = rtl8365mb_phylink_mac_link_up,
+       .phy_read = rtl8365mb_dsa_phy_read,
+       .phy_write = rtl8365mb_dsa_phy_write,
        .port_stp_state_set = rtl8365mb_port_stp_state_set,
        .get_strings = rtl8365mb_get_strings,
        .get_ethtool_stats = rtl8365mb_get_ethtool_stats,
@@ -1970,18 +2032,23 @@ static const struct dsa_switch_ops rtl8365mb_switch_ops = {
        .get_stats64 = rtl8365mb_get_stats64,
 };
 
-static const struct realtek_smi_ops rtl8365mb_smi_ops = {
+static const struct realtek_ops rtl8365mb_ops = {
        .detect = rtl8365mb_detect,
        .phy_read = rtl8365mb_phy_read,
        .phy_write = rtl8365mb_phy_write,
 };
 
-const struct realtek_smi_variant rtl8365mb_variant = {
-       .ds_ops = &rtl8365mb_switch_ops,
-       .ops = &rtl8365mb_smi_ops,
+const struct realtek_variant rtl8365mb_variant = {
+       .ds_ops_smi = &rtl8365mb_switch_ops_smi,
+       .ds_ops_mdio = &rtl8365mb_switch_ops_mdio,
+       .ops = &rtl8365mb_ops,
        .clk_delay = 10,
        .cmd_read = 0xb9,
        .cmd_write = 0xb8,
        .chip_data_sz = sizeof(struct rtl8365mb),
 };
 EXPORT_SYMBOL_GPL(rtl8365mb_variant);
+
+MODULE_AUTHOR("Alvin Å ipraga <alsi@bang-olufsen.dk>");
+MODULE_DESCRIPTION("Driver for RTL8365MB-VC ethernet switch");
+MODULE_LICENSE("GPL");
similarity index 61%
rename from drivers/net/dsa/rtl8366.c
rename to drivers/net/dsa/realtek/rtl8366-core.c
index bdb8d8d..dc5f75b 100644 (file)
 #include <linux/if_bridge.h>
 #include <net/dsa.h>
 
-#include "realtek-smi-core.h"
+#include "realtek.h"
 
-int rtl8366_mc_is_used(struct realtek_smi *smi, int mc_index, int *used)
+int rtl8366_mc_is_used(struct realtek_priv *priv, int mc_index, int *used)
 {
        int ret;
        int i;
 
        *used = 0;
-       for (i = 0; i < smi->num_ports; i++) {
+       for (i = 0; i < priv->num_ports; i++) {
                int index = 0;
 
-               ret = smi->ops->get_mc_index(smi, i, &index);
+               ret = priv->ops->get_mc_index(priv, i, &index);
                if (ret)
                        return ret;
 
@@ -38,13 +38,13 @@ EXPORT_SYMBOL_GPL(rtl8366_mc_is_used);
 
 /**
  * rtl8366_obtain_mc() - retrieve or allocate a VLAN member configuration
- * @smi: the Realtek SMI device instance
+ * @priv: the Realtek SMI device instance
  * @vid: the VLAN ID to look up or allocate
  * @vlanmc: the pointer will be assigned to a pointer to a valid member config
  * if successful
  * @return: index of a new member config or negative error number
  */
-static int rtl8366_obtain_mc(struct realtek_smi *smi, int vid,
+static int rtl8366_obtain_mc(struct realtek_priv *priv, int vid,
                             struct rtl8366_vlan_mc *vlanmc)
 {
        struct rtl8366_vlan_4k vlan4k;
@@ -52,10 +52,10 @@ static int rtl8366_obtain_mc(struct realtek_smi *smi, int vid,
        int i;
 
        /* Try to find an existing member config entry for this VID */
-       for (i = 0; i < smi->num_vlan_mc; i++) {
-               ret = smi->ops->get_vlan_mc(smi, i, vlanmc);
+       for (i = 0; i < priv->num_vlan_mc; i++) {
+               ret = priv->ops->get_vlan_mc(priv, i, vlanmc);
                if (ret) {
-                       dev_err(smi->dev, "error searching for VLAN MC %d for VID %d\n",
+                       dev_err(priv->dev, "error searching for VLAN MC %d for VID %d\n",
                                i, vid);
                        return ret;
                }
@@ -65,19 +65,19 @@ static int rtl8366_obtain_mc(struct realtek_smi *smi, int vid,
        }
 
        /* We have no MC entry for this VID, try to find an empty one */
-       for (i = 0; i < smi->num_vlan_mc; i++) {
-               ret = smi->ops->get_vlan_mc(smi, i, vlanmc);
+       for (i = 0; i < priv->num_vlan_mc; i++) {
+               ret = priv->ops->get_vlan_mc(priv, i, vlanmc);
                if (ret) {
-                       dev_err(smi->dev, "error searching for VLAN MC %d for VID %d\n",
+                       dev_err(priv->dev, "error searching for VLAN MC %d for VID %d\n",
                                i, vid);
                        return ret;
                }
 
                if (vlanmc->vid == 0 && vlanmc->member == 0) {
                        /* Update the entry from the 4K table */
-                       ret = smi->ops->get_vlan_4k(smi, vid, &vlan4k);
+                       ret = priv->ops->get_vlan_4k(priv, vid, &vlan4k);
                        if (ret) {
-                               dev_err(smi->dev, "error looking for 4K VLAN MC %d for VID %d\n",
+                               dev_err(priv->dev, "error looking for 4K VLAN MC %d for VID %d\n",
                                        i, vid);
                                return ret;
                        }
@@ -86,30 +86,30 @@ static int rtl8366_obtain_mc(struct realtek_smi *smi, int vid,
                        vlanmc->member = vlan4k.member;
                        vlanmc->untag = vlan4k.untag;
                        vlanmc->fid = vlan4k.fid;
-                       ret = smi->ops->set_vlan_mc(smi, i, vlanmc);
+                       ret = priv->ops->set_vlan_mc(priv, i, vlanmc);
                        if (ret) {
-                               dev_err(smi->dev, "unable to set/update VLAN MC %d for VID %d\n",
+                               dev_err(priv->dev, "unable to set/update VLAN MC %d for VID %d\n",
                                        i, vid);
                                return ret;
                        }
 
-                       dev_dbg(smi->dev, "created new MC at index %d for VID %d\n",
+                       dev_dbg(priv->dev, "created new MC at index %d for VID %d\n",
                                i, vid);
                        return i;
                }
        }
 
        /* MC table is full, try to find an unused entry and replace it */
-       for (i = 0; i < smi->num_vlan_mc; i++) {
+       for (i = 0; i < priv->num_vlan_mc; i++) {
                int used;
 
-               ret = rtl8366_mc_is_used(smi, i, &used);
+               ret = rtl8366_mc_is_used(priv, i, &used);
                if (ret)
                        return ret;
 
                if (!used) {
                        /* Update the entry from the 4K table */
-                       ret = smi->ops->get_vlan_4k(smi, vid, &vlan4k);
+                       ret = priv->ops->get_vlan_4k(priv, vid, &vlan4k);
                        if (ret)
                                return ret;
 
@@ -117,23 +117,23 @@ static int rtl8366_obtain_mc(struct realtek_smi *smi, int vid,
                        vlanmc->member = vlan4k.member;
                        vlanmc->untag = vlan4k.untag;
                        vlanmc->fid = vlan4k.fid;
-                       ret = smi->ops->set_vlan_mc(smi, i, vlanmc);
+                       ret = priv->ops->set_vlan_mc(priv, i, vlanmc);
                        if (ret) {
-                               dev_err(smi->dev, "unable to set/update VLAN MC %d for VID %d\n",
+                               dev_err(priv->dev, "unable to set/update VLAN MC %d for VID %d\n",
                                        i, vid);
                                return ret;
                        }
-                       dev_dbg(smi->dev, "recycled MC at index %i for VID %d\n",
+                       dev_dbg(priv->dev, "recycled MC at index %i for VID %d\n",
                                i, vid);
                        return i;
                }
        }
 
-       dev_err(smi->dev, "all VLAN member configurations are in use\n");
+       dev_err(priv->dev, "all VLAN member configurations are in use\n");
        return -ENOSPC;
 }
 
-int rtl8366_set_vlan(struct realtek_smi *smi, int vid, u32 member,
+int rtl8366_set_vlan(struct realtek_priv *priv, int vid, u32 member,
                     u32 untag, u32 fid)
 {
        struct rtl8366_vlan_mc vlanmc;
@@ -141,31 +141,31 @@ int rtl8366_set_vlan(struct realtek_smi *smi, int vid, u32 member,
        int mc;
        int ret;
 
-       if (!smi->ops->is_vlan_valid(smi, vid))
+       if (!priv->ops->is_vlan_valid(priv, vid))
                return -EINVAL;
 
-       dev_dbg(smi->dev,
+       dev_dbg(priv->dev,
                "setting VLAN%d 4k members: 0x%02x, untagged: 0x%02x\n",
                vid, member, untag);
 
        /* Update the 4K table */
-       ret = smi->ops->get_vlan_4k(smi, vid, &vlan4k);
+       ret = priv->ops->get_vlan_4k(priv, vid, &vlan4k);
        if (ret)
                return ret;
 
        vlan4k.member |= member;
        vlan4k.untag |= untag;
        vlan4k.fid = fid;
-       ret = smi->ops->set_vlan_4k(smi, &vlan4k);
+       ret = priv->ops->set_vlan_4k(priv, &vlan4k);
        if (ret)
                return ret;
 
-       dev_dbg(smi->dev,
+       dev_dbg(priv->dev,
                "resulting VLAN%d 4k members: 0x%02x, untagged: 0x%02x\n",
                vid, vlan4k.member, vlan4k.untag);
 
        /* Find or allocate a member config for this VID */
-       ret = rtl8366_obtain_mc(smi, vid, &vlanmc);
+       ret = rtl8366_obtain_mc(priv, vid, &vlanmc);
        if (ret < 0)
                return ret;
        mc = ret;
@@ -176,12 +176,12 @@ int rtl8366_set_vlan(struct realtek_smi *smi, int vid, u32 member,
        vlanmc.fid = fid;
 
        /* Commit updates to the MC entry */
-       ret = smi->ops->set_vlan_mc(smi, mc, &vlanmc);
+       ret = priv->ops->set_vlan_mc(priv, mc, &vlanmc);
        if (ret)
-               dev_err(smi->dev, "failed to commit changes to VLAN MC index %d for VID %d\n",
+               dev_err(priv->dev, "failed to commit changes to VLAN MC index %d for VID %d\n",
                        mc, vid);
        else
-               dev_dbg(smi->dev,
+               dev_dbg(priv->dev,
                        "resulting VLAN%d MC members: 0x%02x, untagged: 0x%02x\n",
                        vid, vlanmc.member, vlanmc.untag);
 
@@ -189,37 +189,37 @@ int rtl8366_set_vlan(struct realtek_smi *smi, int vid, u32 member,
 }
 EXPORT_SYMBOL_GPL(rtl8366_set_vlan);
 
-int rtl8366_set_pvid(struct realtek_smi *smi, unsigned int port,
+int rtl8366_set_pvid(struct realtek_priv *priv, unsigned int port,
                     unsigned int vid)
 {
        struct rtl8366_vlan_mc vlanmc;
        int mc;
        int ret;
 
-       if (!smi->ops->is_vlan_valid(smi, vid))
+       if (!priv->ops->is_vlan_valid(priv, vid))
                return -EINVAL;
 
        /* Find or allocate a member config for this VID */
-       ret = rtl8366_obtain_mc(smi, vid, &vlanmc);
+       ret = rtl8366_obtain_mc(priv, vid, &vlanmc);
        if (ret < 0)
                return ret;
        mc = ret;
 
-       ret = smi->ops->set_mc_index(smi, port, mc);
+       ret = priv->ops->set_mc_index(priv, port, mc);
        if (ret) {
-               dev_err(smi->dev, "set PVID: failed to set MC index %d for port %d\n",
+               dev_err(priv->dev, "set PVID: failed to set MC index %d for port %d\n",
                        mc, port);
                return ret;
        }
 
-       dev_dbg(smi->dev, "set PVID: the PVID for port %d set to %d using existing MC index %d\n",
+       dev_dbg(priv->dev, "set PVID: the PVID for port %d set to %d using existing MC index %d\n",
                port, vid, mc);
 
        return 0;
 }
 EXPORT_SYMBOL_GPL(rtl8366_set_pvid);
 
-int rtl8366_enable_vlan4k(struct realtek_smi *smi, bool enable)
+int rtl8366_enable_vlan4k(struct realtek_priv *priv, bool enable)
 {
        int ret;
 
@@ -229,52 +229,52 @@ int rtl8366_enable_vlan4k(struct realtek_smi *smi, bool enable)
         */
        if (enable) {
                /* Make sure VLAN is ON */
-               ret = smi->ops->enable_vlan(smi, true);
+               ret = priv->ops->enable_vlan(priv, true);
                if (ret)
                        return ret;
 
-               smi->vlan_enabled = true;
+               priv->vlan_enabled = true;
        }
 
-       ret = smi->ops->enable_vlan4k(smi, enable);
+       ret = priv->ops->enable_vlan4k(priv, enable);
        if (ret)
                return ret;
 
-       smi->vlan4k_enabled = enable;
+       priv->vlan4k_enabled = enable;
        return 0;
 }
 EXPORT_SYMBOL_GPL(rtl8366_enable_vlan4k);
 
-int rtl8366_enable_vlan(struct realtek_smi *smi, bool enable)
+int rtl8366_enable_vlan(struct realtek_priv *priv, bool enable)
 {
        int ret;
 
-       ret = smi->ops->enable_vlan(smi, enable);
+       ret = priv->ops->enable_vlan(priv, enable);
        if (ret)
                return ret;
 
-       smi->vlan_enabled = enable;
+       priv->vlan_enabled = enable;
 
        /* If we turn VLAN off, make sure that we turn off
         * 4k VLAN as well, if that happened to be on.
         */
        if (!enable) {
-               smi->vlan4k_enabled = false;
-               ret = smi->ops->enable_vlan4k(smi, false);
+               priv->vlan4k_enabled = false;
+               ret = priv->ops->enable_vlan4k(priv, false);
        }
 
        return ret;
 }
 EXPORT_SYMBOL_GPL(rtl8366_enable_vlan);
 
-int rtl8366_reset_vlan(struct realtek_smi *smi)
+int rtl8366_reset_vlan(struct realtek_priv *priv)
 {
        struct rtl8366_vlan_mc vlanmc;
        int ret;
        int i;
 
-       rtl8366_enable_vlan(smi, false);
-       rtl8366_enable_vlan4k(smi, false);
+       rtl8366_enable_vlan(priv, false);
+       rtl8366_enable_vlan4k(priv, false);
 
        /* Clear the 16 VLAN member configurations */
        vlanmc.vid = 0;
@@ -282,8 +282,8 @@ int rtl8366_reset_vlan(struct realtek_smi *smi)
        vlanmc.member = 0;
        vlanmc.untag = 0;
        vlanmc.fid = 0;
-       for (i = 0; i < smi->num_vlan_mc; i++) {
-               ret = smi->ops->set_vlan_mc(smi, i, &vlanmc);
+       for (i = 0; i < priv->num_vlan_mc; i++) {
+               ret = priv->ops->set_vlan_mc(priv, i, &vlanmc);
                if (ret)
                        return ret;
        }
@@ -298,12 +298,12 @@ int rtl8366_vlan_add(struct dsa_switch *ds, int port,
 {
        bool untagged = !!(vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED);
        bool pvid = !!(vlan->flags & BRIDGE_VLAN_INFO_PVID);
-       struct realtek_smi *smi = ds->priv;
+       struct realtek_priv *priv = ds->priv;
        u32 member = 0;
        u32 untag = 0;
        int ret;
 
-       if (!smi->ops->is_vlan_valid(smi, vlan->vid)) {
+       if (!priv->ops->is_vlan_valid(priv, vlan->vid)) {
                NL_SET_ERR_MSG_MOD(extack, "VLAN ID not valid");
                return -EINVAL;
        }
@@ -312,13 +312,13 @@ int rtl8366_vlan_add(struct dsa_switch *ds, int port,
         * FIXME: what's with this 4k business?
         * Just rtl8366_enable_vlan() seems inconclusive.
         */
-       ret = rtl8366_enable_vlan4k(smi, true);
+       ret = rtl8366_enable_vlan4k(priv, true);
        if (ret) {
                NL_SET_ERR_MSG_MOD(extack, "Failed to enable VLAN 4K");
                return ret;
        }
 
-       dev_dbg(smi->dev, "add VLAN %d on port %d, %s, %s\n",
+       dev_dbg(priv->dev, "add VLAN %d on port %d, %s, %s\n",
                vlan->vid, port, untagged ? "untagged" : "tagged",
                pvid ? "PVID" : "no PVID");
 
@@ -327,18 +327,18 @@ int rtl8366_vlan_add(struct dsa_switch *ds, int port,
        if (untagged)
                untag |= BIT(port);
 
-       ret = rtl8366_set_vlan(smi, vlan->vid, member, untag, 0);
+       ret = rtl8366_set_vlan(priv, vlan->vid, member, untag, 0);
        if (ret) {
-               dev_err(smi->dev, "failed to set up VLAN %04x", vlan->vid);
+               dev_err(priv->dev, "failed to set up VLAN %04x", vlan->vid);
                return ret;
        }
 
        if (!pvid)
                return 0;
 
-       ret = rtl8366_set_pvid(smi, port, vlan->vid);
+       ret = rtl8366_set_pvid(priv, port, vlan->vid);
        if (ret) {
-               dev_err(smi->dev, "failed to set PVID on port %d to VLAN %04x",
+               dev_err(priv->dev, "failed to set PVID on port %d to VLAN %04x",
                        port, vlan->vid);
                return ret;
        }
@@ -350,15 +350,15 @@ EXPORT_SYMBOL_GPL(rtl8366_vlan_add);
 int rtl8366_vlan_del(struct dsa_switch *ds, int port,
                     const struct switchdev_obj_port_vlan *vlan)
 {
-       struct realtek_smi *smi = ds->priv;
+       struct realtek_priv *priv = ds->priv;
        int ret, i;
 
-       dev_dbg(smi->dev, "del VLAN %d on port %d\n", vlan->vid, port);
+       dev_dbg(priv->dev, "del VLAN %d on port %d\n", vlan->vid, port);
 
-       for (i = 0; i < smi->num_vlan_mc; i++) {
+       for (i = 0; i < priv->num_vlan_mc; i++) {
                struct rtl8366_vlan_mc vlanmc;
 
-               ret = smi->ops->get_vlan_mc(smi, i, &vlanmc);
+               ret = priv->ops->get_vlan_mc(priv, i, &vlanmc);
                if (ret)
                        return ret;
 
@@ -376,9 +376,9 @@ int rtl8366_vlan_del(struct dsa_switch *ds, int port,
                                vlanmc.priority = 0;
                                vlanmc.fid = 0;
                        }
-                       ret = smi->ops->set_vlan_mc(smi, i, &vlanmc);
+                       ret = priv->ops->set_vlan_mc(priv, i, &vlanmc);
                        if (ret) {
-                               dev_err(smi->dev,
+                               dev_err(priv->dev,
                                        "failed to remove VLAN %04x\n",
                                        vlan->vid);
                                return ret;
@@ -394,15 +394,15 @@ EXPORT_SYMBOL_GPL(rtl8366_vlan_del);
 void rtl8366_get_strings(struct dsa_switch *ds, int port, u32 stringset,
                         uint8_t *data)
 {
-       struct realtek_smi *smi = ds->priv;
+       struct realtek_priv *priv = ds->priv;
        struct rtl8366_mib_counter *mib;
        int i;
 
-       if (port >= smi->num_ports)
+       if (port >= priv->num_ports)
                return;
 
-       for (i = 0; i < smi->num_mib_counters; i++) {
-               mib = &smi->mib_counters[i];
+       for (i = 0; i < priv->num_mib_counters; i++) {
+               mib = &priv->mib_counters[i];
                strncpy(data + i * ETH_GSTRING_LEN,
                        mib->name, ETH_GSTRING_LEN);
        }
@@ -411,35 +411,35 @@ EXPORT_SYMBOL_GPL(rtl8366_get_strings);
 
 int rtl8366_get_sset_count(struct dsa_switch *ds, int port, int sset)
 {
-       struct realtek_smi *smi = ds->priv;
+       struct realtek_priv *priv = ds->priv;
 
        /* We only support SS_STATS */
        if (sset != ETH_SS_STATS)
                return 0;
-       if (port >= smi->num_ports)
+       if (port >= priv->num_ports)
                return -EINVAL;
 
-       return smi->num_mib_counters;
+       return priv->num_mib_counters;
 }
 EXPORT_SYMBOL_GPL(rtl8366_get_sset_count);
 
 void rtl8366_get_ethtool_stats(struct dsa_switch *ds, int port, uint64_t *data)
 {
-       struct realtek_smi *smi = ds->priv;
+       struct realtek_priv *priv = ds->priv;
        int i;
        int ret;
 
-       if (port >= smi->num_ports)
+       if (port >= priv->num_ports)
                return;
 
-       for (i = 0; i < smi->num_mib_counters; i++) {
+       for (i = 0; i < priv->num_mib_counters; i++) {
                struct rtl8366_mib_counter *mib;
                u64 mibvalue = 0;
 
-               mib = &smi->mib_counters[i];
-               ret = smi->ops->get_mib_counter(smi, port, mib, &mibvalue);
+               mib = &priv->mib_counters[i];
+               ret = priv->ops->get_mib_counter(priv, port, mib, &mibvalue);
                if (ret) {
-                       dev_err(smi->dev, "error reading MIB counter %s\n",
+                       dev_err(priv->dev, "error reading MIB counter %s\n",
                                mib->name);
                }
                data[i] = mibvalue;
similarity index 78%
rename from drivers/net/dsa/rtl8366rb.c
rename to drivers/net/dsa/realtek/rtl8366rb.c
index ecc19bd..fb6565e 100644 (file)
@@ -21,7 +21,7 @@
 #include <linux/of_irq.h>
 #include <linux/regmap.h>
 
-#include "realtek-smi-core.h"
+#include "realtek.h"
 
 #define RTL8366RB_PORT_NUM_CPU         5
 #define RTL8366RB_NUM_PORTS            6
@@ -396,7 +396,7 @@ static struct rtl8366_mib_counter rtl8366rb_mib_counters[] = {
        { 0, 70, 2, "IfOutBroadcastPkts"                        },
 };
 
-static int rtl8366rb_get_mib_counter(struct realtek_smi *smi,
+static int rtl8366rb_get_mib_counter(struct realtek_priv *priv,
                                     int port,
                                     struct rtl8366_mib_counter *mib,
                                     u64 *mibvalue)
@@ -412,12 +412,12 @@ static int rtl8366rb_get_mib_counter(struct realtek_smi *smi,
        /* Writing access counter address first
         * then ASIC will prepare 64bits counter wait for being retrived
         */
-       ret = regmap_write(smi->map, addr, 0); /* Write whatever */
+       ret = regmap_write(priv->map, addr, 0); /* Write whatever */
        if (ret)
                return ret;
 
        /* Read MIB control register */
-       ret = regmap_read(smi->map, RTL8366RB_MIB_CTRL_REG, &val);
+       ret = regmap_read(priv->map, RTL8366RB_MIB_CTRL_REG, &val);
        if (ret)
                return -EIO;
 
@@ -430,7 +430,7 @@ static int rtl8366rb_get_mib_counter(struct realtek_smi *smi,
        /* Read each individual MIB 16 bits at the time */
        *mibvalue = 0;
        for (i = mib->length; i > 0; i--) {
-               ret = regmap_read(smi->map, addr + (i - 1), &val);
+               ret = regmap_read(priv->map, addr + (i - 1), &val);
                if (ret)
                        return ret;
                *mibvalue = (*mibvalue << 16) | (val & 0xFFFF);
@@ -455,38 +455,38 @@ static u32 rtl8366rb_get_irqmask(struct irq_data *d)
 
 static void rtl8366rb_mask_irq(struct irq_data *d)
 {
-       struct realtek_smi *smi = irq_data_get_irq_chip_data(d);
+       struct realtek_priv *priv = irq_data_get_irq_chip_data(d);
        int ret;
 
-       ret = regmap_update_bits(smi->map, RTL8366RB_INTERRUPT_MASK_REG,
+       ret = regmap_update_bits(priv->map, RTL8366RB_INTERRUPT_MASK_REG,
                                 rtl8366rb_get_irqmask(d), 0);
        if (ret)
-               dev_err(smi->dev, "could not mask IRQ\n");
+               dev_err(priv->dev, "could not mask IRQ\n");
 }
 
 static void rtl8366rb_unmask_irq(struct irq_data *d)
 {
-       struct realtek_smi *smi = irq_data_get_irq_chip_data(d);
+       struct realtek_priv *priv = irq_data_get_irq_chip_data(d);
        int ret;
 
-       ret = regmap_update_bits(smi->map, RTL8366RB_INTERRUPT_MASK_REG,
+       ret = regmap_update_bits(priv->map, RTL8366RB_INTERRUPT_MASK_REG,
                                 rtl8366rb_get_irqmask(d),
                                 rtl8366rb_get_irqmask(d));
        if (ret)
-               dev_err(smi->dev, "could not unmask IRQ\n");
+               dev_err(priv->dev, "could not unmask IRQ\n");
 }
 
 static irqreturn_t rtl8366rb_irq(int irq, void *data)
 {
-       struct realtek_smi *smi = data;
+       struct realtek_priv *priv = data;
        u32 stat;
        int ret;
 
        /* This clears the IRQ status register */
-       ret = regmap_read(smi->map, RTL8366RB_INTERRUPT_STATUS_REG,
+       ret = regmap_read(priv->map, RTL8366RB_INTERRUPT_STATUS_REG,
                          &stat);
        if (ret) {
-               dev_err(smi->dev, "can't read interrupt status\n");
+               dev_err(priv->dev, "can't read interrupt status\n");
                return IRQ_NONE;
        }
        stat &= RTL8366RB_INTERRUPT_VALID;
@@ -502,7 +502,7 @@ static irqreturn_t rtl8366rb_irq(int irq, void *data)
                 */
                if (line < 12 && line > 5)
                        line -= 5;
-               child_irq = irq_find_mapping(smi->irqdomain, line);
+               child_irq = irq_find_mapping(priv->irqdomain, line);
                handle_nested_irq(child_irq);
        }
        return IRQ_HANDLED;
@@ -538,7 +538,7 @@ static const struct irq_domain_ops rtl8366rb_irqdomain_ops = {
        .xlate  = irq_domain_xlate_onecell,
 };
 
-static int rtl8366rb_setup_cascaded_irq(struct realtek_smi *smi)
+static int rtl8366rb_setup_cascaded_irq(struct realtek_priv *priv)
 {
        struct device_node *intc;
        unsigned long irq_trig;
@@ -547,24 +547,24 @@ static int rtl8366rb_setup_cascaded_irq(struct realtek_smi *smi)
        u32 val;
        int i;
 
-       intc = of_get_child_by_name(smi->dev->of_node, "interrupt-controller");
+       intc = of_get_child_by_name(priv->dev->of_node, "interrupt-controller");
        if (!intc) {
-               dev_err(smi->dev, "missing child interrupt-controller node\n");
+               dev_err(priv->dev, "missing child interrupt-controller node\n");
                return -EINVAL;
        }
        /* RB8366RB IRQs cascade off this one */
        irq = of_irq_get(intc, 0);
        if (irq <= 0) {
-               dev_err(smi->dev, "failed to get parent IRQ\n");
+               dev_err(priv->dev, "failed to get parent IRQ\n");
                ret = irq ? irq : -EINVAL;
                goto out_put_node;
        }
 
        /* This clears the IRQ status register */
-       ret = regmap_read(smi->map, RTL8366RB_INTERRUPT_STATUS_REG,
+       ret = regmap_read(priv->map, RTL8366RB_INTERRUPT_STATUS_REG,
                          &val);
        if (ret) {
-               dev_err(smi->dev, "can't read interrupt status\n");
+               dev_err(priv->dev, "can't read interrupt status\n");
                goto out_put_node;
        }
 
@@ -573,48 +573,48 @@ static int rtl8366rb_setup_cascaded_irq(struct realtek_smi *smi)
        switch (irq_trig) {
        case IRQF_TRIGGER_RISING:
        case IRQF_TRIGGER_HIGH:
-               dev_info(smi->dev, "active high/rising IRQ\n");
+               dev_info(priv->dev, "active high/rising IRQ\n");
                val = 0;
                break;
        case IRQF_TRIGGER_FALLING:
        case IRQF_TRIGGER_LOW:
-               dev_info(smi->dev, "active low/falling IRQ\n");
+               dev_info(priv->dev, "active low/falling IRQ\n");
                val = RTL8366RB_INTERRUPT_POLARITY;
                break;
        }
-       ret = regmap_update_bits(smi->map, RTL8366RB_INTERRUPT_CONTROL_REG,
+       ret = regmap_update_bits(priv->map, RTL8366RB_INTERRUPT_CONTROL_REG,
                                 RTL8366RB_INTERRUPT_POLARITY,
                                 val);
        if (ret) {
-               dev_err(smi->dev, "could not configure IRQ polarity\n");
+               dev_err(priv->dev, "could not configure IRQ polarity\n");
                goto out_put_node;
        }
 
-       ret = devm_request_threaded_irq(smi->dev, irq, NULL,
+       ret = devm_request_threaded_irq(priv->dev, irq, NULL,
                                        rtl8366rb_irq, IRQF_ONESHOT,
-                                       "RTL8366RB", smi);
+                                       "RTL8366RB", priv);
        if (ret) {
-               dev_err(smi->dev, "unable to request irq: %d\n", ret);
+               dev_err(priv->dev, "unable to request irq: %d\n", ret);
                goto out_put_node;
        }
-       smi->irqdomain = irq_domain_add_linear(intc,
-                                              RTL8366RB_NUM_INTERRUPT,
-                                              &rtl8366rb_irqdomain_ops,
-                                              smi);
-       if (!smi->irqdomain) {
-               dev_err(smi->dev, "failed to create IRQ domain\n");
+       priv->irqdomain = irq_domain_add_linear(intc,
+                                               RTL8366RB_NUM_INTERRUPT,
+                                               &rtl8366rb_irqdomain_ops,
+                                               priv);
+       if (!priv->irqdomain) {
+               dev_err(priv->dev, "failed to create IRQ domain\n");
                ret = -EINVAL;
                goto out_put_node;
        }
-       for (i = 0; i < smi->num_ports; i++)
-               irq_set_parent(irq_create_mapping(smi->irqdomain, i), irq);
+       for (i = 0; i < priv->num_ports; i++)
+               irq_set_parent(irq_create_mapping(priv->irqdomain, i), irq);
 
 out_put_node:
        of_node_put(intc);
        return ret;
 }
 
-static int rtl8366rb_set_addr(struct realtek_smi *smi)
+static int rtl8366rb_set_addr(struct realtek_priv *priv)
 {
        u8 addr[ETH_ALEN];
        u16 val;
@@ -622,18 +622,18 @@ static int rtl8366rb_set_addr(struct realtek_smi *smi)
 
        eth_random_addr(addr);
 
-       dev_info(smi->dev, "set MAC: %02X:%02X:%02X:%02X:%02X:%02X\n",
+       dev_info(priv->dev, "set MAC: %02X:%02X:%02X:%02X:%02X:%02X\n",
                 addr[0], addr[1], addr[2], addr[3], addr[4], addr[5]);
        val = addr[0] << 8 | addr[1];
-       ret = regmap_write(smi->map, RTL8366RB_SMAR0, val);
+       ret = regmap_write(priv->map, RTL8366RB_SMAR0, val);
        if (ret)
                return ret;
        val = addr[2] << 8 | addr[3];
-       ret = regmap_write(smi->map, RTL8366RB_SMAR1, val);
+       ret = regmap_write(priv->map, RTL8366RB_SMAR1, val);
        if (ret)
                return ret;
        val = addr[4] << 8 | addr[5];
-       ret = regmap_write(smi->map, RTL8366RB_SMAR2, val);
+       ret = regmap_write(priv->map, RTL8366RB_SMAR2, val);
        if (ret)
                return ret;
 
@@ -765,7 +765,7 @@ static const struct rtl8366rb_jam_tbl_entry rtl8366rb_green_jam[] = {
 
 /* Function that jams the tables in the proper registers */
 static int rtl8366rb_jam_table(const struct rtl8366rb_jam_tbl_entry *jam_table,
-                              int jam_size, struct realtek_smi *smi,
+                              int jam_size, struct realtek_priv *priv,
                               bool write_dbg)
 {
        u32 val;
@@ -774,24 +774,24 @@ static int rtl8366rb_jam_table(const struct rtl8366rb_jam_tbl_entry *jam_table,
 
        for (i = 0; i < jam_size; i++) {
                if ((jam_table[i].reg & 0xBE00) == 0xBE00) {
-                       ret = regmap_read(smi->map,
+                       ret = regmap_read(priv->map,
                                          RTL8366RB_PHY_ACCESS_BUSY_REG,
                                          &val);
                        if (ret)
                                return ret;
                        if (!(val & RTL8366RB_PHY_INT_BUSY)) {
-                               ret = regmap_write(smi->map,
-                                               RTL8366RB_PHY_ACCESS_CTRL_REG,
-                                               RTL8366RB_PHY_CTRL_WRITE);
+                               ret = regmap_write(priv->map,
+                                                  RTL8366RB_PHY_ACCESS_CTRL_REG,
+                                                  RTL8366RB_PHY_CTRL_WRITE);
                                if (ret)
                                        return ret;
                        }
                }
                if (write_dbg)
-                       dev_dbg(smi->dev, "jam %04x into register %04x\n",
+                       dev_dbg(priv->dev, "jam %04x into register %04x\n",
                                jam_table[i].val,
                                jam_table[i].reg);
-               ret = regmap_write(smi->map,
+               ret = regmap_write(priv->map,
                                   jam_table[i].reg,
                                   jam_table[i].val);
                if (ret)
@@ -802,7 +802,7 @@ static int rtl8366rb_jam_table(const struct rtl8366rb_jam_tbl_entry *jam_table,
 
 static int rtl8366rb_setup(struct dsa_switch *ds)
 {
-       struct realtek_smi *smi = ds->priv;
+       struct realtek_priv *priv = ds->priv;
        const struct rtl8366rb_jam_tbl_entry *jam_table;
        struct rtl8366rb *rb;
        u32 chip_ver = 0;
@@ -812,11 +812,11 @@ static int rtl8366rb_setup(struct dsa_switch *ds)
        int ret;
        int i;
 
-       rb = smi->chip_data;
+       rb = priv->chip_data;
 
-       ret = regmap_read(smi->map, RTL8366RB_CHIP_ID_REG, &chip_id);
+       ret = regmap_read(priv->map, RTL8366RB_CHIP_ID_REG, &chip_id);
        if (ret) {
-               dev_err(smi->dev, "unable to read chip id\n");
+               dev_err(priv->dev, "unable to read chip id\n");
                return ret;
        }
 
@@ -824,18 +824,18 @@ static int rtl8366rb_setup(struct dsa_switch *ds)
        case RTL8366RB_CHIP_ID_8366:
                break;
        default:
-               dev_err(smi->dev, "unknown chip id (%04x)\n", chip_id);
+               dev_err(priv->dev, "unknown chip id (%04x)\n", chip_id);
                return -ENODEV;
        }
 
-       ret = regmap_read(smi->map, RTL8366RB_CHIP_VERSION_CTRL_REG,
+       ret = regmap_read(priv->map, RTL8366RB_CHIP_VERSION_CTRL_REG,
                          &chip_ver);
        if (ret) {
-               dev_err(smi->dev, "unable to read chip version\n");
+               dev_err(priv->dev, "unable to read chip version\n");
                return ret;
        }
 
-       dev_info(smi->dev, "RTL%04x ver %u chip found\n",
+       dev_info(priv->dev, "RTL%04x ver %u chip found\n",
                 chip_id, chip_ver & RTL8366RB_CHIP_VERSION_MASK);
 
        /* Do the init dance using the right jam table */
@@ -872,20 +872,20 @@ static int rtl8366rb_setup(struct dsa_switch *ds)
                jam_size = ARRAY_SIZE(rtl8366rb_init_jam_dgn3500);
        }
 
-       ret = rtl8366rb_jam_table(jam_table, jam_size, smi, true);
+       ret = rtl8366rb_jam_table(jam_table, jam_size, priv, true);
        if (ret)
                return ret;
 
        /* Isolate all user ports so they can only send packets to itself and the CPU port */
        for (i = 0; i < RTL8366RB_PORT_NUM_CPU; i++) {
-               ret = regmap_write(smi->map, RTL8366RB_PORT_ISO(i),
+               ret = regmap_write(priv->map, RTL8366RB_PORT_ISO(i),
                                   RTL8366RB_PORT_ISO_PORTS(BIT(RTL8366RB_PORT_NUM_CPU)) |
                                   RTL8366RB_PORT_ISO_EN);
                if (ret)
                        return ret;
        }
        /* CPU port can send packets to all ports */
-       ret = regmap_write(smi->map, RTL8366RB_PORT_ISO(RTL8366RB_PORT_NUM_CPU),
+       ret = regmap_write(priv->map, RTL8366RB_PORT_ISO(RTL8366RB_PORT_NUM_CPU),
                           RTL8366RB_PORT_ISO_PORTS(dsa_user_ports(ds)) |
                           RTL8366RB_PORT_ISO_EN);
        if (ret)
@@ -893,26 +893,26 @@ static int rtl8366rb_setup(struct dsa_switch *ds)
 
        /* Set up the "green ethernet" feature */
        ret = rtl8366rb_jam_table(rtl8366rb_green_jam,
-                                 ARRAY_SIZE(rtl8366rb_green_jam), smi, false);
+                                 ARRAY_SIZE(rtl8366rb_green_jam), priv, false);
        if (ret)
                return ret;
 
-       ret = regmap_write(smi->map,
+       ret = regmap_write(priv->map,
                           RTL8366RB_GREEN_FEATURE_REG,
                           (chip_ver == 1) ? 0x0007 : 0x0003);
        if (ret)
                return ret;
 
        /* Vendor driver sets 0x240 in registers 0xc and 0xd (undocumented) */
-       ret = regmap_write(smi->map, 0x0c, 0x240);
+       ret = regmap_write(priv->map, 0x0c, 0x240);
        if (ret)
                return ret;
-       ret = regmap_write(smi->map, 0x0d, 0x240);
+       ret = regmap_write(priv->map, 0x0d, 0x240);
        if (ret)
                return ret;
 
        /* Set some random MAC address */
-       ret = rtl8366rb_set_addr(smi);
+       ret = rtl8366rb_set_addr(priv);
        if (ret)
                return ret;
 
@@ -921,21 +921,21 @@ static int rtl8366rb_setup(struct dsa_switch *ds)
         * If you set RTL8368RB_CPU_NO_TAG (bit 15) in this registers
         * the custom tag is turned off.
         */
-       ret = regmap_update_bits(smi->map, RTL8368RB_CPU_CTRL_REG,
+       ret = regmap_update_bits(priv->map, RTL8368RB_CPU_CTRL_REG,
                                 0xFFFF,
-                                BIT(smi->cpu_port));
+                                BIT(priv->cpu_port));
        if (ret)
                return ret;
 
        /* Make sure we default-enable the fixed CPU port */
-       ret = regmap_update_bits(smi->map, RTL8366RB_PECR,
-                                BIT(smi->cpu_port),
+       ret = regmap_update_bits(priv->map, RTL8366RB_PECR,
+                                BIT(priv->cpu_port),
                                 0);
        if (ret)
                return ret;
 
        /* Set maximum packet length to 1536 bytes */
-       ret = regmap_update_bits(smi->map, RTL8366RB_SGCR,
+       ret = regmap_update_bits(priv->map, RTL8366RB_SGCR,
                                 RTL8366RB_SGCR_MAX_LENGTH_MASK,
                                 RTL8366RB_SGCR_MAX_LENGTH_1536);
        if (ret)
@@ -945,13 +945,13 @@ static int rtl8366rb_setup(struct dsa_switch *ds)
                rb->max_mtu[i] = 1532;
 
        /* Disable learning for all ports */
-       ret = regmap_write(smi->map, RTL8366RB_PORT_LEARNDIS_CTRL,
+       ret = regmap_write(priv->map, RTL8366RB_PORT_LEARNDIS_CTRL,
                           RTL8366RB_PORT_ALL);
        if (ret)
                return ret;
 
        /* Enable auto ageing for all ports */
-       ret = regmap_write(smi->map, RTL8366RB_SECURITY_CTRL, 0);
+       ret = regmap_write(priv->map, RTL8366RB_SECURITY_CTRL, 0);
        if (ret)
                return ret;
 
@@ -962,30 +962,30 @@ static int rtl8366rb_setup(struct dsa_switch *ds)
         * connected to something exotic such as fiber, then this might
         * be worth experimenting with.
         */
-       ret = regmap_update_bits(smi->map, RTL8366RB_PMC0,
+       ret = regmap_update_bits(priv->map, RTL8366RB_PMC0,
                                 RTL8366RB_PMC0_P4_IOMODE_MASK,
                                 0 << RTL8366RB_PMC0_P4_IOMODE_SHIFT);
        if (ret)
                return ret;
 
        /* Accept all packets by default, we enable filtering on-demand */
-       ret = regmap_write(smi->map, RTL8366RB_VLAN_INGRESS_CTRL1_REG,
+       ret = regmap_write(priv->map, RTL8366RB_VLAN_INGRESS_CTRL1_REG,
                           0);
        if (ret)
                return ret;
-       ret = regmap_write(smi->map, RTL8366RB_VLAN_INGRESS_CTRL2_REG,
+       ret = regmap_write(priv->map, RTL8366RB_VLAN_INGRESS_CTRL2_REG,
                           0);
        if (ret)
                return ret;
 
        /* Don't drop packets whose DA has not been learned */
-       ret = regmap_update_bits(smi->map, RTL8366RB_SSCR2,
+       ret = regmap_update_bits(priv->map, RTL8366RB_SSCR2,
                                 RTL8366RB_SSCR2_DROP_UNKNOWN_DA, 0);
        if (ret)
                return ret;
 
        /* Set blinking, TODO: make this configurable */
-       ret = regmap_update_bits(smi->map, RTL8366RB_LED_BLINKRATE_REG,
+       ret = regmap_update_bits(priv->map, RTL8366RB_LED_BLINKRATE_REG,
                                 RTL8366RB_LED_BLINKRATE_MASK,
                                 RTL8366RB_LED_BLINKRATE_56MS);
        if (ret)
@@ -996,15 +996,15 @@ static int rtl8366rb_setup(struct dsa_switch *ds)
         * behaviour (no individual config) but we can set up each
         * LED separately.
         */
-       if (smi->leds_disabled) {
+       if (priv->leds_disabled) {
                /* Turn everything off */
-               regmap_update_bits(smi->map,
+               regmap_update_bits(priv->map,
                                   RTL8366RB_LED_0_1_CTRL_REG,
                                   0x0FFF, 0);
-               regmap_update_bits(smi->map,
+               regmap_update_bits(priv->map,
                                   RTL8366RB_LED_2_3_CTRL_REG,
                                   0x0FFF, 0);
-               regmap_update_bits(smi->map,
+               regmap_update_bits(priv->map,
                                   RTL8366RB_INTERRUPT_CONTROL_REG,
                                   RTL8366RB_P4_RGMII_LED,
                                   0);
@@ -1014,7 +1014,7 @@ static int rtl8366rb_setup(struct dsa_switch *ds)
                val = RTL8366RB_LED_FORCE;
        }
        for (i = 0; i < 4; i++) {
-               ret = regmap_update_bits(smi->map,
+               ret = regmap_update_bits(priv->map,
                                         RTL8366RB_LED_CTRL_REG,
                                         0xf << (i * 4),
                                         val << (i * 4));
@@ -1022,18 +1022,20 @@ static int rtl8366rb_setup(struct dsa_switch *ds)
                        return ret;
        }
 
-       ret = rtl8366_reset_vlan(smi);
+       ret = rtl8366_reset_vlan(priv);
        if (ret)
                return ret;
 
-       ret = rtl8366rb_setup_cascaded_irq(smi);
+       ret = rtl8366rb_setup_cascaded_irq(priv);
        if (ret)
-               dev_info(smi->dev, "no interrupt support\n");
+               dev_info(priv->dev, "no interrupt support\n");
 
-       ret = realtek_smi_setup_mdio(smi);
-       if (ret) {
-               dev_info(smi->dev, "could not set up MDIO bus\n");
-               return -ENODEV;
+       if (priv->setup_interface) {
+               ret = priv->setup_interface(ds);
+               if (ret) {
+                       dev_err(priv->dev, "could not set up MDIO bus\n");
+                       return -ENODEV;
+               }
        }
 
        return 0;
@@ -1052,35 +1054,35 @@ rtl8366rb_mac_link_up(struct dsa_switch *ds, int port, unsigned int mode,
                      phy_interface_t interface, struct phy_device *phydev,
                      int speed, int duplex, bool tx_pause, bool rx_pause)
 {
-       struct realtek_smi *smi = ds->priv;
+       struct realtek_priv *priv = ds->priv;
        int ret;
 
-       if (port != smi->cpu_port)
+       if (port != priv->cpu_port)
                return;
 
-       dev_dbg(smi->dev, "MAC link up on CPU port (%d)\n", port);
+       dev_dbg(priv->dev, "MAC link up on CPU port (%d)\n", port);
 
        /* Force the fixed CPU port into 1Gbit mode, no autonegotiation */
-       ret = regmap_update_bits(smi->map, RTL8366RB_MAC_FORCE_CTRL_REG,
+       ret = regmap_update_bits(priv->map, RTL8366RB_MAC_FORCE_CTRL_REG,
                                 BIT(port), BIT(port));
        if (ret) {
-               dev_err(smi->dev, "failed to force 1Gbit on CPU port\n");
+               dev_err(priv->dev, "failed to force 1Gbit on CPU port\n");
                return;
        }
 
-       ret = regmap_update_bits(smi->map, RTL8366RB_PAACR2,
+       ret = regmap_update_bits(priv->map, RTL8366RB_PAACR2,
                                 0xFF00U,
                                 RTL8366RB_PAACR_CPU_PORT << 8);
        if (ret) {
-               dev_err(smi->dev, "failed to set PAACR on CPU port\n");
+               dev_err(priv->dev, "failed to set PAACR on CPU port\n");
                return;
        }
 
        /* Enable the CPU port */
-       ret = regmap_update_bits(smi->map, RTL8366RB_PECR, BIT(port),
+       ret = regmap_update_bits(priv->map, RTL8366RB_PECR, BIT(port),
                                 0);
        if (ret) {
-               dev_err(smi->dev, "failed to enable the CPU port\n");
+               dev_err(priv->dev, "failed to enable the CPU port\n");
                return;
        }
 }
@@ -1089,99 +1091,99 @@ static void
 rtl8366rb_mac_link_down(struct dsa_switch *ds, int port, unsigned int mode,
                        phy_interface_t interface)
 {
-       struct realtek_smi *smi = ds->priv;
+       struct realtek_priv *priv = ds->priv;
        int ret;
 
-       if (port != smi->cpu_port)
+       if (port != priv->cpu_port)
                return;
 
-       dev_dbg(smi->dev, "MAC link down on CPU port (%d)\n", port);
+       dev_dbg(priv->dev, "MAC link down on CPU port (%d)\n", port);
 
        /* Disable the CPU port */
-       ret = regmap_update_bits(smi->map, RTL8366RB_PECR, BIT(port),
+       ret = regmap_update_bits(priv->map, RTL8366RB_PECR, BIT(port),
                                 BIT(port));
        if (ret) {
-               dev_err(smi->dev, "failed to disable the CPU port\n");
+               dev_err(priv->dev, "failed to disable the CPU port\n");
                return;
        }
 }
 
-static void rb8366rb_set_port_led(struct realtek_smi *smi,
+static void rb8366rb_set_port_led(struct realtek_priv *priv,
                                  int port, bool enable)
 {
        u16 val = enable ? 0x3f : 0;
        int ret;
 
-       if (smi->leds_disabled)
+       if (priv->leds_disabled)
                return;
 
        switch (port) {
        case 0:
-               ret = regmap_update_bits(smi->map,
+               ret = regmap_update_bits(priv->map,
                                         RTL8366RB_LED_0_1_CTRL_REG,
                                         0x3F, val);
                break;
        case 1:
-               ret = regmap_update_bits(smi->map,
+               ret = regmap_update_bits(priv->map,
                                         RTL8366RB_LED_0_1_CTRL_REG,
                                         0x3F << RTL8366RB_LED_1_OFFSET,
                                         val << RTL8366RB_LED_1_OFFSET);
                break;
        case 2:
-               ret = regmap_update_bits(smi->map,
+               ret = regmap_update_bits(priv->map,
                                         RTL8366RB_LED_2_3_CTRL_REG,
                                         0x3F, val);
                break;
        case 3:
-               ret = regmap_update_bits(smi->map,
+               ret = regmap_update_bits(priv->map,
                                         RTL8366RB_LED_2_3_CTRL_REG,
                                         0x3F << RTL8366RB_LED_3_OFFSET,
                                         val << RTL8366RB_LED_3_OFFSET);
                break;
        case 4:
-               ret = regmap_update_bits(smi->map,
+               ret = regmap_update_bits(priv->map,
                                         RTL8366RB_INTERRUPT_CONTROL_REG,
                                         RTL8366RB_P4_RGMII_LED,
                                         enable ? RTL8366RB_P4_RGMII_LED : 0);
                break;
        default:
-               dev_err(smi->dev, "no LED for port %d\n", port);
+               dev_err(priv->dev, "no LED for port %d\n", port);
                return;
        }
        if (ret)
-               dev_err(smi->dev, "error updating LED on port %d\n", port);
+               dev_err(priv->dev, "error updating LED on port %d\n", port);
 }
 
 static int
 rtl8366rb_port_enable(struct dsa_switch *ds, int port,
                      struct phy_device *phy)
 {
-       struct realtek_smi *smi = ds->priv;
+       struct realtek_priv *priv = ds->priv;
        int ret;
 
-       dev_dbg(smi->dev, "enable port %d\n", port);
-       ret = regmap_update_bits(smi->map, RTL8366RB_PECR, BIT(port),
+       dev_dbg(priv->dev, "enable port %d\n", port);
+       ret = regmap_update_bits(priv->map, RTL8366RB_PECR, BIT(port),
                                 0);
        if (ret)
                return ret;
 
-       rb8366rb_set_port_led(smi, port, true);
+       rb8366rb_set_port_led(priv, port, true);
        return 0;
 }
 
 static void
 rtl8366rb_port_disable(struct dsa_switch *ds, int port)
 {
-       struct realtek_smi *smi = ds->priv;
+       struct realtek_priv *priv = ds->priv;
        int ret;
 
-       dev_dbg(smi->dev, "disable port %d\n", port);
-       ret = regmap_update_bits(smi->map, RTL8366RB_PECR, BIT(port),
+       dev_dbg(priv->dev, "disable port %d\n", port);
+       ret = regmap_update_bits(priv->map, RTL8366RB_PECR, BIT(port),
                                 BIT(port));
        if (ret)
                return;
 
-       rb8366rb_set_port_led(smi, port, false);
+       rb8366rb_set_port_led(priv, port, false);
 }
 
 static int
@@ -1189,7 +1191,7 @@ rtl8366rb_port_bridge_join(struct dsa_switch *ds, int port,
                           struct dsa_bridge bridge,
                           bool *tx_fwd_offload)
 {
-       struct realtek_smi *smi = ds->priv;
+       struct realtek_priv *priv = ds->priv;
        unsigned int port_bitmap = 0;
        int ret, i;
 
@@ -1202,17 +1204,17 @@ rtl8366rb_port_bridge_join(struct dsa_switch *ds, int port,
                if (!dsa_port_offloads_bridge(dsa_to_port(ds, i), &bridge))
                        continue;
                /* Join this port to each other port on the bridge */
-               ret = regmap_update_bits(smi->map, RTL8366RB_PORT_ISO(i),
+               ret = regmap_update_bits(priv->map, RTL8366RB_PORT_ISO(i),
                                         RTL8366RB_PORT_ISO_PORTS(BIT(port)),
                                         RTL8366RB_PORT_ISO_PORTS(BIT(port)));
                if (ret)
-                       dev_err(smi->dev, "failed to join port %d\n", port);
+                       dev_err(priv->dev, "failed to join port %d\n", port);
 
                port_bitmap |= BIT(i);
        }
 
        /* Set the bits for the ports we can access */
-       return regmap_update_bits(smi->map, RTL8366RB_PORT_ISO(port),
+       return regmap_update_bits(priv->map, RTL8366RB_PORT_ISO(port),
                                  RTL8366RB_PORT_ISO_PORTS(port_bitmap),
                                  RTL8366RB_PORT_ISO_PORTS(port_bitmap));
 }
@@ -1221,7 +1223,7 @@ static void
 rtl8366rb_port_bridge_leave(struct dsa_switch *ds, int port,
                            struct dsa_bridge bridge)
 {
-       struct realtek_smi *smi = ds->priv;
+       struct realtek_priv *priv = ds->priv;
        unsigned int port_bitmap = 0;
        int ret, i;
 
@@ -1234,28 +1236,30 @@ rtl8366rb_port_bridge_leave(struct dsa_switch *ds, int port,
                if (!dsa_port_offloads_bridge(dsa_to_port(ds, i), &bridge))
                        continue;
                /* Remove this port from any other port on the bridge */
-               ret = regmap_update_bits(smi->map, RTL8366RB_PORT_ISO(i),
+               ret = regmap_update_bits(priv->map, RTL8366RB_PORT_ISO(i),
                                         RTL8366RB_PORT_ISO_PORTS(BIT(port)), 0);
                if (ret)
-                       dev_err(smi->dev, "failed to leave port %d\n", port);
+                       dev_err(priv->dev, "failed to leave port %d\n", port);
 
                port_bitmap |= BIT(i);
        }
 
        /* Clear the bits for the ports we can not access, leave ourselves */
-       regmap_update_bits(smi->map, RTL8366RB_PORT_ISO(port),
+       regmap_update_bits(priv->map, RTL8366RB_PORT_ISO(port),
                           RTL8366RB_PORT_ISO_PORTS(port_bitmap), 0);
 }
 
 /**
  * rtl8366rb_drop_untagged() - make the switch drop untagged and C-tagged frames
- * @smi: SMI state container
+ * @priv: SMI state container
  * @port: the port to drop untagged and C-tagged frames on
  * @drop: whether to drop or pass untagged and C-tagged frames
+ *
+ * Return: zero for success, a negative number on error.
  */
-static int rtl8366rb_drop_untagged(struct realtek_smi *smi, int port, bool drop)
+static int rtl8366rb_drop_untagged(struct realtek_priv *priv, int port, bool drop)
 {
-       return regmap_update_bits(smi->map, RTL8366RB_VLAN_INGRESS_CTRL1_REG,
+       return regmap_update_bits(priv->map, RTL8366RB_VLAN_INGRESS_CTRL1_REG,
                                  RTL8366RB_VLAN_INGRESS_CTRL1_DROP(port),
                                  drop ? RTL8366RB_VLAN_INGRESS_CTRL1_DROP(port) : 0);
 }
@@ -1264,17 +1268,17 @@ static int rtl8366rb_vlan_filtering(struct dsa_switch *ds, int port,
                                    bool vlan_filtering,
                                    struct netlink_ext_ack *extack)
 {
-       struct realtek_smi *smi = ds->priv;
+       struct realtek_priv *priv = ds->priv;
        struct rtl8366rb *rb;
        int ret;
 
-       rb = smi->chip_data;
+       rb = priv->chip_data;
 
-       dev_dbg(smi->dev, "port %d: %s VLAN filtering\n", port,
+       dev_dbg(priv->dev, "port %d: %s VLAN filtering\n", port,
                vlan_filtering ? "enable" : "disable");
 
        /* If the port is not in the member set, the frame will be dropped */
-       ret = regmap_update_bits(smi->map, RTL8366RB_VLAN_INGRESS_CTRL2_REG,
+       ret = regmap_update_bits(priv->map, RTL8366RB_VLAN_INGRESS_CTRL2_REG,
                                 BIT(port), vlan_filtering ? BIT(port) : 0);
        if (ret)
                return ret;
@@ -1284,9 +1288,9 @@ static int rtl8366rb_vlan_filtering(struct dsa_switch *ds, int port,
         * filtering on a port, we need to accept any frames.
         */
        if (vlan_filtering)
-               ret = rtl8366rb_drop_untagged(smi, port, !rb->pvid_enabled[port]);
+               ret = rtl8366rb_drop_untagged(priv, port, !rb->pvid_enabled[port]);
        else
-               ret = rtl8366rb_drop_untagged(smi, port, false);
+               ret = rtl8366rb_drop_untagged(priv, port, false);
 
        return ret;
 }
@@ -1308,11 +1312,11 @@ rtl8366rb_port_bridge_flags(struct dsa_switch *ds, int port,
                            struct switchdev_brport_flags flags,
                            struct netlink_ext_ack *extack)
 {
-       struct realtek_smi *smi = ds->priv;
+       struct realtek_priv *priv = ds->priv;
        int ret;
 
        if (flags.mask & BR_LEARNING) {
-               ret = regmap_update_bits(smi->map, RTL8366RB_PORT_LEARNDIS_CTRL,
+               ret = regmap_update_bits(priv->map, RTL8366RB_PORT_LEARNDIS_CTRL,
                                         BIT(port),
                                         (flags.val & BR_LEARNING) ? 0 : BIT(port));
                if (ret)
@@ -1325,7 +1329,7 @@ rtl8366rb_port_bridge_flags(struct dsa_switch *ds, int port,
 static void
 rtl8366rb_port_stp_state_set(struct dsa_switch *ds, int port, u8 state)
 {
-       struct realtek_smi *smi = ds->priv;
+       struct realtek_priv *priv = ds->priv;
        u32 val;
        int i;
 
@@ -1344,13 +1348,13 @@ rtl8366rb_port_stp_state_set(struct dsa_switch *ds, int port, u8 state)
                val = RTL8366RB_STP_STATE_FORWARDING;
                break;
        default:
-               dev_err(smi->dev, "unknown bridge state requested\n");
+               dev_err(priv->dev, "unknown bridge state requested\n");
                return;
        }
 
        /* Set the same status for the port on all the FIDs */
        for (i = 0; i < RTL8366RB_NUM_FIDS; i++) {
-               regmap_update_bits(smi->map, RTL8366RB_STP_STATE_BASE + i,
+               regmap_update_bits(priv->map, RTL8366RB_STP_STATE_BASE + i,
                                   RTL8366RB_STP_STATE_MASK(port),
                                   RTL8366RB_STP_STATE(port, val));
        }
@@ -1359,26 +1363,26 @@ rtl8366rb_port_stp_state_set(struct dsa_switch *ds, int port, u8 state)
 static void
 rtl8366rb_port_fast_age(struct dsa_switch *ds, int port)
 {
-       struct realtek_smi *smi = ds->priv;
+       struct realtek_priv *priv = ds->priv;
 
        /* This will age out any learned L2 entries */
-       regmap_update_bits(smi->map, RTL8366RB_SECURITY_CTRL,
+       regmap_update_bits(priv->map, RTL8366RB_SECURITY_CTRL,
                           BIT(port), BIT(port));
        /* Restore the normal state of things */
-       regmap_update_bits(smi->map, RTL8366RB_SECURITY_CTRL,
+       regmap_update_bits(priv->map, RTL8366RB_SECURITY_CTRL,
                           BIT(port), 0);
 }
 
 static int rtl8366rb_change_mtu(struct dsa_switch *ds, int port, int new_mtu)
 {
-       struct realtek_smi *smi = ds->priv;
+       struct realtek_priv *priv = ds->priv;
        struct rtl8366rb *rb;
        unsigned int max_mtu;
        u32 len;
        int i;
 
        /* Cache the per-port MTU setting */
-       rb = smi->chip_data;
+       rb = priv->chip_data;
        rb->max_mtu[port] = new_mtu;
 
        /* Roof out the MTU for the entire switch to the greatest
@@ -1406,7 +1410,7 @@ static int rtl8366rb_change_mtu(struct dsa_switch *ds, int port, int new_mtu)
        else
                len = RTL8366RB_SGCR_MAX_LENGTH_16000;
 
-       return regmap_update_bits(smi->map, RTL8366RB_SGCR,
+       return regmap_update_bits(priv->map, RTL8366RB_SGCR,
                                  RTL8366RB_SGCR_MAX_LENGTH_MASK,
                                  len);
 }
@@ -1419,7 +1423,7 @@ static int rtl8366rb_max_mtu(struct dsa_switch *ds, int port)
        return 15996;
 }
 
-static int rtl8366rb_get_vlan_4k(struct realtek_smi *smi, u32 vid,
+static int rtl8366rb_get_vlan_4k(struct realtek_priv *priv, u32 vid,
                                 struct rtl8366_vlan_4k *vlan4k)
 {
        u32 data[3];
@@ -1432,19 +1436,19 @@ static int rtl8366rb_get_vlan_4k(struct realtek_smi *smi, u32 vid,
                return -EINVAL;
 
        /* write VID */
-       ret = regmap_write(smi->map, RTL8366RB_VLAN_TABLE_WRITE_BASE,
+       ret = regmap_write(priv->map, RTL8366RB_VLAN_TABLE_WRITE_BASE,
                           vid & RTL8366RB_VLAN_VID_MASK);
        if (ret)
                return ret;
 
        /* write table access control word */
-       ret = regmap_write(smi->map, RTL8366RB_TABLE_ACCESS_CTRL_REG,
+       ret = regmap_write(priv->map, RTL8366RB_TABLE_ACCESS_CTRL_REG,
                           RTL8366RB_TABLE_VLAN_READ_CTRL);
        if (ret)
                return ret;
 
        for (i = 0; i < 3; i++) {
-               ret = regmap_read(smi->map,
+               ret = regmap_read(priv->map,
                                  RTL8366RB_VLAN_TABLE_READ_BASE + i,
                                  &data[i]);
                if (ret)
@@ -1460,7 +1464,7 @@ static int rtl8366rb_get_vlan_4k(struct realtek_smi *smi, u32 vid,
        return 0;
 }
 
-static int rtl8366rb_set_vlan_4k(struct realtek_smi *smi,
+static int rtl8366rb_set_vlan_4k(struct realtek_priv *priv,
                                 const struct rtl8366_vlan_4k *vlan4k)
 {
        u32 data[3];
@@ -1480,7 +1484,7 @@ static int rtl8366rb_set_vlan_4k(struct realtek_smi *smi,
        data[2] = vlan4k->fid & RTL8366RB_VLAN_FID_MASK;
 
        for (i = 0; i < 3; i++) {
-               ret = regmap_write(smi->map,
+               ret = regmap_write(priv->map,
                                   RTL8366RB_VLAN_TABLE_WRITE_BASE + i,
                                   data[i]);
                if (ret)
@@ -1488,13 +1492,13 @@ static int rtl8366rb_set_vlan_4k(struct realtek_smi *smi,
        }
 
        /* write table access control word */
-       ret = regmap_write(smi->map, RTL8366RB_TABLE_ACCESS_CTRL_REG,
+       ret = regmap_write(priv->map, RTL8366RB_TABLE_ACCESS_CTRL_REG,
                           RTL8366RB_TABLE_VLAN_WRITE_CTRL);
 
        return ret;
 }
 
-static int rtl8366rb_get_vlan_mc(struct realtek_smi *smi, u32 index,
+static int rtl8366rb_get_vlan_mc(struct realtek_priv *priv, u32 index,
                                 struct rtl8366_vlan_mc *vlanmc)
 {
        u32 data[3];
@@ -1507,7 +1511,7 @@ static int rtl8366rb_get_vlan_mc(struct realtek_smi *smi, u32 index,
                return -EINVAL;
 
        for (i = 0; i < 3; i++) {
-               ret = regmap_read(smi->map,
+               ret = regmap_read(priv->map,
                                  RTL8366RB_VLAN_MC_BASE(index) + i,
                                  &data[i]);
                if (ret)
@@ -1525,7 +1529,7 @@ static int rtl8366rb_get_vlan_mc(struct realtek_smi *smi, u32 index,
        return 0;
 }
 
-static int rtl8366rb_set_vlan_mc(struct realtek_smi *smi, u32 index,
+static int rtl8366rb_set_vlan_mc(struct realtek_priv *priv, u32 index,
                                 const struct rtl8366_vlan_mc *vlanmc)
 {
        u32 data[3];
@@ -1549,7 +1553,7 @@ static int rtl8366rb_set_vlan_mc(struct realtek_smi *smi, u32 index,
        data[2] = vlanmc->fid & RTL8366RB_VLAN_FID_MASK;
 
        for (i = 0; i < 3; i++) {
-               ret = regmap_write(smi->map,
+               ret = regmap_write(priv->map,
                                   RTL8366RB_VLAN_MC_BASE(index) + i,
                                   data[i]);
                if (ret)
@@ -1559,15 +1563,15 @@ static int rtl8366rb_set_vlan_mc(struct realtek_smi *smi, u32 index,
        return 0;
 }
 
-static int rtl8366rb_get_mc_index(struct realtek_smi *smi, int port, int *val)
+static int rtl8366rb_get_mc_index(struct realtek_priv *priv, int port, int *val)
 {
        u32 data;
        int ret;
 
-       if (port >= smi->num_ports)
+       if (port >= priv->num_ports)
                return -EINVAL;
 
-       ret = regmap_read(smi->map, RTL8366RB_PORT_VLAN_CTRL_REG(port),
+       ret = regmap_read(priv->map, RTL8366RB_PORT_VLAN_CTRL_REG(port),
                          &data);
        if (ret)
                return ret;
@@ -1578,22 +1582,22 @@ static int rtl8366rb_get_mc_index(struct realtek_smi *smi, int port, int *val)
        return 0;
 }
 
-static int rtl8366rb_set_mc_index(struct realtek_smi *smi, int port, int index)
+static int rtl8366rb_set_mc_index(struct realtek_priv *priv, int port, int index)
 {
        struct rtl8366rb *rb;
        bool pvid_enabled;
        int ret;
 
-       rb = smi->chip_data;
+       rb = priv->chip_data;
        pvid_enabled = !!index;
 
-       if (port >= smi->num_ports || index >= RTL8366RB_NUM_VLANS)
+       if (port >= priv->num_ports || index >= RTL8366RB_NUM_VLANS)
                return -EINVAL;
 
-       ret = regmap_update_bits(smi->map, RTL8366RB_PORT_VLAN_CTRL_REG(port),
-                               RTL8366RB_PORT_VLAN_CTRL_MASK <<
+       ret = regmap_update_bits(priv->map, RTL8366RB_PORT_VLAN_CTRL_REG(port),
+                                RTL8366RB_PORT_VLAN_CTRL_MASK <<
                                        RTL8366RB_PORT_VLAN_CTRL_SHIFT(port),
-                               (index & RTL8366RB_PORT_VLAN_CTRL_MASK) <<
+                                (index & RTL8366RB_PORT_VLAN_CTRL_MASK) <<
                                        RTL8366RB_PORT_VLAN_CTRL_SHIFT(port));
        if (ret)
                return ret;
@@ -1604,17 +1608,17 @@ static int rtl8366rb_set_mc_index(struct realtek_smi *smi, int port, int index)
         * not drop any untagged or C-tagged frames. Make sure to update the
         * filtering setting.
         */
-       if (dsa_port_is_vlan_filtering(dsa_to_port(smi->ds, port)))
-               ret = rtl8366rb_drop_untagged(smi, port, !pvid_enabled);
+       if (dsa_port_is_vlan_filtering(dsa_to_port(priv->ds, port)))
+               ret = rtl8366rb_drop_untagged(priv, port, !pvid_enabled);
 
        return ret;
 }
 
-static bool rtl8366rb_is_vlan_valid(struct realtek_smi *smi, unsigned int vlan)
+static bool rtl8366rb_is_vlan_valid(struct realtek_priv *priv, unsigned int vlan)
 {
        unsigned int max = RTL8366RB_NUM_VLANS - 1;
 
-       if (smi->vlan4k_enabled)
+       if (priv->vlan4k_enabled)
                max = RTL8366RB_NUM_VIDS - 1;
 
        if (vlan > max)
@@ -1623,23 +1627,23 @@ static bool rtl8366rb_is_vlan_valid(struct realtek_smi *smi, unsigned int vlan)
        return true;
 }
 
-static int rtl8366rb_enable_vlan(struct realtek_smi *smi, bool enable)
+static int rtl8366rb_enable_vlan(struct realtek_priv *priv, bool enable)
 {
-       dev_dbg(smi->dev, "%s VLAN\n", enable ? "enable" : "disable");
-       return regmap_update_bits(smi->map,
+       dev_dbg(priv->dev, "%s VLAN\n", enable ? "enable" : "disable");
+       return regmap_update_bits(priv->map,
                                  RTL8366RB_SGCR, RTL8366RB_SGCR_EN_VLAN,
                                  enable ? RTL8366RB_SGCR_EN_VLAN : 0);
 }
 
-static int rtl8366rb_enable_vlan4k(struct realtek_smi *smi, bool enable)
+static int rtl8366rb_enable_vlan4k(struct realtek_priv *priv, bool enable)
 {
-       dev_dbg(smi->dev, "%s VLAN 4k\n", enable ? "enable" : "disable");
-       return regmap_update_bits(smi->map, RTL8366RB_SGCR,
+       dev_dbg(priv->dev, "%s VLAN 4k\n", enable ? "enable" : "disable");
+       return regmap_update_bits(priv->map, RTL8366RB_SGCR,
                                  RTL8366RB_SGCR_EN_VLAN_4KTB,
                                  enable ? RTL8366RB_SGCR_EN_VLAN_4KTB : 0);
 }
 
-static int rtl8366rb_phy_read(struct realtek_smi *smi, int phy, int regnum)
+static int rtl8366rb_phy_read(struct realtek_priv *priv, int phy, int regnum)
 {
        u32 val;
        u32 reg;
@@ -1648,32 +1652,32 @@ static int rtl8366rb_phy_read(struct realtek_smi *smi, int phy, int regnum)
        if (phy > RTL8366RB_PHY_NO_MAX)
                return -EINVAL;
 
-       ret = regmap_write(smi->map, RTL8366RB_PHY_ACCESS_CTRL_REG,
+       ret = regmap_write(priv->map, RTL8366RB_PHY_ACCESS_CTRL_REG,
                           RTL8366RB_PHY_CTRL_READ);
        if (ret)
                return ret;
 
        reg = 0x8000 | (1 << (phy + RTL8366RB_PHY_NO_OFFSET)) | regnum;
 
-       ret = regmap_write(smi->map, reg, 0);
+       ret = regmap_write(priv->map, reg, 0);
        if (ret) {
-               dev_err(smi->dev,
+               dev_err(priv->dev,
                        "failed to write PHY%d reg %04x @ %04x, ret %d\n",
                        phy, regnum, reg, ret);
                return ret;
        }
 
-       ret = regmap_read(smi->map, RTL8366RB_PHY_ACCESS_DATA_REG, &val);
+       ret = regmap_read(priv->map, RTL8366RB_PHY_ACCESS_DATA_REG, &val);
        if (ret)
                return ret;
 
-       dev_dbg(smi->dev, "read PHY%d register 0x%04x @ %08x, val <- %04x\n",
+       dev_dbg(priv->dev, "read PHY%d register 0x%04x @ %08x, val <- %04x\n",
                phy, regnum, reg, val);
 
        return val;
 }
 
-static int rtl8366rb_phy_write(struct realtek_smi *smi, int phy, int regnum,
+static int rtl8366rb_phy_write(struct realtek_priv *priv, int phy, int regnum,
                               u16 val)
 {
        u32 reg;
@@ -1682,34 +1686,45 @@ static int rtl8366rb_phy_write(struct realtek_smi *smi, int phy, int regnum,
        if (phy > RTL8366RB_PHY_NO_MAX)
                return -EINVAL;
 
-       ret = regmap_write(smi->map, RTL8366RB_PHY_ACCESS_CTRL_REG,
+       ret = regmap_write(priv->map, RTL8366RB_PHY_ACCESS_CTRL_REG,
                           RTL8366RB_PHY_CTRL_WRITE);
        if (ret)
                return ret;
 
        reg = 0x8000 | (1 << (phy + RTL8366RB_PHY_NO_OFFSET)) | regnum;
 
-       dev_dbg(smi->dev, "write PHY%d register 0x%04x @ %04x, val -> %04x\n",
+       dev_dbg(priv->dev, "write PHY%d register 0x%04x @ %04x, val -> %04x\n",
                phy, regnum, reg, val);
 
-       ret = regmap_write(smi->map, reg, val);
+       ret = regmap_write(priv->map, reg, val);
        if (ret)
                return ret;
 
        return 0;
 }
 
-static int rtl8366rb_reset_chip(struct realtek_smi *smi)
+static int rtl8366rb_dsa_phy_read(struct dsa_switch *ds, int phy, int regnum)
+{
+       return rtl8366rb_phy_read(ds->priv, phy, regnum);
+}
+
+static int rtl8366rb_dsa_phy_write(struct dsa_switch *ds, int phy, int regnum,
+                                  u16 val)
+{
+       return rtl8366rb_phy_write(ds->priv, phy, regnum, val);
+}
+
+static int rtl8366rb_reset_chip(struct realtek_priv *priv)
 {
        int timeout = 10;
        u32 val;
        int ret;
 
-       realtek_smi_write_reg_noack(smi, RTL8366RB_RESET_CTRL_REG,
-                                   RTL8366RB_CHIP_CTRL_RESET_HW);
+       priv->write_reg_noack(priv, RTL8366RB_RESET_CTRL_REG,
+                             RTL8366RB_CHIP_CTRL_RESET_HW);
        do {
                usleep_range(20000, 25000);
-               ret = regmap_read(smi->map, RTL8366RB_RESET_CTRL_REG, &val);
+               ret = regmap_read(priv->map, RTL8366RB_RESET_CTRL_REG, &val);
                if (ret)
                        return ret;
 
@@ -1718,21 +1733,21 @@ static int rtl8366rb_reset_chip(struct realtek_smi *smi)
        } while (--timeout);
 
        if (!timeout) {
-               dev_err(smi->dev, "timeout waiting for the switch to reset\n");
+               dev_err(priv->dev, "timeout waiting for the switch to reset\n");
                return -EIO;
        }
 
        return 0;
 }
 
-static int rtl8366rb_detect(struct realtek_smi *smi)
+static int rtl8366rb_detect(struct realtek_priv *priv)
 {
-       struct device *dev = smi->dev;
+       struct device *dev = priv->dev;
        int ret;
        u32 val;
 
        /* Detect device */
-       ret = regmap_read(smi->map, 0x5c, &val);
+       ret = regmap_read(priv->map, 0x5c, &val);
        if (ret) {
                dev_err(dev, "can't get chip ID (%d)\n", ret);
                return ret;
@@ -1745,11 +1760,11 @@ static int rtl8366rb_detect(struct realtek_smi *smi)
                return -ENODEV;
        case 0x5937:
                dev_info(dev, "found an RTL8366RB switch\n");
-               smi->cpu_port = RTL8366RB_PORT_NUM_CPU;
-               smi->num_ports = RTL8366RB_NUM_PORTS;
-               smi->num_vlan_mc = RTL8366RB_NUM_VLANS;
-               smi->mib_counters = rtl8366rb_mib_counters;
-               smi->num_mib_counters = ARRAY_SIZE(rtl8366rb_mib_counters);
+               priv->cpu_port = RTL8366RB_PORT_NUM_CPU;
+               priv->num_ports = RTL8366RB_NUM_PORTS;
+               priv->num_vlan_mc = RTL8366RB_NUM_VLANS;
+               priv->mib_counters = rtl8366rb_mib_counters;
+               priv->num_mib_counters = ARRAY_SIZE(rtl8366rb_mib_counters);
                break;
        default:
                dev_info(dev, "found an Unknown Realtek switch (id=0x%04x)\n",
@@ -1757,14 +1772,14 @@ static int rtl8366rb_detect(struct realtek_smi *smi)
                break;
        }
 
-       ret = rtl8366rb_reset_chip(smi);
+       ret = rtl8366rb_reset_chip(priv);
        if (ret)
                return ret;
 
        return 0;
 }
 
-static const struct dsa_switch_ops rtl8366rb_switch_ops = {
+static const struct dsa_switch_ops rtl8366rb_switch_ops_smi = {
        .get_tag_protocol = rtl8366_get_tag_protocol,
        .setup = rtl8366rb_setup,
        .phylink_mac_link_up = rtl8366rb_mac_link_up,
@@ -1787,7 +1802,32 @@ static const struct dsa_switch_ops rtl8366rb_switch_ops = {
        .port_max_mtu = rtl8366rb_max_mtu,
 };
 
-static const struct realtek_smi_ops rtl8366rb_smi_ops = {
+static const struct dsa_switch_ops rtl8366rb_switch_ops_mdio = {
+       .get_tag_protocol = rtl8366_get_tag_protocol,
+       .setup = rtl8366rb_setup,
+       .phy_read = rtl8366rb_dsa_phy_read,
+       .phy_write = rtl8366rb_dsa_phy_write,
+       .phylink_mac_link_up = rtl8366rb_mac_link_up,
+       .phylink_mac_link_down = rtl8366rb_mac_link_down,
+       .get_strings = rtl8366_get_strings,
+       .get_ethtool_stats = rtl8366_get_ethtool_stats,
+       .get_sset_count = rtl8366_get_sset_count,
+       .port_bridge_join = rtl8366rb_port_bridge_join,
+       .port_bridge_leave = rtl8366rb_port_bridge_leave,
+       .port_vlan_filtering = rtl8366rb_vlan_filtering,
+       .port_vlan_add = rtl8366_vlan_add,
+       .port_vlan_del = rtl8366_vlan_del,
+       .port_enable = rtl8366rb_port_enable,
+       .port_disable = rtl8366rb_port_disable,
+       .port_pre_bridge_flags = rtl8366rb_port_pre_bridge_flags,
+       .port_bridge_flags = rtl8366rb_port_bridge_flags,
+       .port_stp_state_set = rtl8366rb_port_stp_state_set,
+       .port_fast_age = rtl8366rb_port_fast_age,
+       .port_change_mtu = rtl8366rb_change_mtu,
+       .port_max_mtu = rtl8366rb_max_mtu,
+};
+
+static const struct realtek_ops rtl8366rb_ops = {
        .detect         = rtl8366rb_detect,
        .get_vlan_mc    = rtl8366rb_get_vlan_mc,
        .set_vlan_mc    = rtl8366rb_set_vlan_mc,
@@ -1803,12 +1843,17 @@ static const struct realtek_smi_ops rtl8366rb_smi_ops = {
        .phy_write      = rtl8366rb_phy_write,
 };
 
-const struct realtek_smi_variant rtl8366rb_variant = {
-       .ds_ops = &rtl8366rb_switch_ops,
-       .ops = &rtl8366rb_smi_ops,
+const struct realtek_variant rtl8366rb_variant = {
+       .ds_ops_smi = &rtl8366rb_switch_ops_smi,
+       .ds_ops_mdio = &rtl8366rb_switch_ops_mdio,
+       .ops = &rtl8366rb_ops,
        .clk_delay = 10,
        .cmd_read = 0xa9,
        .cmd_write = 0xa8,
        .chip_data_sz = sizeof(struct rtl8366rb),
 };
 EXPORT_SYMBOL_GPL(rtl8366rb_variant);
+
+MODULE_AUTHOR("Linus Walleij <linus.walleij@linaro.org>");
+MODULE_DESCRIPTION("Driver for RTL8366RB ethernet switch");
+MODULE_LICENSE("GPL");
index 0730352..bc06fe6 100644 (file)
@@ -442,34 +442,27 @@ static void xrs700x_teardown(struct dsa_switch *ds)
        cancel_delayed_work_sync(&priv->mib_work);
 }
 
-static void xrs700x_phylink_validate(struct dsa_switch *ds, int port,
-                                    unsigned long *supported,
-                                    struct phylink_link_state *state)
+static void xrs700x_phylink_get_caps(struct dsa_switch *ds, int port,
+                                    struct phylink_config *config)
 {
-       __ETHTOOL_DECLARE_LINK_MODE_MASK(mask) = { 0, };
-
        switch (port) {
        case 0:
+               __set_bit(PHY_INTERFACE_MODE_RMII,
+                         config->supported_interfaces);
+               config->mac_capabilities = MAC_10FD | MAC_100FD;
                break;
+
        case 1:
        case 2:
        case 3:
-               phylink_set(mask, 1000baseT_Full);
+               phy_interface_set_rgmii(config->supported_interfaces);
+               config->mac_capabilities = MAC_10FD | MAC_100FD | MAC_1000FD;
                break;
+
        default:
-               linkmode_zero(supported);
                dev_err(ds->dev, "Unsupported port: %i\n", port);
-               return;
+               break;
        }
-
-       phylink_set_port_modes(mask);
-
-       /* The switch only supports full duplex. */
-       phylink_set(mask, 10baseT_Full);
-       phylink_set(mask, 100baseT_Full);
-
-       linkmode_and(supported, supported, mask);
-       linkmode_and(state->advertising, state->advertising, mask);
 }
 
 static void xrs700x_mac_link_up(struct dsa_switch *ds, int port,
@@ -703,7 +696,7 @@ static const struct dsa_switch_ops xrs700x_ops = {
        .setup                  = xrs700x_setup,
        .teardown               = xrs700x_teardown,
        .port_stp_state_set     = xrs700x_port_stp_state_set,
-       .phylink_validate       = xrs700x_phylink_validate,
+       .phylink_get_caps       = xrs700x_phylink_get_caps,
        .phylink_mac_link_up    = xrs700x_mac_link_up,
        .get_strings            = xrs700x_get_strings,
        .get_sset_count         = xrs700x_get_sset_count,
index 8aec5d9..ad57209 100644 (file)
@@ -138,11 +138,6 @@ MODULE_PARM_DESC(use_mmio, "Use MMIO (1) or PIO(0) to access the NIC. "
 module_param(rx_copybreak, int, 0);
 module_param(use_mmio, int, 0);
 
-#if defined(NETIF_F_TSO) && MAX_SKB_FRAGS > 32
-#warning Typhoon only supports 32 entries in its SG list for TSO, disabling TSO
-#undef NETIF_F_TSO
-#endif
-
 #if TXLO_ENTRIES <= (2 * MAX_SKB_FRAGS)
 #error TX ring too small!
 #endif
@@ -2261,9 +2256,28 @@ out:
        return mode;
 }
 
+#if MAX_SKB_FRAGS > 32
+
+#include <net/vxlan.h>
+
+static netdev_features_t typhoon_features_check(struct sk_buff *skb,
+                                               struct net_device *dev,
+                                               netdev_features_t features)
+{
+       if (skb_shinfo(skb)->nr_frags > 32 && skb_is_gso(skb))
+               features &= ~NETIF_F_GSO_MASK;
+
+       features = vlan_features_check(skb, features);
+       return vxlan_features_check(skb, features);
+}
+#endif
+
 static const struct net_device_ops typhoon_netdev_ops = {
        .ndo_open               = typhoon_open,
        .ndo_stop               = typhoon_close,
+#if MAX_SKB_FRAGS > 32
+       .ndo_features_check     = typhoon_features_check,
+#endif
        .ndo_start_xmit         = typhoon_start_tx,
        .ndo_set_rx_mode        = typhoon_set_rx_mode,
        .ndo_tx_timeout         = typhoon_tx_timeout,
index 537e6a8..fbf4588 100644 (file)
@@ -2413,11 +2413,13 @@ static void et131x_tx_dma_memory_free(struct et131x_adapter *adapter)
        kfree(tx_ring->tcb_ring);
 }
 
+#define MAX_TX_DESC_PER_PKT 24
+
 /* nic_send_packet - NIC specific send handler for version B silicon. */
 static int nic_send_packet(struct et131x_adapter *adapter, struct tcb *tcb)
 {
        u32 i;
-       struct tx_desc desc[24];
+       struct tx_desc desc[MAX_TX_DESC_PER_PKT];
        u32 frag = 0;
        u32 thiscopy, remainder;
        struct sk_buff *skb = tcb->skb;
@@ -2432,9 +2434,6 @@ static int nic_send_packet(struct et131x_adapter *adapter, struct tcb *tcb)
         * more than 5 fragments.
         */
 
-       /* nr_frags should be no more than 18. */
-       BUILD_BUG_ON(MAX_SKB_FRAGS + 1 > 23);
-
        memset(desc, 0, sizeof(struct tx_desc) * (nr_frags + 1));
 
        for (i = 0; i < nr_frags; i++) {
@@ -3762,6 +3761,13 @@ static netdev_tx_t et131x_tx(struct sk_buff *skb, struct net_device *netdev)
        struct et131x_adapter *adapter = netdev_priv(netdev);
        struct tx_ring *tx_ring = &adapter->tx_ring;
 
+       /* This driver does not support TSO, it is very unlikely
+        * this condition is true.
+        */
+       if (unlikely(skb_shinfo(skb)->nr_frags > MAX_TX_DESC_PER_PKT - 2)) {
+               if (skb_linearize(skb))
+                       goto drop_err;
+       }
        /* stop the queue if it's getting full */
        if (tx_ring->used >= NUM_TCB - 1 && !netif_queue_stopped(netdev))
                netif_stop_queue(netdev);
index 53080fd..07444ae 100644 (file)
@@ -1400,10 +1400,9 @@ static struct sk_buff *ena_alloc_skb(struct ena_ring *rx_ring, void *first_frag)
        struct sk_buff *skb;
 
        if (!first_frag)
-               skb = netdev_alloc_skb_ip_align(rx_ring->netdev,
-                                               rx_ring->rx_copybreak);
+               skb = napi_alloc_skb(rx_ring->napi, rx_ring->rx_copybreak);
        else
-               skb = build_skb(first_frag, ENA_PAGE_SIZE);
+               skb = napi_build_skb(first_frag, ENA_PAGE_SIZE);
 
        if (unlikely(!skb)) {
                ena_increase_stat(&rx_ring->rx_stats.skb_alloc_fail, 1,
index a19dd67..447a75e 100644 (file)
@@ -1271,7 +1271,7 @@ struct bnx2x_fw_stats_data {
        struct per_port_stats           port;
        struct per_pf_stats             pf;
        struct fcoe_statistics_params   fcoe;
-       struct per_queue_stats          queue_stats[1];
+       struct per_queue_stats          queue_stats[];
 };
 
 /* Public slow path states */
index 4f94136..c313221 100644 (file)
@@ -233,6 +233,7 @@ static const u16 bnxt_async_events_arr[] = {
        ASYNC_EVENT_CMPL_EVENT_ID_ECHO_REQUEST,
        ASYNC_EVENT_CMPL_EVENT_ID_PPS_TIMESTAMP,
        ASYNC_EVENT_CMPL_EVENT_ID_ERROR_REPORT,
+       ASYNC_EVENT_CMPL_EVENT_ID_PHC_UPDATE,
 };
 
 static struct workqueue_struct *bnxt_pf_wq;
@@ -2079,6 +2080,16 @@ static void bnxt_event_error_report(struct bnxt *bp, u32 data1, u32 data2)
        (BNXT_EVENT_RING_TYPE(data2) == \
         ASYNC_EVENT_CMPL_RING_MONITOR_MSG_EVENT_DATA2_DISABLE_RING_TYPE_RX)
 
+#define BNXT_EVENT_PHC_EVENT_TYPE(data1)       \
+       (((data1) & ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_DATA1_FLAGS_MASK) >>\
+        ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_DATA1_FLAGS_SFT)
+
+#define BNXT_EVENT_PHC_RTC_UPDATE(data1)       \
+       (((data1) & ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_DATA1_PHC_TIME_MSB_MASK) >>\
+        ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_DATA1_PHC_TIME_MSB_SFT)
+
+#define BNXT_PHC_BITS  48
+
 static int bnxt_async_event_process(struct bnxt *bp,
                                    struct hwrm_async_event_cmpl *cmpl)
 {
@@ -2258,6 +2269,24 @@ static int bnxt_async_event_process(struct bnxt *bp,
                bnxt_event_error_report(bp, data1, data2);
                goto async_event_process_exit;
        }
+       case ASYNC_EVENT_CMPL_EVENT_ID_PHC_UPDATE: {
+               switch (BNXT_EVENT_PHC_EVENT_TYPE(data1)) {
+               case ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_DATA1_FLAGS_PHC_RTC_UPDATE:
+                       if (bp->fw_cap & BNXT_FW_CAP_PTP_RTC) {
+                               struct bnxt_ptp_cfg *ptp = bp->ptp_cfg;
+                               u64 ns;
+
+                               spin_lock_bh(&ptp->ptp_lock);
+                               bnxt_ptp_update_current_time(bp);
+                               ns = (((u64)BNXT_EVENT_PHC_RTC_UPDATE(data1) <<
+                                      BNXT_PHC_BITS) | ptp->current_time);
+                               bnxt_ptp_rtc_timecounter_init(ptp, ns);
+                               spin_unlock_bh(&ptp->ptp_lock);
+                       }
+                       break;
+               }
+               goto async_event_process_exit;
+       }
        case ASYNC_EVENT_CMPL_EVENT_ID_DEFERRED_RESPONSE: {
                u16 seq_id = le32_to_cpu(cmpl->event_data2) & 0xffff;
 
@@ -7414,6 +7443,7 @@ static int __bnxt_hwrm_ptp_qcfg(struct bnxt *bp)
        struct hwrm_port_mac_ptp_qcfg_output *resp;
        struct hwrm_port_mac_ptp_qcfg_input *req;
        struct bnxt_ptp_cfg *ptp = bp->ptp_cfg;
+       bool phc_cfg;
        u8 flags;
        int rc;
 
@@ -7456,7 +7486,8 @@ static int __bnxt_hwrm_ptp_qcfg(struct bnxt *bp)
                rc = -ENODEV;
                goto exit;
        }
-       rc = bnxt_ptp_init(bp);
+       phc_cfg = (flags & PORT_MAC_PTP_QCFG_RESP_FLAGS_RTC_CONFIGURED) != 0;
+       rc = bnxt_ptp_init(bp, phc_cfg);
        if (rc)
                netdev_warn(bp->dev, "PTP initialization failed.\n");
 exit:
@@ -7514,6 +7545,8 @@ static int __bnxt_hwrm_func_qcaps(struct bnxt *bp)
                bp->fw_cap |= BNXT_FW_CAP_EXT_HW_STATS_SUPPORTED;
        if (BNXT_PF(bp) && (flags_ext & FUNC_QCAPS_RESP_FLAGS_EXT_PTP_PPS_SUPPORTED))
                bp->fw_cap |= BNXT_FW_CAP_PTP_PPS;
+       if (flags_ext & FUNC_QCAPS_RESP_FLAGS_EXT_PTP_64BIT_RTC_SUPPORTED)
+               bp->fw_cap |= BNXT_FW_CAP_PTP_RTC;
        if (BNXT_PF(bp) && (flags_ext & FUNC_QCAPS_RESP_FLAGS_EXT_HOT_RESET_IF_SUPPORT))
                bp->fw_cap |= BNXT_FW_CAP_HOT_RESET_IF;
        if (BNXT_PF(bp) && (flags_ext & FUNC_QCAPS_RESP_FLAGS_EXT_FW_LIVEPATCH_SUPPORTED))
@@ -10288,6 +10321,7 @@ static int __bnxt_open_nic(struct bnxt *bp, bool irq_re_init, bool link_re_init)
        /* VF-reps may need to be re-opened after the PF is re-opened */
        if (BNXT_PF(bp))
                bnxt_vf_reps_open(bp);
+       bnxt_ptp_init_rtc(bp, true);
        return 0;
 
 open_err_irq:
index 440dfeb..4b023e3 100644 (file)
@@ -1957,6 +1957,7 @@ struct bnxt {
        #define BNXT_FW_CAP_EXT_STATS_SUPPORTED         0x00040000
        #define BNXT_FW_CAP_ERR_RECOVER_RELOAD          0x00100000
        #define BNXT_FW_CAP_HOT_RESET                   0x00200000
+       #define BNXT_FW_CAP_PTP_RTC                     0x00400000
        #define BNXT_FW_CAP_VLAN_RX_STRIP               0x01000000
        #define BNXT_FW_CAP_VLAN_TX_INSERT              0x02000000
        #define BNXT_FW_CAP_EXT_HW_STATS_SUPPORTED      0x04000000
index 003330e..5edbee9 100644 (file)
@@ -11,6 +11,7 @@
 #include <linux/ctype.h>
 #include <linux/stringify.h>
 #include <linux/ethtool.h>
+#include <linux/ethtool_netlink.h>
 #include <linux/linkmode.h>
 #include <linux/interrupt.h>
 #include <linux/pci.h>
@@ -802,9 +803,11 @@ static void bnxt_get_ringparam(struct net_device *dev,
        if (bp->flags & BNXT_FLAG_AGG_RINGS) {
                ering->rx_max_pending = BNXT_MAX_RX_DESC_CNT_JUM_ENA;
                ering->rx_jumbo_max_pending = BNXT_MAX_RX_JUM_DESC_CNT;
+               kernel_ering->tcp_data_split = ETHTOOL_TCP_DATA_SPLIT_ENABLED;
        } else {
                ering->rx_max_pending = BNXT_MAX_RX_DESC_CNT;
                ering->rx_jumbo_max_pending = 0;
+               kernel_ering->tcp_data_split = ETHTOOL_TCP_DATA_SPLIT_DISABLED;
        }
        ering->tx_max_pending = BNXT_MAX_TX_DESC_CNT;
 
index ea86c54..b7100ed 100644 (file)
@@ -369,6 +369,12 @@ struct cmd_nums {
        #define HWRM_FUNC_PTP_EXT_CFG                     0x1a0UL
        #define HWRM_FUNC_PTP_EXT_QCFG                    0x1a1UL
        #define HWRM_FUNC_KEY_CTX_ALLOC                   0x1a2UL
+       #define HWRM_FUNC_BACKING_STORE_CFG_V2            0x1a3UL
+       #define HWRM_FUNC_BACKING_STORE_QCFG_V2           0x1a4UL
+       #define HWRM_FUNC_DBR_PACING_CFG                  0x1a5UL
+       #define HWRM_FUNC_DBR_PACING_QCFG                 0x1a6UL
+       #define HWRM_FUNC_DBR_PACING_BROADCAST_EVENT      0x1a7UL
+       #define HWRM_FUNC_BACKING_STORE_QCAPS_V2          0x1a8UL
        #define HWRM_SELFTEST_QLIST                       0x200UL
        #define HWRM_SELFTEST_EXEC                        0x201UL
        #define HWRM_SELFTEST_IRQ                         0x202UL
@@ -390,6 +396,9 @@ struct cmd_nums {
        #define HWRM_MFG_PRVSN_IMPORT_CERT                0x212UL
        #define HWRM_MFG_PRVSN_GET_STATE                  0x213UL
        #define HWRM_MFG_GET_NVM_MEASUREMENT              0x214UL
+       #define HWRM_MFG_PSOC_QSTATUS                     0x215UL
+       #define HWRM_MFG_SELFTEST_QLIST                   0x216UL
+       #define HWRM_MFG_SELFTEST_EXEC                    0x217UL
        #define HWRM_TF                                   0x2bcUL
        #define HWRM_TF_VERSION_GET                       0x2bdUL
        #define HWRM_TF_SESSION_OPEN                      0x2c6UL
@@ -532,8 +541,8 @@ struct hwrm_err_output {
 #define HWRM_VERSION_MAJOR 1
 #define HWRM_VERSION_MINOR 10
 #define HWRM_VERSION_UPDATE 2
-#define HWRM_VERSION_RSVD 63
-#define HWRM_VERSION_STR "1.10.2.63"
+#define HWRM_VERSION_RSVD 73
+#define HWRM_VERSION_STR "1.10.2.73"
 
 /* hwrm_ver_get_input (size:192b/24B) */
 struct hwrm_ver_get_input {
@@ -757,10 +766,11 @@ struct hwrm_async_event_cmpl {
        #define ASYNC_EVENT_CMPL_EVENT_ID_DEFERRED_RESPONSE          0x40UL
        #define ASYNC_EVENT_CMPL_EVENT_ID_PFC_WATCHDOG_CFG_CHANGE    0x41UL
        #define ASYNC_EVENT_CMPL_EVENT_ID_ECHO_REQUEST               0x42UL
-       #define ASYNC_EVENT_CMPL_EVENT_ID_PHC_MASTER                 0x43UL
+       #define ASYNC_EVENT_CMPL_EVENT_ID_PHC_UPDATE                 0x43UL
        #define ASYNC_EVENT_CMPL_EVENT_ID_PPS_TIMESTAMP              0x44UL
        #define ASYNC_EVENT_CMPL_EVENT_ID_ERROR_REPORT               0x45UL
-       #define ASYNC_EVENT_CMPL_EVENT_ID_MAX_RGTR_EVENT_ID          0x46UL
+       #define ASYNC_EVENT_CMPL_EVENT_ID_DOORBELL_PACING_THRESHOLD  0x46UL
+       #define ASYNC_EVENT_CMPL_EVENT_ID_MAX_RGTR_EVENT_ID          0x47UL
        #define ASYNC_EVENT_CMPL_EVENT_ID_FW_TRACE_MSG               0xfeUL
        #define ASYNC_EVENT_CMPL_EVENT_ID_HWRM_ERROR                 0xffUL
        #define ASYNC_EVENT_CMPL_EVENT_ID_LAST                      ASYNC_EVENT_CMPL_EVENT_ID_HWRM_ERROR
@@ -1112,34 +1122,37 @@ struct hwrm_async_event_cmpl_echo_request {
        __le32  event_data1;
 };
 
-/* hwrm_async_event_cmpl_phc_master (size:128b/16B) */
-struct hwrm_async_event_cmpl_phc_master {
+/* hwrm_async_event_cmpl_phc_update (size:128b/16B) */
+struct hwrm_async_event_cmpl_phc_update {
        __le16  type;
-       #define ASYNC_EVENT_CMPL_PHC_MASTER_TYPE_MASK            0x3fUL
-       #define ASYNC_EVENT_CMPL_PHC_MASTER_TYPE_SFT             0
-       #define ASYNC_EVENT_CMPL_PHC_MASTER_TYPE_HWRM_ASYNC_EVENT  0x2eUL
-       #define ASYNC_EVENT_CMPL_PHC_MASTER_TYPE_LAST             ASYNC_EVENT_CMPL_PHC_MASTER_TYPE_HWRM_ASYNC_EVENT
+       #define ASYNC_EVENT_CMPL_PHC_UPDATE_TYPE_MASK            0x3fUL
+       #define ASYNC_EVENT_CMPL_PHC_UPDATE_TYPE_SFT             0
+       #define ASYNC_EVENT_CMPL_PHC_UPDATE_TYPE_HWRM_ASYNC_EVENT  0x2eUL
+       #define ASYNC_EVENT_CMPL_PHC_UPDATE_TYPE_LAST             ASYNC_EVENT_CMPL_PHC_UPDATE_TYPE_HWRM_ASYNC_EVENT
        __le16  event_id;
-       #define ASYNC_EVENT_CMPL_PHC_MASTER_EVENT_ID_PHC_MASTER 0x43UL
-       #define ASYNC_EVENT_CMPL_PHC_MASTER_EVENT_ID_LAST      ASYNC_EVENT_CMPL_PHC_MASTER_EVENT_ID_PHC_MASTER
+       #define ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_ID_PHC_UPDATE 0x43UL
+       #define ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_ID_LAST      ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_ID_PHC_UPDATE
        __le32  event_data2;
-       #define ASYNC_EVENT_CMPL_PHC_MASTER_EVENT_DATA2_PHC_MASTER_FID_MASK 0xffffUL
-       #define ASYNC_EVENT_CMPL_PHC_MASTER_EVENT_DATA2_PHC_MASTER_FID_SFT 0
-       #define ASYNC_EVENT_CMPL_PHC_MASTER_EVENT_DATA2_PHC_SEC_FID_MASK   0xffff0000UL
-       #define ASYNC_EVENT_CMPL_PHC_MASTER_EVENT_DATA2_PHC_SEC_FID_SFT    16
+       #define ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_DATA2_PHC_MASTER_FID_MASK 0xffffUL
+       #define ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_DATA2_PHC_MASTER_FID_SFT 0
+       #define ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_DATA2_PHC_SEC_FID_MASK   0xffff0000UL
+       #define ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_DATA2_PHC_SEC_FID_SFT    16
        u8      opaque_v;
-       #define ASYNC_EVENT_CMPL_PHC_MASTER_V          0x1UL
-       #define ASYNC_EVENT_CMPL_PHC_MASTER_OPAQUE_MASK 0xfeUL
-       #define ASYNC_EVENT_CMPL_PHC_MASTER_OPAQUE_SFT 1
+       #define ASYNC_EVENT_CMPL_PHC_UPDATE_V          0x1UL
+       #define ASYNC_EVENT_CMPL_PHC_UPDATE_OPAQUE_MASK 0xfeUL
+       #define ASYNC_EVENT_CMPL_PHC_UPDATE_OPAQUE_SFT 1
        u8      timestamp_lo;
        __le16  timestamp_hi;
        __le32  event_data1;
-       #define ASYNC_EVENT_CMPL_PHC_MASTER_EVENT_DATA1_FLAGS_MASK         0xfUL
-       #define ASYNC_EVENT_CMPL_PHC_MASTER_EVENT_DATA1_FLAGS_SFT          0
-       #define ASYNC_EVENT_CMPL_PHC_MASTER_EVENT_DATA1_FLAGS_PHC_MASTER     0x1UL
-       #define ASYNC_EVENT_CMPL_PHC_MASTER_EVENT_DATA1_FLAGS_PHC_SECONDARY  0x2UL
-       #define ASYNC_EVENT_CMPL_PHC_MASTER_EVENT_DATA1_FLAGS_PHC_FAILOVER   0x3UL
-       #define ASYNC_EVENT_CMPL_PHC_MASTER_EVENT_DATA1_FLAGS_LAST          ASYNC_EVENT_CMPL_PHC_MASTER_EVENT_DATA1_FLAGS_PHC_FAILOVER
+       #define ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_DATA1_FLAGS_MASK          0xfUL
+       #define ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_DATA1_FLAGS_SFT           0
+       #define ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_DATA1_FLAGS_PHC_MASTER      0x1UL
+       #define ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_DATA1_FLAGS_PHC_SECONDARY   0x2UL
+       #define ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_DATA1_FLAGS_PHC_FAILOVER    0x3UL
+       #define ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_DATA1_FLAGS_PHC_RTC_UPDATE  0x4UL
+       #define ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_DATA1_FLAGS_LAST           ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_DATA1_FLAGS_PHC_RTC_UPDATE
+       #define ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_DATA1_PHC_TIME_MSB_MASK   0xffff0UL
+       #define ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_DATA1_PHC_TIME_MSB_SFT    4
 };
 
 /* hwrm_async_event_cmpl_pps_timestamp (size:128b/16B) */
@@ -1330,6 +1343,30 @@ struct hwrm_async_event_cmpl_error_report_nvm {
        #define ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_DATA1_NVM_ERR_TYPE_LAST    ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_DATA1_NVM_ERR_TYPE_ERASE
 };
 
+/* hwrm_async_event_cmpl_error_report_doorbell_drop_threshold (size:128b/16B) */
+struct hwrm_async_event_cmpl_error_report_doorbell_drop_threshold {
+       __le16  type;
+       #define ASYNC_EVENT_CMPL_ERROR_REPORT_DOORBELL_DROP_THRESHOLD_TYPE_MASK            0x3fUL
+       #define ASYNC_EVENT_CMPL_ERROR_REPORT_DOORBELL_DROP_THRESHOLD_TYPE_SFT             0
+       #define ASYNC_EVENT_CMPL_ERROR_REPORT_DOORBELL_DROP_THRESHOLD_TYPE_HWRM_ASYNC_EVENT  0x2eUL
+       #define ASYNC_EVENT_CMPL_ERROR_REPORT_DOORBELL_DROP_THRESHOLD_TYPE_LAST             ASYNC_EVENT_CMPL_ERROR_REPORT_DOORBELL_DROP_THRESHOLD_TYPE_HWRM_ASYNC_EVENT
+       __le16  event_id;
+       #define ASYNC_EVENT_CMPL_ERROR_REPORT_DOORBELL_DROP_THRESHOLD_EVENT_ID_ERROR_REPORT 0x45UL
+       #define ASYNC_EVENT_CMPL_ERROR_REPORT_DOORBELL_DROP_THRESHOLD_EVENT_ID_LAST        ASYNC_EVENT_CMPL_ERROR_REPORT_DOORBELL_DROP_THRESHOLD_EVENT_ID_ERROR_REPORT
+       __le32  event_data2;
+       u8      opaque_v;
+       #define ASYNC_EVENT_CMPL_ERROR_REPORT_DOORBELL_DROP_THRESHOLD_V          0x1UL
+       #define ASYNC_EVENT_CMPL_ERROR_REPORT_DOORBELL_DROP_THRESHOLD_OPAQUE_MASK 0xfeUL
+       #define ASYNC_EVENT_CMPL_ERROR_REPORT_DOORBELL_DROP_THRESHOLD_OPAQUE_SFT 1
+       u8      timestamp_lo;
+       __le16  timestamp_hi;
+       __le32  event_data1;
+       #define ASYNC_EVENT_CMPL_ERROR_REPORT_DOORBELL_DROP_THRESHOLD_EVENT_DATA1_ERROR_TYPE_MASK                   0xffUL
+       #define ASYNC_EVENT_CMPL_ERROR_REPORT_DOORBELL_DROP_THRESHOLD_EVENT_DATA1_ERROR_TYPE_SFT                    0
+       #define ASYNC_EVENT_CMPL_ERROR_REPORT_DOORBELL_DROP_THRESHOLD_EVENT_DATA1_ERROR_TYPE_DOORBELL_DROP_THRESHOLD  0x4UL
+       #define ASYNC_EVENT_CMPL_ERROR_REPORT_DOORBELL_DROP_THRESHOLD_EVENT_DATA1_ERROR_TYPE_LAST                    ASYNC_EVENT_CMPL_ERROR_REPORT_DOORBELL_DROP_THRESHOLD_EVENT_DATA1_ERROR_TYPE_DOORBELL_DROP_THRESHOLD
+};
+
 /* hwrm_func_reset_input (size:192b/24B) */
 struct hwrm_func_reset_input {
        __le16  req_type;
@@ -1589,6 +1626,10 @@ struct hwrm_func_qcaps_output {
        #define FUNC_QCAPS_RESP_FLAGS_EXT_EP_RATE_CONTROL                        0x800000UL
        #define FUNC_QCAPS_RESP_FLAGS_EXT_MIN_BW_SUPPORTED                       0x1000000UL
        #define FUNC_QCAPS_RESP_FLAGS_EXT_TX_COAL_CMPL_CAP                       0x2000000UL
+       #define FUNC_QCAPS_RESP_FLAGS_EXT_BS_V2_SUPPORTED                        0x4000000UL
+       #define FUNC_QCAPS_RESP_FLAGS_EXT_BS_V2_REQUIRED                         0x8000000UL
+       #define FUNC_QCAPS_RESP_FLAGS_EXT_PTP_64BIT_RTC_SUPPORTED                0x10000000UL
+       #define FUNC_QCAPS_RESP_FLAGS_EXT_DBR_PACING_SUPPORTED                   0x20000000UL
        u8      max_schqs;
        u8      mpc_chnls_cap;
        #define FUNC_QCAPS_RESP_MPC_CHNLS_CAP_TCE         0x1UL
@@ -2455,7 +2496,7 @@ struct hwrm_func_backing_store_qcaps_output {
        __le16  rkc_entry_size;
        __le32  tkc_max_entries;
        __le32  rkc_max_entries;
-       u8      rsvd[7];
+       u8      rsvd1[7];
        u8      valid;
 };
 
@@ -3164,7 +3205,7 @@ struct hwrm_func_ptp_pin_cfg_output {
        u8      valid;
 };
 
-/* hwrm_func_ptp_cfg_input (size:320b/40B) */
+/* hwrm_func_ptp_cfg_input (size:384b/48B) */
 struct hwrm_func_ptp_cfg_input {
        __le16  req_type;
        __le16  cmpl_ring;
@@ -3178,6 +3219,7 @@ struct hwrm_func_ptp_cfg_input {
        #define FUNC_PTP_CFG_REQ_ENABLES_PTP_FREQ_ADJ_EXT_PERIOD     0x8UL
        #define FUNC_PTP_CFG_REQ_ENABLES_PTP_FREQ_ADJ_EXT_UP         0x10UL
        #define FUNC_PTP_CFG_REQ_ENABLES_PTP_FREQ_ADJ_EXT_PHASE      0x20UL
+       #define FUNC_PTP_CFG_REQ_ENABLES_PTP_SET_TIME                0x40UL
        u8      ptp_pps_event;
        #define FUNC_PTP_CFG_REQ_PTP_PPS_EVENT_INTERNAL     0x1UL
        #define FUNC_PTP_CFG_REQ_PTP_PPS_EVENT_EXTERNAL     0x2UL
@@ -3204,6 +3246,7 @@ struct hwrm_func_ptp_cfg_input {
        __le32  ptp_freq_adj_ext_up;
        __le32  ptp_freq_adj_ext_phase_lower;
        __le32  ptp_freq_adj_ext_phase_upper;
+       __le64  ptp_set_time;
 };
 
 /* hwrm_func_ptp_cfg_output (size:128b/16B) */
@@ -3243,6 +3286,308 @@ struct hwrm_func_ptp_ts_query_output {
        u8      valid;
 };
 
+/* hwrm_func_ptp_ext_cfg_input (size:256b/32B) */
+struct hwrm_func_ptp_ext_cfg_input {
+       __le16  req_type;
+       __le16  cmpl_ring;
+       __le16  seq_id;
+       __le16  target_id;
+       __le64  resp_addr;
+       __le16  enables;
+       #define FUNC_PTP_EXT_CFG_REQ_ENABLES_PHC_MASTER_FID     0x1UL
+       #define FUNC_PTP_EXT_CFG_REQ_ENABLES_PHC_SEC_FID        0x2UL
+       #define FUNC_PTP_EXT_CFG_REQ_ENABLES_PHC_SEC_MODE       0x4UL
+       #define FUNC_PTP_EXT_CFG_REQ_ENABLES_FAILOVER_TIMER     0x8UL
+       __le16  phc_master_fid;
+       __le16  phc_sec_fid;
+       u8      phc_sec_mode;
+       #define FUNC_PTP_EXT_CFG_REQ_PHC_SEC_MODE_SWITCH  0x0UL
+       #define FUNC_PTP_EXT_CFG_REQ_PHC_SEC_MODE_ALL     0x1UL
+       #define FUNC_PTP_EXT_CFG_REQ_PHC_SEC_MODE_PF_ONLY 0x2UL
+       #define FUNC_PTP_EXT_CFG_REQ_PHC_SEC_MODE_LAST   FUNC_PTP_EXT_CFG_REQ_PHC_SEC_MODE_PF_ONLY
+       u8      unused_0;
+       __le32  failover_timer;
+       u8      unused_1[4];
+};
+
+/* hwrm_func_ptp_ext_cfg_output (size:128b/16B) */
+struct hwrm_func_ptp_ext_cfg_output {
+       __le16  error_code;
+       __le16  req_type;
+       __le16  seq_id;
+       __le16  resp_len;
+       u8      unused_0[7];
+       u8      valid;
+};
+
+/* hwrm_func_ptp_ext_qcfg_input (size:192b/24B) */
+struct hwrm_func_ptp_ext_qcfg_input {
+       __le16  req_type;
+       __le16  cmpl_ring;
+       __le16  seq_id;
+       __le16  target_id;
+       __le64  resp_addr;
+       u8      unused_0[8];
+};
+
+/* hwrm_func_ptp_ext_qcfg_output (size:256b/32B) */
+struct hwrm_func_ptp_ext_qcfg_output {
+       __le16  error_code;
+       __le16  req_type;
+       __le16  seq_id;
+       __le16  resp_len;
+       __le16  phc_master_fid;
+       __le16  phc_sec_fid;
+       __le16  phc_active_fid0;
+       __le16  phc_active_fid1;
+       __le32  last_failover_event;
+       __le16  from_fid;
+       __le16  to_fid;
+       u8      unused_0[7];
+       u8      valid;
+};
+
+/* hwrm_func_backing_store_cfg_v2_input (size:448b/56B) */
+struct hwrm_func_backing_store_cfg_v2_input {
+       __le16  req_type;
+       __le16  cmpl_ring;
+       __le16  seq_id;
+       __le16  target_id;
+       __le64  resp_addr;
+       __le16  type;
+       #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_QP          0x0UL
+       #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_SRQ         0x1UL
+       #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_CQ          0x2UL
+       #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_VNIC        0x3UL
+       #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_STAT        0x4UL
+       #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_SP_TQM_RING 0x5UL
+       #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_FP_TQM_RING 0x6UL
+       #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_MRAV        0xeUL
+       #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_TIM         0xfUL
+       #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_TKC         0x13UL
+       #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_RKC         0x14UL
+       #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_MP_TQM_RING 0x15UL
+       #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_INVALID     0xffffUL
+       #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_LAST       FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_INVALID
+       __le16  instance;
+       __le32  flags;
+       #define FUNC_BACKING_STORE_CFG_V2_REQ_FLAGS_PREBOOT_MODE     0x1UL
+       __le64  page_dir;
+       __le32  num_entries;
+       __le16  entry_size;
+       u8      page_size_pbl_level;
+       #define FUNC_BACKING_STORE_CFG_V2_REQ_PBL_LEVEL_MASK  0xfUL
+       #define FUNC_BACKING_STORE_CFG_V2_REQ_PBL_LEVEL_SFT   0
+       #define FUNC_BACKING_STORE_CFG_V2_REQ_PBL_LEVEL_LVL_0   0x0UL
+       #define FUNC_BACKING_STORE_CFG_V2_REQ_PBL_LEVEL_LVL_1   0x1UL
+       #define FUNC_BACKING_STORE_CFG_V2_REQ_PBL_LEVEL_LVL_2   0x2UL
+       #define FUNC_BACKING_STORE_CFG_V2_REQ_PBL_LEVEL_LAST   FUNC_BACKING_STORE_CFG_V2_REQ_PBL_LEVEL_LVL_2
+       #define FUNC_BACKING_STORE_CFG_V2_REQ_PAGE_SIZE_MASK  0xf0UL
+       #define FUNC_BACKING_STORE_CFG_V2_REQ_PAGE_SIZE_SFT   4
+       #define FUNC_BACKING_STORE_CFG_V2_REQ_PAGE_SIZE_PG_4K   (0x0UL << 4)
+       #define FUNC_BACKING_STORE_CFG_V2_REQ_PAGE_SIZE_PG_8K   (0x1UL << 4)
+       #define FUNC_BACKING_STORE_CFG_V2_REQ_PAGE_SIZE_PG_64K  (0x2UL << 4)
+       #define FUNC_BACKING_STORE_CFG_V2_REQ_PAGE_SIZE_PG_2M   (0x3UL << 4)
+       #define FUNC_BACKING_STORE_CFG_V2_REQ_PAGE_SIZE_PG_8M   (0x4UL << 4)
+       #define FUNC_BACKING_STORE_CFG_V2_REQ_PAGE_SIZE_PG_1G   (0x5UL << 4)
+       #define FUNC_BACKING_STORE_CFG_V2_REQ_PAGE_SIZE_LAST   FUNC_BACKING_STORE_CFG_V2_REQ_PAGE_SIZE_PG_1G
+       u8      subtype_valid_cnt;
+       __le32  split_entry_0;
+       __le32  split_entry_1;
+       __le32  split_entry_2;
+       __le32  split_entry_3;
+};
+
+/* hwrm_func_backing_store_cfg_v2_output (size:128b/16B) */
+struct hwrm_func_backing_store_cfg_v2_output {
+       __le16  error_code;
+       __le16  req_type;
+       __le16  seq_id;
+       __le16  resp_len;
+       u8      rsvd0[7];
+       u8      valid;
+};
+
+/* hwrm_func_backing_store_qcfg_v2_input (size:192b/24B) */
+struct hwrm_func_backing_store_qcfg_v2_input {
+       __le16  req_type;
+       __le16  cmpl_ring;
+       __le16  seq_id;
+       __le16  target_id;
+       __le64  resp_addr;
+       __le16  type;
+       #define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_QP          0x0UL
+       #define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_SRQ         0x1UL
+       #define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_CQ          0x2UL
+       #define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_VNIC        0x3UL
+       #define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_STAT        0x4UL
+       #define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_SP_TQM_RING 0x5UL
+       #define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_FP_TQM_RING 0x6UL
+       #define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_MRAV        0xeUL
+       #define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_TIM         0xfUL
+       #define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_TKC         0x13UL
+       #define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_RKC         0x14UL
+       #define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_MP_TQM_RING 0x15UL
+       #define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_INVALID     0xffffUL
+       #define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_LAST       FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_INVALID
+       __le16  instance;
+       u8      rsvd[4];
+};
+
+/* hwrm_func_backing_store_qcfg_v2_output (size:448b/56B) */
+struct hwrm_func_backing_store_qcfg_v2_output {
+       __le16  error_code;
+       __le16  req_type;
+       __le16  seq_id;
+       __le16  resp_len;
+       __le16  type;
+       #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_QP          0x0UL
+       #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_SRQ         0x1UL
+       #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_CQ          0x2UL
+       #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_VNIC        0x3UL
+       #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_STAT        0x4UL
+       #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_SP_TQM_RING 0x5UL
+       #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_FP_TQM_RING 0x6UL
+       #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_MRAV        0xeUL
+       #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_TIM         0xfUL
+       #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_TKC         0x13UL
+       #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_RKC         0x14UL
+       #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_MP_TQM_RING 0x15UL
+       #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_INVALID     0xffffUL
+       #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_LAST       FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_INVALID
+       __le16  instance;
+       __le32  flags;
+       __le64  page_dir;
+       __le32  num_entries;
+       u8      page_size_pbl_level;
+       #define FUNC_BACKING_STORE_QCFG_V2_RESP_PBL_LEVEL_MASK  0xfUL
+       #define FUNC_BACKING_STORE_QCFG_V2_RESP_PBL_LEVEL_SFT   0
+       #define FUNC_BACKING_STORE_QCFG_V2_RESP_PBL_LEVEL_LVL_0   0x0UL
+       #define FUNC_BACKING_STORE_QCFG_V2_RESP_PBL_LEVEL_LVL_1   0x1UL
+       #define FUNC_BACKING_STORE_QCFG_V2_RESP_PBL_LEVEL_LVL_2   0x2UL
+       #define FUNC_BACKING_STORE_QCFG_V2_RESP_PBL_LEVEL_LAST   FUNC_BACKING_STORE_QCFG_V2_RESP_PBL_LEVEL_LVL_2
+       #define FUNC_BACKING_STORE_QCFG_V2_RESP_PAGE_SIZE_MASK  0xf0UL
+       #define FUNC_BACKING_STORE_QCFG_V2_RESP_PAGE_SIZE_SFT   4
+       #define FUNC_BACKING_STORE_QCFG_V2_RESP_PAGE_SIZE_PG_4K   (0x0UL << 4)
+       #define FUNC_BACKING_STORE_QCFG_V2_RESP_PAGE_SIZE_PG_8K   (0x1UL << 4)
+       #define FUNC_BACKING_STORE_QCFG_V2_RESP_PAGE_SIZE_PG_64K  (0x2UL << 4)
+       #define FUNC_BACKING_STORE_QCFG_V2_RESP_PAGE_SIZE_PG_2M   (0x3UL << 4)
+       #define FUNC_BACKING_STORE_QCFG_V2_RESP_PAGE_SIZE_PG_8M   (0x4UL << 4)
+       #define FUNC_BACKING_STORE_QCFG_V2_RESP_PAGE_SIZE_PG_1G   (0x5UL << 4)
+       #define FUNC_BACKING_STORE_QCFG_V2_RESP_PAGE_SIZE_LAST   FUNC_BACKING_STORE_QCFG_V2_RESP_PAGE_SIZE_PG_1G
+       u8      subtype_valid_cnt;
+       u8      rsvd[2];
+       __le32  split_entry_0;
+       __le32  split_entry_1;
+       __le32  split_entry_2;
+       __le32  split_entry_3;
+       u8      rsvd2[7];
+       u8      valid;
+};
+
+/* qpc_split_entries (size:128b/16B) */
+struct qpc_split_entries {
+       __le32  qp_num_l2_entries;
+       __le32  qp_num_qp1_entries;
+       __le32  rsvd[2];
+};
+
+/* srq_split_entries (size:128b/16B) */
+struct srq_split_entries {
+       __le32  srq_num_l2_entries;
+       __le32  rsvd;
+       __le32  rsvd2[2];
+};
+
+/* cq_split_entries (size:128b/16B) */
+struct cq_split_entries {
+       __le32  cq_num_l2_entries;
+       __le32  rsvd;
+       __le32  rsvd2[2];
+};
+
+/* vnic_split_entries (size:128b/16B) */
+struct vnic_split_entries {
+       __le32  vnic_num_vnic_entries;
+       __le32  rsvd;
+       __le32  rsvd2[2];
+};
+
+/* mrav_split_entries (size:128b/16B) */
+struct mrav_split_entries {
+       __le32  mrav_num_av_entries;
+       __le32  rsvd;
+       __le32  rsvd2[2];
+};
+
+/* hwrm_func_backing_store_qcaps_v2_input (size:192b/24B) */
+struct hwrm_func_backing_store_qcaps_v2_input {
+       __le16  req_type;
+       __le16  cmpl_ring;
+       __le16  seq_id;
+       __le16  target_id;
+       __le64  resp_addr;
+       __le16  type;
+       #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_QP          0x0UL
+       #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_SRQ         0x1UL
+       #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_CQ          0x2UL
+       #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_VNIC        0x3UL
+       #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_STAT        0x4UL
+       #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_SP_TQM_RING 0x5UL
+       #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_FP_TQM_RING 0x6UL
+       #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_MRAV        0xeUL
+       #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_TIM         0xfUL
+       #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_TKC         0x13UL
+       #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_RKC         0x14UL
+       #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_MP_TQM_RING 0x15UL
+       #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_INVALID     0xffffUL
+       #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_LAST       FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_INVALID
+       u8      rsvd[6];
+};
+
+/* hwrm_func_backing_store_qcaps_v2_output (size:448b/56B) */
+struct hwrm_func_backing_store_qcaps_v2_output {
+       __le16  error_code;
+       __le16  req_type;
+       __le16  seq_id;
+       __le16  resp_len;
+       __le16  type;
+       #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_QP          0x0UL
+       #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_SRQ         0x1UL
+       #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_CQ          0x2UL
+       #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_VNIC        0x3UL
+       #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_STAT        0x4UL
+       #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_SP_TQM_RING 0x5UL
+       #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_FP_TQM_RING 0x6UL
+       #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_MRAV        0xeUL
+       #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_TIM         0xfUL
+       #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_TKC         0x13UL
+       #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_RKC         0x14UL
+       #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_MP_TQM_RING 0x15UL
+       #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_INVALID     0xffffUL
+       #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_LAST       FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_INVALID
+       __le16  entry_size;
+       __le32  flags;
+       #define FUNC_BACKING_STORE_QCAPS_V2_RESP_FLAGS_ENABLE_CTX_KIND_INIT     0x1UL
+       #define FUNC_BACKING_STORE_QCAPS_V2_RESP_FLAGS_TYPE_VALID               0x2UL
+       __le32  instance_bit_map;
+       u8      ctx_init_value;
+       u8      ctx_init_offset;
+       u8      entry_multiple;
+       u8      rsvd;
+       __le32  max_num_entries;
+       __le32  min_num_entries;
+       __le16  next_valid_type;
+       u8      subtype_valid_cnt;
+       u8      rsvd2;
+       __le32  split_entry_0;
+       __le32  split_entry_1;
+       __le32  split_entry_2;
+       __le32  split_entry_3;
+       u8      rsvd3[3];
+       u8      valid;
+};
+
 /* hwrm_func_drv_if_change_input (size:192b/24B) */
 struct hwrm_func_drv_if_change_input {
        __le16  req_type;
@@ -3741,7 +4086,7 @@ struct hwrm_port_phy_qcfg_output {
        u8      valid;
 };
 
-/* hwrm_port_mac_cfg_input (size:384b/48B) */
+/* hwrm_port_mac_cfg_input (size:448b/56B) */
 struct hwrm_port_mac_cfg_input {
        __le16  req_type;
        __le16  cmpl_ring;
@@ -3807,7 +4152,8 @@ struct hwrm_port_mac_cfg_input {
        #define PORT_MAC_CFG_REQ_COS_FIELD_CFG_DEFAULT_COS_SFT           5
        u8      unused_0[3];
        __le32  ptp_freq_adj_ppb;
-       __le32  ptp_adj_phase;
+       u8      unused_1[4];
+       __le64  ptp_adj_phase;
 };
 
 /* hwrm_port_mac_cfg_output (size:128b/16B) */
@@ -3850,6 +4196,7 @@ struct hwrm_port_mac_ptp_qcfg_output {
        #define PORT_MAC_PTP_QCFG_RESP_FLAGS_ONE_STEP_TX_TS                      0x4UL
        #define PORT_MAC_PTP_QCFG_RESP_FLAGS_HWRM_ACCESS                         0x8UL
        #define PORT_MAC_PTP_QCFG_RESP_FLAGS_PARTIAL_DIRECT_ACCESS_REF_CLOCK     0x10UL
+       #define PORT_MAC_PTP_QCFG_RESP_FLAGS_RTC_CONFIGURED                      0x20UL
        u8      unused_0[3];
        __le32  rx_ts_reg_off_lower;
        __le32  rx_ts_reg_off_upper;
@@ -4339,7 +4686,8 @@ struct hwrm_port_phy_qcaps_output {
        #define PORT_PHY_QCAPS_RESP_PORT_CNT_2       0x2UL
        #define PORT_PHY_QCAPS_RESP_PORT_CNT_3       0x3UL
        #define PORT_PHY_QCAPS_RESP_PORT_CNT_4       0x4UL
-       #define PORT_PHY_QCAPS_RESP_PORT_CNT_LAST   PORT_PHY_QCAPS_RESP_PORT_CNT_4
+       #define PORT_PHY_QCAPS_RESP_PORT_CNT_12      0xcUL
+       #define PORT_PHY_QCAPS_RESP_PORT_CNT_LAST   PORT_PHY_QCAPS_RESP_PORT_CNT_12
        __le16  supported_speeds_force_mode;
        #define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_FORCE_MODE_100MBHD     0x1UL
        #define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_FORCE_MODE_100MB       0x2UL
@@ -4399,7 +4747,7 @@ struct hwrm_port_phy_qcaps_output {
        __le16  flags2;
        #define PORT_PHY_QCAPS_RESP_FLAGS2_PAUSE_UNSUPPORTED     0x1UL
        #define PORT_PHY_QCAPS_RESP_FLAGS2_PFC_UNSUPPORTED       0x2UL
-       u8      unused_0[1];
+       u8      internal_port_cnt;
        u8      valid;
 };
 
@@ -6221,12 +6569,13 @@ struct hwrm_vnic_rss_cfg_input {
        __le16  target_id;
        __le64  resp_addr;
        __le32  hash_type;
-       #define VNIC_RSS_CFG_REQ_HASH_TYPE_IPV4         0x1UL
-       #define VNIC_RSS_CFG_REQ_HASH_TYPE_TCP_IPV4     0x2UL
-       #define VNIC_RSS_CFG_REQ_HASH_TYPE_UDP_IPV4     0x4UL
-       #define VNIC_RSS_CFG_REQ_HASH_TYPE_IPV6         0x8UL
-       #define VNIC_RSS_CFG_REQ_HASH_TYPE_TCP_IPV6     0x10UL
-       #define VNIC_RSS_CFG_REQ_HASH_TYPE_UDP_IPV6     0x20UL
+       #define VNIC_RSS_CFG_REQ_HASH_TYPE_IPV4                0x1UL
+       #define VNIC_RSS_CFG_REQ_HASH_TYPE_TCP_IPV4            0x2UL
+       #define VNIC_RSS_CFG_REQ_HASH_TYPE_UDP_IPV4            0x4UL
+       #define VNIC_RSS_CFG_REQ_HASH_TYPE_IPV6                0x8UL
+       #define VNIC_RSS_CFG_REQ_HASH_TYPE_TCP_IPV6            0x10UL
+       #define VNIC_RSS_CFG_REQ_HASH_TYPE_UDP_IPV6            0x20UL
+       #define VNIC_RSS_CFG_REQ_HASH_TYPE_IPV6_FLOW_LABEL     0x40UL
        __le16  vnic_id;
        u8      ring_table_pair_index;
        u8      hash_mode_flags;
@@ -7898,6 +8247,7 @@ struct hwrm_cfa_adv_flow_mgnt_qcaps_output {
        u8      valid;
 };
 
+/* hwrm_tunnel_dst_port_query_input (size:192b/24B) */
 struct hwrm_tunnel_dst_port_query_input {
        __le16  req_type;
        __le16  cmpl_ring;
@@ -8909,6 +9259,50 @@ struct hwrm_dbg_qcfg_output {
        u8      valid;
 };
 
+/* hwrm_dbg_crashdump_medium_cfg_input (size:320b/40B) */
+struct hwrm_dbg_crashdump_medium_cfg_input {
+       __le16  req_type;
+       __le16  cmpl_ring;
+       __le16  seq_id;
+       __le16  target_id;
+       __le64  resp_addr;
+       __le16  output_dest_flags;
+       #define DBG_CRASHDUMP_MEDIUM_CFG_REQ_TYPE_DDR     0x1UL
+       __le16  pg_size_lvl;
+       #define DBG_CRASHDUMP_MEDIUM_CFG_REQ_LVL_MASK      0x3UL
+       #define DBG_CRASHDUMP_MEDIUM_CFG_REQ_LVL_SFT       0
+       #define DBG_CRASHDUMP_MEDIUM_CFG_REQ_LVL_LVL_0       0x0UL
+       #define DBG_CRASHDUMP_MEDIUM_CFG_REQ_LVL_LVL_1       0x1UL
+       #define DBG_CRASHDUMP_MEDIUM_CFG_REQ_LVL_LVL_2       0x2UL
+       #define DBG_CRASHDUMP_MEDIUM_CFG_REQ_LVL_LAST       DBG_CRASHDUMP_MEDIUM_CFG_REQ_LVL_LVL_2
+       #define DBG_CRASHDUMP_MEDIUM_CFG_REQ_PG_SIZE_MASK  0x1cUL
+       #define DBG_CRASHDUMP_MEDIUM_CFG_REQ_PG_SIZE_SFT   2
+       #define DBG_CRASHDUMP_MEDIUM_CFG_REQ_PG_SIZE_PG_4K   (0x0UL << 2)
+       #define DBG_CRASHDUMP_MEDIUM_CFG_REQ_PG_SIZE_PG_8K   (0x1UL << 2)
+       #define DBG_CRASHDUMP_MEDIUM_CFG_REQ_PG_SIZE_PG_64K  (0x2UL << 2)
+       #define DBG_CRASHDUMP_MEDIUM_CFG_REQ_PG_SIZE_PG_2M   (0x3UL << 2)
+       #define DBG_CRASHDUMP_MEDIUM_CFG_REQ_PG_SIZE_PG_8M   (0x4UL << 2)
+       #define DBG_CRASHDUMP_MEDIUM_CFG_REQ_PG_SIZE_PG_1G   (0x5UL << 2)
+       #define DBG_CRASHDUMP_MEDIUM_CFG_REQ_PG_SIZE_LAST   DBG_CRASHDUMP_MEDIUM_CFG_REQ_PG_SIZE_PG_1G
+       #define DBG_CRASHDUMP_MEDIUM_CFG_REQ_UNUSED11_MASK 0xffe0UL
+       #define DBG_CRASHDUMP_MEDIUM_CFG_REQ_UNUSED11_SFT  5
+       __le32  size;
+       __le32  coredump_component_disable_flags;
+       #define DBG_CRASHDUMP_MEDIUM_CFG_REQ_NVRAM     0x1UL
+       __le32  unused_0;
+       __le64  pbl;
+};
+
+/* hwrm_dbg_crashdump_medium_cfg_output (size:128b/16B) */
+struct hwrm_dbg_crashdump_medium_cfg_output {
+       __le16  error_code;
+       __le16  req_type;
+       __le16  seq_id;
+       __le16  resp_len;
+       u8      unused_1[7];
+       u8      valid;
+};
+
 /* coredump_segment_record (size:128b/16B) */
 struct coredump_segment_record {
        __le16  component_id;
@@ -9372,8 +9766,35 @@ struct hwrm_nvm_install_update_output {
        __le16  resp_len;
        __le64  installed_items;
        u8      result;
-       #define NVM_INSTALL_UPDATE_RESP_RESULT_SUCCESS 0x0UL
-       #define NVM_INSTALL_UPDATE_RESP_RESULT_LAST   NVM_INSTALL_UPDATE_RESP_RESULT_SUCCESS
+       #define NVM_INSTALL_UPDATE_RESP_RESULT_SUCCESS                      0x0UL
+       #define NVM_INSTALL_UPDATE_RESP_RESULT_FAILURE                      0xffUL
+       #define NVM_INSTALL_UPDATE_RESP_RESULT_MALLOC_FAILURE               0xfdUL
+       #define NVM_INSTALL_UPDATE_RESP_RESULT_INVALID_INDEX_PARAMETER      0xfbUL
+       #define NVM_INSTALL_UPDATE_RESP_RESULT_INVALID_TYPE_PARAMETER       0xf3UL
+       #define NVM_INSTALL_UPDATE_RESP_RESULT_INVALID_PREREQUISITE         0xf2UL
+       #define NVM_INSTALL_UPDATE_RESP_RESULT_INVALID_FILE_HEADER          0xecUL
+       #define NVM_INSTALL_UPDATE_RESP_RESULT_INVALID_SIGNATURE            0xebUL
+       #define NVM_INSTALL_UPDATE_RESP_RESULT_INVALID_PROP_STREAM          0xeaUL
+       #define NVM_INSTALL_UPDATE_RESP_RESULT_INVALID_PROP_LENGTH          0xe9UL
+       #define NVM_INSTALL_UPDATE_RESP_RESULT_INVALID_MANIFEST             0xe8UL
+       #define NVM_INSTALL_UPDATE_RESP_RESULT_INVALID_TRAILER              0xe7UL
+       #define NVM_INSTALL_UPDATE_RESP_RESULT_INVALID_CHECKSUM             0xe6UL
+       #define NVM_INSTALL_UPDATE_RESP_RESULT_INVALID_ITEM_CHECKSUM        0xe5UL
+       #define NVM_INSTALL_UPDATE_RESP_RESULT_INVALID_DATA_LENGTH          0xe4UL
+       #define NVM_INSTALL_UPDATE_RESP_RESULT_INVALID_DIRECTIVE            0xe1UL
+       #define NVM_INSTALL_UPDATE_RESP_RESULT_UNSUPPORTED_CHIP_REV         0xceUL
+       #define NVM_INSTALL_UPDATE_RESP_RESULT_UNSUPPORTED_DEVICE_ID        0xcdUL
+       #define NVM_INSTALL_UPDATE_RESP_RESULT_UNSUPPORTED_SUBSYS_VENDOR    0xccUL
+       #define NVM_INSTALL_UPDATE_RESP_RESULT_UNSUPPORTED_SUBSYS_ID        0xcbUL
+       #define NVM_INSTALL_UPDATE_RESP_RESULT_UNSUPPORTED_PLATFORM         0xc5UL
+       #define NVM_INSTALL_UPDATE_RESP_RESULT_DUPLICATE_ITEM               0xc4UL
+       #define NVM_INSTALL_UPDATE_RESP_RESULT_ZERO_LENGTH_ITEM             0xc3UL
+       #define NVM_INSTALL_UPDATE_RESP_RESULT_INSTALL_CHECKSUM_ERROR       0xb9UL
+       #define NVM_INSTALL_UPDATE_RESP_RESULT_INSTALL_DATA_ERROR           0xb8UL
+       #define NVM_INSTALL_UPDATE_RESP_RESULT_INSTALL_AUTHENTICATION_ERROR 0xb7UL
+       #define NVM_INSTALL_UPDATE_RESP_RESULT_ITEM_NOT_FOUND               0xb0UL
+       #define NVM_INSTALL_UPDATE_RESP_RESULT_ITEM_LOCKED                  0xa7UL
+       #define NVM_INSTALL_UPDATE_RESP_RESULT_LAST                        NVM_INSTALL_UPDATE_RESP_RESULT_ITEM_LOCKED
        u8      problem_item;
        #define NVM_INSTALL_UPDATE_RESP_PROBLEM_ITEM_NONE    0x0UL
        #define NVM_INSTALL_UPDATE_RESP_PROBLEM_ITEM_PACKAGE 0xffUL
index 4852096..a0b321a 100644 (file)
 #include "bnxt_hwrm.h"
 #include "bnxt_ptp.h"
 
+static int bnxt_ptp_cfg_settime(struct bnxt *bp, u64 time)
+{
+       struct hwrm_func_ptp_cfg_input *req;
+       int rc;
+
+       rc = hwrm_req_init(bp, req, HWRM_FUNC_PTP_CFG);
+       if (rc)
+               return rc;
+
+       req->enables = cpu_to_le16(FUNC_PTP_CFG_REQ_ENABLES_PTP_SET_TIME);
+       req->ptp_set_time = cpu_to_le64(time);
+       return hwrm_req_send(bp, req);
+}
+
 int bnxt_ptp_parse(struct sk_buff *skb, u16 *seq_id, u16 *hdr_off)
 {
        unsigned int ptp_class;
@@ -48,6 +62,9 @@ static int bnxt_ptp_settime(struct ptp_clock_info *ptp_info,
                                                ptp_info);
        u64 ns = timespec64_to_ns(ts);
 
+       if (ptp->bp->fw_cap & BNXT_FW_CAP_PTP_RTC)
+               return bnxt_ptp_cfg_settime(ptp->bp, ns);
+
        spin_lock_bh(&ptp->ptp_lock);
        timecounter_init(&ptp->tc, &ptp->cc, ns);
        spin_unlock_bh(&ptp->ptp_lock);
@@ -131,11 +148,47 @@ static int bnxt_ptp_gettimex(struct ptp_clock_info *ptp_info,
        return 0;
 }
 
+/* Caller holds ptp_lock */
+void bnxt_ptp_update_current_time(struct bnxt *bp)
+{
+       struct bnxt_ptp_cfg *ptp = bp->ptp_cfg;
+
+       bnxt_refclk_read(ptp->bp, NULL, &ptp->current_time);
+       WRITE_ONCE(ptp->old_time, ptp->current_time);
+}
+
+static int bnxt_ptp_adjphc(struct bnxt_ptp_cfg *ptp, s64 delta)
+{
+       struct hwrm_port_mac_cfg_input *req;
+       int rc;
+
+       rc = hwrm_req_init(ptp->bp, req, HWRM_PORT_MAC_CFG);
+       if (rc)
+               return rc;
+
+       req->enables = cpu_to_le32(PORT_MAC_CFG_REQ_ENABLES_PTP_ADJ_PHASE);
+       req->ptp_adj_phase = cpu_to_le64(delta);
+
+       rc = hwrm_req_send(ptp->bp, req);
+       if (rc) {
+               netdev_err(ptp->bp->dev, "ptp adjphc failed. rc = %x\n", rc);
+       } else {
+               spin_lock_bh(&ptp->ptp_lock);
+               bnxt_ptp_update_current_time(ptp->bp);
+               spin_unlock_bh(&ptp->ptp_lock);
+       }
+
+       return rc;
+}
+
 static int bnxt_ptp_adjtime(struct ptp_clock_info *ptp_info, s64 delta)
 {
        struct bnxt_ptp_cfg *ptp = container_of(ptp_info, struct bnxt_ptp_cfg,
                                                ptp_info);
 
+       if (ptp->bp->fw_cap & BNXT_FW_CAP_PTP_RTC)
+               return bnxt_ptp_adjphc(ptp, delta);
+
        spin_lock_bh(&ptp->ptp_lock);
        timecounter_adjtime(&ptp->tc, delta);
        spin_unlock_bh(&ptp->ptp_lock);
@@ -714,7 +767,70 @@ static bool bnxt_pps_config_ok(struct bnxt *bp)
        return !(bp->fw_cap & BNXT_FW_CAP_PTP_PPS) == !ptp->ptp_info.pin_config;
 }
 
-int bnxt_ptp_init(struct bnxt *bp)
+static void bnxt_ptp_timecounter_init(struct bnxt *bp, bool init_tc)
+{
+       struct bnxt_ptp_cfg *ptp = bp->ptp_cfg;
+
+       if (!ptp->ptp_clock) {
+               memset(&ptp->cc, 0, sizeof(ptp->cc));
+               ptp->cc.read = bnxt_cc_read;
+               ptp->cc.mask = CYCLECOUNTER_MASK(48);
+               ptp->cc.shift = 0;
+               ptp->cc.mult = 1;
+               ptp->next_overflow_check = jiffies + BNXT_PHC_OVERFLOW_PERIOD;
+       }
+       if (init_tc)
+               timecounter_init(&ptp->tc, &ptp->cc, ktime_to_ns(ktime_get_real()));
+}
+
+/* Caller holds ptp_lock */
+void bnxt_ptp_rtc_timecounter_init(struct bnxt_ptp_cfg *ptp, u64 ns)
+{
+       timecounter_init(&ptp->tc, &ptp->cc, ns);
+       /* For RTC, cycle_last must be in sync with the timecounter value. */
+       ptp->tc.cycle_last = ns & ptp->cc.mask;
+}
+
+int bnxt_ptp_init_rtc(struct bnxt *bp, bool phc_cfg)
+{
+       struct timespec64 tsp;
+       u64 ns;
+       int rc;
+
+       if (!bp->ptp_cfg || !(bp->fw_cap & BNXT_FW_CAP_PTP_RTC))
+               return -ENODEV;
+
+       if (!phc_cfg) {
+               ktime_get_real_ts64(&tsp);
+               ns = timespec64_to_ns(&tsp);
+               rc = bnxt_ptp_cfg_settime(bp, ns);
+               if (rc)
+                       return rc;
+       } else {
+               rc = bnxt_hwrm_port_ts_query(bp, PORT_TS_QUERY_REQ_FLAGS_CURRENT_TIME, &ns);
+               if (rc)
+                       return rc;
+       }
+       spin_lock_bh(&bp->ptp_cfg->ptp_lock);
+       bnxt_ptp_rtc_timecounter_init(bp->ptp_cfg, ns);
+       spin_unlock_bh(&bp->ptp_cfg->ptp_lock);
+
+       return 0;
+}
+
+static void bnxt_ptp_free(struct bnxt *bp)
+{
+       struct bnxt_ptp_cfg *ptp = bp->ptp_cfg;
+
+       if (ptp->ptp_clock) {
+               ptp_clock_unregister(ptp->ptp_clock);
+               ptp->ptp_clock = NULL;
+               kfree(ptp->ptp_info.pin_config);
+               ptp->ptp_info.pin_config = NULL;
+       }
+}
+
+int bnxt_ptp_init(struct bnxt *bp, bool phc_cfg)
 {
        struct bnxt_ptp_cfg *ptp = bp->ptp_cfg;
        int rc;
@@ -726,26 +842,23 @@ int bnxt_ptp_init(struct bnxt *bp)
        if (rc)
                return rc;
 
+       if (bp->fw_cap & BNXT_FW_CAP_PTP_RTC) {
+               bnxt_ptp_timecounter_init(bp, false);
+               rc = bnxt_ptp_init_rtc(bp, phc_cfg);
+               if (rc)
+                       goto out;
+       }
+
        if (ptp->ptp_clock && bnxt_pps_config_ok(bp))
                return 0;
 
-       if (ptp->ptp_clock) {
-               ptp_clock_unregister(ptp->ptp_clock);
-               ptp->ptp_clock = NULL;
-               kfree(ptp->ptp_info.pin_config);
-               ptp->ptp_info.pin_config = NULL;
-       }
+       bnxt_ptp_free(bp);
+
        atomic_set(&ptp->tx_avail, BNXT_MAX_TX_TS);
        spin_lock_init(&ptp->ptp_lock);
 
-       memset(&ptp->cc, 0, sizeof(ptp->cc));
-       ptp->cc.read = bnxt_cc_read;
-       ptp->cc.mask = CYCLECOUNTER_MASK(48);
-       ptp->cc.shift = 0;
-       ptp->cc.mult = 1;
-
-       ptp->next_overflow_check = jiffies + BNXT_PHC_OVERFLOW_PERIOD;
-       timecounter_init(&ptp->tc, &ptp->cc, ktime_to_ns(ktime_get_real()));
+       if (!(bp->fw_cap & BNXT_FW_CAP_PTP_RTC))
+               bnxt_ptp_timecounter_init(bp, true);
 
        ptp->ptp_info = bnxt_ptp_caps;
        if ((bp->fw_cap & BNXT_FW_CAP_PTP_PPS)) {
@@ -757,8 +870,8 @@ int bnxt_ptp_init(struct bnxt *bp)
                int err = PTR_ERR(ptp->ptp_clock);
 
                ptp->ptp_clock = NULL;
-               bnxt_unmap_ptp_regs(bp);
-               return err;
+               rc = err;
+               goto out;
        }
        if (bp->flags & BNXT_FLAG_CHIP_P5) {
                spin_lock_bh(&ptp->ptp_lock);
@@ -768,6 +881,11 @@ int bnxt_ptp_init(struct bnxt *bp)
                ptp_schedule_worker(ptp->ptp_clock, 0);
        }
        return 0;
+
+out:
+       bnxt_ptp_free(bp);
+       bnxt_unmap_ptp_regs(bp);
+       return rc;
 }
 
 void bnxt_ptp_clear(struct bnxt *bp)
index 7c528e1..373baf4 100644 (file)
@@ -131,12 +131,15 @@ do {                                              \
 #endif
 
 int bnxt_ptp_parse(struct sk_buff *skb, u16 *seq_id, u16 *hdr_off);
+void bnxt_ptp_update_current_time(struct bnxt *bp);
 void bnxt_ptp_pps_event(struct bnxt *bp, u32 data1, u32 data2);
 void bnxt_ptp_reapply_pps(struct bnxt *bp);
 int bnxt_hwtstamp_set(struct net_device *dev, struct ifreq *ifr);
 int bnxt_hwtstamp_get(struct net_device *dev, struct ifreq *ifr);
 int bnxt_get_tx_ts_p5(struct bnxt *bp, struct sk_buff *skb);
 int bnxt_get_rx_ts_p5(struct bnxt *bp, u64 *ts, u32 pkt_ts);
-int bnxt_ptp_init(struct bnxt *bp);
+void bnxt_ptp_rtc_timecounter_init(struct bnxt_ptp_cfg *ptp, u64 ns);
+int bnxt_ptp_init_rtc(struct bnxt *bp, bool phc_cfg);
+int bnxt_ptp_init(struct bnxt *bp, bool phc_cfg);
 void bnxt_ptp_clear(struct bnxt *bp);
 #endif
index 87f1056..cfe0911 100644 (file)
@@ -1368,7 +1368,7 @@ static int bcmgenet_set_eee(struct net_device *dev, struct ethtool_eee *e)
        if (!p->eee_enabled) {
                bcmgenet_eee_enable_set(dev, false);
        } else {
-               ret = phy_init_eee(dev->phydev, 0);
+               ret = phy_init_eee(dev->phydev, false);
                if (ret) {
                        netif_err(priv, hw, dev, "EEE initialization failed\n");
                        return ret;
index 9ddbee7..f0a7d83 100644 (file)
@@ -12,6 +12,7 @@
 #include <linux/ptp_clock_kernel.h>
 #include <linux/net_tstamp.h>
 #include <linux/interrupt.h>
+#include <linux/phy/phy.h>
 
 #if defined(CONFIG_ARCH_DMA_ADDR_T_64BIT) || defined(CONFIG_MACB_USE_HWSTAMP)
 #define MACB_EXT_DESC
@@ -1291,6 +1292,9 @@ struct macb {
        u32                     wol;
 
        struct macb_ptp_info    *ptp_info;      /* macb-ptp interface */
+
+       struct phy              *sgmii_phy;     /* for ZynqMP SGMII mode */
+
 #ifdef MACB_EXT_DESC
        uint8_t hw_dma_cap;
 #endif
index 98498a7..4c23115 100644 (file)
@@ -34,7 +34,9 @@
 #include <linux/udp.h>
 #include <linux/tcp.h>
 #include <linux/iopoll.h>
+#include <linux/phy/phy.h>
 #include <linux/pm_runtime.h>
+#include <linux/reset.h>
 #include "macb.h"
 
 /* This structure is only used for MACB on SiFive FU540 devices */
@@ -2739,10 +2741,14 @@ static int macb_open(struct net_device *dev)
 
        macb_init_hw(bp);
 
-       err = macb_phylink_connect(bp);
+       err = phy_power_on(bp->sgmii_phy);
        if (err)
                goto reset_hw;
 
+       err = macb_phylink_connect(bp);
+       if (err)
+               goto phy_off;
+
        netif_tx_start_all_queues(dev);
 
        if (bp->ptp_info)
@@ -2750,6 +2756,9 @@ static int macb_open(struct net_device *dev)
 
        return 0;
 
+phy_off:
+       phy_power_off(bp->sgmii_phy);
+
 reset_hw:
        macb_reset_hw(bp);
        for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue)
@@ -2775,6 +2784,8 @@ static int macb_close(struct net_device *dev)
        phylink_stop(bp->phylink);
        phylink_disconnect_phy(bp->phylink);
 
+       phy_power_off(bp->sgmii_phy);
+
        spin_lock_irqsave(&bp->lock, flags);
        macb_reset_hw(bp);
        netif_carrier_off(dev);
@@ -4544,13 +4555,55 @@ static const struct macb_config np4_config = {
        .usrio = &macb_default_usrio,
 };
 
+static int zynqmp_init(struct platform_device *pdev)
+{
+       struct net_device *dev = platform_get_drvdata(pdev);
+       struct macb *bp = netdev_priv(dev);
+       int ret;
+
+       if (bp->phy_interface == PHY_INTERFACE_MODE_SGMII) {
+               /* Ensure PS-GTR PHY device used in SGMII mode is ready */
+               bp->sgmii_phy = devm_phy_get(&pdev->dev, "sgmii-phy");
+
+               if (IS_ERR(bp->sgmii_phy)) {
+                       ret = PTR_ERR(bp->sgmii_phy);
+                       dev_err_probe(&pdev->dev, ret,
+                                     "failed to get PS-GTR PHY\n");
+                       return ret;
+               }
+
+               ret = phy_init(bp->sgmii_phy);
+               if (ret) {
+                       dev_err(&pdev->dev, "failed to init PS-GTR PHY: %d\n",
+                               ret);
+                       return ret;
+               }
+       }
+
+       /* Fully reset GEM controller at hardware level using zynqmp-reset driver,
+        * if mapped in device tree.
+        */
+       ret = device_reset_optional(&pdev->dev);
+       if (ret) {
+               dev_err_probe(&pdev->dev, ret, "failed to reset controller");
+               phy_exit(bp->sgmii_phy);
+               return ret;
+       }
+
+       ret = macb_init(pdev);
+       if (ret)
+               phy_exit(bp->sgmii_phy);
+
+       return ret;
+}
+
 static const struct macb_config zynqmp_config = {
        .caps = MACB_CAPS_GIGABIT_MODE_AVAILABLE |
                        MACB_CAPS_JUMBO |
                        MACB_CAPS_GEM_HAS_PTP | MACB_CAPS_BD_RD_PREFETCH,
        .dma_burst_length = 16,
        .clk_init = macb_clk_init,
-       .init = macb_init,
+       .init = zynqmp_init,
        .jumbo_max_len = 10240,
        .usrio = &macb_default_usrio,
 };
@@ -4767,7 +4820,7 @@ static int macb_probe(struct platform_device *pdev)
 
        err = macb_mii_init(bp);
        if (err)
-               goto err_out_free_netdev;
+               goto err_out_phy_exit;
 
        netif_carrier_off(dev);
 
@@ -4792,6 +4845,9 @@ err_out_unregister_mdio:
        mdiobus_unregister(bp->mii_bus);
        mdiobus_free(bp->mii_bus);
 
+err_out_phy_exit:
+       phy_exit(bp->sgmii_phy);
+
 err_out_free_netdev:
        free_netdev(dev);
 
@@ -4813,6 +4869,7 @@ static int macb_remove(struct platform_device *pdev)
 
        if (dev) {
                bp = netdev_priv(dev);
+               phy_exit(bp->sgmii_phy);
                mdiobus_unregister(bp->mii_bus);
                mdiobus_free(bp->mii_bus);
 
index ba28aa4..8e07192 100644 (file)
@@ -1539,7 +1539,7 @@ static int liquidio_ptp_adjfreq(struct ptp_clock_info *ptp, s32 ppb)
         * compute the delta in terms of coprocessor clocks.
         */
        delta = (u64)ppb << 32;
-       do_div(delta, oct->coproc_clock_rate);
+       div64_u64(delta, oct->coproc_clock_rate);
 
        spin_lock_irqsave(&lio->ptp_lock, flags);
        comp = lio_pci_readq(oct, CN6XXX_MIO_PTP_CLOCK_COMP);
@@ -1672,7 +1672,7 @@ static void liquidio_ptp_init(struct octeon_device *oct)
        u64 clock_comp, cfg;
 
        clock_comp = (u64)NSEC_PER_SEC << 32;
-       do_div(clock_comp, oct->coproc_clock_rate);
+       div64_u64(clock_comp, oct->coproc_clock_rate);
        lio_pci_writeq(oct, clock_comp, CN6XXX_MIO_PTP_CLOCK_COMP);
 
        /* Enable */
index 574a32f..2f6484d 100644 (file)
@@ -1409,7 +1409,8 @@ static acpi_status bgx_acpi_register_phy(acpi_handle handle,
        struct device *dev = &bgx->pdev->dev;
        struct acpi_device *adev;
 
-       if (acpi_bus_get_device(handle, &adev))
+       adev = acpi_fetch_acpi_dev(handle);
+       if (!adev)
                goto out;
 
        acpi_get_mac_address(dev, adev, bgx->lmac[bgx->acpi_lmac_idx].mac);
index c78b99a..8014eb3 100644 (file)
@@ -2363,11 +2363,13 @@ static void gemini_port_save_mac_addr(struct gemini_ethernet_port *port)
 static int gemini_ethernet_port_probe(struct platform_device *pdev)
 {
        char *port_names[2] = { "ethernet0", "ethernet1" };
+       struct device_node *np = pdev->dev.of_node;
        struct gemini_ethernet_port *port;
        struct device *dev = &pdev->dev;
        struct gemini_ethernet *geth;
        struct net_device *netdev;
        struct device *parent;
+       u8 mac[ETH_ALEN];
        unsigned int id;
        int irq;
        int ret;
@@ -2473,6 +2475,12 @@ static int gemini_ethernet_port_probe(struct platform_device *pdev)
        netif_napi_add(netdev, &port->napi, gmac_napi_poll,
                       DEFAULT_NAPI_WEIGHT);
 
+       ret = of_get_mac_address(np, mac);
+       if (!ret) {
+               dev_info(dev, "Setting macaddr from DT %pM\n", mac);
+               memcpy(port->mac_addr, mac, ETH_ALEN);
+       }
+
        if (is_valid_ether_addr((void *)port->mac_addr)) {
                eth_hw_addr_set(netdev, (u8 *)port->mac_addr);
        } else {
index 3fb39e3..653bde4 100644 (file)
@@ -21,7 +21,7 @@ void pnic_do_nway(struct net_device *dev)
        struct tulip_private *tp = netdev_priv(dev);
        void __iomem *ioaddr = tp->base_addr;
        u32 phy_reg = ioread32(ioaddr + 0xB8);
-       u32 new_csr6 = tp->csr6 & ~0x40C40200;
+       u32 new_csr6;
 
        if (phy_reg & 0x78000000) { /* Ignore baseT4 */
                if (phy_reg & 0x20000000)               dev->if_port = 5;
index c710dc1..8dd7bf9 100644 (file)
@@ -340,7 +340,7 @@ enum wake_event_bits {
 struct netdev_desc {
        __le32 next_desc;
        __le32 status;
-       struct desc_frag { __le32 addr, length; } frag[1];
+       struct desc_frag { __le32 addr, length; } frag;
 };
 
 /* Bits in netdev_desc.status */
@@ -980,8 +980,8 @@ static void tx_timeout(struct net_device *dev, unsigned int txqueue)
                                le32_to_cpu(np->tx_ring[i].next_desc),
                                le32_to_cpu(np->tx_ring[i].status),
                                (le32_to_cpu(np->tx_ring[i].status) >> 2) & 0xff,
-                               le32_to_cpu(np->tx_ring[i].frag[0].addr),
-                               le32_to_cpu(np->tx_ring[i].frag[0].length));
+                               le32_to_cpu(np->tx_ring[i].frag.addr),
+                               le32_to_cpu(np->tx_ring[i].frag.length));
                }
                printk(KERN_DEBUG "TxListPtr=%08x netif_queue_stopped=%d\n",
                        ioread32(np->base + TxListPtr),
@@ -1027,7 +1027,7 @@ static void init_ring(struct net_device *dev)
                np->rx_ring[i].next_desc = cpu_to_le32(np->rx_ring_dma +
                        ((i+1)%RX_RING_SIZE)*sizeof(*np->rx_ring));
                np->rx_ring[i].status = 0;
-               np->rx_ring[i].frag[0].length = 0;
+               np->rx_ring[i].frag.length = 0;
                np->rx_skbuff[i] = NULL;
        }
 
@@ -1039,16 +1039,16 @@ static void init_ring(struct net_device *dev)
                if (skb == NULL)
                        break;
                skb_reserve(skb, 2);    /* 16 byte align the IP header. */
-               np->rx_ring[i].frag[0].addr = cpu_to_le32(
+               np->rx_ring[i].frag.addr = cpu_to_le32(
                        dma_map_single(&np->pci_dev->dev, skb->data,
                                np->rx_buf_sz, DMA_FROM_DEVICE));
                if (dma_mapping_error(&np->pci_dev->dev,
-                                       np->rx_ring[i].frag[0].addr)) {
+                                       np->rx_ring[i].frag.addr)) {
                        dev_kfree_skb(skb);
                        np->rx_skbuff[i] = NULL;
                        break;
                }
-               np->rx_ring[i].frag[0].length = cpu_to_le32(np->rx_buf_sz | LastFrag);
+               np->rx_ring[i].frag.length = cpu_to_le32(np->rx_buf_sz | LastFrag);
        }
        np->dirty_rx = (unsigned int)(i - RX_RING_SIZE);
 
@@ -1097,12 +1097,12 @@ start_tx (struct sk_buff *skb, struct net_device *dev)
 
        txdesc->next_desc = 0;
        txdesc->status = cpu_to_le32 ((entry << 2) | DisableAlign);
-       txdesc->frag[0].addr = cpu_to_le32(dma_map_single(&np->pci_dev->dev,
+       txdesc->frag.addr = cpu_to_le32(dma_map_single(&np->pci_dev->dev,
                                skb->data, skb->len, DMA_TO_DEVICE));
        if (dma_mapping_error(&np->pci_dev->dev,
-                               txdesc->frag[0].addr))
+                               txdesc->frag.addr))
                        goto drop_frame;
-       txdesc->frag[0].length = cpu_to_le32 (skb->len | LastFrag);
+       txdesc->frag.length = cpu_to_le32 (skb->len | LastFrag);
 
        /* Increment cur_tx before tasklet_schedule() */
        np->cur_tx++;
@@ -1151,7 +1151,7 @@ reset_tx (struct net_device *dev)
                skb = np->tx_skbuff[i];
                if (skb) {
                        dma_unmap_single(&np->pci_dev->dev,
-                               le32_to_cpu(np->tx_ring[i].frag[0].addr),
+                               le32_to_cpu(np->tx_ring[i].frag.addr),
                                skb->len, DMA_TO_DEVICE);
                        dev_kfree_skb_any(skb);
                        np->tx_skbuff[i] = NULL;
@@ -1271,12 +1271,12 @@ static irqreturn_t intr_handler(int irq, void *dev_instance)
                                skb = np->tx_skbuff[entry];
                                /* Free the original skb. */
                                dma_unmap_single(&np->pci_dev->dev,
-                                       le32_to_cpu(np->tx_ring[entry].frag[0].addr),
+                                       le32_to_cpu(np->tx_ring[entry].frag.addr),
                                        skb->len, DMA_TO_DEVICE);
                                dev_consume_skb_irq(np->tx_skbuff[entry]);
                                np->tx_skbuff[entry] = NULL;
-                               np->tx_ring[entry].frag[0].addr = 0;
-                               np->tx_ring[entry].frag[0].length = 0;
+                               np->tx_ring[entry].frag.addr = 0;
+                               np->tx_ring[entry].frag.length = 0;
                        }
                        spin_unlock(&np->lock);
                } else {
@@ -1290,12 +1290,12 @@ static irqreturn_t intr_handler(int irq, void *dev_instance)
                                skb = np->tx_skbuff[entry];
                                /* Free the original skb. */
                                dma_unmap_single(&np->pci_dev->dev,
-                                       le32_to_cpu(np->tx_ring[entry].frag[0].addr),
+                                       le32_to_cpu(np->tx_ring[entry].frag.addr),
                                        skb->len, DMA_TO_DEVICE);
                                dev_consume_skb_irq(np->tx_skbuff[entry]);
                                np->tx_skbuff[entry] = NULL;
-                               np->tx_ring[entry].frag[0].addr = 0;
-                               np->tx_ring[entry].frag[0].length = 0;
+                               np->tx_ring[entry].frag.addr = 0;
+                               np->tx_ring[entry].frag.length = 0;
                        }
                        spin_unlock(&np->lock);
                }
@@ -1372,16 +1372,16 @@ static void rx_poll(struct tasklet_struct *t)
                            (skb = netdev_alloc_skb(dev, pkt_len + 2)) != NULL) {
                                skb_reserve(skb, 2);    /* 16 byte align the IP header */
                                dma_sync_single_for_cpu(&np->pci_dev->dev,
-                                               le32_to_cpu(desc->frag[0].addr),
+                                               le32_to_cpu(desc->frag.addr),
                                                np->rx_buf_sz, DMA_FROM_DEVICE);
                                skb_copy_to_linear_data(skb, np->rx_skbuff[entry]->data, pkt_len);
                                dma_sync_single_for_device(&np->pci_dev->dev,
-                                               le32_to_cpu(desc->frag[0].addr),
+                                               le32_to_cpu(desc->frag.addr),
                                                np->rx_buf_sz, DMA_FROM_DEVICE);
                                skb_put(skb, pkt_len);
                        } else {
                                dma_unmap_single(&np->pci_dev->dev,
-                                       le32_to_cpu(desc->frag[0].addr),
+                                       le32_to_cpu(desc->frag.addr),
                                        np->rx_buf_sz, DMA_FROM_DEVICE);
                                skb_put(skb = np->rx_skbuff[entry], pkt_len);
                                np->rx_skbuff[entry] = NULL;
@@ -1427,18 +1427,18 @@ static void refill_rx (struct net_device *dev)
                        if (skb == NULL)
                                break;          /* Better luck next round. */
                        skb_reserve(skb, 2);    /* Align IP on 16 byte boundaries */
-                       np->rx_ring[entry].frag[0].addr = cpu_to_le32(
+                       np->rx_ring[entry].frag.addr = cpu_to_le32(
                                dma_map_single(&np->pci_dev->dev, skb->data,
                                        np->rx_buf_sz, DMA_FROM_DEVICE));
                        if (dma_mapping_error(&np->pci_dev->dev,
-                                   np->rx_ring[entry].frag[0].addr)) {
+                                   np->rx_ring[entry].frag.addr)) {
                            dev_kfree_skb_irq(skb);
                            np->rx_skbuff[entry] = NULL;
                            break;
                        }
                }
                /* Perhaps we need not reset this field. */
-               np->rx_ring[entry].frag[0].length =
+               np->rx_ring[entry].frag.length =
                        cpu_to_le32(np->rx_buf_sz | LastFrag);
                np->rx_ring[entry].status = 0;
                cnt++;
@@ -1870,14 +1870,14 @@ static int netdev_close(struct net_device *dev)
                           (int)(np->tx_ring_dma));
                for (i = 0; i < TX_RING_SIZE; i++)
                        printk(KERN_DEBUG " #%d desc. %4.4x %8.8x %8.8x.\n",
-                                  i, np->tx_ring[i].status, np->tx_ring[i].frag[0].addr,
-                                  np->tx_ring[i].frag[0].length);
+                                  i, np->tx_ring[i].status, np->tx_ring[i].frag.addr,
+                                  np->tx_ring[i].frag.length);
                printk(KERN_DEBUG "  Rx ring %8.8x:\n",
                           (int)(np->rx_ring_dma));
                for (i = 0; i < /*RX_RING_SIZE*/4 ; i++) {
                        printk(KERN_DEBUG " #%d desc. %4.4x %4.4x %8.8x\n",
-                                  i, np->rx_ring[i].status, np->rx_ring[i].frag[0].addr,
-                                  np->rx_ring[i].frag[0].length);
+                                  i, np->rx_ring[i].status, np->rx_ring[i].frag.addr,
+                                  np->rx_ring[i].frag.length);
                }
        }
 #endif /* __i386__ debugging only */
@@ -1892,19 +1892,19 @@ static int netdev_close(struct net_device *dev)
                skb = np->rx_skbuff[i];
                if (skb) {
                        dma_unmap_single(&np->pci_dev->dev,
-                               le32_to_cpu(np->rx_ring[i].frag[0].addr),
+                               le32_to_cpu(np->rx_ring[i].frag.addr),
                                np->rx_buf_sz, DMA_FROM_DEVICE);
                        dev_kfree_skb(skb);
                        np->rx_skbuff[i] = NULL;
                }
-               np->rx_ring[i].frag[0].addr = cpu_to_le32(0xBADF00D0); /* poison */
+               np->rx_ring[i].frag.addr = cpu_to_le32(0xBADF00D0); /* poison */
        }
        for (i = 0; i < TX_RING_SIZE; i++) {
                np->tx_ring[i].next_desc = 0;
                skb = np->tx_skbuff[i];
                if (skb) {
                        dma_unmap_single(&np->pci_dev->dev,
-                               le32_to_cpu(np->tx_ring[i].frag[0].addr),
+                               le32_to_cpu(np->tx_ring[i].frag.addr),
                                skb->len, DMA_TO_DEVICE);
                        dev_kfree_skb(skb);
                        np->tx_skbuff[i] = NULL;
index dd9385d..c4a48e6 100644 (file)
@@ -18,6 +18,7 @@
 #include <linux/ptp_classify.h>
 #include <net/pkt_cls.h>
 #include <net/sock.h>
+#include <net/tso.h>
 
 #include "dpaa2-eth.h"
 
@@ -760,6 +761,39 @@ static void dpaa2_eth_enable_tx_tstamp(struct dpaa2_eth_priv *priv,
        }
 }
 
+static void *dpaa2_eth_sgt_get(struct dpaa2_eth_priv *priv)
+{
+       struct dpaa2_eth_sgt_cache *sgt_cache;
+       void *sgt_buf = NULL;
+       int sgt_buf_size;
+
+       sgt_cache = this_cpu_ptr(priv->sgt_cache);
+       sgt_buf_size = priv->tx_data_offset +
+               DPAA2_ETH_SG_ENTRIES_MAX * sizeof(struct dpaa2_sg_entry);
+
+       if (sgt_cache->count == 0)
+               sgt_buf = napi_alloc_frag_align(sgt_buf_size, DPAA2_ETH_TX_BUF_ALIGN);
+       else
+               sgt_buf = sgt_cache->buf[--sgt_cache->count];
+       if (!sgt_buf)
+               return NULL;
+
+       memset(sgt_buf, 0, sgt_buf_size);
+
+       return sgt_buf;
+}
+
+static void dpaa2_eth_sgt_recycle(struct dpaa2_eth_priv *priv, void *sgt_buf)
+{
+       struct dpaa2_eth_sgt_cache *sgt_cache;
+
+       sgt_cache = this_cpu_ptr(priv->sgt_cache);
+       if (sgt_cache->count >= DPAA2_ETH_SGT_CACHE_SIZE)
+               skb_free_frag(sgt_buf);
+       else
+               sgt_cache->buf[sgt_cache->count++] = sgt_buf;
+}
+
 /* Create a frame descriptor based on a fragmented skb */
 static int dpaa2_eth_build_sg_fd(struct dpaa2_eth_priv *priv,
                                 struct sk_buff *skb,
@@ -805,12 +839,11 @@ static int dpaa2_eth_build_sg_fd(struct dpaa2_eth_priv *priv,
        /* Prepare the HW SGT structure */
        sgt_buf_size = priv->tx_data_offset +
                       sizeof(struct dpaa2_sg_entry) *  num_dma_bufs;
-       sgt_buf = napi_alloc_frag_align(sgt_buf_size, DPAA2_ETH_TX_BUF_ALIGN);
+       sgt_buf = dpaa2_eth_sgt_get(priv);
        if (unlikely(!sgt_buf)) {
                err = -ENOMEM;
                goto sgt_buf_alloc_failed;
        }
-       memset(sgt_buf, 0, sgt_buf_size);
 
        sgt = (struct dpaa2_sg_entry *)(sgt_buf + priv->tx_data_offset);
 
@@ -846,6 +879,7 @@ static int dpaa2_eth_build_sg_fd(struct dpaa2_eth_priv *priv,
                err = -ENOMEM;
                goto dma_map_single_failed;
        }
+       memset(fd, 0, sizeof(struct dpaa2_fd));
        dpaa2_fd_set_offset(fd, priv->tx_data_offset);
        dpaa2_fd_set_format(fd, dpaa2_fd_sg);
        dpaa2_fd_set_addr(fd, addr);
@@ -855,7 +889,7 @@ static int dpaa2_eth_build_sg_fd(struct dpaa2_eth_priv *priv,
        return 0;
 
 dma_map_single_failed:
-       skb_free_frag(sgt_buf);
+       dpaa2_eth_sgt_recycle(priv, sgt_buf);
 sgt_buf_alloc_failed:
        dma_unmap_sg(dev, scl, num_sg, DMA_BIDIRECTIONAL);
 dma_map_sg_failed:
@@ -875,7 +909,6 @@ static int dpaa2_eth_build_sg_fd_single_buf(struct dpaa2_eth_priv *priv,
                                            void **swa_addr)
 {
        struct device *dev = priv->net_dev->dev.parent;
-       struct dpaa2_eth_sgt_cache *sgt_cache;
        struct dpaa2_sg_entry *sgt;
        struct dpaa2_eth_swa *swa;
        dma_addr_t addr, sgt_addr;
@@ -884,18 +917,10 @@ static int dpaa2_eth_build_sg_fd_single_buf(struct dpaa2_eth_priv *priv,
        int err;
 
        /* Prepare the HW SGT structure */
-       sgt_cache = this_cpu_ptr(priv->sgt_cache);
        sgt_buf_size = priv->tx_data_offset + sizeof(struct dpaa2_sg_entry);
-
-       if (sgt_cache->count == 0)
-               sgt_buf = kzalloc(sgt_buf_size + DPAA2_ETH_TX_BUF_ALIGN,
-                                 GFP_ATOMIC);
-       else
-               sgt_buf = sgt_cache->buf[--sgt_cache->count];
+       sgt_buf = dpaa2_eth_sgt_get(priv);
        if (unlikely(!sgt_buf))
                return -ENOMEM;
-
-       sgt_buf = PTR_ALIGN(sgt_buf, DPAA2_ETH_TX_BUF_ALIGN);
        sgt = (struct dpaa2_sg_entry *)(sgt_buf + priv->tx_data_offset);
 
        addr = dma_map_single(dev, skb->data, skb->len, DMA_BIDIRECTIONAL);
@@ -923,6 +948,7 @@ static int dpaa2_eth_build_sg_fd_single_buf(struct dpaa2_eth_priv *priv,
                goto sgt_map_failed;
        }
 
+       memset(fd, 0, sizeof(struct dpaa2_fd));
        dpaa2_fd_set_offset(fd, priv->tx_data_offset);
        dpaa2_fd_set_format(fd, dpaa2_fd_sg);
        dpaa2_fd_set_addr(fd, sgt_addr);
@@ -934,10 +960,7 @@ static int dpaa2_eth_build_sg_fd_single_buf(struct dpaa2_eth_priv *priv,
 sgt_map_failed:
        dma_unmap_single(dev, addr, skb->len, DMA_BIDIRECTIONAL);
 data_map_failed:
-       if (sgt_cache->count >= DPAA2_ETH_SGT_CACHE_SIZE)
-               kfree(sgt_buf);
-       else
-               sgt_cache->buf[sgt_cache->count++] = sgt_buf;
+       dpaa2_eth_sgt_recycle(priv, sgt_buf);
 
        return err;
 }
@@ -978,6 +1001,7 @@ static int dpaa2_eth_build_single_fd(struct dpaa2_eth_priv *priv,
        if (unlikely(dma_mapping_error(dev, addr)))
                return -ENOMEM;
 
+       memset(fd, 0, sizeof(struct dpaa2_fd));
        dpaa2_fd_set_addr(fd, addr);
        dpaa2_fd_set_offset(fd, (u16)(skb->data - buffer_start));
        dpaa2_fd_set_len(fd, skb->len);
@@ -1005,9 +1029,9 @@ static void dpaa2_eth_free_tx_fd(struct dpaa2_eth_priv *priv,
        struct dpaa2_eth_swa *swa;
        u8 fd_format = dpaa2_fd_get_format(fd);
        u32 fd_len = dpaa2_fd_get_len(fd);
-
-       struct dpaa2_eth_sgt_cache *sgt_cache;
        struct dpaa2_sg_entry *sgt;
+       int should_free_skb = 1;
+       int i;
 
        fd_addr = dpaa2_fd_get_addr(fd);
        buffer_start = dpaa2_iova_to_virt(priv->iommu_domain, fd_addr);
@@ -1039,6 +1063,28 @@ static void dpaa2_eth_free_tx_fd(struct dpaa2_eth_priv *priv,
                        /* Unmap the SGT buffer */
                        dma_unmap_single(dev, fd_addr, swa->sg.sgt_size,
                                         DMA_BIDIRECTIONAL);
+               } else if (swa->type == DPAA2_ETH_SWA_SW_TSO) {
+                       skb = swa->tso.skb;
+
+                       sgt = (struct dpaa2_sg_entry *)(buffer_start +
+                                                       priv->tx_data_offset);
+
+                       /* Unmap and free the header */
+                       dma_unmap_single(dev, dpaa2_sg_get_addr(sgt), TSO_HEADER_SIZE,
+                                        DMA_TO_DEVICE);
+                       kfree(dpaa2_iova_to_virt(priv->iommu_domain, dpaa2_sg_get_addr(sgt)));
+
+                       /* Unmap the other SG entries for the data */
+                       for (i = 1; i < swa->tso.num_sg; i++)
+                               dma_unmap_single(dev, dpaa2_sg_get_addr(&sgt[i]),
+                                                dpaa2_sg_get_len(&sgt[i]), DMA_TO_DEVICE);
+
+                       /* Unmap the SGT buffer */
+                       dma_unmap_single(dev, fd_addr, swa->sg.sgt_size,
+                                        DMA_BIDIRECTIONAL);
+
+                       if (!swa->tso.is_last_fd)
+                               should_free_skb = 0;
                } else {
                        skb = swa->single.skb;
 
@@ -1067,55 +1113,195 @@ static void dpaa2_eth_free_tx_fd(struct dpaa2_eth_priv *priv,
        }
 
        /* Get the timestamp value */
-       if (skb->cb[0] == TX_TSTAMP) {
-               struct skb_shared_hwtstamps shhwtstamps;
-               __le64 *ts = dpaa2_get_ts(buffer_start, true);
-               u64 ns;
-
-               memset(&shhwtstamps, 0, sizeof(shhwtstamps));
-
-               ns = DPAA2_PTP_CLK_PERIOD_NS * le64_to_cpup(ts);
-               shhwtstamps.hwtstamp = ns_to_ktime(ns);
-               skb_tstamp_tx(skb, &shhwtstamps);
-       } else if (skb->cb[0] == TX_TSTAMP_ONESTEP_SYNC) {
-               mutex_unlock(&priv->onestep_tstamp_lock);
+       if (swa->type != DPAA2_ETH_SWA_SW_TSO) {
+               if (skb->cb[0] == TX_TSTAMP) {
+                       struct skb_shared_hwtstamps shhwtstamps;
+                       __le64 *ts = dpaa2_get_ts(buffer_start, true);
+                       u64 ns;
+
+                       memset(&shhwtstamps, 0, sizeof(shhwtstamps));
+
+                       ns = DPAA2_PTP_CLK_PERIOD_NS * le64_to_cpup(ts);
+                       shhwtstamps.hwtstamp = ns_to_ktime(ns);
+                       skb_tstamp_tx(skb, &shhwtstamps);
+               } else if (skb->cb[0] == TX_TSTAMP_ONESTEP_SYNC) {
+                       mutex_unlock(&priv->onestep_tstamp_lock);
+               }
        }
 
        /* Free SGT buffer allocated on tx */
-       if (fd_format != dpaa2_fd_single) {
-               sgt_cache = this_cpu_ptr(priv->sgt_cache);
-               if (swa->type == DPAA2_ETH_SWA_SG) {
-                       skb_free_frag(buffer_start);
-               } else {
-                       if (sgt_cache->count >= DPAA2_ETH_SGT_CACHE_SIZE)
-                               kfree(buffer_start);
-                       else
-                               sgt_cache->buf[sgt_cache->count++] = buffer_start;
+       if (fd_format != dpaa2_fd_single)
+               dpaa2_eth_sgt_recycle(priv, buffer_start);
+
+       /* Move on with skb release. If we are just confirming multiple FDs
+        * from the same TSO skb then only the last one will need to free the
+        * skb.
+        */
+       if (should_free_skb)
+               napi_consume_skb(skb, in_napi);
+}
+
+static int dpaa2_eth_build_gso_fd(struct dpaa2_eth_priv *priv,
+                                 struct sk_buff *skb, struct dpaa2_fd *fd,
+                                 int *num_fds, u32 *total_fds_len)
+{
+       struct device *dev = priv->net_dev->dev.parent;
+       int hdr_len, total_len, data_left, fd_len;
+       int num_sge, err, i, sgt_buf_size;
+       struct dpaa2_fd *fd_start = fd;
+       struct dpaa2_sg_entry *sgt;
+       struct dpaa2_eth_swa *swa;
+       dma_addr_t sgt_addr, addr;
+       dma_addr_t tso_hdr_dma;
+       unsigned int index = 0;
+       struct tso_t tso;
+       char *tso_hdr;
+       void *sgt_buf;
+
+       /* Initialize the TSO handler, and prepare the first payload */
+       hdr_len = tso_start(skb, &tso);
+       *total_fds_len = 0;
+
+       total_len = skb->len - hdr_len;
+       while (total_len > 0) {
+               /* Prepare the HW SGT structure for this frame */
+               sgt_buf = dpaa2_eth_sgt_get(priv);
+               if (unlikely(!sgt_buf)) {
+                       netdev_err(priv->net_dev, "dpaa2_eth_sgt_get() failed\n");
+                       err = -ENOMEM;
+                       goto err_sgt_get;
                }
+               sgt = (struct dpaa2_sg_entry *)(sgt_buf + priv->tx_data_offset);
+
+               /* Determine the data length of this frame */
+               data_left = min_t(int, skb_shinfo(skb)->gso_size, total_len);
+               total_len -= data_left;
+               fd_len = data_left + hdr_len;
+
+               /* Prepare packet headers: MAC + IP + TCP */
+               tso_hdr = kmalloc(TSO_HEADER_SIZE, GFP_ATOMIC);
+               if (!tso_hdr) {
+                       err =  -ENOMEM;
+                       goto err_alloc_tso_hdr;
+               }
+
+               tso_build_hdr(skb, tso_hdr, &tso, data_left, total_len == 0);
+               tso_hdr_dma = dma_map_single(dev, tso_hdr, TSO_HEADER_SIZE, DMA_TO_DEVICE);
+               if (dma_mapping_error(dev, tso_hdr_dma)) {
+                       netdev_err(priv->net_dev, "dma_map_single(tso_hdr) failed\n");
+                       err = -ENOMEM;
+                       goto err_map_tso_hdr;
+               }
+
+               /* Setup the SG entry for the header */
+               dpaa2_sg_set_addr(sgt, tso_hdr_dma);
+               dpaa2_sg_set_len(sgt, hdr_len);
+               dpaa2_sg_set_final(sgt, data_left > 0 ? false : true);
+
+               /* Compose the SG entries for each fragment of data */
+               num_sge = 1;
+               while (data_left > 0) {
+                       int size;
+
+                       /* Move to the next SG entry */
+                       sgt++;
+                       size = min_t(int, tso.size, data_left);
+
+                       addr = dma_map_single(dev, tso.data, size, DMA_TO_DEVICE);
+                       if (dma_mapping_error(dev, addr)) {
+                               netdev_err(priv->net_dev, "dma_map_single(tso.data) failed\n");
+                               err = -ENOMEM;
+                               goto err_map_data;
+                       }
+                       dpaa2_sg_set_addr(sgt, addr);
+                       dpaa2_sg_set_len(sgt, size);
+                       dpaa2_sg_set_final(sgt, size == data_left ? true : false);
+
+                       num_sge++;
+
+                       /* Build the data for the __next__ fragment */
+                       data_left -= size;
+                       tso_build_data(skb, &tso, size);
+               }
+
+               /* Store the skb backpointer in the SGT buffer */
+               sgt_buf_size = priv->tx_data_offset + num_sge * sizeof(struct dpaa2_sg_entry);
+               swa = (struct dpaa2_eth_swa *)sgt_buf;
+               swa->type = DPAA2_ETH_SWA_SW_TSO;
+               swa->tso.skb = skb;
+               swa->tso.num_sg = num_sge;
+               swa->tso.sgt_size = sgt_buf_size;
+               swa->tso.is_last_fd = total_len == 0 ? 1 : 0;
+
+               /* Separately map the SGT buffer */
+               sgt_addr = dma_map_single(dev, sgt_buf, sgt_buf_size, DMA_BIDIRECTIONAL);
+               if (unlikely(dma_mapping_error(dev, sgt_addr))) {
+                       netdev_err(priv->net_dev, "dma_map_single(sgt_buf) failed\n");
+                       err = -ENOMEM;
+                       goto err_map_sgt;
+               }
+
+               /* Setup the frame descriptor */
+               memset(fd, 0, sizeof(struct dpaa2_fd));
+               dpaa2_fd_set_offset(fd, priv->tx_data_offset);
+               dpaa2_fd_set_format(fd, dpaa2_fd_sg);
+               dpaa2_fd_set_addr(fd, sgt_addr);
+               dpaa2_fd_set_len(fd, fd_len);
+               dpaa2_fd_set_ctrl(fd, FD_CTRL_PTA);
+
+               *total_fds_len += fd_len;
+               /* Advance to the next frame descriptor */
+               fd++;
+               index++;
        }
 
-       /* Move on with skb release */
-       napi_consume_skb(skb, in_napi);
+       *num_fds = index;
+
+       return 0;
+
+err_map_sgt:
+err_map_data:
+       /* Unmap all the data S/G entries for the current FD */
+       sgt = (struct dpaa2_sg_entry *)(sgt_buf + priv->tx_data_offset);
+       for (i = 1; i < num_sge; i++)
+               dma_unmap_single(dev, dpaa2_sg_get_addr(&sgt[i]),
+                                dpaa2_sg_get_len(&sgt[i]), DMA_TO_DEVICE);
+
+       /* Unmap the header entry */
+       dma_unmap_single(dev, tso_hdr_dma, TSO_HEADER_SIZE, DMA_TO_DEVICE);
+err_map_tso_hdr:
+       kfree(tso_hdr);
+err_alloc_tso_hdr:
+       dpaa2_eth_sgt_recycle(priv, sgt_buf);
+err_sgt_get:
+       /* Free all the other FDs that were already fully created */
+       for (i = 0; i < index; i++)
+               dpaa2_eth_free_tx_fd(priv, NULL, &fd_start[i], false);
+
+       return err;
 }
 
 static netdev_tx_t __dpaa2_eth_tx(struct sk_buff *skb,
                                  struct net_device *net_dev)
 {
        struct dpaa2_eth_priv *priv = netdev_priv(net_dev);
-       struct dpaa2_fd fd;
-       struct rtnl_link_stats64 *percpu_stats;
+       int total_enqueued = 0, retries = 0, enqueued;
        struct dpaa2_eth_drv_stats *percpu_extras;
+       struct rtnl_link_stats64 *percpu_stats;
+       unsigned int needed_headroom;
+       int num_fds = 1, max_retries;
        struct dpaa2_eth_fq *fq;
        struct netdev_queue *nq;
+       struct dpaa2_fd *fd;
        u16 queue_mapping;
-       unsigned int needed_headroom;
-       u32 fd_len;
+       void *swa = NULL;
        u8 prio = 0;
        int err, i;
-       void *swa;
+       u32 fd_len;
 
        percpu_stats = this_cpu_ptr(priv->percpu_stats);
        percpu_extras = this_cpu_ptr(priv->percpu_extras);
+       fd = (this_cpu_ptr(priv->fd))->array;
 
        needed_headroom = dpaa2_eth_needed_headroom(skb);
 
@@ -1130,20 +1316,28 @@ static netdev_tx_t __dpaa2_eth_tx(struct sk_buff *skb,
        }
 
        /* Setup the FD fields */
-       memset(&fd, 0, sizeof(fd));
 
-       if (skb_is_nonlinear(skb)) {
-               err = dpaa2_eth_build_sg_fd(priv, skb, &fd, &swa);
+       if (skb_is_gso(skb)) {
+               err = dpaa2_eth_build_gso_fd(priv, skb, fd, &num_fds, &fd_len);
+               percpu_extras->tx_sg_frames += num_fds;
+               percpu_extras->tx_sg_bytes += fd_len;
+               percpu_extras->tx_tso_frames += num_fds;
+               percpu_extras->tx_tso_bytes += fd_len;
+       } else if (skb_is_nonlinear(skb)) {
+               err = dpaa2_eth_build_sg_fd(priv, skb, fd, &swa);
                percpu_extras->tx_sg_frames++;
                percpu_extras->tx_sg_bytes += skb->len;
+               fd_len = dpaa2_fd_get_len(fd);
        } else if (skb_headroom(skb) < needed_headroom) {
-               err = dpaa2_eth_build_sg_fd_single_buf(priv, skb, &fd, &swa);
+               err = dpaa2_eth_build_sg_fd_single_buf(priv, skb, fd, &swa);
                percpu_extras->tx_sg_frames++;
                percpu_extras->tx_sg_bytes += skb->len;
                percpu_extras->tx_converted_sg_frames++;
                percpu_extras->tx_converted_sg_bytes += skb->len;
+               fd_len = dpaa2_fd_get_len(fd);
        } else {
-               err = dpaa2_eth_build_single_fd(priv, skb, &fd, &swa);
+               err = dpaa2_eth_build_single_fd(priv, skb, fd, &swa);
+               fd_len = dpaa2_fd_get_len(fd);
        }
 
        if (unlikely(err)) {
@@ -1151,11 +1345,12 @@ static netdev_tx_t __dpaa2_eth_tx(struct sk_buff *skb,
                goto err_build_fd;
        }
 
-       if (skb->cb[0])
-               dpaa2_eth_enable_tx_tstamp(priv, &fd, swa, skb);
+       if (swa && skb->cb[0])
+               dpaa2_eth_enable_tx_tstamp(priv, fd, swa, skb);
 
        /* Tracing point */
-       trace_dpaa2_tx_fd(net_dev, &fd);
+       for (i = 0; i < num_fds; i++)
+               trace_dpaa2_tx_fd(net_dev, &fd[i]);
 
        /* TxConf FQ selection relies on queue id from the stack.
         * In case of a forwarded frame from another DPNI interface, we choose
@@ -1175,27 +1370,32 @@ static netdev_tx_t __dpaa2_eth_tx(struct sk_buff *skb,
                queue_mapping %= dpaa2_eth_queue_count(priv);
        }
        fq = &priv->fq[queue_mapping];
-
-       fd_len = dpaa2_fd_get_len(&fd);
        nq = netdev_get_tx_queue(net_dev, queue_mapping);
        netdev_tx_sent_queue(nq, fd_len);
 
        /* Everything that happens after this enqueues might race with
         * the Tx confirmation callback for this frame
         */
-       for (i = 0; i < DPAA2_ETH_ENQUEUE_RETRIES; i++) {
-               err = priv->enqueue(priv, fq, &fd, prio, 1, NULL);
-               if (err != -EBUSY)
-                       break;
+       max_retries = num_fds * DPAA2_ETH_ENQUEUE_RETRIES;
+       while (total_enqueued < num_fds && retries < max_retries) {
+               err = priv->enqueue(priv, fq, &fd[total_enqueued],
+                                   prio, num_fds - total_enqueued, &enqueued);
+               if (err == -EBUSY) {
+                       retries++;
+                       continue;
+               }
+
+               total_enqueued += enqueued;
        }
-       percpu_extras->tx_portal_busy += i;
+       percpu_extras->tx_portal_busy += retries;
+
        if (unlikely(err < 0)) {
                percpu_stats->tx_errors++;
                /* Clean up everything, including freeing the skb */
-               dpaa2_eth_free_tx_fd(priv, fq, &fd, false);
+               dpaa2_eth_free_tx_fd(priv, fq, fd, false);
                netdev_tx_completed_queue(nq, 1, fd_len);
        } else {
-               percpu_stats->tx_packets++;
+               percpu_stats->tx_packets += total_enqueued;
                percpu_stats->tx_bytes += fd_len;
        }
 
@@ -1523,7 +1723,7 @@ static void dpaa2_eth_sgt_cache_drain(struct dpaa2_eth_priv *priv)
                count = sgt_cache->count;
 
                for (i = 0; i < count; i++)
-                       kfree(sgt_cache->buf[i]);
+                       skb_free_frag(sgt_cache->buf[i]);
                sgt_cache->count = 0;
        }
 }
@@ -4115,7 +4315,8 @@ static int dpaa2_eth_netdev_init(struct net_device *net_dev)
        net_dev->features = NETIF_F_RXCSUM |
                            NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
                            NETIF_F_SG | NETIF_F_HIGHDMA |
-                           NETIF_F_LLTX | NETIF_F_HW_TC;
+                           NETIF_F_LLTX | NETIF_F_HW_TC | NETIF_F_TSO;
+       net_dev->gso_max_segs = DPAA2_ETH_ENQUEUE_MAX_FDS;
        net_dev->hw_features = net_dev->features;
 
        if (priv->dpni_attrs.vlan_filter_entries)
@@ -4397,6 +4598,13 @@ static int dpaa2_eth_probe(struct fsl_mc_device *dpni_dev)
                goto err_alloc_sgt_cache;
        }
 
+       priv->fd = alloc_percpu(*priv->fd);
+       if (!priv->fd) {
+               dev_err(dev, "alloc_percpu(fds) failed\n");
+               err = -ENOMEM;
+               goto err_alloc_fds;
+       }
+
        err = dpaa2_eth_netdev_init(net_dev);
        if (err)
                goto err_netdev_init;
@@ -4484,6 +4692,8 @@ err_poll_thread:
 err_alloc_rings:
 err_csum:
 err_netdev_init:
+       free_percpu(priv->fd);
+err_alloc_fds:
        free_percpu(priv->sgt_cache);
 err_alloc_sgt_cache:
        free_percpu(priv->percpu_extras);
@@ -4539,6 +4749,7 @@ static int dpaa2_eth_remove(struct fsl_mc_device *ls_dev)
                fsl_mc_free_irqs(ls_dev);
 
        dpaa2_eth_free_rings(priv);
+       free_percpu(priv->fd);
        free_percpu(priv->sgt_cache);
        free_percpu(priv->percpu_stats);
        free_percpu(priv->percpu_extras);
index e54e70e..b79831c 100644 (file)
@@ -122,6 +122,7 @@ enum dpaa2_eth_swa_type {
        DPAA2_ETH_SWA_SINGLE,
        DPAA2_ETH_SWA_SG,
        DPAA2_ETH_SWA_XDP,
+       DPAA2_ETH_SWA_SW_TSO,
 };
 
 /* Must keep this struct smaller than DPAA2_ETH_SWA_SIZE */
@@ -142,6 +143,12 @@ struct dpaa2_eth_swa {
                        int dma_size;
                        struct xdp_frame *xdpf;
                } xdp;
+               struct {
+                       struct sk_buff *skb;
+                       int num_sg;
+                       int sgt_size;
+                       int is_last_fd;
+               } tso;
        };
 };
 
@@ -354,6 +361,8 @@ struct dpaa2_eth_drv_stats {
        __u64   tx_conf_bytes;
        __u64   tx_sg_frames;
        __u64   tx_sg_bytes;
+       __u64   tx_tso_frames;
+       __u64   tx_tso_bytes;
        __u64   rx_sg_frames;
        __u64   rx_sg_bytes;
        /* Linear skbs sent as a S/G FD due to insufficient headroom */
@@ -493,8 +502,15 @@ struct dpaa2_eth_trap_data {
        struct dpaa2_eth_priv *priv;
 };
 
+#define DPAA2_ETH_SG_ENTRIES_MAX       (PAGE_SIZE / sizeof(struct scatterlist))
+
 #define DPAA2_ETH_DEFAULT_COPYBREAK    512
 
+#define DPAA2_ETH_ENQUEUE_MAX_FDS      200
+struct dpaa2_eth_fds {
+       struct dpaa2_fd array[DPAA2_ETH_ENQUEUE_MAX_FDS];
+};
+
 /* Driver private data */
 struct dpaa2_eth_priv {
        struct net_device *net_dev;
@@ -577,6 +593,8 @@ struct dpaa2_eth_priv {
        struct devlink_port devlink_port;
 
        u32 rx_copybreak;
+
+       struct dpaa2_eth_fds __percpu *fd;
 };
 
 struct dpaa2_eth_devlink_priv {
index 3fdbf87..eea7d7a 100644 (file)
@@ -44,6 +44,8 @@ static char dpaa2_ethtool_extras[][ETH_GSTRING_LEN] = {
        "[drv] tx conf bytes",
        "[drv] tx sg frames",
        "[drv] tx sg bytes",
+       "[drv] tx tso frames",
+       "[drv] tx tso bytes",
        "[drv] rx sg frames",
        "[drv] rx sg bytes",
        "[drv] tx converted sg frames",
index 623d113..521f036 100644 (file)
@@ -100,6 +100,14 @@ static int dpaa2_mac_get_if_mode(struct fwnode_handle *dpmac_node,
        return err;
 }
 
+static struct phylink_pcs *dpaa2_mac_select_pcs(struct phylink_config *config,
+                                               phy_interface_t interface)
+{
+       struct dpaa2_mac *mac = phylink_to_dpaa2_mac(config);
+
+       return mac->pcs;
+}
+
 static void dpaa2_mac_config(struct phylink_config *config, unsigned int mode,
                             const struct phylink_link_state *state)
 {
@@ -172,6 +180,7 @@ static void dpaa2_mac_link_down(struct phylink_config *config,
 
 static const struct phylink_mac_ops dpaa2_mac_phylink_ops = {
        .validate = phylink_generic_validate,
+       .mac_select_pcs = dpaa2_mac_select_pcs,
        .mac_config = dpaa2_mac_config,
        .mac_link_up = dpaa2_mac_link_up,
        .mac_link_down = dpaa2_mac_link_down,
@@ -303,9 +312,6 @@ int dpaa2_mac_connect(struct dpaa2_mac *mac)
        }
        mac->phylink = phylink;
 
-       if (mac->pcs)
-               phylink_set_pcs(mac->phylink, mac->pcs);
-
        err = phylink_fwnode_phy_connect(mac->phylink, dpmac_node, 0);
        if (err) {
                netdev_err(net_dev, "phylink_fwnode_phy_connect() = %d\n", err);
index fb39e40..68d806d 100644 (file)
@@ -18,6 +18,8 @@
 #define ENETC_MAX_MTU          (ENETC_MAC_MAXFRM_SIZE - \
                                (ETH_FCS_LEN + ETH_HLEN + VLAN_HLEN))
 
+#define ENETC_CBD_DATA_MEM_ALIGN 64
+
 struct enetc_tx_swbd {
        union {
                struct sk_buff *skb;
@@ -415,6 +417,42 @@ int enetc_get_rss_table(struct enetc_si *si, u32 *table, int count);
 int enetc_set_rss_table(struct enetc_si *si, const u32 *table, int count);
 int enetc_send_cmd(struct enetc_si *si, struct enetc_cbd *cbd);
 
+static inline void *enetc_cbd_alloc_data_mem(struct enetc_si *si,
+                                            struct enetc_cbd *cbd,
+                                            int size, dma_addr_t *dma,
+                                            void **data_align)
+{
+       struct enetc_cbdr *ring = &si->cbd_ring;
+       dma_addr_t dma_align;
+       void *data;
+
+       data = dma_alloc_coherent(ring->dma_dev,
+                                 size + ENETC_CBD_DATA_MEM_ALIGN,
+                                 dma, GFP_KERNEL);
+       if (!data) {
+               dev_err(ring->dma_dev, "CBD alloc data memory failed!\n");
+               return NULL;
+       }
+
+       dma_align = ALIGN(*dma, ENETC_CBD_DATA_MEM_ALIGN);
+       *data_align = PTR_ALIGN(data, ENETC_CBD_DATA_MEM_ALIGN);
+
+       cbd->addr[0] = cpu_to_le32(lower_32_bits(dma_align));
+       cbd->addr[1] = cpu_to_le32(upper_32_bits(dma_align));
+       cbd->length = cpu_to_le16(size);
+
+       return data;
+}
+
+static inline void enetc_cbd_free_data_mem(struct enetc_si *si, int size,
+                                          void *data, dma_addr_t *dma)
+{
+       struct enetc_cbdr *ring = &si->cbd_ring;
+
+       dma_free_coherent(ring->dma_dev, size + ENETC_CBD_DATA_MEM_ALIGN,
+                         data, *dma);
+}
+
 #ifdef CONFIG_FSL_ENETC_QOS
 int enetc_setup_tc_taprio(struct net_device *ndev, void *type_data);
 void enetc_sched_speed_set(struct enetc_ndev_priv *priv, int speed);
index 073e56d..af68dc4 100644 (file)
@@ -166,70 +166,55 @@ int enetc_set_mac_flt_entry(struct enetc_si *si, int index,
        return enetc_send_cmd(si, &cbd);
 }
 
-#define RFSE_ALIGN     64
 /* Set entry in RFS table */
 int enetc_set_fs_entry(struct enetc_si *si, struct enetc_cmd_rfse *rfse,
                       int index)
 {
        struct enetc_cbdr *ring = &si->cbd_ring;
        struct enetc_cbd cbd = {.cmd = 0};
-       dma_addr_t dma, dma_align;
        void *tmp, *tmp_align;
+       dma_addr_t dma;
        int err;
 
        /* fill up the "set" descriptor */
        cbd.cmd = 0;
        cbd.cls = 4;
        cbd.index = cpu_to_le16(index);
-       cbd.length = cpu_to_le16(sizeof(*rfse));
        cbd.opt[3] = cpu_to_le32(0); /* SI */
 
-       tmp = dma_alloc_coherent(ring->dma_dev, sizeof(*rfse) + RFSE_ALIGN,
-                                &dma, GFP_KERNEL);
-       if (!tmp) {
-               dev_err(ring->dma_dev, "DMA mapping of RFS entry failed!\n");
+       tmp = enetc_cbd_alloc_data_mem(si, &cbd, sizeof(*rfse),
+                                      &dma, &tmp_align);
+       if (!tmp)
                return -ENOMEM;
-       }
 
-       dma_align = ALIGN(dma, RFSE_ALIGN);
-       tmp_align = PTR_ALIGN(tmp, RFSE_ALIGN);
        memcpy(tmp_align, rfse, sizeof(*rfse));
 
-       cbd.addr[0] = cpu_to_le32(lower_32_bits(dma_align));
-       cbd.addr[1] = cpu_to_le32(upper_32_bits(dma_align));
-
        err = enetc_send_cmd(si, &cbd);
        if (err)
                dev_err(ring->dma_dev, "FS entry add failed (%d)!", err);
 
-       dma_free_coherent(ring->dma_dev, sizeof(*rfse) + RFSE_ALIGN,
-                         tmp, dma);
+       enetc_cbd_free_data_mem(si, sizeof(*rfse), tmp, &dma);
 
        return err;
 }
 
-#define RSSE_ALIGN     64
 static int enetc_cmd_rss_table(struct enetc_si *si, u32 *table, int count,
                               bool read)
 {
        struct enetc_cbdr *ring = &si->cbd_ring;
        struct enetc_cbd cbd = {.cmd = 0};
-       dma_addr_t dma, dma_align;
        u8 *tmp, *tmp_align;
+       dma_addr_t dma;
        int err, i;
 
-       if (count < RSSE_ALIGN)
+       if (count < ENETC_CBD_DATA_MEM_ALIGN)
                /* HW only takes in a full 64 entry table */
                return -EINVAL;
 
-       tmp = dma_alloc_coherent(ring->dma_dev, count + RSSE_ALIGN,
-                                &dma, GFP_KERNEL);
-       if (!tmp) {
-               dev_err(ring->dma_dev, "DMA mapping of RSS table failed!\n");
+       tmp = enetc_cbd_alloc_data_mem(si, &cbd, count,
+                                      &dma, (void *)&tmp_align);
+       if (!tmp)
                return -ENOMEM;
-       }
-       dma_align = ALIGN(dma, RSSE_ALIGN);
-       tmp_align = PTR_ALIGN(tmp, RSSE_ALIGN);
 
        if (!read)
                for (i = 0; i < count; i++)
@@ -238,10 +223,6 @@ static int enetc_cmd_rss_table(struct enetc_si *si, u32 *table, int count,
        /* fill up the descriptor */
        cbd.cmd = read ? 2 : 1;
        cbd.cls = 3;
-       cbd.length = cpu_to_le16(count);
-
-       cbd.addr[0] = cpu_to_le32(lower_32_bits(dma_align));
-       cbd.addr[1] = cpu_to_le32(upper_32_bits(dma_align));
 
        err = enetc_send_cmd(si, &cbd);
        if (err)
@@ -251,7 +232,7 @@ static int enetc_cmd_rss_table(struct enetc_si *si, u32 *table, int count,
                for (i = 0; i < count; i++)
                        table[i] = tmp_align[i];
 
-       dma_free_coherent(ring->dma_dev, count + RSSE_ALIGN, tmp, dma);
+       enetc_cbd_free_data_mem(si, count, tmp, &dma);
 
        return err;
 }
index ed16a5a..a0c75c7 100644 (file)
@@ -934,18 +934,21 @@ static void enetc_mdiobus_destroy(struct enetc_pf *pf)
        enetc_imdio_remove(pf);
 }
 
+static struct phylink_pcs *
+enetc_pl_mac_select_pcs(struct phylink_config *config, phy_interface_t iface)
+{
+       struct enetc_pf *pf = phylink_to_enetc_pf(config);
+
+       return pf->pcs;
+}
+
 static void enetc_pl_mac_config(struct phylink_config *config,
                                unsigned int mode,
                                const struct phylink_link_state *state)
 {
        struct enetc_pf *pf = phylink_to_enetc_pf(config);
-       struct enetc_ndev_priv *priv;
 
        enetc_mac_config(&pf->si->hw, state->interface);
-
-       priv = netdev_priv(pf->si->ndev);
-       if (pf->pcs)
-               phylink_set_pcs(priv->phylink, pf->pcs);
 }
 
 static void enetc_force_rgmii_mac(struct enetc_hw *hw, int speed, int duplex)
@@ -1062,6 +1065,7 @@ static void enetc_pl_mac_link_down(struct phylink_config *config,
 
 static const struct phylink_mac_ops enetc_mac_phylink_ops = {
        .validate = phylink_generic_validate,
+       .mac_select_pcs = enetc_pl_mac_select_pcs,
        .mac_config = enetc_pl_mac_config,
        .mac_link_up = enetc_pl_mac_link_up,
        .mac_link_down = enetc_pl_mac_link_down,
index 3555c12..5a3eea1 100644 (file)
@@ -52,10 +52,11 @@ static int enetc_setup_taprio(struct net_device *ndev,
        struct enetc_cbd cbd = {.cmd = 0};
        struct tgs_gcl_conf *gcl_config;
        struct tgs_gcl_data *gcl_data;
-       struct gce *gce;
        dma_addr_t dma;
+       struct gce *gce;
        u16 data_size;
        u16 gcl_len;
+       void *tmp;
        u32 tge;
        int err;
        int i;
@@ -82,8 +83,9 @@ static int enetc_setup_taprio(struct net_device *ndev,
        gcl_config = &cbd.gcl_conf;
 
        data_size = struct_size(gcl_data, entry, gcl_len);
-       gcl_data = kzalloc(data_size, __GFP_DMA | GFP_KERNEL);
-       if (!gcl_data)
+       tmp = enetc_cbd_alloc_data_mem(priv->si, &cbd, data_size,
+                                      &dma, (void *)&gcl_data);
+       if (!tmp)
                return -ENOMEM;
 
        gce = (struct gce *)(gcl_data + 1);
@@ -107,19 +109,8 @@ static int enetc_setup_taprio(struct net_device *ndev,
                temp_gce->period = cpu_to_le32(temp_entry->interval);
        }
 
-       cbd.length = cpu_to_le16(data_size);
        cbd.status_flags = 0;
 
-       dma = dma_map_single(&priv->si->pdev->dev, gcl_data,
-                            data_size, DMA_TO_DEVICE);
-       if (dma_mapping_error(&priv->si->pdev->dev, dma)) {
-               netdev_err(priv->si->ndev, "DMA mapping failed!\n");
-               kfree(gcl_data);
-               return -ENOMEM;
-       }
-
-       cbd.addr[0] = cpu_to_le32(lower_32_bits(dma));
-       cbd.addr[1] = cpu_to_le32(upper_32_bits(dma));
        cbd.cls = BDCR_CMD_PORT_GCL;
        cbd.status_flags = 0;
 
@@ -132,8 +123,7 @@ static int enetc_setup_taprio(struct net_device *ndev,
                         ENETC_QBV_PTGCR_OFFSET,
                         tge & (~ENETC_QBV_TGE));
 
-       dma_unmap_single(&priv->si->pdev->dev, dma, data_size, DMA_TO_DEVICE);
-       kfree(gcl_data);
+       enetc_cbd_free_data_mem(priv->si, data_size, tmp, &dma);
 
        return err;
 }
@@ -450,6 +440,7 @@ static struct actions_fwd enetc_act_fwd[] = {
 };
 
 static struct enetc_psfp epsfp = {
+       .dev_bitmap = 0,
        .psfp_sfi_bitmap = NULL,
 };
 
@@ -463,8 +454,9 @@ static int enetc_streamid_hw_set(struct enetc_ndev_priv *priv,
        struct enetc_cbd cbd = {.cmd = 0};
        struct streamid_data *si_data;
        struct streamid_conf *si_conf;
-       u16 data_size;
        dma_addr_t dma;
+       u16 data_size;
+       void *tmp;
        int port;
        int err;
 
@@ -485,21 +477,11 @@ static int enetc_streamid_hw_set(struct enetc_ndev_priv *priv,
        cbd.status_flags = 0;
 
        data_size = sizeof(struct streamid_data);
-       si_data = kzalloc(data_size, __GFP_DMA | GFP_KERNEL);
-       if (!si_data)
+       tmp = enetc_cbd_alloc_data_mem(priv->si, &cbd, data_size,
+                                      &dma, (void *)&si_data);
+       if (!tmp)
                return -ENOMEM;
-       cbd.length = cpu_to_le16(data_size);
 
-       dma = dma_map_single(&priv->si->pdev->dev, si_data,
-                            data_size, DMA_FROM_DEVICE);
-       if (dma_mapping_error(&priv->si->pdev->dev, dma)) {
-               netdev_err(priv->si->ndev, "DMA mapping failed!\n");
-               err = -ENOMEM;
-               goto out;
-       }
-
-       cbd.addr[0] = cpu_to_le32(lower_32_bits(dma));
-       cbd.addr[1] = cpu_to_le32(upper_32_bits(dma));
        eth_broadcast_addr(si_data->dmac);
        si_data->vid_vidm_tg = (ENETC_CBDR_SID_VID_MASK
                               + ((0x3 << 14) | ENETC_CBDR_SID_VIDM));
@@ -520,11 +502,6 @@ static int enetc_streamid_hw_set(struct enetc_ndev_priv *priv,
                goto out;
 
        /* Enable the entry overwrite again incase space flushed by hardware */
-       memset(&cbd, 0, sizeof(cbd));
-
-       cbd.index = cpu_to_le16((u16)sid->index);
-       cbd.cmd = 0;
-       cbd.cls = BDCR_CMD_STREAM_IDENTIFY;
        cbd.status_flags = 0;
 
        si_conf->en = 0x80;
@@ -537,11 +514,6 @@ static int enetc_streamid_hw_set(struct enetc_ndev_priv *priv,
 
        memset(si_data, 0, data_size);
 
-       cbd.length = cpu_to_le16(data_size);
-
-       cbd.addr[0] = cpu_to_le32(lower_32_bits(dma));
-       cbd.addr[1] = cpu_to_le32(upper_32_bits(dma));
-
        /* VIDM default to be 1.
         * VID Match. If set (b1) then the VID must match, otherwise
         * any VID is considered a match. VIDM setting is only used
@@ -561,10 +533,7 @@ static int enetc_streamid_hw_set(struct enetc_ndev_priv *priv,
 
        err = enetc_send_cmd(priv->si, &cbd);
 out:
-       if (!dma_mapping_error(&priv->si->pdev->dev, dma))
-               dma_unmap_single(&priv->si->pdev->dev, dma, data_size, DMA_FROM_DEVICE);
-
-       kfree(si_data);
+       enetc_cbd_free_data_mem(priv->si, data_size, tmp, &dma);
 
        return err;
 }
@@ -635,6 +604,7 @@ static int enetc_streamcounter_hw_get(struct enetc_ndev_priv *priv,
        struct sfi_counter_data *data_buf;
        dma_addr_t dma;
        u16 data_size;
+       void *tmp;
        int err;
 
        cbd.index = cpu_to_le16((u16)index);
@@ -643,21 +613,11 @@ static int enetc_streamcounter_hw_get(struct enetc_ndev_priv *priv,
        cbd.status_flags = 0;
 
        data_size = sizeof(struct sfi_counter_data);
-       data_buf = kzalloc(data_size, __GFP_DMA | GFP_KERNEL);
-       if (!data_buf)
-               return -ENOMEM;
-
-       dma = dma_map_single(&priv->si->pdev->dev, data_buf,
-                            data_size, DMA_FROM_DEVICE);
-       if (dma_mapping_error(&priv->si->pdev->dev, dma)) {
-               netdev_err(priv->si->ndev, "DMA mapping failed!\n");
-               err = -ENOMEM;
-               goto exit;
-       }
-       cbd.addr[0] = cpu_to_le32(lower_32_bits(dma));
-       cbd.addr[1] = cpu_to_le32(upper_32_bits(dma));
 
-       cbd.length = cpu_to_le16(data_size);
+       tmp = enetc_cbd_alloc_data_mem(priv->si, &cbd, data_size,
+                                      &dma, (void *)&data_buf);
+       if (!tmp)
+               return -ENOMEM;
 
        err = enetc_send_cmd(priv->si, &cbd);
        if (err)
@@ -684,7 +644,8 @@ static int enetc_streamcounter_hw_get(struct enetc_ndev_priv *priv,
                                data_buf->flow_meter_dropl;
 
 exit:
-       kfree(data_buf);
+       enetc_cbd_free_data_mem(priv->si, data_size, tmp, &dma);
+
        return err;
 }
 
@@ -726,6 +687,7 @@ static int enetc_streamgate_hw_set(struct enetc_ndev_priv *priv,
        dma_addr_t dma;
        u16 data_size;
        int err, i;
+       void *tmp;
        u64 now;
 
        cbd.index = cpu_to_le16(sgi->index);
@@ -772,24 +734,10 @@ static int enetc_streamgate_hw_set(struct enetc_ndev_priv *priv,
        sgcl_config->acl_len = (sgi->num_entries - 1) & 0x3;
 
        data_size = struct_size(sgcl_data, sgcl, sgi->num_entries);
-
-       sgcl_data = kzalloc(data_size, __GFP_DMA | GFP_KERNEL);
-       if (!sgcl_data)
-               return -ENOMEM;
-
-       cbd.length = cpu_to_le16(data_size);
-
-       dma = dma_map_single(&priv->si->pdev->dev,
-                            sgcl_data, data_size,
-                            DMA_FROM_DEVICE);
-       if (dma_mapping_error(&priv->si->pdev->dev, dma)) {
-               netdev_err(priv->si->ndev, "DMA mapping failed!\n");
-               kfree(sgcl_data);
+       tmp = enetc_cbd_alloc_data_mem(priv->si, &cbd, data_size,
+                                      &dma, (void *)&sgcl_data);
+       if (!tmp)
                return -ENOMEM;
-       }
-
-       cbd.addr[0] = cpu_to_le32(lower_32_bits(dma));
-       cbd.addr[1] = cpu_to_le32(upper_32_bits(dma));
 
        sgce = &sgcl_data->sgcl[0];
 
@@ -844,8 +792,7 @@ static int enetc_streamgate_hw_set(struct enetc_ndev_priv *priv,
        err = enetc_send_cmd(priv->si, &cbd);
 
 exit:
-       kfree(sgcl_data);
-
+       enetc_cbd_free_data_mem(priv->si, data_size, tmp, &dma);
        return err;
 }
 
index 796133d..11227f5 100644 (file)
@@ -2797,7 +2797,7 @@ static int fec_enet_eee_mode_set(struct net_device *ndev, bool enable)
        int ret = 0;
 
        if (enable) {
-               ret = phy_init_eee(ndev->phydev, 0);
+               ret = phy_init_eee(ndev->phydev, false);
                if (ret)
                        return ret;
 
index af99017..7d49c28 100644 (file)
@@ -101,7 +101,6 @@ static int fec_ptp_enable_pps(struct fec_enet_private *fep, uint enable)
        u32 val, tempval;
        struct timespec64 ts;
        u64 ns;
-       val = 0;
 
        if (fep->pps_enable == enable)
                return 0;
index 266e562..ef8058a 100644 (file)
@@ -14,6 +14,7 @@
 
 #include <linux/acpi.h>
 #include <linux/acpi_mdio.h>
+#include <linux/clk.h>
 #include <linux/interrupt.h>
 #include <linux/kernel.h>
 #include <linux/mdio.h>
@@ -36,9 +37,10 @@ struct tgec_mdio_controller {
 } __packed;
 
 #define MDIO_STAT_ENC          BIT(6)
-#define MDIO_STAT_CLKDIV(x)    (((x>>1) & 0xff) << 8)
+#define MDIO_STAT_CLKDIV(x)    (((x) & 0x1ff) << 7)
 #define MDIO_STAT_BSY          BIT(0)
 #define MDIO_STAT_RD_ER                BIT(1)
+#define MDIO_STAT_PRE_DIS      BIT(5)
 #define MDIO_CTL_DEV_ADDR(x)   (x & 0x1f)
 #define MDIO_CTL_PORT_ADDR(x)  ((x & 0x1f) << 5)
 #define MDIO_CTL_PRE_DIS       BIT(10)
@@ -50,6 +52,8 @@ struct tgec_mdio_controller {
 
 struct mdio_fsl_priv {
        struct  tgec_mdio_controller __iomem *mdio_base;
+       struct  clk *enet_clk;
+       u32     mdc_freq;
        bool    is_little_endian;
        bool    has_a009885;
        bool    has_a011043;
@@ -254,6 +258,50 @@ irq_restore:
        return ret;
 }
 
+static int xgmac_mdio_set_mdc_freq(struct mii_bus *bus)
+{
+       struct mdio_fsl_priv *priv = (struct mdio_fsl_priv *)bus->priv;
+       struct tgec_mdio_controller __iomem *regs = priv->mdio_base;
+       struct device *dev = bus->parent;
+       u32 mdio_stat, div;
+
+       if (device_property_read_u32(dev, "clock-frequency", &priv->mdc_freq))
+               return 0;
+
+       priv->enet_clk = devm_clk_get(dev, NULL);
+       if (IS_ERR(priv->enet_clk)) {
+               dev_err(dev, "Input clock unknown, not changing MDC frequency");
+               return PTR_ERR(priv->enet_clk);
+       }
+
+       div = ((clk_get_rate(priv->enet_clk) / priv->mdc_freq) - 1) / 2;
+       if (div < 5 || div > 0x1ff) {
+               dev_err(dev, "Requested MDC frequency is out of range, ignoring");
+               return -EINVAL;
+       }
+
+       mdio_stat = xgmac_read32(&regs->mdio_stat, priv->is_little_endian);
+       mdio_stat &= ~MDIO_STAT_CLKDIV(0x1ff);
+       mdio_stat |= MDIO_STAT_CLKDIV(div);
+       xgmac_write32(mdio_stat, &regs->mdio_stat, priv->is_little_endian);
+       return 0;
+}
+
+static void xgmac_mdio_set_suppress_preamble(struct mii_bus *bus)
+{
+       struct mdio_fsl_priv *priv = (struct mdio_fsl_priv *)bus->priv;
+       struct tgec_mdio_controller __iomem *regs = priv->mdio_base;
+       struct device *dev = bus->parent;
+       u32 mdio_stat;
+
+       if (!device_property_read_bool(dev, "suppress-preamble"))
+               return;
+
+       mdio_stat = xgmac_read32(&regs->mdio_stat, priv->is_little_endian);
+       mdio_stat |= MDIO_STAT_PRE_DIS;
+       xgmac_write32(mdio_stat, &regs->mdio_stat, priv->is_little_endian);
+}
+
 static int xgmac_mdio_probe(struct platform_device *pdev)
 {
        struct fwnode_handle *fwnode;
@@ -273,7 +321,7 @@ static int xgmac_mdio_probe(struct platform_device *pdev)
                return -EINVAL;
        }
 
-       bus = mdiobus_alloc_size(sizeof(struct mdio_fsl_priv));
+       bus = devm_mdiobus_alloc_size(&pdev->dev, sizeof(struct mdio_fsl_priv));
        if (!bus)
                return -ENOMEM;
 
@@ -284,13 +332,11 @@ static int xgmac_mdio_probe(struct platform_device *pdev)
        bus->probe_capabilities = MDIOBUS_C22_C45;
        snprintf(bus->id, MII_BUS_ID_SIZE, "%pa", &res->start);
 
-       /* Set the PHY base address */
        priv = bus->priv;
-       priv->mdio_base = ioremap(res->start, resource_size(res));
-       if (!priv->mdio_base) {
-               ret = -ENOMEM;
-               goto err_ioremap;
-       }
+       priv->mdio_base = devm_ioremap(&pdev->dev, res->start,
+                                      resource_size(res));
+       if (!priv->mdio_base)
+               return -ENOMEM;
 
        /* For both ACPI and DT cases, endianness of MDIO controller
         * needs to be specified using "little-endian" property.
@@ -303,6 +349,12 @@ static int xgmac_mdio_probe(struct platform_device *pdev)
        priv->has_a011043 = device_property_read_bool(&pdev->dev,
                                                      "fsl,erratum-a011043");
 
+       xgmac_mdio_set_suppress_preamble(bus);
+
+       ret = xgmac_mdio_set_mdc_freq(bus);
+       if (ret)
+               return ret;
+
        fwnode = pdev->dev.fwnode;
        if (is_of_node(fwnode))
                ret = of_mdiobus_register(bus, to_of_node(fwnode));
@@ -312,32 +364,12 @@ static int xgmac_mdio_probe(struct platform_device *pdev)
                ret = -EINVAL;
        if (ret) {
                dev_err(&pdev->dev, "cannot register MDIO bus\n");
-               goto err_registration;
+               return ret;
        }
 
        platform_set_drvdata(pdev, bus);
 
        return 0;
-
-err_registration:
-       iounmap(priv->mdio_base);
-
-err_ioremap:
-       mdiobus_free(bus);
-
-       return ret;
-}
-
-static int xgmac_mdio_remove(struct platform_device *pdev)
-{
-       struct mii_bus *bus = platform_get_drvdata(pdev);
-       struct mdio_fsl_priv *priv = bus->priv;
-
-       mdiobus_unregister(bus);
-       iounmap(priv->mdio_base);
-       mdiobus_free(bus);
-
-       return 0;
 }
 
 static const struct of_device_id xgmac_mdio_match[] = {
@@ -364,7 +396,6 @@ static struct platform_driver xgmac_mdio_driver = {
                .acpi_match_table = xgmac_acpi_match,
        },
        .probe = xgmac_mdio_probe,
-       .remove = xgmac_mdio_remove,
 };
 
 module_platform_driver(xgmac_mdio_driver);
index 9298fbe..6f18c9a 100644 (file)
@@ -167,6 +167,7 @@ struct hnae3_handle;
 
 struct hnae3_queue {
        void __iomem *io_base;
+       void __iomem *mem_base;
        struct hnae3_ae_algo *ae_algo;
        struct hnae3_handle *handle;
        int tqp_index;          /* index in a handle */
index babc5d7..0b8a73c 100644 (file)
@@ -2028,9 +2028,73 @@ static int hns3_fill_skb_to_desc(struct hns3_enet_ring *ring,
        return bd_num;
 }
 
+static void hns3_tx_push_bd(struct hns3_enet_ring *ring, int num)
+{
+#define HNS3_BYTES_PER_64BIT           8
+
+       struct hns3_desc desc[HNS3_MAX_PUSH_BD_NUM] = {};
+       int offset = 0;
+
+       /* make sure everything is visible to device before
+        * excuting tx push or updating doorbell
+        */
+       dma_wmb();
+
+       do {
+               int idx = (ring->next_to_use - num + ring->desc_num) %
+                         ring->desc_num;
+
+               u64_stats_update_begin(&ring->syncp);
+               ring->stats.tx_push++;
+               u64_stats_update_end(&ring->syncp);
+               memcpy(&desc[offset], &ring->desc[idx],
+                      sizeof(struct hns3_desc));
+               offset++;
+       } while (--num);
+
+       __iowrite64_copy(ring->tqp->mem_base, desc,
+                        (sizeof(struct hns3_desc) * HNS3_MAX_PUSH_BD_NUM) /
+                        HNS3_BYTES_PER_64BIT);
+
+       io_stop_wc();
+}
+
+static void hns3_tx_mem_doorbell(struct hns3_enet_ring *ring)
+{
+#define HNS3_MEM_DOORBELL_OFFSET       64
+
+       __le64 bd_num = cpu_to_le64((u64)ring->pending_buf);
+
+       /* make sure everything is visible to device before
+        * excuting tx push or updating doorbell
+        */
+       dma_wmb();
+
+       __iowrite64_copy(ring->tqp->mem_base + HNS3_MEM_DOORBELL_OFFSET,
+                        &bd_num, 1);
+       u64_stats_update_begin(&ring->syncp);
+       ring->stats.tx_mem_doorbell += ring->pending_buf;
+       u64_stats_update_end(&ring->syncp);
+
+       io_stop_wc();
+}
+
 static void hns3_tx_doorbell(struct hns3_enet_ring *ring, int num,
                             bool doorbell)
 {
+       struct net_device *netdev = ring_to_netdev(ring);
+       struct hns3_nic_priv *priv = netdev_priv(netdev);
+
+       /* when tx push is enabled, the packet whose number of BD below
+        * HNS3_MAX_PUSH_BD_NUM can be pushed directly.
+        */
+       if (test_bit(HNS3_NIC_STATE_TX_PUSH_ENABLE, &priv->state) && num &&
+           !ring->pending_buf && num <= HNS3_MAX_PUSH_BD_NUM && doorbell) {
+               hns3_tx_push_bd(ring, num);
+               WRITE_ONCE(ring->last_to_use, ring->next_to_use);
+               return;
+       }
+
        ring->pending_buf += num;
 
        if (!doorbell) {
@@ -2038,11 +2102,12 @@ static void hns3_tx_doorbell(struct hns3_enet_ring *ring, int num,
                return;
        }
 
-       if (!ring->pending_buf)
-               return;
+       if (ring->tqp->mem_base)
+               hns3_tx_mem_doorbell(ring);
+       else
+               writel(ring->pending_buf,
+                      ring->tqp->io_base + HNS3_RING_TX_RING_TAIL_REG);
 
-       writel(ring->pending_buf,
-              ring->tqp->io_base + HNS3_RING_TX_RING_TAIL_REG);
        ring->pending_buf = 0;
        WRITE_ONCE(ring->last_to_use, ring->next_to_use);
 }
@@ -2732,6 +2797,9 @@ static void hns3_dump_queue_stats(struct net_device *ndev,
                    "seg_pkt_cnt: %llu, tx_more: %llu, restart_queue: %llu, tx_busy: %llu\n",
                    tx_ring->stats.seg_pkt_cnt, tx_ring->stats.tx_more,
                    tx_ring->stats.restart_queue, tx_ring->stats.tx_busy);
+
+       netdev_info(ndev, "tx_push: %llu, tx_mem_doorbell: %llu\n",
+                   tx_ring->stats.tx_push, tx_ring->stats.tx_mem_doorbell);
 }
 
 static void hns3_dump_queue_reg(struct net_device *ndev,
@@ -5094,6 +5162,9 @@ static void hns3_state_init(struct hnae3_handle *handle)
 
        set_bit(HNS3_NIC_STATE_INITED, &priv->state);
 
+       if (test_bit(HNAE3_DEV_SUPPORT_TX_PUSH_B, ae_dev->caps))
+               set_bit(HNS3_NIC_STATE_TX_PUSH_ENABLE, &priv->state);
+
        if (ae_dev->dev_version >= HNAE3_DEVICE_VERSION_V3)
                set_bit(HNAE3_PFLAG_LIMIT_PROMISC, &handle->supported_pflags);
 
index a05a0c7..4a32536 100644 (file)
@@ -7,6 +7,7 @@
 #include <linux/dim.h>
 #include <linux/if_vlan.h>
 #include <net/page_pool.h>
+#include <asm/barrier.h>
 
 #include "hnae3.h"
 
@@ -25,9 +26,12 @@ enum hns3_nic_state {
        HNS3_NIC_STATE2_RESET_REQUESTED,
        HNS3_NIC_STATE_HW_TX_CSUM_ENABLE,
        HNS3_NIC_STATE_RXD_ADV_LAYOUT_ENABLE,
+       HNS3_NIC_STATE_TX_PUSH_ENABLE,
        HNS3_NIC_STATE_MAX
 };
 
+#define HNS3_MAX_PUSH_BD_NUM           2
+
 #define HNS3_RING_RX_RING_BASEADDR_L_REG       0x00000
 #define HNS3_RING_RX_RING_BASEADDR_H_REG       0x00004
 #define HNS3_RING_RX_RING_BD_NUM_REG           0x00008
@@ -410,6 +414,8 @@ struct ring_stats {
                        u64 tx_pkts;
                        u64 tx_bytes;
                        u64 tx_more;
+                       u64 tx_push;
+                       u64 tx_mem_doorbell;
                        u64 restart_queue;
                        u64 tx_busy;
                        u64 tx_copy;
index c06c39e..6469238 100644 (file)
@@ -23,6 +23,8 @@ static const struct hns3_stats hns3_txq_stats[] = {
        HNS3_TQP_STAT("packets", tx_pkts),
        HNS3_TQP_STAT("bytes", tx_bytes),
        HNS3_TQP_STAT("more", tx_more),
+       HNS3_TQP_STAT("push", tx_push),
+       HNS3_TQP_STAT("mem_doorbell", tx_mem_doorbell),
        HNS3_TQP_STAT("wake", restart_queue),
        HNS3_TQP_STAT("busy", tx_busy),
        HNS3_TQP_STAT("copy", tx_copy),
index 24f7afa..78d0498 100644 (file)
@@ -1643,6 +1643,7 @@ static int hclge_config_gro(struct hclge_dev *hdev)
 
 static int hclge_alloc_tqps(struct hclge_dev *hdev)
 {
+       struct hnae3_ae_dev *ae_dev = pci_get_drvdata(hdev->pdev);
        struct hclge_comm_tqp *tqp;
        int i;
 
@@ -1676,6 +1677,14 @@ static int hclge_alloc_tqps(struct hclge_dev *hdev)
                                         (i - HCLGE_TQP_MAX_SIZE_DEV_V2) *
                                         HCLGE_TQP_REG_SIZE;
 
+               /* when device supports tx push and has device memory,
+                * the queue can execute push mode or doorbell mode on
+                * device memory.
+                */
+               if (test_bit(HNAE3_DEV_SUPPORT_TX_PUSH_B, ae_dev->caps))
+                       tqp->q.mem_base = hdev->hw.hw.mem_base +
+                                         HCLGE_TQP_MEM_OFFSET(hdev, i);
+
                tqp++;
        }
 
@@ -11008,8 +11017,6 @@ static void hclge_uninit_client_instance(struct hnae3_client *client,
 
 static int hclge_dev_mem_map(struct hclge_dev *hdev)
 {
-#define HCLGE_MEM_BAR          4
-
        struct pci_dev *pdev = hdev->pdev;
        struct hclge_hw *hw = &hdev->hw;
 
index adfb26e..f7f5a4b 100644 (file)
@@ -169,6 +169,14 @@ enum HLCGE_PORT_TYPE {
 #define HCLGE_VECTOR0_ALL_MSIX_ERR_B   6U
 #define HCLGE_TRIGGER_IMP_RESET_B      7U
 
+#define HCLGE_TQP_MEM_SIZE             0x10000
+#define HCLGE_MEM_BAR                  4
+/* in the bar4, the first half is for roce, and the second half is for nic */
+#define HCLGE_NIC_MEM_OFFSET(hdev)     \
+       (pci_resource_len((hdev)->pdev, HCLGE_MEM_BAR) >> 1)
+#define HCLGE_TQP_MEM_OFFSET(hdev, i)  \
+       (HCLGE_NIC_MEM_OFFSET(hdev) + HCLGE_TQP_MEM_SIZE * (i))
+
 #define HCLGE_MAC_DEFAULT_FRAME \
        (ETH_HLEN + ETH_FCS_LEN + 2 * VLAN_HLEN + ETH_DATA_LEN)
 #define HCLGE_MAC_MIN_FRAME            64
index 21442a9..93389be 100644 (file)
@@ -321,6 +321,7 @@ static int hclgevf_get_pf_media_type(struct hclgevf_dev *hdev)
 
 static int hclgevf_alloc_tqps(struct hclgevf_dev *hdev)
 {
+       struct hnae3_ae_dev *ae_dev = pci_get_drvdata(hdev->pdev);
        struct hclge_comm_tqp *tqp;
        int i;
 
@@ -354,6 +355,14 @@ static int hclgevf_alloc_tqps(struct hclgevf_dev *hdev)
                                         (i - HCLGEVF_TQP_MAX_SIZE_DEV_V2) *
                                         HCLGEVF_TQP_REG_SIZE;
 
+               /* when device supports tx push and has device memory,
+                * the queue can execute push mode or doorbell mode on
+                * device memory.
+                */
+               if (test_bit(HNAE3_DEV_SUPPORT_TX_PUSH_B, ae_dev->caps))
+                       tqp->q.mem_base = hdev->hw.hw.mem_base +
+                                         HCLGEVF_TQP_MEM_OFFSET(hdev, i);
+
                tqp++;
        }
 
@@ -2546,8 +2555,6 @@ static void hclgevf_uninit_client_instance(struct hnae3_client *client,
 
 static int hclgevf_dev_mem_map(struct hclgevf_dev *hdev)
 {
-#define HCLGEVF_MEM_BAR                4
-
        struct pci_dev *pdev = hdev->pdev;
        struct hclgevf_hw *hw = &hdev->hw;
 
index 502ca1c..4b00fd4 100644 (file)
 
 #define HCLGEVF_RSS_IND_TBL_SIZE               512
 
+#define HCLGEVF_TQP_MEM_SIZE           0x10000
+#define HCLGEVF_MEM_BAR                        4
+/* in the bar4, the first half is for roce, and the second half is for nic */
+#define HCLGEVF_NIC_MEM_OFFSET(hdev)   \
+       (pci_resource_len((hdev)->pdev, HCLGEVF_MEM_BAR) >> 1)
+#define HCLGEVF_TQP_MEM_OFFSET(hdev, i)                \
+       (HCLGEVF_NIC_MEM_OFFSET(hdev) + HCLGEVF_TQP_MEM_SIZE * (i))
+
 #define HCLGEVF_MAC_MAX_FRAME          9728
 
 #define HCLGEVF_STATS_TIMER_INTERVAL   36U
index a42aeb5..6fb3437 100644 (file)
@@ -7388,9 +7388,9 @@ static int e1000_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
        resource_size_t flash_start, flash_len;
        static int cards_found;
        u16 aspm_disable_flag = 0;
-       int bars, i, err, pci_using_dac;
        u16 eeprom_data = 0;
        u16 eeprom_apme_mask = E1000_EEPROM_APME;
+       int bars, i, err;
        s32 ret_val = 0;
 
        if (ei->flags2 & FLAG2_DISABLE_ASPM_L0S)
@@ -7404,17 +7404,11 @@ static int e1000_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
        if (err)
                return err;
 
-       pci_using_dac = 0;
        err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
-       if (!err) {
-               pci_using_dac = 1;
-       } else {
-               err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
-               if (err) {
-                       dev_err(&pdev->dev,
-                               "No usable DMA configuration, aborting\n");
-                       goto err_dma;
-               }
+       if (err) {
+               dev_err(&pdev->dev,
+                       "No usable DMA configuration, aborting\n");
+               goto err_dma;
        }
 
        bars = pci_select_bars(pdev, IORESOURCE_MEM);
@@ -7550,10 +7544,8 @@ static int e1000_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
        netdev->priv_flags |= IFF_UNICAST_FLT;
 
-       if (pci_using_dac) {
-               netdev->features |= NETIF_F_HIGHDMA;
-               netdev->vlan_features |= NETIF_F_HIGHDMA;
-       }
+       netdev->features |= NETIF_F_HIGHDMA;
+       netdev->vlan_features |= NETIF_F_HIGHDMA;
 
        /* MTU range: 68 - max_hw_frame_size */
        netdev->min_mtu = ETH_MIN_MTU;
index 80c5cec..55c6bce 100644 (file)
@@ -854,6 +854,10 @@ struct i40e_vsi {
        u64 tx_force_wb;
        u64 rx_buf_failed;
        u64 rx_page_failed;
+       u64 rx_page_reuse;
+       u64 rx_page_alloc;
+       u64 rx_page_waive;
+       u64 rx_page_busy;
 
        /* These are containers of ring pointers, allocated at run-time */
        struct i40e_ring **rx_rings;
index 7abef88..42439f7 100644 (file)
@@ -769,7 +769,7 @@ static bool i40e_asq_done(struct i40e_hw *hw)
 }
 
 /**
- *  i40e_asq_send_command_atomic - send command to Admin Queue
+ *  i40e_asq_send_command_atomic_exec - send command to Admin Queue
  *  @hw: pointer to the hw struct
  *  @desc: prefilled descriptor describing the command (non DMA mem)
  *  @buff: buffer to use for indirect commands
@@ -780,11 +780,13 @@ static bool i40e_asq_done(struct i40e_hw *hw)
  *  This is the main send command driver routine for the Admin Queue send
  *  queue.  It runs the queue, cleans the queue, etc
  **/
-i40e_status
-i40e_asq_send_command_atomic(struct i40e_hw *hw, struct i40e_aq_desc *desc,
-                            void *buff, /* can be NULL */ u16  buff_size,
-                            struct i40e_asq_cmd_details *cmd_details,
-                            bool is_atomic_context)
+static i40e_status
+i40e_asq_send_command_atomic_exec(struct i40e_hw *hw,
+                                 struct i40e_aq_desc *desc,
+                                 void *buff, /* can be NULL */
+                                 u16  buff_size,
+                                 struct i40e_asq_cmd_details *cmd_details,
+                                 bool is_atomic_context)
 {
        i40e_status status = 0;
        struct i40e_dma_mem *dma_buff = NULL;
@@ -794,8 +796,6 @@ i40e_asq_send_command_atomic(struct i40e_hw *hw, struct i40e_aq_desc *desc,
        u16  retval = 0;
        u32  val = 0;
 
-       mutex_lock(&hw->aq.asq_mutex);
-
        if (hw->aq.asq.count == 0) {
                i40e_debug(hw, I40E_DEBUG_AQ_MESSAGE,
                           "AQTX: Admin queue not initialized.\n");
@@ -969,6 +969,36 @@ i40e_asq_send_command_atomic(struct i40e_hw *hw, struct i40e_aq_desc *desc,
        }
 
 asq_send_command_error:
+       return status;
+}
+
+/**
+ *  i40e_asq_send_command_atomic - send command to Admin Queue
+ *  @hw: pointer to the hw struct
+ *  @desc: prefilled descriptor describing the command (non DMA mem)
+ *  @buff: buffer to use for indirect commands
+ *  @buff_size: size of buffer for indirect commands
+ *  @cmd_details: pointer to command details structure
+ *  @is_atomic_context: is the function called in an atomic context?
+ *
+ *  Acquires the lock and calls the main send command execution
+ *  routine.
+ **/
+i40e_status
+i40e_asq_send_command_atomic(struct i40e_hw *hw,
+                            struct i40e_aq_desc *desc,
+                            void *buff, /* can be NULL */
+                            u16  buff_size,
+                            struct i40e_asq_cmd_details *cmd_details,
+                            bool is_atomic_context)
+{
+       i40e_status status;
+
+       mutex_lock(&hw->aq.asq_mutex);
+       status = i40e_asq_send_command_atomic_exec(hw, desc, buff, buff_size,
+                                                  cmd_details,
+                                                  is_atomic_context);
+
        mutex_unlock(&hw->aq.asq_mutex);
        return status;
 }
@@ -983,6 +1013,52 @@ i40e_asq_send_command(struct i40e_hw *hw, struct i40e_aq_desc *desc,
 }
 
 /**
+ *  i40e_asq_send_command_atomic_v2 - send command to Admin Queue
+ *  @hw: pointer to the hw struct
+ *  @desc: prefilled descriptor describing the command (non DMA mem)
+ *  @buff: buffer to use for indirect commands
+ *  @buff_size: size of buffer for indirect commands
+ *  @cmd_details: pointer to command details structure
+ *  @is_atomic_context: is the function called in an atomic context?
+ *  @aq_status: pointer to Admin Queue status return value
+ *
+ *  Acquires the lock and calls the main send command execution
+ *  routine. Returns the last Admin Queue status in aq_status
+ *  to avoid race conditions in access to hw->aq.asq_last_status.
+ **/
+i40e_status
+i40e_asq_send_command_atomic_v2(struct i40e_hw *hw,
+                               struct i40e_aq_desc *desc,
+                               void *buff, /* can be NULL */
+                               u16  buff_size,
+                               struct i40e_asq_cmd_details *cmd_details,
+                               bool is_atomic_context,
+                               enum i40e_admin_queue_err *aq_status)
+{
+       i40e_status status;
+
+       mutex_lock(&hw->aq.asq_mutex);
+       status = i40e_asq_send_command_atomic_exec(hw, desc, buff,
+                                                  buff_size,
+                                                  cmd_details,
+                                                  is_atomic_context);
+       if (aq_status)
+               *aq_status = hw->aq.asq_last_status;
+       mutex_unlock(&hw->aq.asq_mutex);
+       return status;
+}
+
+i40e_status
+i40e_asq_send_command_v2(struct i40e_hw *hw, struct i40e_aq_desc *desc,
+                        void *buff, /* can be NULL */ u16  buff_size,
+                        struct i40e_asq_cmd_details *cmd_details,
+                        enum i40e_admin_queue_err *aq_status)
+{
+       return i40e_asq_send_command_atomic_v2(hw, desc, buff, buff_size,
+                                              cmd_details, true, aq_status);
+}
+
+/**
  *  i40e_fill_default_direct_cmd_desc - AQ descriptor helper function
  *  @desc:     pointer to the temp descriptor (non DMA mem)
  *  @opcode:   the opcode can be used to decide which flags to turn off or on
index 9ddeb01..6aefffd 100644 (file)
@@ -1899,8 +1899,9 @@ i40e_status i40e_aq_add_vsi(struct i40e_hw *hw,
 
        desc.flags |= cpu_to_le16((u16)(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD));
 
-       status = i40e_asq_send_command(hw, &desc, &vsi_ctx->info,
-                                   sizeof(vsi_ctx->info), cmd_details);
+       status = i40e_asq_send_command_atomic(hw, &desc, &vsi_ctx->info,
+                                             sizeof(vsi_ctx->info),
+                                             cmd_details, true);
 
        if (status)
                goto aq_add_vsi_exit;
@@ -2287,8 +2288,9 @@ i40e_status i40e_aq_update_vsi_params(struct i40e_hw *hw,
 
        desc.flags |= cpu_to_le16((u16)(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD));
 
-       status = i40e_asq_send_command(hw, &desc, &vsi_ctx->info,
-                                   sizeof(vsi_ctx->info), cmd_details);
+       status = i40e_asq_send_command_atomic(hw, &desc, &vsi_ctx->info,
+                                             sizeof(vsi_ctx->info),
+                                             cmd_details, true);
 
        vsi_ctx->vsis_allocated = le16_to_cpu(resp->vsi_used);
        vsi_ctx->vsis_unallocated = le16_to_cpu(resp->vsi_free);
@@ -2632,33 +2634,28 @@ get_veb_exit:
 }
 
 /**
- * i40e_aq_add_macvlan
- * @hw: pointer to the hw struct
- * @seid: VSI for the mac address
+ * i40e_prepare_add_macvlan
  * @mv_list: list of macvlans to be added
+ * @desc: pointer to AQ descriptor structure
  * @count: length of the list
- * @cmd_details: pointer to command details structure or NULL
+ * @seid: VSI for the mac address
  *
- * Add MAC/VLAN addresses to the HW filtering
+ * Internal helper function that prepares the add macvlan request
+ * and returns the buffer size.
  **/
-i40e_status i40e_aq_add_macvlan(struct i40e_hw *hw, u16 seid,
-                       struct i40e_aqc_add_macvlan_element_data *mv_list,
-                       u16 count, struct i40e_asq_cmd_details *cmd_details)
+static u16
+i40e_prepare_add_macvlan(struct i40e_aqc_add_macvlan_element_data *mv_list,
+                        struct i40e_aq_desc *desc, u16 count, u16 seid)
 {
-       struct i40e_aq_desc desc;
        struct i40e_aqc_macvlan *cmd =
-               (struct i40e_aqc_macvlan *)&desc.params.raw;
-       i40e_status status;
+               (struct i40e_aqc_macvlan *)&desc->params.raw;
        u16 buf_size;
        int i;
 
-       if (count == 0 || !mv_list || !hw)
-               return I40E_ERR_PARAM;
-
        buf_size = count * sizeof(*mv_list);
 
        /* prep the rest of the request */
-       i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_add_macvlan);
+       i40e_fill_default_direct_cmd_desc(desc, i40e_aqc_opc_add_macvlan);
        cmd->num_addresses = cpu_to_le16(count);
        cmd->seid[0] = cpu_to_le16(I40E_AQC_MACVLAN_CMD_SEID_VALID | seid);
        cmd->seid[1] = 0;
@@ -2669,14 +2666,71 @@ i40e_status i40e_aq_add_macvlan(struct i40e_hw *hw, u16 seid,
                        mv_list[i].flags |=
                               cpu_to_le16(I40E_AQC_MACVLAN_ADD_USE_SHARED_MAC);
 
-       desc.flags |= cpu_to_le16((u16)(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD));
+       desc->flags |= cpu_to_le16((u16)(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD));
        if (buf_size > I40E_AQ_LARGE_BUF)
-               desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_LB);
+               desc->flags |= cpu_to_le16((u16)I40E_AQ_FLAG_LB);
 
-       status = i40e_asq_send_command(hw, &desc, mv_list, buf_size,
-                                      cmd_details);
+       return buf_size;
+}
 
-       return status;
+/**
+ * i40e_aq_add_macvlan
+ * @hw: pointer to the hw struct
+ * @seid: VSI for the mac address
+ * @mv_list: list of macvlans to be added
+ * @count: length of the list
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Add MAC/VLAN addresses to the HW filtering
+ **/
+i40e_status
+i40e_aq_add_macvlan(struct i40e_hw *hw, u16 seid,
+                   struct i40e_aqc_add_macvlan_element_data *mv_list,
+                   u16 count, struct i40e_asq_cmd_details *cmd_details)
+{
+       struct i40e_aq_desc desc;
+       u16 buf_size;
+
+       if (count == 0 || !mv_list || !hw)
+               return I40E_ERR_PARAM;
+
+       buf_size = i40e_prepare_add_macvlan(mv_list, &desc, count, seid);
+
+       return i40e_asq_send_command_atomic(hw, &desc, mv_list, buf_size,
+                                           cmd_details, true);
+}
+
+/**
+ * i40e_aq_add_macvlan_v2
+ * @hw: pointer to the hw struct
+ * @seid: VSI for the mac address
+ * @mv_list: list of macvlans to be added
+ * @count: length of the list
+ * @cmd_details: pointer to command details structure or NULL
+ * @aq_status: pointer to Admin Queue status return value
+ *
+ * Add MAC/VLAN addresses to the HW filtering.
+ * The _v2 version returns the last Admin Queue status in aq_status
+ * to avoid race conditions in access to hw->aq.asq_last_status.
+ * It also calls _v2 versions of asq_send_command functions to
+ * get the aq_status on the stack.
+ **/
+i40e_status
+i40e_aq_add_macvlan_v2(struct i40e_hw *hw, u16 seid,
+                      struct i40e_aqc_add_macvlan_element_data *mv_list,
+                      u16 count, struct i40e_asq_cmd_details *cmd_details,
+                      enum i40e_admin_queue_err *aq_status)
+{
+       struct i40e_aq_desc desc;
+       u16 buf_size;
+
+       if (count == 0 || !mv_list || !hw)
+               return I40E_ERR_PARAM;
+
+       buf_size = i40e_prepare_add_macvlan(mv_list, &desc, count, seid);
+
+       return i40e_asq_send_command_atomic_v2(hw, &desc, mv_list, buf_size,
+                                              cmd_details, true, aq_status);
 }
 
 /**
@@ -2715,13 +2769,59 @@ i40e_status i40e_aq_remove_macvlan(struct i40e_hw *hw, u16 seid,
        if (buf_size > I40E_AQ_LARGE_BUF)
                desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_LB);
 
-       status = i40e_asq_send_command(hw, &desc, mv_list, buf_size,
-                                      cmd_details);
+       status = i40e_asq_send_command_atomic(hw, &desc, mv_list, buf_size,
+                                             cmd_details, true);
 
        return status;
 }
 
 /**
+ * i40e_aq_remove_macvlan_v2
+ * @hw: pointer to the hw struct
+ * @seid: VSI for the mac address
+ * @mv_list: list of macvlans to be removed
+ * @count: length of the list
+ * @cmd_details: pointer to command details structure or NULL
+ * @aq_status: pointer to Admin Queue status return value
+ *
+ * Remove MAC/VLAN addresses from the HW filtering.
+ * The _v2 version returns the last Admin Queue status in aq_status
+ * to avoid race conditions in access to hw->aq.asq_last_status.
+ * It also calls _v2 versions of asq_send_command functions to
+ * get the aq_status on the stack.
+ **/
+i40e_status
+i40e_aq_remove_macvlan_v2(struct i40e_hw *hw, u16 seid,
+                         struct i40e_aqc_remove_macvlan_element_data *mv_list,
+                         u16 count, struct i40e_asq_cmd_details *cmd_details,
+                         enum i40e_admin_queue_err *aq_status)
+{
+       struct i40e_aqc_macvlan *cmd;
+       struct i40e_aq_desc desc;
+       u16 buf_size;
+
+       if (count == 0 || !mv_list || !hw)
+               return I40E_ERR_PARAM;
+
+       buf_size = count * sizeof(*mv_list);
+
+       /* prep the rest of the request */
+       i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_remove_macvlan);
+       cmd = (struct i40e_aqc_macvlan *)&desc.params.raw;
+       cmd->num_addresses = cpu_to_le16(count);
+       cmd->seid[0] = cpu_to_le16(I40E_AQC_MACVLAN_CMD_SEID_VALID | seid);
+       cmd->seid[1] = 0;
+       cmd->seid[2] = 0;
+
+       desc.flags |= cpu_to_le16((u16)(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD));
+       if (buf_size > I40E_AQ_LARGE_BUF)
+               desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_LB);
+
+       return i40e_asq_send_command_atomic_v2(hw, &desc, mv_list, buf_size,
+                                                cmd_details, true, aq_status);
+}
+
+/**
  * i40e_mirrorrule_op - Internal helper function to add/delete mirror rule
  * @hw: pointer to the hw struct
  * @opcode: AQ opcode for add or delete mirror rule
@@ -3868,7 +3968,8 @@ i40e_status i40e_aq_delete_element(struct i40e_hw *hw, u16 seid,
 
        cmd->seid = cpu_to_le16(seid);
 
-       status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+       status = i40e_asq_send_command_atomic(hw, &desc, NULL, 0,
+                                             cmd_details, true);
 
        return status;
 }
index 1e57cc8..90fff05 100644 (file)
@@ -275,9 +275,8 @@ static void i40e_dbg_dump_vsi_seid(struct i40e_pf *pf, int seid)
                         rx_ring->rx_stats.alloc_page_failed,
                         rx_ring->rx_stats.alloc_buff_failed);
                dev_info(&pf->pdev->dev,
-                        "    rx_rings[%i]: rx_stats: realloc_count = %lld, page_reuse_count = %lld\n",
+                        "    rx_rings[%i]: rx_stats: realloc_count = 0, page_reuse_count = %lld\n",
                         i,
-                        rx_ring->rx_stats.realloc_count,
                         rx_ring->rx_stats.page_reuse_count);
                dev_info(&pf->pdev->dev,
                         "    rx_rings[%i]: size = %i\n",
index 091f36a..e484996 100644 (file)
@@ -295,6 +295,10 @@ static const struct i40e_stats i40e_gstrings_misc_stats[] = {
        I40E_VSI_STAT("tx_busy", tx_busy),
        I40E_VSI_STAT("rx_alloc_fail", rx_buf_failed),
        I40E_VSI_STAT("rx_pg_alloc_fail", rx_page_failed),
+       I40E_VSI_STAT("rx_cache_reuse", rx_page_reuse),
+       I40E_VSI_STAT("rx_cache_alloc", rx_page_alloc),
+       I40E_VSI_STAT("rx_cache_waive", rx_page_waive),
+       I40E_VSI_STAT("rx_cache_busy", rx_page_busy),
 };
 
 /* These PF_STATs might look like duplicates of some NETDEV_STATs,
index 0c4b7df..9b7ce6d 100644 (file)
@@ -773,6 +773,7 @@ void i40e_update_veb_stats(struct i40e_veb *veb)
  **/
 static void i40e_update_vsi_stats(struct i40e_vsi *vsi)
 {
+       u64 rx_page, rx_buf, rx_reuse, rx_alloc, rx_waive, rx_busy;
        struct i40e_pf *pf = vsi->back;
        struct rtnl_link_stats64 *ons;
        struct rtnl_link_stats64 *ns;   /* netdev stats */
@@ -780,7 +781,6 @@ static void i40e_update_vsi_stats(struct i40e_vsi *vsi)
        struct i40e_eth_stats *es;     /* device's eth stats */
        u64 tx_restart, tx_busy;
        struct i40e_ring *p;
-       u64 rx_page, rx_buf;
        u64 bytes, packets;
        unsigned int start;
        u64 tx_linearize;
@@ -806,6 +806,10 @@ static void i40e_update_vsi_stats(struct i40e_vsi *vsi)
        tx_restart = tx_busy = tx_linearize = tx_force_wb = 0;
        rx_page = 0;
        rx_buf = 0;
+       rx_reuse = 0;
+       rx_alloc = 0;
+       rx_waive = 0;
+       rx_busy = 0;
        rcu_read_lock();
        for (q = 0; q < vsi->num_queue_pairs; q++) {
                /* locate Tx ring */
@@ -839,6 +843,10 @@ static void i40e_update_vsi_stats(struct i40e_vsi *vsi)
                rx_p += packets;
                rx_buf += p->rx_stats.alloc_buff_failed;
                rx_page += p->rx_stats.alloc_page_failed;
+               rx_reuse += p->rx_stats.page_reuse_count;
+               rx_alloc += p->rx_stats.page_alloc_count;
+               rx_waive += p->rx_stats.page_waive_count;
+               rx_busy += p->rx_stats.page_busy_count;
 
                if (i40e_enabled_xdp_vsi(vsi)) {
                        /* locate XDP ring */
@@ -866,6 +874,10 @@ static void i40e_update_vsi_stats(struct i40e_vsi *vsi)
        vsi->tx_force_wb = tx_force_wb;
        vsi->rx_page_failed = rx_page;
        vsi->rx_buf_failed = rx_buf;
+       vsi->rx_page_reuse = rx_reuse;
+       vsi->rx_page_alloc = rx_alloc;
+       vsi->rx_page_waive = rx_waive;
+       vsi->rx_page_busy = rx_busy;
 
        ns->rx_packets = rx_p;
        ns->rx_bytes = rx_b;
@@ -2143,19 +2155,19 @@ void i40e_aqc_del_filters(struct i40e_vsi *vsi, const char *vsi_name,
                          int num_del, int *retval)
 {
        struct i40e_hw *hw = &vsi->back->hw;
+       enum i40e_admin_queue_err aq_status;
        i40e_status aq_ret;
-       int aq_err;
 
-       aq_ret = i40e_aq_remove_macvlan(hw, vsi->seid, list, num_del, NULL);
-       aq_err = hw->aq.asq_last_status;
+       aq_ret = i40e_aq_remove_macvlan_v2(hw, vsi->seid, list, num_del, NULL,
+                                          &aq_status);
 
        /* Explicitly ignore and do not report when firmware returns ENOENT */
-       if (aq_ret && !(aq_err == I40E_AQ_RC_ENOENT)) {
+       if (aq_ret && !(aq_status == I40E_AQ_RC_ENOENT)) {
                *retval = -EIO;
                dev_info(&vsi->back->pdev->dev,
                         "ignoring delete macvlan error on %s, err %s, aq_err %s\n",
                         vsi_name, i40e_stat_str(hw, aq_ret),
-                        i40e_aq_str(hw, aq_err));
+                        i40e_aq_str(hw, aq_status));
        }
 }
 
@@ -2178,10 +2190,10 @@ void i40e_aqc_add_filters(struct i40e_vsi *vsi, const char *vsi_name,
                          int num_add)
 {
        struct i40e_hw *hw = &vsi->back->hw;
-       int aq_err, fcnt;
+       enum i40e_admin_queue_err aq_status;
+       int fcnt;
 
-       i40e_aq_add_macvlan(hw, vsi->seid, list, num_add, NULL);
-       aq_err = hw->aq.asq_last_status;
+       i40e_aq_add_macvlan_v2(hw, vsi->seid, list, num_add, NULL, &aq_status);
        fcnt = i40e_update_filter_state(num_add, list, add_head);
 
        if (fcnt != num_add) {
@@ -2189,17 +2201,19 @@ void i40e_aqc_add_filters(struct i40e_vsi *vsi, const char *vsi_name,
                        set_bit(__I40E_VSI_OVERFLOW_PROMISC, vsi->state);
                        dev_warn(&vsi->back->pdev->dev,
                                 "Error %s adding RX filters on %s, promiscuous mode forced on\n",
-                                i40e_aq_str(hw, aq_err), vsi_name);
+                                i40e_aq_str(hw, aq_status), vsi_name);
                } else if (vsi->type == I40E_VSI_SRIOV ||
                           vsi->type == I40E_VSI_VMDQ1 ||
                           vsi->type == I40E_VSI_VMDQ2) {
                        dev_warn(&vsi->back->pdev->dev,
                                 "Error %s adding RX filters on %s, please set promiscuous on manually for %s\n",
-                                i40e_aq_str(hw, aq_err), vsi_name, vsi_name);
+                                i40e_aq_str(hw, aq_status), vsi_name,
+                                            vsi_name);
                } else {
                        dev_warn(&vsi->back->pdev->dev,
                                 "Error %s adding RX filters on %s, incorrect VSI type: %i.\n",
-                                i40e_aq_str(hw, aq_err), vsi_name, vsi->type);
+                                i40e_aq_str(hw, aq_status), vsi_name,
+                                            vsi->type);
                }
        }
 }
@@ -12722,7 +12736,8 @@ static int i40e_set_features(struct net_device *netdev,
        else
                i40e_vlan_stripping_disable(vsi);
 
-       if (!(features & NETIF_F_HW_TC) && pf->num_cloud_filters) {
+       if (!(features & NETIF_F_HW_TC) &&
+           (netdev->features & NETIF_F_HW_TC) && pf->num_cloud_filters) {
                dev_err(&pf->pdev->dev,
                        "Offloaded tc filters active, can't turn hw_tc_offload off");
                return -EINVAL;
@@ -13478,6 +13493,8 @@ static int i40e_config_netdev(struct i40e_vsi *vsi)
        netdev->features |= hw_features | NETIF_F_HW_VLAN_CTAG_FILTER;
        netdev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
 
+       netdev->features &= ~NETIF_F_HW_TC;
+
        if (vsi->type == I40E_VSI_MAIN) {
                SET_NETDEV_DEV(netdev, &pf->pdev->dev);
                ether_addr_copy(mac_addr, hw->mac.perm_addr);
@@ -15341,12 +15358,9 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
        /* set up for high or low dma */
        err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
        if (err) {
-               err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
-               if (err) {
-                       dev_err(&pdev->dev,
-                               "DMA configuration failed: 0x%x\n", err);
-                       goto err_dma;
-               }
+               dev_err(&pdev->dev,
+                       "DMA configuration failed: 0x%x\n", err);
+               goto err_dma;
        }
 
        /* set up pci connections */
index 9241b60..ebdcde6 100644 (file)
@@ -27,10 +27,25 @@ i40e_asq_send_command(struct i40e_hw *hw, struct i40e_aq_desc *desc,
                      void *buff, /* can be NULL */ u16  buff_size,
                      struct i40e_asq_cmd_details *cmd_details);
 i40e_status
+i40e_asq_send_command_v2(struct i40e_hw *hw,
+                        struct i40e_aq_desc *desc,
+                        void *buff, /* can be NULL */
+                        u16  buff_size,
+                        struct i40e_asq_cmd_details *cmd_details,
+                        enum i40e_admin_queue_err *aq_status);
+i40e_status
 i40e_asq_send_command_atomic(struct i40e_hw *hw, struct i40e_aq_desc *desc,
                             void *buff, /* can be NULL */ u16  buff_size,
                             struct i40e_asq_cmd_details *cmd_details,
                             bool is_atomic_context);
+i40e_status
+i40e_asq_send_command_atomic_v2(struct i40e_hw *hw,
+                               struct i40e_aq_desc *desc,
+                               void *buff, /* can be NULL */
+                               u16  buff_size,
+                               struct i40e_asq_cmd_details *cmd_details,
+                               bool is_atomic_context,
+                               enum i40e_admin_queue_err *aq_status);
 
 /* debug function for adminq */
 void i40e_debug_aq(struct i40e_hw *hw, enum i40e_debug_mask mask,
@@ -150,9 +165,19 @@ i40e_status i40e_aq_get_veb_parameters(struct i40e_hw *hw,
 i40e_status i40e_aq_add_macvlan(struct i40e_hw *hw, u16 vsi_id,
                        struct i40e_aqc_add_macvlan_element_data *mv_list,
                        u16 count, struct i40e_asq_cmd_details *cmd_details);
+i40e_status
+i40e_aq_add_macvlan_v2(struct i40e_hw *hw, u16 seid,
+                      struct i40e_aqc_add_macvlan_element_data *mv_list,
+                      u16 count, struct i40e_asq_cmd_details *cmd_details,
+                      enum i40e_admin_queue_err *aq_status);
 i40e_status i40e_aq_remove_macvlan(struct i40e_hw *hw, u16 vsi_id,
                        struct i40e_aqc_remove_macvlan_element_data *mv_list,
                        u16 count, struct i40e_asq_cmd_details *cmd_details);
+i40e_status
+i40e_aq_remove_macvlan_v2(struct i40e_hw *hw, u16 seid,
+                         struct i40e_aqc_remove_macvlan_element_data *mv_list,
+                         u16 count, struct i40e_asq_cmd_details *cmd_details,
+                         enum i40e_admin_queue_err *aq_status);
 i40e_status i40e_aq_add_mirrorrule(struct i40e_hw *hw, u16 sw_seid,
                        u16 rule_type, u16 dest_vsi, u16 count, __le16 *mr_list,
                        struct i40e_asq_cmd_details *cmd_details,
index 66cc795..0eae585 100644 (file)
@@ -830,8 +830,6 @@ void i40e_free_tx_resources(struct i40e_ring *tx_ring)
        i40e_clean_tx_ring(tx_ring);
        kfree(tx_ring->tx_bi);
        tx_ring->tx_bi = NULL;
-       kfree(tx_ring->xsk_descs);
-       tx_ring->xsk_descs = NULL;
 
        if (tx_ring->desc) {
                dma_free_coherent(tx_ring->dev, tx_ring->size,
@@ -1382,8 +1380,6 @@ static void i40e_reuse_rx_page(struct i40e_ring *rx_ring,
        new_buff->page_offset   = old_buff->page_offset;
        new_buff->pagecnt_bias  = old_buff->pagecnt_bias;
 
-       rx_ring->rx_stats.page_reuse_count++;
-
        /* clear contents of buffer_info */
        old_buff->page = NULL;
 }
@@ -1433,13 +1429,6 @@ int i40e_setup_tx_descriptors(struct i40e_ring *tx_ring)
        if (!tx_ring->tx_bi)
                goto err;
 
-       if (ring_is_xdp(tx_ring)) {
-               tx_ring->xsk_descs = kcalloc(I40E_MAX_NUM_DESCRIPTORS, sizeof(*tx_ring->xsk_descs),
-                                            GFP_KERNEL);
-               if (!tx_ring->xsk_descs)
-                       goto err;
-       }
-
        u64_stats_init(&tx_ring->syncp);
 
        /* round up to nearest 4K */
@@ -1463,8 +1452,6 @@ int i40e_setup_tx_descriptors(struct i40e_ring *tx_ring)
        return 0;
 
 err:
-       kfree(tx_ring->xsk_descs);
-       tx_ring->xsk_descs = NULL;
        kfree(tx_ring->tx_bi);
        tx_ring->tx_bi = NULL;
        return -ENOMEM;
@@ -1675,6 +1662,8 @@ static bool i40e_alloc_mapped_page(struct i40e_ring *rx_ring,
                return false;
        }
 
+       rx_ring->rx_stats.page_alloc_count++;
+
        /* map page for use */
        dma = dma_map_page_attrs(rx_ring->dev, page, 0,
                                 i40e_rx_pg_size(rx_ring),
@@ -1982,32 +1971,43 @@ static bool i40e_cleanup_headers(struct i40e_ring *rx_ring, struct sk_buff *skb,
 /**
  * i40e_can_reuse_rx_page - Determine if page can be reused for another Rx
  * @rx_buffer: buffer containing the page
+ * @rx_stats: rx stats structure for the rx ring
  * @rx_buffer_pgcnt: buffer page refcount pre xdp_do_redirect() call
  *
  * If page is reusable, we have a green light for calling i40e_reuse_rx_page,
  * which will assign the current buffer to the buffer that next_to_alloc is
  * pointing to; otherwise, the DMA mapping needs to be destroyed and
- * page freed
+ * page freed.
+ *
+ * rx_stats will be updated to indicate whether the page was waived
+ * or busy if it could not be reused.
  */
 static bool i40e_can_reuse_rx_page(struct i40e_rx_buffer *rx_buffer,
+                                  struct i40e_rx_queue_stats *rx_stats,
                                   int rx_buffer_pgcnt)
 {
        unsigned int pagecnt_bias = rx_buffer->pagecnt_bias;
        struct page *page = rx_buffer->page;
 
        /* Is any reuse possible? */
-       if (!dev_page_is_reusable(page))
+       if (!dev_page_is_reusable(page)) {
+               rx_stats->page_waive_count++;
                return false;
+       }
 
 #if (PAGE_SIZE < 8192)
        /* if we are only owner of page we can reuse it */
-       if (unlikely((rx_buffer_pgcnt - pagecnt_bias) > 1))
+       if (unlikely((rx_buffer_pgcnt - pagecnt_bias) > 1)) {
+               rx_stats->page_busy_count++;
                return false;
+       }
 #else
 #define I40E_LAST_OFFSET \
        (SKB_WITH_OVERHEAD(PAGE_SIZE) - I40E_RXBUFFER_2048)
-       if (rx_buffer->page_offset > I40E_LAST_OFFSET)
+       if (rx_buffer->page_offset > I40E_LAST_OFFSET) {
+               rx_stats->page_busy_count++;
                return false;
+       }
 #endif
 
        /* If we have drained the page fragment pool we need to update
@@ -2237,7 +2237,7 @@ static void i40e_put_rx_buffer(struct i40e_ring *rx_ring,
                               struct i40e_rx_buffer *rx_buffer,
                               int rx_buffer_pgcnt)
 {
-       if (i40e_can_reuse_rx_page(rx_buffer, rx_buffer_pgcnt)) {
+       if (i40e_can_reuse_rx_page(rx_buffer, &rx_ring->rx_stats, rx_buffer_pgcnt)) {
                /* hand second half of page back to the ring */
                i40e_reuse_rx_page(rx_ring, rx_buffer);
        } else {
index bfc2845..c471c2d 100644 (file)
@@ -298,7 +298,9 @@ struct i40e_rx_queue_stats {
        u64 alloc_page_failed;
        u64 alloc_buff_failed;
        u64 page_reuse_count;
-       u64 realloc_count;
+       u64 page_alloc_count;
+       u64 page_waive_count;
+       u64 page_busy_count;
 };
 
 enum i40e_ring_state_t {
@@ -390,7 +392,6 @@ struct i40e_ring {
        u16 rx_offset;
        struct xdp_rxq_info xdp_rxq;
        struct xsk_buff_pool *xsk_pool;
-       struct xdp_desc *xsk_descs;      /* For storing descriptors in the AF_XDP ZC path */
 } ____cacheline_internodealigned_in_smp;
 
 static inline bool ring_uses_build_skb(struct i40e_ring *ring)
index 945b1bb..5a997b0 100644 (file)
@@ -241,21 +241,25 @@ bool i40e_alloc_rx_buffers_zc(struct i40e_ring *rx_ring, u16 count)
 static struct sk_buff *i40e_construct_skb_zc(struct i40e_ring *rx_ring,
                                             struct xdp_buff *xdp)
 {
+       unsigned int totalsize = xdp->data_end - xdp->data_meta;
        unsigned int metasize = xdp->data - xdp->data_meta;
-       unsigned int datasize = xdp->data_end - xdp->data;
        struct sk_buff *skb;
 
+       net_prefetch(xdp->data_meta);
+
        /* allocate a skb to store the frags */
-       skb = __napi_alloc_skb(&rx_ring->q_vector->napi,
-                              xdp->data_end - xdp->data_hard_start,
+       skb = __napi_alloc_skb(&rx_ring->q_vector->napi, totalsize,
                               GFP_ATOMIC | __GFP_NOWARN);
        if (unlikely(!skb))
                goto out;
 
-       skb_reserve(skb, xdp->data - xdp->data_hard_start);
-       memcpy(__skb_put(skb, datasize), xdp->data, datasize);
-       if (metasize)
+       memcpy(__skb_put(skb, totalsize), xdp->data_meta,
+              ALIGN(totalsize, sizeof(long)));
+
+       if (metasize) {
                skb_metadata_set(skb, metasize);
+               __skb_pull(skb, metasize);
+       }
 
 out:
        xsk_buff_free(xdp);
@@ -467,11 +471,11 @@ static void i40e_set_rs_bit(struct i40e_ring *xdp_ring)
  **/
 static bool i40e_xmit_zc(struct i40e_ring *xdp_ring, unsigned int budget)
 {
-       struct xdp_desc *descs = xdp_ring->xsk_descs;
+       struct xdp_desc *descs = xdp_ring->xsk_pool->tx_descs;
        u32 nb_pkts, nb_processed = 0;
        unsigned int total_bytes = 0;
 
-       nb_pkts = xsk_tx_peek_release_desc_batch(xdp_ring->xsk_pool, descs, budget);
+       nb_pkts = xsk_tx_peek_release_desc_batch(xdp_ring->xsk_pool, budget);
        if (!nb_pkts)
                return true;
 
index 8125b91..b0bd95c 100644 (file)
@@ -4368,12 +4368,9 @@ static int iavf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
        err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
        if (err) {
-               err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
-               if (err) {
-                       dev_err(&pdev->dev,
-                               "DMA configuration failed: 0x%x\n", err);
-                       goto err_dma;
-               }
+               dev_err(&pdev->dev,
+                       "DMA configuration failed: 0x%x\n", err);
+               goto err_dma;
        }
 
        err = pci_request_regions(pdev, iavf_driver_name);
index c36faa7..389fff7 100644 (file)
@@ -18,8 +18,12 @@ ice-y := ice_main.o  \
         ice_txrx_lib.o \
         ice_txrx.o     \
         ice_fltr.o     \
+        ice_pf_vsi_vlan_ops.o \
+        ice_vsi_vlan_ops.o \
+        ice_vsi_vlan_lib.o \
         ice_fdir.o     \
         ice_ethtool_fdir.o \
+        ice_vlan_mode.o \
         ice_flex_pipe.o \
         ice_flow.o     \
         ice_idc.o      \
@@ -29,8 +33,12 @@ ice-y := ice_main.o  \
         ice_ethtool.o  \
         ice_repr.o     \
         ice_tc_lib.o
-ice-$(CONFIG_PCI_IOV) += ice_virtchnl_allowlist.o
-ice-$(CONFIG_PCI_IOV) += ice_virtchnl_pf.o ice_sriov.o ice_virtchnl_fdir.o
+ice-$(CONFIG_PCI_IOV) +=       \
+       ice_virtchnl_allowlist.o \
+       ice_virtchnl_fdir.o     \
+       ice_sriov.o             \
+       ice_vf_vsi_vlan_ops.o   \
+       ice_virtchnl_pf.o
 ice-$(CONFIG_PTP_1588_CLOCK) += ice_ptp.o ice_ptp_hw.o
 ice-$(CONFIG_DCB) += ice_dcb.o ice_dcb_nl.o ice_dcb_lib.o
 ice-$(CONFIG_RFS_ACCEL) += ice_arfs.o
index a9fa701..827fcb5 100644 (file)
@@ -72,6 +72,7 @@
 #include "ice_repr.h"
 #include "ice_eswitch.h"
 #include "ice_lag.h"
+#include "ice_vsi_vlan_ops.h"
 
 #define ICE_BAR0               0
 #define ICE_REQ_DESC_MULTIPLE  32
@@ -368,6 +369,8 @@ struct ice_vsi {
        u8 irqs_ready:1;
        u8 current_isup:1;               /* Sync 'link up' logging */
        u8 stat_offsets_loaded:1;
+       struct ice_vsi_vlan_ops inner_vlan_ops;
+       struct ice_vsi_vlan_ops outer_vlan_ops;
        u16 num_vlan;
 
        /* queue information */
@@ -482,6 +485,7 @@ enum ice_pf_flags {
        ICE_FLAG_LEGACY_RX,
        ICE_FLAG_VF_TRUE_PROMISC_ENA,
        ICE_FLAG_MDD_AUTO_RESET_VF,
+       ICE_FLAG_VF_VLAN_PRUNING,
        ICE_FLAG_LINK_LENIENT_MODE_ENA,
        ICE_FLAG_PLUG_AUX_DEV,
        ICE_PF_FLAGS_NBITS              /* must be last */
index ad1dcfa..fd8ee5b 100644 (file)
@@ -226,6 +226,15 @@ struct ice_aqc_get_sw_cfg_resp_elem {
 #define ICE_AQC_GET_SW_CONF_RESP_IS_VF         BIT(15)
 };
 
+/* Set Port parameters, (direct, 0x0203) */
+struct ice_aqc_set_port_params {
+       __le16 cmd_flags;
+#define ICE_AQC_SET_P_PARAMS_DOUBLE_VLAN_ENA   BIT(2)
+       __le16 bad_frame_vsi;
+       __le16 swid;
+       u8 reserved[10];
+};
+
 /* These resource type defines are used for all switch resource
  * commands where a resource type is required, such as:
  * Get Resource Allocation command (indirect 0x0204)
@@ -283,6 +292,40 @@ struct ice_aqc_alloc_free_res_elem {
        struct ice_aqc_res_elem elem[];
 };
 
+/* Request buffer for Set VLAN Mode AQ command (indirect 0x020C) */
+struct ice_aqc_set_vlan_mode {
+       u8 reserved;
+       u8 l2tag_prio_tagging;
+#define ICE_AQ_VLAN_PRIO_TAG_S                 0
+#define ICE_AQ_VLAN_PRIO_TAG_M                 (0x7 << ICE_AQ_VLAN_PRIO_TAG_S)
+#define ICE_AQ_VLAN_PRIO_TAG_NOT_SUPPORTED     0x0
+#define ICE_AQ_VLAN_PRIO_TAG_STAG              0x1
+#define ICE_AQ_VLAN_PRIO_TAG_OUTER_CTAG                0x2
+#define ICE_AQ_VLAN_PRIO_TAG_OUTER_VLAN                0x3
+#define ICE_AQ_VLAN_PRIO_TAG_INNER_CTAG                0x4
+#define ICE_AQ_VLAN_PRIO_TAG_MAX               0x4
+#define ICE_AQ_VLAN_PRIO_TAG_ERROR             0x7
+       u8 l2tag_reserved[64];
+       u8 rdma_packet;
+#define ICE_AQ_VLAN_RDMA_TAG_S                 0
+#define ICE_AQ_VLAN_RDMA_TAG_M                 (0x3F << ICE_AQ_VLAN_RDMA_TAG_S)
+#define ICE_AQ_SVM_VLAN_RDMA_PKT_FLAG_SETTING  0x10
+#define ICE_AQ_DVM_VLAN_RDMA_PKT_FLAG_SETTING  0x1A
+       u8 rdma_reserved[2];
+       u8 mng_vlan_prot_id;
+#define ICE_AQ_VLAN_MNG_PROTOCOL_ID_OUTER      0x10
+#define ICE_AQ_VLAN_MNG_PROTOCOL_ID_INNER      0x11
+       u8 prot_id_reserved[30];
+};
+
+/* Response buffer for Get VLAN Mode AQ command (indirect 0x020D) */
+struct ice_aqc_get_vlan_mode {
+       u8 vlan_mode;
+#define ICE_AQ_VLAN_MODE_DVM_ENA       BIT(0)
+       u8 l2tag_prio_tagging;
+       u8 reserved[98];
+};
+
 /* Add VSI (indirect 0x0210)
  * Update VSI (indirect 0x0211)
  * Get VSI (indirect 0x0212)
@@ -343,108 +386,113 @@ struct ice_aqc_vsi_props {
 #define ICE_AQ_VSI_SW_FLAG_SRC_PRUNE           BIT(7)
        u8 sw_flags2;
 #define ICE_AQ_VSI_SW_FLAG_RX_PRUNE_EN_S       0
-#define ICE_AQ_VSI_SW_FLAG_RX_PRUNE_EN_M       \
-                               (0xF << ICE_AQ_VSI_SW_FLAG_RX_PRUNE_EN_S)
+#define ICE_AQ_VSI_SW_FLAG_RX_PRUNE_EN_M       (0xF << ICE_AQ_VSI_SW_FLAG_RX_PRUNE_EN_S)
 #define ICE_AQ_VSI_SW_FLAG_RX_VLAN_PRUNE_ENA   BIT(0)
 #define ICE_AQ_VSI_SW_FLAG_LAN_ENA             BIT(4)
        u8 veb_stat_id;
 #define ICE_AQ_VSI_SW_VEB_STAT_ID_S            0
-#define ICE_AQ_VSI_SW_VEB_STAT_ID_M    (0x1F << ICE_AQ_VSI_SW_VEB_STAT_ID_S)
+#define ICE_AQ_VSI_SW_VEB_STAT_ID_M            (0x1F << ICE_AQ_VSI_SW_VEB_STAT_ID_S)
 #define ICE_AQ_VSI_SW_VEB_STAT_ID_VALID                BIT(5)
        /* security section */
        u8 sec_flags;
 #define ICE_AQ_VSI_SEC_FLAG_ALLOW_DEST_OVRD    BIT(0)
 #define ICE_AQ_VSI_SEC_FLAG_ENA_MAC_ANTI_SPOOF BIT(2)
-#define ICE_AQ_VSI_SEC_TX_PRUNE_ENA_S  4
-#define ICE_AQ_VSI_SEC_TX_PRUNE_ENA_M  (0xF << ICE_AQ_VSI_SEC_TX_PRUNE_ENA_S)
+#define ICE_AQ_VSI_SEC_TX_PRUNE_ENA_S          4
+#define ICE_AQ_VSI_SEC_TX_PRUNE_ENA_M          (0xF << ICE_AQ_VSI_SEC_TX_PRUNE_ENA_S)
 #define ICE_AQ_VSI_SEC_TX_VLAN_PRUNE_ENA       BIT(0)
        u8 sec_reserved;
        /* VLAN section */
-       __le16 pvid; /* VLANS include priority bits */
-       u8 pvlan_reserved[2];
-       u8 vlan_flags;
-#define ICE_AQ_VSI_VLAN_MODE_S 0
-#define ICE_AQ_VSI_VLAN_MODE_M (0x3 << ICE_AQ_VSI_VLAN_MODE_S)
-#define ICE_AQ_VSI_VLAN_MODE_UNTAGGED  0x1
-#define ICE_AQ_VSI_VLAN_MODE_TAGGED    0x2
-#define ICE_AQ_VSI_VLAN_MODE_ALL       0x3
-#define ICE_AQ_VSI_PVLAN_INSERT_PVID   BIT(2)
-#define ICE_AQ_VSI_VLAN_EMOD_S         3
-#define ICE_AQ_VSI_VLAN_EMOD_M         (0x3 << ICE_AQ_VSI_VLAN_EMOD_S)
-#define ICE_AQ_VSI_VLAN_EMOD_STR_BOTH  (0x0 << ICE_AQ_VSI_VLAN_EMOD_S)
-#define ICE_AQ_VSI_VLAN_EMOD_STR_UP    (0x1 << ICE_AQ_VSI_VLAN_EMOD_S)
-#define ICE_AQ_VSI_VLAN_EMOD_STR       (0x2 << ICE_AQ_VSI_VLAN_EMOD_S)
-#define ICE_AQ_VSI_VLAN_EMOD_NOTHING   (0x3 << ICE_AQ_VSI_VLAN_EMOD_S)
-       u8 pvlan_reserved2[3];
+       __le16 port_based_inner_vlan; /* VLANS include priority bits */
+       u8 inner_vlan_reserved[2];
+       u8 inner_vlan_flags;
+#define ICE_AQ_VSI_INNER_VLAN_TX_MODE_S                0
+#define ICE_AQ_VSI_INNER_VLAN_TX_MODE_M                (0x3 << ICE_AQ_VSI_INNER_VLAN_TX_MODE_S)
+#define ICE_AQ_VSI_INNER_VLAN_TX_MODE_ACCEPTUNTAGGED   0x1
+#define ICE_AQ_VSI_INNER_VLAN_TX_MODE_ACCEPTTAGGED     0x2
+#define ICE_AQ_VSI_INNER_VLAN_TX_MODE_ALL      0x3
+#define ICE_AQ_VSI_INNER_VLAN_INSERT_PVID      BIT(2)
+#define ICE_AQ_VSI_INNER_VLAN_EMODE_S          3
+#define ICE_AQ_VSI_INNER_VLAN_EMODE_M          (0x3 << ICE_AQ_VSI_INNER_VLAN_EMODE_S)
+#define ICE_AQ_VSI_INNER_VLAN_EMODE_STR_BOTH   (0x0 << ICE_AQ_VSI_INNER_VLAN_EMODE_S)
+#define ICE_AQ_VSI_INNER_VLAN_EMODE_STR_UP     (0x1 << ICE_AQ_VSI_INNER_VLAN_EMODE_S)
+#define ICE_AQ_VSI_INNER_VLAN_EMODE_STR                (0x2 << ICE_AQ_VSI_INNER_VLAN_EMODE_S)
+#define ICE_AQ_VSI_INNER_VLAN_EMODE_NOTHING    (0x3 << ICE_AQ_VSI_INNER_VLAN_EMODE_S)
+       u8 inner_vlan_reserved2[3];
        /* ingress egress up sections */
        __le32 ingress_table; /* bitmap, 3 bits per up */
-#define ICE_AQ_VSI_UP_TABLE_UP0_S      0
-#define ICE_AQ_VSI_UP_TABLE_UP0_M      (0x7 << ICE_AQ_VSI_UP_TABLE_UP0_S)
-#define ICE_AQ_VSI_UP_TABLE_UP1_S      3
-#define ICE_AQ_VSI_UP_TABLE_UP1_M      (0x7 << ICE_AQ_VSI_UP_TABLE_UP1_S)
-#define ICE_AQ_VSI_UP_TABLE_UP2_S      6
-#define ICE_AQ_VSI_UP_TABLE_UP2_M      (0x7 << ICE_AQ_VSI_UP_TABLE_UP2_S)
-#define ICE_AQ_VSI_UP_TABLE_UP3_S      9
-#define ICE_AQ_VSI_UP_TABLE_UP3_M      (0x7 << ICE_AQ_VSI_UP_TABLE_UP3_S)
-#define ICE_AQ_VSI_UP_TABLE_UP4_S      12
-#define ICE_AQ_VSI_UP_TABLE_UP4_M      (0x7 << ICE_AQ_VSI_UP_TABLE_UP4_S)
-#define ICE_AQ_VSI_UP_TABLE_UP5_S      15
-#define ICE_AQ_VSI_UP_TABLE_UP5_M      (0x7 << ICE_AQ_VSI_UP_TABLE_UP5_S)
-#define ICE_AQ_VSI_UP_TABLE_UP6_S      18
-#define ICE_AQ_VSI_UP_TABLE_UP6_M      (0x7 << ICE_AQ_VSI_UP_TABLE_UP6_S)
-#define ICE_AQ_VSI_UP_TABLE_UP7_S      21
-#define ICE_AQ_VSI_UP_TABLE_UP7_M      (0x7 << ICE_AQ_VSI_UP_TABLE_UP7_S)
+#define ICE_AQ_VSI_UP_TABLE_UP0_S              0
+#define ICE_AQ_VSI_UP_TABLE_UP0_M              (0x7 << ICE_AQ_VSI_UP_TABLE_UP0_S)
+#define ICE_AQ_VSI_UP_TABLE_UP1_S              3
+#define ICE_AQ_VSI_UP_TABLE_UP1_M              (0x7 << ICE_AQ_VSI_UP_TABLE_UP1_S)
+#define ICE_AQ_VSI_UP_TABLE_UP2_S              6
+#define ICE_AQ_VSI_UP_TABLE_UP2_M              (0x7 << ICE_AQ_VSI_UP_TABLE_UP2_S)
+#define ICE_AQ_VSI_UP_TABLE_UP3_S              9
+#define ICE_AQ_VSI_UP_TABLE_UP3_M              (0x7 << ICE_AQ_VSI_UP_TABLE_UP3_S)
+#define ICE_AQ_VSI_UP_TABLE_UP4_S              12
+#define ICE_AQ_VSI_UP_TABLE_UP4_M              (0x7 << ICE_AQ_VSI_UP_TABLE_UP4_S)
+#define ICE_AQ_VSI_UP_TABLE_UP5_S              15
+#define ICE_AQ_VSI_UP_TABLE_UP5_M              (0x7 << ICE_AQ_VSI_UP_TABLE_UP5_S)
+#define ICE_AQ_VSI_UP_TABLE_UP6_S              18
+#define ICE_AQ_VSI_UP_TABLE_UP6_M              (0x7 << ICE_AQ_VSI_UP_TABLE_UP6_S)
+#define ICE_AQ_VSI_UP_TABLE_UP7_S              21
+#define ICE_AQ_VSI_UP_TABLE_UP7_M              (0x7 << ICE_AQ_VSI_UP_TABLE_UP7_S)
        __le32 egress_table;   /* same defines as for ingress table */
        /* outer tags section */
-       __le16 outer_tag;
-       u8 outer_tag_flags;
-#define ICE_AQ_VSI_OUTER_TAG_MODE_S    0
-#define ICE_AQ_VSI_OUTER_TAG_MODE_M    (0x3 << ICE_AQ_VSI_OUTER_TAG_MODE_S)
-#define ICE_AQ_VSI_OUTER_TAG_NOTHING   0x0
-#define ICE_AQ_VSI_OUTER_TAG_REMOVE    0x1
-#define ICE_AQ_VSI_OUTER_TAG_COPY      0x2
-#define ICE_AQ_VSI_OUTER_TAG_TYPE_S    2
-#define ICE_AQ_VSI_OUTER_TAG_TYPE_M    (0x3 << ICE_AQ_VSI_OUTER_TAG_TYPE_S)
-#define ICE_AQ_VSI_OUTER_TAG_NONE      0x0
-#define ICE_AQ_VSI_OUTER_TAG_STAG      0x1
-#define ICE_AQ_VSI_OUTER_TAG_VLAN_8100 0x2
-#define ICE_AQ_VSI_OUTER_TAG_VLAN_9100 0x3
-#define ICE_AQ_VSI_OUTER_TAG_INSERT    BIT(4)
-#define ICE_AQ_VSI_OUTER_TAG_ACCEPT_HOST BIT(6)
-       u8 outer_tag_reserved;
+       __le16 port_based_outer_vlan;
+       u8 outer_vlan_flags;
+#define ICE_AQ_VSI_OUTER_VLAN_EMODE_S          0
+#define ICE_AQ_VSI_OUTER_VLAN_EMODE_M          (0x3 << ICE_AQ_VSI_OUTER_VLAN_EMODE_S)
+#define ICE_AQ_VSI_OUTER_VLAN_EMODE_SHOW_BOTH  0x0
+#define ICE_AQ_VSI_OUTER_VLAN_EMODE_SHOW_UP    0x1
+#define ICE_AQ_VSI_OUTER_VLAN_EMODE_SHOW       0x2
+#define ICE_AQ_VSI_OUTER_VLAN_EMODE_NOTHING    0x3
+#define ICE_AQ_VSI_OUTER_TAG_TYPE_S            2
+#define ICE_AQ_VSI_OUTER_TAG_TYPE_M            (0x3 << ICE_AQ_VSI_OUTER_TAG_TYPE_S)
+#define ICE_AQ_VSI_OUTER_TAG_NONE              0x0
+#define ICE_AQ_VSI_OUTER_TAG_STAG              0x1
+#define ICE_AQ_VSI_OUTER_TAG_VLAN_8100         0x2
+#define ICE_AQ_VSI_OUTER_TAG_VLAN_9100         0x3
+#define ICE_AQ_VSI_OUTER_VLAN_PORT_BASED_INSERT                BIT(4)
+#define ICE_AQ_VSI_OUTER_VLAN_TX_MODE_S                        5
+#define ICE_AQ_VSI_OUTER_VLAN_TX_MODE_M                        (0x3 << ICE_AQ_VSI_OUTER_VLAN_TX_MODE_S)
+#define ICE_AQ_VSI_OUTER_VLAN_TX_MODE_ACCEPTUNTAGGED   0x1
+#define ICE_AQ_VSI_OUTER_VLAN_TX_MODE_ACCEPTTAGGED     0x2
+#define ICE_AQ_VSI_OUTER_VLAN_TX_MODE_ALL              0x3
+#define ICE_AQ_VSI_OUTER_VLAN_BLOCK_TX_DESC            BIT(7)
+       u8 outer_vlan_reserved;
        /* queue mapping section */
        __le16 mapping_flags;
-#define ICE_AQ_VSI_Q_MAP_CONTIG        0x0
-#define ICE_AQ_VSI_Q_MAP_NONCONTIG     BIT(0)
+#define ICE_AQ_VSI_Q_MAP_CONTIG                        0x0
+#define ICE_AQ_VSI_Q_MAP_NONCONTIG             BIT(0)
        __le16 q_mapping[16];
-#define ICE_AQ_VSI_Q_S         0
-#define ICE_AQ_VSI_Q_M         (0x7FF << ICE_AQ_VSI_Q_S)
+#define ICE_AQ_VSI_Q_S                         0
+#define ICE_AQ_VSI_Q_M                         (0x7FF << ICE_AQ_VSI_Q_S)
        __le16 tc_mapping[8];
-#define ICE_AQ_VSI_TC_Q_OFFSET_S       0
-#define ICE_AQ_VSI_TC_Q_OFFSET_M       (0x7FF << ICE_AQ_VSI_TC_Q_OFFSET_S)
-#define ICE_AQ_VSI_TC_Q_NUM_S          11
-#define ICE_AQ_VSI_TC_Q_NUM_M          (0xF << ICE_AQ_VSI_TC_Q_NUM_S)
+#define ICE_AQ_VSI_TC_Q_OFFSET_S               0
+#define ICE_AQ_VSI_TC_Q_OFFSET_M               (0x7FF << ICE_AQ_VSI_TC_Q_OFFSET_S)
+#define ICE_AQ_VSI_TC_Q_NUM_S                  11
+#define ICE_AQ_VSI_TC_Q_NUM_M                  (0xF << ICE_AQ_VSI_TC_Q_NUM_S)
        /* queueing option section */
        u8 q_opt_rss;
-#define ICE_AQ_VSI_Q_OPT_RSS_LUT_S     0
-#define ICE_AQ_VSI_Q_OPT_RSS_LUT_M     (0x3 << ICE_AQ_VSI_Q_OPT_RSS_LUT_S)
-#define ICE_AQ_VSI_Q_OPT_RSS_LUT_VSI   0x0
-#define ICE_AQ_VSI_Q_OPT_RSS_LUT_PF    0x2
-#define ICE_AQ_VSI_Q_OPT_RSS_LUT_GBL   0x3
-#define ICE_AQ_VSI_Q_OPT_RSS_GBL_LUT_S 2
-#define ICE_AQ_VSI_Q_OPT_RSS_GBL_LUT_M (0xF << ICE_AQ_VSI_Q_OPT_RSS_GBL_LUT_S)
-#define ICE_AQ_VSI_Q_OPT_RSS_HASH_S    6
-#define ICE_AQ_VSI_Q_OPT_RSS_HASH_M    (0x3 << ICE_AQ_VSI_Q_OPT_RSS_HASH_S)
-#define ICE_AQ_VSI_Q_OPT_RSS_TPLZ      (0x0 << ICE_AQ_VSI_Q_OPT_RSS_HASH_S)
-#define ICE_AQ_VSI_Q_OPT_RSS_SYM_TPLZ  (0x1 << ICE_AQ_VSI_Q_OPT_RSS_HASH_S)
-#define ICE_AQ_VSI_Q_OPT_RSS_XOR       (0x2 << ICE_AQ_VSI_Q_OPT_RSS_HASH_S)
-#define ICE_AQ_VSI_Q_OPT_RSS_JHASH     (0x3 << ICE_AQ_VSI_Q_OPT_RSS_HASH_S)
+#define ICE_AQ_VSI_Q_OPT_RSS_LUT_S             0
+#define ICE_AQ_VSI_Q_OPT_RSS_LUT_M             (0x3 << ICE_AQ_VSI_Q_OPT_RSS_LUT_S)
+#define ICE_AQ_VSI_Q_OPT_RSS_LUT_VSI           0x0
+#define ICE_AQ_VSI_Q_OPT_RSS_LUT_PF            0x2
+#define ICE_AQ_VSI_Q_OPT_RSS_LUT_GBL           0x3
+#define ICE_AQ_VSI_Q_OPT_RSS_GBL_LUT_S         2
+#define ICE_AQ_VSI_Q_OPT_RSS_GBL_LUT_M         (0xF << ICE_AQ_VSI_Q_OPT_RSS_GBL_LUT_S)
+#define ICE_AQ_VSI_Q_OPT_RSS_HASH_S            6
+#define ICE_AQ_VSI_Q_OPT_RSS_HASH_M            (0x3 << ICE_AQ_VSI_Q_OPT_RSS_HASH_S)
+#define ICE_AQ_VSI_Q_OPT_RSS_TPLZ              (0x0 << ICE_AQ_VSI_Q_OPT_RSS_HASH_S)
+#define ICE_AQ_VSI_Q_OPT_RSS_SYM_TPLZ          (0x1 << ICE_AQ_VSI_Q_OPT_RSS_HASH_S)
+#define ICE_AQ_VSI_Q_OPT_RSS_XOR               (0x2 << ICE_AQ_VSI_Q_OPT_RSS_HASH_S)
+#define ICE_AQ_VSI_Q_OPT_RSS_JHASH             (0x3 << ICE_AQ_VSI_Q_OPT_RSS_HASH_S)
        u8 q_opt_tc;
-#define ICE_AQ_VSI_Q_OPT_TC_OVR_S      0
-#define ICE_AQ_VSI_Q_OPT_TC_OVR_M      (0x1F << ICE_AQ_VSI_Q_OPT_TC_OVR_S)
-#define ICE_AQ_VSI_Q_OPT_PROF_TC_OVR   BIT(7)
+#define ICE_AQ_VSI_Q_OPT_TC_OVR_S              0
+#define ICE_AQ_VSI_Q_OPT_TC_OVR_M              (0x1F << ICE_AQ_VSI_Q_OPT_TC_OVR_S)
+#define ICE_AQ_VSI_Q_OPT_PROF_TC_OVR           BIT(7)
        u8 q_opt_flags;
-#define ICE_AQ_VSI_Q_OPT_PE_FLTR_EN    BIT(0)
+#define ICE_AQ_VSI_Q_OPT_PE_FLTR_EN            BIT(0)
        u8 q_opt_reserved[3];
        /* outer up section */
        __le32 outer_up_table; /* same structure and defines as ingress tbl */
@@ -452,27 +500,27 @@ struct ice_aqc_vsi_props {
        __le16 sect_10_reserved;
        /* flow director section */
        __le16 fd_options;
-#define ICE_AQ_VSI_FD_ENABLE           BIT(0)
-#define ICE_AQ_VSI_FD_TX_AUTO_ENABLE   BIT(1)
-#define ICE_AQ_VSI_FD_PROG_ENABLE      BIT(3)
+#define ICE_AQ_VSI_FD_ENABLE                   BIT(0)
+#define ICE_AQ_VSI_FD_TX_AUTO_ENABLE           BIT(1)
+#define ICE_AQ_VSI_FD_PROG_ENABLE              BIT(3)
        __le16 max_fd_fltr_dedicated;
        __le16 max_fd_fltr_shared;
        __le16 fd_def_q;
-#define ICE_AQ_VSI_FD_DEF_Q_S          0
-#define ICE_AQ_VSI_FD_DEF_Q_M          (0x7FF << ICE_AQ_VSI_FD_DEF_Q_S)
-#define ICE_AQ_VSI_FD_DEF_GRP_S        12
-#define ICE_AQ_VSI_FD_DEF_GRP_M        (0x7 << ICE_AQ_VSI_FD_DEF_GRP_S)
+#define ICE_AQ_VSI_FD_DEF_Q_S                  0
+#define ICE_AQ_VSI_FD_DEF_Q_M                  (0x7FF << ICE_AQ_VSI_FD_DEF_Q_S)
+#define ICE_AQ_VSI_FD_DEF_GRP_S                        12
+#define ICE_AQ_VSI_FD_DEF_GRP_M                        (0x7 << ICE_AQ_VSI_FD_DEF_GRP_S)
        __le16 fd_report_opt;
-#define ICE_AQ_VSI_FD_REPORT_Q_S       0
-#define ICE_AQ_VSI_FD_REPORT_Q_M       (0x7FF << ICE_AQ_VSI_FD_REPORT_Q_S)
-#define ICE_AQ_VSI_FD_DEF_PRIORITY_S   12
-#define ICE_AQ_VSI_FD_DEF_PRIORITY_M   (0x7 << ICE_AQ_VSI_FD_DEF_PRIORITY_S)
-#define ICE_AQ_VSI_FD_DEF_DROP         BIT(15)
+#define ICE_AQ_VSI_FD_REPORT_Q_S               0
+#define ICE_AQ_VSI_FD_REPORT_Q_M               (0x7FF << ICE_AQ_VSI_FD_REPORT_Q_S)
+#define ICE_AQ_VSI_FD_DEF_PRIORITY_S           12
+#define ICE_AQ_VSI_FD_DEF_PRIORITY_M           (0x7 << ICE_AQ_VSI_FD_DEF_PRIORITY_S)
+#define ICE_AQ_VSI_FD_DEF_DROP                 BIT(15)
        /* PASID section */
        __le32 pasid_id;
-#define ICE_AQ_VSI_PASID_ID_S          0
-#define ICE_AQ_VSI_PASID_ID_M          (0xFFFFF << ICE_AQ_VSI_PASID_ID_S)
-#define ICE_AQ_VSI_PASID_ID_VALID      BIT(31)
+#define ICE_AQ_VSI_PASID_ID_S                  0
+#define ICE_AQ_VSI_PASID_ID_M                  (0xFFFFF << ICE_AQ_VSI_PASID_ID_S)
+#define ICE_AQ_VSI_PASID_ID_VALID              BIT(31)
        u8 reserved[24];
 };
 
@@ -489,9 +537,13 @@ struct ice_aqc_add_get_recipe {
 
 struct ice_aqc_recipe_content {
        u8 rid;
+#define ICE_AQ_RECIPE_ID_S             0
+#define ICE_AQ_RECIPE_ID_M             (0x3F << ICE_AQ_RECIPE_ID_S)
 #define ICE_AQ_RECIPE_ID_IS_ROOT       BIT(7)
 #define ICE_AQ_SW_ID_LKUP_IDX          0
        u8 lkup_indx[5];
+#define ICE_AQ_RECIPE_LKUP_DATA_S      0
+#define ICE_AQ_RECIPE_LKUP_DATA_M      (0x3F << ICE_AQ_RECIPE_LKUP_DATA_S)
 #define ICE_AQ_RECIPE_LKUP_IGNORE      BIT(7)
 #define ICE_AQ_SW_ID_LKUP_MASK         0x00FF
        __le16 mask[5];
@@ -502,15 +554,25 @@ struct ice_aqc_recipe_content {
        u8 rsvd0[3];
        u8 act_ctrl_join_priority;
        u8 act_ctrl_fwd_priority;
+#define ICE_AQ_RECIPE_FWD_PRIORITY_S   0
+#define ICE_AQ_RECIPE_FWD_PRIORITY_M   (0xF << ICE_AQ_RECIPE_FWD_PRIORITY_S)
        u8 act_ctrl;
+#define ICE_AQ_RECIPE_ACT_NEED_PASS_L2 BIT(0)
+#define ICE_AQ_RECIPE_ACT_ALLOW_PASS_L2        BIT(1)
 #define ICE_AQ_RECIPE_ACT_INV_ACT      BIT(2)
+#define ICE_AQ_RECIPE_ACT_PRUNE_INDX_S 4
+#define ICE_AQ_RECIPE_ACT_PRUNE_INDX_M (0x3 << ICE_AQ_RECIPE_ACT_PRUNE_INDX_S)
        u8 rsvd1;
        __le32 dflt_act;
+#define ICE_AQ_RECIPE_DFLT_ACT_S       0
+#define ICE_AQ_RECIPE_DFLT_ACT_M       (0x7FFFF << ICE_AQ_RECIPE_DFLT_ACT_S)
+#define ICE_AQ_RECIPE_DFLT_ACT_VALID   BIT(31)
 };
 
 struct ice_aqc_recipe_data_elem {
        u8 recipe_indx;
        u8 resp_bits;
+#define ICE_AQ_RECIPE_WAS_UPDATED      BIT(0)
        u8 rsvd0[2];
        u8 recipe_bitmap[8];
        u8 rsvd1[4];
@@ -1883,7 +1945,7 @@ struct ice_aqc_get_clear_fw_log {
 };
 
 /* Download Package (indirect 0x0C40) */
-/* Also used for Update Package (indirect 0x0C42) */
+/* Also used for Update Package (indirect 0x0C41 and 0x0C42) */
 struct ice_aqc_download_pkg {
        u8 flags;
 #define ICE_AQC_DOWNLOAD_PKG_LAST_BUF  0x01
@@ -2009,6 +2071,7 @@ struct ice_aq_desc {
                struct ice_aqc_sff_eeprom read_write_sff_param;
                struct ice_aqc_set_port_id_led set_port_id_led;
                struct ice_aqc_get_sw_cfg get_sw_conf;
+               struct ice_aqc_set_port_params set_port_params;
                struct ice_aqc_sw_rules sw_rules;
                struct ice_aqc_add_get_recipe add_get_recipe;
                struct ice_aqc_recipe_to_profile recipe_to_profile;
@@ -2110,10 +2173,13 @@ enum ice_adminq_opc {
 
        /* internal switch commands */
        ice_aqc_opc_get_sw_cfg                          = 0x0200,
+       ice_aqc_opc_set_port_params                     = 0x0203,
 
        /* Alloc/Free/Get Resources */
        ice_aqc_opc_alloc_res                           = 0x0208,
        ice_aqc_opc_free_res                            = 0x0209,
+       ice_aqc_opc_set_vlan_mode_parameters            = 0x020C,
+       ice_aqc_opc_get_vlan_mode_parameters            = 0x020D,
 
        /* VSI commands */
        ice_aqc_opc_add_vsi                             = 0x0210,
@@ -2204,6 +2270,7 @@ enum ice_adminq_opc {
 
        /* package commands */
        ice_aqc_opc_download_pkg                        = 0x0C40,
+       ice_aqc_opc_upload_section                      = 0x0C41,
        ice_aqc_opc_update_pkg                          = 0x0C42,
        ice_aqc_opc_get_pkg_info_list                   = 0x0C43,
 
index 1a5ece3..2360e6a 100644 (file)
@@ -5,6 +5,7 @@
 #include "ice_base.h"
 #include "ice_lib.h"
 #include "ice_dcb_lib.h"
+#include "ice_virtchnl_pf.h"
 
 static bool ice_alloc_rx_buf_zc(struct ice_rx_ring *rx_ring)
 {
@@ -418,8 +419,22 @@ static int ice_setup_rx_ctx(struct ice_rx_ring *ring)
         */
        rlan_ctx.crcstrip = 1;
 
-       /* L2TSEL flag defines the reported L2 Tags in the receive descriptor */
-       rlan_ctx.l2tsel = 1;
+       /* L2TSEL flag defines the reported L2 Tags in the receive descriptor
+        * and it needs to remain 1 for non-DVM capable configurations to not
+        * break backward compatibility for VF drivers. Setting this field to 0
+        * will cause the single/outer VLAN tag to be stripped to the L2TAG2_2ND
+        * field in the Rx descriptor. Setting it to 1 allows the VLAN tag to
+        * be stripped in L2TAG1 of the Rx descriptor, which is where VFs will
+        * check for the tag
+        */
+       if (ice_is_dvm_ena(hw))
+               if (vsi->type == ICE_VSI_VF &&
+                   ice_vf_is_port_vlan_ena(&vsi->back->vf[vsi->vf_id]))
+                       rlan_ctx.l2tsel = 1;
+               else
+                       rlan_ctx.l2tsel = 0;
+       else
+               rlan_ctx.l2tsel = 1;
 
        rlan_ctx.dtype = ICE_RX_DTYPE_NO_SPLIT;
        rlan_ctx.hsplit_0 = ICE_RLAN_RX_HSPLIT_0_NO_SPLIT;
index a6d7d3e..c57e5fc 100644 (file)
@@ -1518,16 +1518,27 @@ ice_aq_send_cmd(struct ice_hw *hw, struct ice_aq_desc *desc, void *buf,
 
        /* When a package download is in process (i.e. when the firmware's
         * Global Configuration Lock resource is held), only the Download
-        * Package, Get Version, Get Package Info List and Release Resource
-        * (with resource ID set to Global Config Lock) AdminQ commands are
-        * allowed; all others must block until the package download completes
-        * and the Global Config Lock is released.  See also
-        * ice_acquire_global_cfg_lock().
+        * Package, Get Version, Get Package Info List, Upload Section,
+        * Update Package, Set Port Parameters, Get/Set VLAN Mode Parameters,
+        * Add Recipe, Set Recipes to Profile Association, Get Recipe, and Get
+        * Recipes to Profile Association, and Release Resource (with resource
+        * ID set to Global Config Lock) AdminQ commands are allowed; all others
+        * must block until the package download completes and the Global Config
+        * Lock is released.  See also ice_acquire_global_cfg_lock().
         */
        switch (le16_to_cpu(desc->opcode)) {
        case ice_aqc_opc_download_pkg:
        case ice_aqc_opc_get_pkg_info_list:
        case ice_aqc_opc_get_ver:
+       case ice_aqc_opc_upload_section:
+       case ice_aqc_opc_update_pkg:
+       case ice_aqc_opc_set_port_params:
+       case ice_aqc_opc_get_vlan_mode_parameters:
+       case ice_aqc_opc_set_vlan_mode_parameters:
+       case ice_aqc_opc_add_recipe:
+       case ice_aqc_opc_recipe_to_profile:
+       case ice_aqc_opc_get_recipe:
+       case ice_aqc_opc_get_recipe_to_profile:
                break;
        case ice_aqc_opc_release_res:
                if (le16_to_cpu(cmd->res_id) == ICE_AQC_RES_ID_GLBL_LOCK)
@@ -2737,6 +2748,34 @@ void ice_clear_pxe_mode(struct ice_hw *hw)
 }
 
 /**
+ * ice_aq_set_port_params - set physical port parameters.
+ * @pi: pointer to the port info struct
+ * @double_vlan: if set double VLAN is enabled
+ * @cd: pointer to command details structure or NULL
+ *
+ * Set Physical port parameters (0x0203)
+ */
+int
+ice_aq_set_port_params(struct ice_port_info *pi, bool double_vlan,
+                      struct ice_sq_cd *cd)
+
+{
+       struct ice_aqc_set_port_params *cmd;
+       struct ice_hw *hw = pi->hw;
+       struct ice_aq_desc desc;
+       u16 cmd_flags = 0;
+
+       cmd = &desc.params.set_port_params;
+
+       ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_port_params);
+       if (double_vlan)
+               cmd_flags |= ICE_AQC_SET_P_PARAMS_DOUBLE_VLAN_ENA;
+       cmd->cmd_flags = cpu_to_le16(cmd_flags);
+
+       return ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
+}
+
+/**
  * ice_get_link_speed_based_on_phy_type - returns link speed
  * @phy_type_low: lower part of phy_type
  * @phy_type_high: higher part of phy_type
index 1c57097..d28749e 100644 (file)
@@ -85,6 +85,9 @@ int
 ice_aq_send_driver_ver(struct ice_hw *hw, struct ice_driver_ver *dv,
                       struct ice_sq_cd *cd);
 int
+ice_aq_set_port_params(struct ice_port_info *pi, bool double_vlan,
+                      struct ice_sq_cd *cd);
+int
 ice_aq_get_phy_caps(struct ice_port_info *pi, bool qual_mods, u8 report_mode,
                    struct ice_aqc_get_phy_caps_data *caps,
                    struct ice_sq_cd *cd);
index b94d8da..add90e7 100644 (file)
@@ -916,7 +916,8 @@ ice_tx_prepare_vlan_flags_dcb(struct ice_tx_ring *tx_ring,
                return;
 
        /* Insert 802.1p priority into VLAN header */
-       if ((first->tx_flags & ICE_TX_FLAGS_HW_VLAN) ||
+       if ((first->tx_flags & ICE_TX_FLAGS_HW_VLAN ||
+            first->tx_flags & ICE_TX_FLAGS_HW_OUTER_SINGLE_VLAN) ||
            skb->priority != TC_PRIO_CONTROL) {
                first->tx_flags &= ~ICE_TX_FLAGS_VLAN_PR_M;
                /* Mask the lower 3 bits to set the 802.1p priority */
@@ -925,7 +926,10 @@ ice_tx_prepare_vlan_flags_dcb(struct ice_tx_ring *tx_ring,
                /* if this is not already set it means a VLAN 0 + priority needs
                 * to be offloaded
                 */
-               first->tx_flags |= ICE_TX_FLAGS_HW_VLAN;
+               if (tx_ring->flags & ICE_TX_FLAGS_RING_VLAN_L2TAG2)
+                       first->tx_flags |= ICE_TX_FLAGS_HW_OUTER_SINGLE_VLAN;
+               else
+                       first->tx_flags |= ICE_TX_FLAGS_HW_VLAN;
        }
 }
 
index 864692b..e1cb668 100644 (file)
@@ -115,9 +115,12 @@ static int ice_eswitch_setup_env(struct ice_pf *pf)
        struct ice_vsi *uplink_vsi = pf->switchdev.uplink_vsi;
        struct net_device *uplink_netdev = uplink_vsi->netdev;
        struct ice_vsi *ctrl_vsi = pf->switchdev.control_vsi;
+       struct ice_vsi_vlan_ops *vlan_ops;
        bool rule_added = false;
 
-       ice_vsi_manage_vlan_stripping(ctrl_vsi, false);
+       vlan_ops = ice_get_compat_vsi_vlan_ops(ctrl_vsi);
+       if (vlan_ops->dis_stripping(ctrl_vsi))
+               return -ENODEV;
 
        ice_remove_vsi_fltr(&pf->hw, uplink_vsi->idx);
 
@@ -126,7 +129,7 @@ static int ice_eswitch_setup_env(struct ice_pf *pf)
        __dev_mc_unsync(uplink_netdev, NULL);
        netif_addr_unlock_bh(uplink_netdev);
 
-       if (ice_vsi_add_vlan(uplink_vsi, 0, ICE_FWD_TO_VSI))
+       if (ice_vsi_add_vlan_zero(uplink_vsi))
                goto err_def_rx;
 
        if (!ice_is_dflt_vsi_in_use(uplink_vsi->vsw)) {
@@ -230,7 +233,7 @@ static int ice_eswitch_setup_reprs(struct ice_pf *pf)
                        goto err;
                }
 
-               if (ice_vsi_add_vlan(vsi, 0, ICE_FWD_TO_VSI)) {
+               if (ice_vsi_add_vlan_zero(vsi)) {
                        ice_fltr_add_mac_and_broadcast(vsi,
                                                       vf->hw_lan_addr.addr,
                                                       ICE_FWD_TO_VSI);
index e2e3ef7..a349275 100644 (file)
@@ -164,6 +164,7 @@ static const struct ice_priv_flag ice_gstrings_priv_flags[] = {
        ICE_PRIV_FLAG("vf-true-promisc-support",
                      ICE_FLAG_VF_TRUE_PROMISC_ENA),
        ICE_PRIV_FLAG("mdd-auto-reset-vf", ICE_FLAG_MDD_AUTO_RESET_VF),
+       ICE_PRIV_FLAG("vf-vlan-pruning", ICE_FLAG_VF_VLAN_PRUNING),
        ICE_PRIV_FLAG("legacy-rx", ICE_FLAG_LEGACY_RX),
 };
 
@@ -1295,6 +1296,14 @@ static int ice_set_priv_flags(struct net_device *netdev, u32 flags)
                change_bit(ICE_FLAG_VF_TRUE_PROMISC_ENA, pf->flags);
                ret = -EAGAIN;
        }
+
+       if (test_bit(ICE_FLAG_VF_VLAN_PRUNING, change_flags) &&
+           pf->num_alloc_vfs) {
+               dev_err(dev, "vf-vlan-pruning: VLAN pruning cannot be changed while VFs are active.\n");
+               /* toggle bit back to previous state */
+               change_bit(ICE_FLAG_VF_VLAN_PRUNING, pf->flags);
+               ret = -EOPNOTSUPP;
+       }
 ethtool_exit:
        clear_bit(ICE_FLAG_ETHTOOL_CTXT, pf->flags);
        return ret;
@@ -2803,6 +2812,8 @@ ice_set_ringparam(struct net_device *netdev, struct ethtool_ringparam *ring,
                /* clone ring and setup updated count */
                xdp_rings[i] = *vsi->xdp_rings[i];
                xdp_rings[i].count = new_tx_cnt;
+               xdp_rings[i].next_dd = ICE_RING_QUARTER(&xdp_rings[i]) - 1;
+               xdp_rings[i].next_rs = ICE_RING_QUARTER(&xdp_rings[i]) - 1;
                xdp_rings[i].desc = NULL;
                xdp_rings[i].tx_buf = NULL;
                err = ice_setup_tx_ring(&xdp_rings[i]);
index 4deb2c9..38fe0a7 100644 (file)
@@ -5,9 +5,17 @@
 #include "ice_flex_pipe.h"
 #include "ice_flow.h"
 
+/* For supporting double VLAN mode, it is necessary to enable or disable certain
+ * boost tcam entries. The metadata labels names that match the following
+ * prefixes will be saved to allow enabling double VLAN mode.
+ */
+#define ICE_DVM_PRE    "BOOST_MAC_VLAN_DVM"    /* enable these entries */
+#define ICE_SVM_PRE    "BOOST_MAC_VLAN_SVM"    /* disable these entries */
+
 /* To support tunneling entries by PF, the package will append the PF number to
  * the label; for example TNL_VXLAN_PF0, TNL_VXLAN_PF1, TNL_VXLAN_PF2, etc.
  */
+#define ICE_TNL_PRE    "TNL_"
 static const struct ice_tunnel_type_scan tnls[] = {
        { TNL_VXLAN,            "TNL_VXLAN_PF" },
        { TNL_GENEVE,           "TNL_GENEVE_PF" },
@@ -523,6 +531,55 @@ ice_enum_labels(struct ice_seg *ice_seg, u32 type, struct ice_pkg_enum *state,
 }
 
 /**
+ * ice_add_tunnel_hint
+ * @hw: pointer to the HW structure
+ * @label_name: label text
+ * @val: value of the tunnel port boost entry
+ */
+static void ice_add_tunnel_hint(struct ice_hw *hw, char *label_name, u16 val)
+{
+       if (hw->tnl.count < ICE_TUNNEL_MAX_ENTRIES) {
+               u16 i;
+
+               for (i = 0; tnls[i].type != TNL_LAST; i++) {
+                       size_t len = strlen(tnls[i].label_prefix);
+
+                       /* Look for matching label start, before continuing */
+                       if (strncmp(label_name, tnls[i].label_prefix, len))
+                               continue;
+
+                       /* Make sure this label matches our PF. Note that the PF
+                        * character ('0' - '7') will be located where our
+                        * prefix string's null terminator is located.
+                        */
+                       if ((label_name[len] - '0') == hw->pf_id) {
+                               hw->tnl.tbl[hw->tnl.count].type = tnls[i].type;
+                               hw->tnl.tbl[hw->tnl.count].valid = false;
+                               hw->tnl.tbl[hw->tnl.count].boost_addr = val;
+                               hw->tnl.tbl[hw->tnl.count].port = 0;
+                               hw->tnl.count++;
+                               break;
+                       }
+               }
+       }
+}
+
+/**
+ * ice_add_dvm_hint
+ * @hw: pointer to the HW structure
+ * @val: value of the boost entry
+ * @enable: true if entry needs to be enabled, or false if needs to be disabled
+ */
+static void ice_add_dvm_hint(struct ice_hw *hw, u16 val, bool enable)
+{
+       if (hw->dvm_upd.count < ICE_DVM_MAX_ENTRIES) {
+               hw->dvm_upd.tbl[hw->dvm_upd.count].boost_addr = val;
+               hw->dvm_upd.tbl[hw->dvm_upd.count].enable = enable;
+               hw->dvm_upd.count++;
+       }
+}
+
+/**
  * ice_init_pkg_hints
  * @hw: pointer to the HW structure
  * @ice_seg: pointer to the segment of the package scan (non-NULL)
@@ -548,32 +605,23 @@ static void ice_init_pkg_hints(struct ice_hw *hw, struct ice_seg *ice_seg)
        label_name = ice_enum_labels(ice_seg, ICE_SID_LBL_RXPARSER_TMEM, &state,
                                     &val);
 
-       while (label_name && hw->tnl.count < ICE_TUNNEL_MAX_ENTRIES) {
-               for (i = 0; tnls[i].type != TNL_LAST; i++) {
-                       size_t len = strlen(tnls[i].label_prefix);
+       while (label_name) {
+               if (!strncmp(label_name, ICE_TNL_PRE, strlen(ICE_TNL_PRE)))
+                       /* check for a tunnel entry */
+                       ice_add_tunnel_hint(hw, label_name, val);
 
-                       /* Look for matching label start, before continuing */
-                       if (strncmp(label_name, tnls[i].label_prefix, len))
-                               continue;
+               /* check for a dvm mode entry */
+               else if (!strncmp(label_name, ICE_DVM_PRE, strlen(ICE_DVM_PRE)))
+                       ice_add_dvm_hint(hw, val, true);
 
-                       /* Make sure this label matches our PF. Note that the PF
-                        * character ('0' - '7') will be located where our
-                        * prefix string's null terminator is located.
-                        */
-                       if ((label_name[len] - '0') == hw->pf_id) {
-                               hw->tnl.tbl[hw->tnl.count].type = tnls[i].type;
-                               hw->tnl.tbl[hw->tnl.count].valid = false;
-                               hw->tnl.tbl[hw->tnl.count].boost_addr = val;
-                               hw->tnl.tbl[hw->tnl.count].port = 0;
-                               hw->tnl.count++;
-                               break;
-                       }
-               }
+               /* check for a svm mode entry */
+               else if (!strncmp(label_name, ICE_SVM_PRE, strlen(ICE_SVM_PRE)))
+                       ice_add_dvm_hint(hw, val, false);
 
                label_name = ice_enum_labels(NULL, 0, &state, &val);
        }
 
-       /* Cache the appropriate boost TCAM entry pointers */
+       /* Cache the appropriate boost TCAM entry pointers for tunnels */
        for (i = 0; i < hw->tnl.count; i++) {
                ice_find_boost_entry(ice_seg, hw->tnl.tbl[i].boost_addr,
                                     &hw->tnl.tbl[i].boost_entry);
@@ -583,6 +631,11 @@ static void ice_init_pkg_hints(struct ice_hw *hw, struct ice_seg *ice_seg)
                                hw->tnl.valid_count[hw->tnl.tbl[i].type]++;
                }
        }
+
+       /* Cache the appropriate boost TCAM entry pointers for DVM and SVM */
+       for (i = 0; i < hw->dvm_upd.count; i++)
+               ice_find_boost_entry(ice_seg, hw->dvm_upd.tbl[i].boost_addr,
+                                    &hw->dvm_upd.tbl[i].boost_entry);
 }
 
 /* Key creation */
@@ -874,6 +927,27 @@ ice_aq_download_pkg(struct ice_hw *hw, struct ice_buf_hdr *pkg_buf,
 }
 
 /**
+ * ice_aq_upload_section
+ * @hw: pointer to the hardware structure
+ * @pkg_buf: the package buffer which will receive the section
+ * @buf_size: the size of the package buffer
+ * @cd: pointer to command details structure or NULL
+ *
+ * Upload Section (0x0C41)
+ */
+int
+ice_aq_upload_section(struct ice_hw *hw, struct ice_buf_hdr *pkg_buf,
+                     u16 buf_size, struct ice_sq_cd *cd)
+{
+       struct ice_aq_desc desc;
+
+       ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_upload_section);
+       desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+
+       return ice_aq_send_cmd(hw, &desc, pkg_buf, buf_size, cd);
+}
+
+/**
  * ice_aq_update_pkg
  * @hw: pointer to the hardware structure
  * @pkg_buf: the package cmd buffer
@@ -957,25 +1031,21 @@ ice_find_seg_in_pkg(struct ice_hw *hw, u32 seg_type,
 }
 
 /**
- * ice_update_pkg
+ * ice_update_pkg_no_lock
  * @hw: pointer to the hardware structure
  * @bufs: pointer to an array of buffers
  * @count: the number of buffers in the array
- *
- * Obtains change lock and updates package.
  */
-static int ice_update_pkg(struct ice_hw *hw, struct ice_buf *bufs, u32 count)
+static int
+ice_update_pkg_no_lock(struct ice_hw *hw, struct ice_buf *bufs, u32 count)
 {
-       u32 offset, info, i;
-       int status;
-
-       status = ice_acquire_change_lock(hw, ICE_RES_WRITE);
-       if (status)
-               return status;
+       int status = 0;
+       u32 i;
 
        for (i = 0; i < count; i++) {
                struct ice_buf_hdr *bh = (struct ice_buf_hdr *)(bufs + i);
                bool last = ((i + 1) == count);
+               u32 offset, info;
 
                status = ice_aq_update_pkg(hw, bh, le16_to_cpu(bh->data_end),
                                           last, &offset, &info, NULL);
@@ -987,6 +1057,27 @@ static int ice_update_pkg(struct ice_hw *hw, struct ice_buf *bufs, u32 count)
                }
        }
 
+       return status;
+}
+
+/**
+ * ice_update_pkg
+ * @hw: pointer to the hardware structure
+ * @bufs: pointer to an array of buffers
+ * @count: the number of buffers in the array
+ *
+ * Obtains change lock and updates package.
+ */
+static int ice_update_pkg(struct ice_hw *hw, struct ice_buf *bufs, u32 count)
+{
+       int status;
+
+       status = ice_acquire_change_lock(hw, ICE_RES_WRITE);
+       if (status)
+               return status;
+
+       status = ice_update_pkg_no_lock(hw, bufs, count);
+
        ice_release_change_lock(hw);
 
        return status;
@@ -1080,6 +1171,13 @@ ice_dwnld_cfg_bufs(struct ice_hw *hw, struct ice_buf *bufs, u32 count)
                        break;
        }
 
+       if (!status) {
+               status = ice_set_vlan_mode(hw);
+               if (status)
+                       ice_debug(hw, ICE_DBG_PKG, "Failed to set VLAN mode: err %d\n",
+                                 status);
+       }
+
        ice_release_global_cfg_lock(hw);
 
        return state;
@@ -1117,6 +1215,7 @@ static enum ice_ddp_state
 ice_download_pkg(struct ice_hw *hw, struct ice_seg *ice_seg)
 {
        struct ice_buf_table *ice_buf_tbl;
+       int status;
 
        ice_debug(hw, ICE_DBG_PKG, "Segment format version: %d.%d.%d.%d\n",
                  ice_seg->hdr.seg_format_ver.major,
@@ -1133,8 +1232,12 @@ ice_download_pkg(struct ice_hw *hw, struct ice_seg *ice_seg)
        ice_debug(hw, ICE_DBG_PKG, "Seg buf count: %d\n",
                  le32_to_cpu(ice_buf_tbl->buf_count));
 
-       return ice_dwnld_cfg_bufs(hw, ice_buf_tbl->buf_array,
-                                 le32_to_cpu(ice_buf_tbl->buf_count));
+       status = ice_dwnld_cfg_bufs(hw, ice_buf_tbl->buf_array,
+                                   le32_to_cpu(ice_buf_tbl->buf_count));
+
+       ice_post_pkg_dwnld_vlan_mode_cfg(hw);
+
+       return status;
 }
 
 /**
@@ -1897,7 +2000,7 @@ void ice_init_prof_result_bm(struct ice_hw *hw)
  *
  * Frees a package buffer
  */
-static void ice_pkg_buf_free(struct ice_hw *hw, struct ice_buf_build *bld)
+void ice_pkg_buf_free(struct ice_hw *hw, struct ice_buf_build *bld)
 {
        devm_kfree(ice_hw_to_dev(hw), bld);
 }
@@ -1997,6 +2100,43 @@ ice_pkg_buf_alloc_section(struct ice_buf_build *bld, u32 type, u16 size)
 }
 
 /**
+ * ice_pkg_buf_alloc_single_section
+ * @hw: pointer to the HW structure
+ * @type: the section type value
+ * @size: the size of the section to reserve (in bytes)
+ * @section: returns pointer to the section
+ *
+ * Allocates a package buffer with a single section.
+ * Note: all package contents must be in Little Endian form.
+ */
+struct ice_buf_build *
+ice_pkg_buf_alloc_single_section(struct ice_hw *hw, u32 type, u16 size,
+                                void **section)
+{
+       struct ice_buf_build *buf;
+
+       if (!section)
+               return NULL;
+
+       buf = ice_pkg_buf_alloc(hw);
+       if (!buf)
+               return NULL;
+
+       if (ice_pkg_buf_reserve_section(buf, 1))
+               goto ice_pkg_buf_alloc_single_section_err;
+
+       *section = ice_pkg_buf_alloc_section(buf, type, size);
+       if (!*section)
+               goto ice_pkg_buf_alloc_single_section_err;
+
+       return buf;
+
+ice_pkg_buf_alloc_single_section_err:
+       ice_pkg_buf_free(hw, buf);
+       return NULL;
+}
+
+/**
  * ice_pkg_buf_get_active_sections
  * @bld: pointer to pkg build (allocated by ice_pkg_buf_alloc())
  *
@@ -2023,7 +2163,7 @@ static u16 ice_pkg_buf_get_active_sections(struct ice_buf_build *bld)
  *
  * Return a pointer to the buffer's header
  */
-static struct ice_buf *ice_pkg_buf(struct ice_buf_build *bld)
+struct ice_buf *ice_pkg_buf(struct ice_buf_build *bld)
 {
        if (!bld)
                return NULL;
@@ -2060,6 +2200,89 @@ ice_get_open_tunnel_port(struct ice_hw *hw, u16 *port,
 }
 
 /**
+ * ice_upd_dvm_boost_entry
+ * @hw: pointer to the HW structure
+ * @entry: pointer to double vlan boost entry info
+ */
+static int
+ice_upd_dvm_boost_entry(struct ice_hw *hw, struct ice_dvm_entry *entry)
+{
+       struct ice_boost_tcam_section *sect_rx, *sect_tx;
+       int status = -ENOSPC;
+       struct ice_buf_build *bld;
+       u8 val, dc, nm;
+
+       bld = ice_pkg_buf_alloc(hw);
+       if (!bld)
+               return -ENOMEM;
+
+       /* allocate 2 sections, one for Rx parser, one for Tx parser */
+       if (ice_pkg_buf_reserve_section(bld, 2))
+               goto ice_upd_dvm_boost_entry_err;
+
+       sect_rx = ice_pkg_buf_alloc_section(bld, ICE_SID_RXPARSER_BOOST_TCAM,
+                                           struct_size(sect_rx, tcam, 1));
+       if (!sect_rx)
+               goto ice_upd_dvm_boost_entry_err;
+       sect_rx->count = cpu_to_le16(1);
+
+       sect_tx = ice_pkg_buf_alloc_section(bld, ICE_SID_TXPARSER_BOOST_TCAM,
+                                           struct_size(sect_tx, tcam, 1));
+       if (!sect_tx)
+               goto ice_upd_dvm_boost_entry_err;
+       sect_tx->count = cpu_to_le16(1);
+
+       /* copy original boost entry to update package buffer */
+       memcpy(sect_rx->tcam, entry->boost_entry, sizeof(*sect_rx->tcam));
+
+       /* re-write the don't care and never match bits accordingly */
+       if (entry->enable) {
+               /* all bits are don't care */
+               val = 0x00;
+               dc = 0xFF;
+               nm = 0x00;
+       } else {
+               /* disable, one never match bit, the rest are don't care */
+               val = 0x00;
+               dc = 0xF7;
+               nm = 0x08;
+       }
+
+       ice_set_key((u8 *)&sect_rx->tcam[0].key, sizeof(sect_rx->tcam[0].key),
+                   &val, NULL, &dc, &nm, 0, sizeof(u8));
+
+       /* exact copy of entry to Tx section entry */
+       memcpy(sect_tx->tcam, sect_rx->tcam, sizeof(*sect_tx->tcam));
+
+       status = ice_update_pkg_no_lock(hw, ice_pkg_buf(bld), 1);
+
+ice_upd_dvm_boost_entry_err:
+       ice_pkg_buf_free(hw, bld);
+
+       return status;
+}
+
+/**
+ * ice_set_dvm_boost_entries
+ * @hw: pointer to the HW structure
+ *
+ * Enable double vlan by updating the appropriate boost tcam entries.
+ */
+int ice_set_dvm_boost_entries(struct ice_hw *hw)
+{
+       int status;
+       u16 i;
+
+       for (i = 0; i < hw->dvm_upd.count; i++) {
+               status = ice_upd_dvm_boost_entry(hw, &hw->dvm_upd.tbl[i]);
+               if (status)
+                       return status;
+       }
+
+       return 0;
+}
+
+/**
  * ice_tunnel_idx_to_entry - convert linear index to the sparse one
  * @hw: pointer to the HW structure
  * @type: type of tunnel
index 6cbc29b..2fd5312 100644 (file)
@@ -89,6 +89,12 @@ ice_init_prof_result_bm(struct ice_hw *hw);
 int
 ice_get_sw_fv_list(struct ice_hw *hw, u8 *prot_ids, u16 ids_cnt,
                   unsigned long *bm, struct list_head *fv_list);
+int
+ice_pkg_buf_unreserve_section(struct ice_buf_build *bld, u16 count);
+u16 ice_pkg_buf_get_free_space(struct ice_buf_build *bld);
+int
+ice_aq_upload_section(struct ice_hw *hw, struct ice_buf_hdr *pkg_buf,
+                     u16 buf_size, struct ice_sq_cd *cd);
 bool
 ice_get_open_tunnel_port(struct ice_hw *hw, u16 *port,
                         enum ice_tunnel_type type);
@@ -96,6 +102,7 @@ int ice_udp_tunnel_set_port(struct net_device *netdev, unsigned int table,
                            unsigned int idx, struct udp_tunnel_info *ti);
 int ice_udp_tunnel_unset_port(struct net_device *netdev, unsigned int table,
                              unsigned int idx, struct udp_tunnel_info *ti);
+int ice_set_dvm_boost_entries(struct ice_hw *hw);
 
 /* Rx parser PTYPE functions */
 bool ice_hw_ptype_ena(struct ice_hw *hw, u16 ptype);
@@ -119,4 +126,10 @@ void ice_fill_blk_tbls(struct ice_hw *hw);
 void ice_clear_hw_tbls(struct ice_hw *hw);
 void ice_free_hw_tbls(struct ice_hw *hw);
 int ice_rem_prof(struct ice_hw *hw, enum ice_block blk, u64 id);
+struct ice_buf_build *
+ice_pkg_buf_alloc_single_section(struct ice_hw *hw, u32 type, u16 size,
+                                void **section);
+struct ice_buf *ice_pkg_buf(struct ice_buf_build *bld);
+void ice_pkg_buf_free(struct ice_hw *hw, struct ice_buf_build *bld);
+
 #endif /* _ICE_FLEX_PIPE_H_ */
index fc087e0..5735e95 100644 (file)
@@ -162,6 +162,7 @@ struct ice_meta_sect {
 
 #define ICE_SID_RXPARSER_MARKER_PTYPE  55
 #define ICE_SID_RXPARSER_BOOST_TCAM    56
+#define ICE_SID_RXPARSER_METADATA_INIT 58
 #define ICE_SID_TXPARSER_BOOST_TCAM    66
 
 #define ICE_SID_XLT0_PE                        80
@@ -442,6 +443,19 @@ struct ice_tunnel_table {
        u16 valid_count[__TNL_TYPE_CNT];
 };
 
+struct ice_dvm_entry {
+       u16 boost_addr;
+       u16 enable;
+       struct ice_boost_tcam_entry *boost_entry;
+};
+
+#define ICE_DVM_MAX_ENTRIES    48
+
+struct ice_dvm_table {
+       struct ice_dvm_entry tbl[ICE_DVM_MAX_ENTRIES];
+       u16 count;
+};
+
 struct ice_pkg_es {
        __le16 count;
        __le16 offset;
@@ -662,4 +676,30 @@ enum ice_prof_type {
        ICE_PROF_TUN_ALL = 0x6,
        ICE_PROF_ALL = 0xFF,
 };
+
+/* Number of bits/bytes contained in meta init entry. Note, this should be a
+ * multiple of 32 bits.
+ */
+#define ICE_META_INIT_BITS     192
+#define ICE_META_INIT_DW_CNT   (ICE_META_INIT_BITS / (sizeof(__le32) * \
+                                BITS_PER_BYTE))
+
+/* The meta init Flag field starts at this bit */
+#define ICE_META_FLAGS_ST              123
+
+/* The entry and bit to check for Double VLAN Mode (DVM) support */
+#define ICE_META_VLAN_MODE_ENTRY       0
+#define ICE_META_FLAG_VLAN_MODE                60
+#define ICE_META_VLAN_MODE_BIT         (ICE_META_FLAGS_ST + \
+                                        ICE_META_FLAG_VLAN_MODE)
+
+struct ice_meta_init_entry {
+       __le32 bm[ICE_META_INIT_DW_CNT];
+};
+
+struct ice_meta_init_section {
+       __le16 count;
+       __le16 offset;
+       struct ice_meta_init_entry entry;
+};
 #endif /* _ICE_FLEX_TYPE_H_ */
index c29177c..af57eb1 100644 (file)
@@ -203,21 +203,22 @@ ice_fltr_add_mac_to_list(struct ice_vsi *vsi, struct list_head *list,
  * ice_fltr_add_vlan_to_list - add VLAN filter info to exsisting list
  * @vsi: pointer to VSI struct
  * @list: list to add filter info to
- * @vlan_id: VLAN ID to add
- * @action: filter action
+ * @vlan: VLAN filter details
  */
 static int
 ice_fltr_add_vlan_to_list(struct ice_vsi *vsi, struct list_head *list,
-                         u16 vlan_id, enum ice_sw_fwd_act_type action)
+                         struct ice_vlan *vlan)
 {
        struct ice_fltr_info info = { 0 };
 
        info.flag = ICE_FLTR_TX;
        info.src_id = ICE_SRC_ID_VSI;
        info.lkup_type = ICE_SW_LKUP_VLAN;
-       info.fltr_act = action;
+       info.fltr_act = ICE_FWD_TO_VSI;
        info.vsi_handle = vsi->idx;
-       info.l_data.vlan.vlan_id = vlan_id;
+       info.l_data.vlan.vlan_id = vlan->vid;
+       info.l_data.vlan.tpid = vlan->tpid;
+       info.l_data.vlan.tpid_valid = true;
 
        return ice_fltr_add_entry_to_list(ice_pf_to_dev(vsi->back), &info,
                                          list);
@@ -310,19 +311,17 @@ ice_fltr_prepare_mac_and_broadcast(struct ice_vsi *vsi, const u8 *mac,
 /**
  * ice_fltr_prepare_vlan - add or remove VLAN filter
  * @vsi: pointer to VSI struct
- * @vlan_id: VLAN ID to add
- * @action: action to be performed on filter match
+ * @vlan: VLAN filter details
  * @vlan_action: pointer to add or remove VLAN function
  */
 static int
-ice_fltr_prepare_vlan(struct ice_vsi *vsi, u16 vlan_id,
-                     enum ice_sw_fwd_act_type action,
+ice_fltr_prepare_vlan(struct ice_vsi *vsi, struct ice_vlan *vlan,
                      int (*vlan_action)(struct ice_vsi *, struct list_head *))
 {
        LIST_HEAD(tmp_list);
        int result;
 
-       if (ice_fltr_add_vlan_to_list(vsi, &tmp_list, vlan_id, action))
+       if (ice_fltr_add_vlan_to_list(vsi, &tmp_list, vlan))
                return -ENOMEM;
 
        result = vlan_action(vsi, &tmp_list);
@@ -395,27 +394,21 @@ int ice_fltr_remove_mac(struct ice_vsi *vsi, const u8 *mac,
 /**
  * ice_fltr_add_vlan - add single VLAN filter
  * @vsi: pointer to VSI struct
- * @vlan_id: VLAN ID to add
- * @action: action to be performed on filter match
+ * @vlan: VLAN filter details
  */
-int ice_fltr_add_vlan(struct ice_vsi *vsi, u16 vlan_id,
-                     enum ice_sw_fwd_act_type action)
+int ice_fltr_add_vlan(struct ice_vsi *vsi, struct ice_vlan *vlan)
 {
-       return ice_fltr_prepare_vlan(vsi, vlan_id, action,
-                                    ice_fltr_add_vlan_list);
+       return ice_fltr_prepare_vlan(vsi, vlan, ice_fltr_add_vlan_list);
 }
 
 /**
  * ice_fltr_remove_vlan - remove VLAN filter
  * @vsi: pointer to VSI struct
- * @vlan_id: filter VLAN to remove
- * @action: action to remove
+ * @vlan: VLAN filter details
  */
-int ice_fltr_remove_vlan(struct ice_vsi *vsi, u16 vlan_id,
-                        enum ice_sw_fwd_act_type action)
+int ice_fltr_remove_vlan(struct ice_vsi *vsi, struct ice_vlan *vlan)
 {
-       return ice_fltr_prepare_vlan(vsi, vlan_id, action,
-                                    ice_fltr_remove_vlan_list);
+       return ice_fltr_prepare_vlan(vsi, vlan, ice_fltr_remove_vlan_list);
 }
 
 /**
index 3eb4247..0f3dbc3 100644 (file)
@@ -4,6 +4,8 @@
 #ifndef _ICE_FLTR_H_
 #define _ICE_FLTR_H_
 
+#include "ice_vlan.h"
+
 void ice_fltr_free_list(struct device *dev, struct list_head *h);
 int
 ice_fltr_set_vlan_vsi_promisc(struct ice_hw *hw, struct ice_vsi *vsi,
@@ -32,12 +34,8 @@ ice_fltr_remove_mac(struct ice_vsi *vsi, const u8 *mac,
                    enum ice_sw_fwd_act_type action);
 int ice_fltr_remove_mac_list(struct ice_vsi *vsi, struct list_head *list);
 
-int
-ice_fltr_add_vlan(struct ice_vsi *vsi, u16 vid,
-                 enum ice_sw_fwd_act_type action);
-int
-ice_fltr_remove_vlan(struct ice_vsi *vsi, u16 vid,
-                    enum ice_sw_fwd_act_type action);
+int ice_fltr_add_vlan(struct ice_vsi *vsi, struct ice_vlan *vlan);
+int ice_fltr_remove_vlan(struct ice_vsi *vsi, struct ice_vlan *vlan);
 
 int
 ice_fltr_add_eth(struct ice_vsi *vsi, u16 ethertype, u16 flag,
index fc35801..263a2e7 100644 (file)
@@ -227,6 +227,11 @@ void ice_get_qos_params(struct ice_pf *pf, struct iidc_qos_params *qos)
 
        for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++)
                qos->tc_info[i].rel_bw = dcbx_cfg->etscfg.tcbwtable[i];
+
+       qos->pfc_mode = dcbx_cfg->pfc_mode;
+       if (qos->pfc_mode == IIDC_DSCP_PFC_MODE)
+               for (i = 0; i < IIDC_MAX_DSCP_MAPPING; i++)
+                       qos->dscp_map[i] = dcbx_cfg->dscp_map[i];
 }
 EXPORT_SYMBOL_GPL(ice_get_qos_params);
 
index 85a6128..b3baf7c 100644 (file)
@@ -424,6 +424,8 @@ enum ice_rx_flex_desc_status_error_0_bits {
 enum ice_rx_flex_desc_status_error_1_bits {
        /* Note: These are predefined bit offsets */
        ICE_RX_FLEX_DESC_STATUS1_NAT_S = 4,
+        /* [10:5] reserved */
+       ICE_RX_FLEX_DESC_STATUS1_L2TAG2P_S = 11,
        ICE_RX_FLEX_DESC_STATUS1_LAST /* this entry must be last!!! */
 };
 
index 0c187cf..f23917d 100644 (file)
@@ -8,6 +8,7 @@
 #include "ice_fltr.h"
 #include "ice_dcb_lib.h"
 #include "ice_devlink.h"
+#include "ice_vsi_vlan_ops.h"
 
 /**
  * ice_vsi_type_str - maps VSI type enum to string equivalents
@@ -838,11 +839,12 @@ static void ice_vsi_set_rss_params(struct ice_vsi *vsi)
 
 /**
  * ice_set_dflt_vsi_ctx - Set default VSI context before adding a VSI
+ * @hw: HW structure used to determine the VLAN mode of the device
  * @ctxt: the VSI context being set
  *
  * This initializes a default VSI context for all sections except the Queues.
  */
-static void ice_set_dflt_vsi_ctx(struct ice_vsi_ctx *ctxt)
+static void ice_set_dflt_vsi_ctx(struct ice_hw *hw, struct ice_vsi_ctx *ctxt)
 {
        u32 table = 0;
 
@@ -853,13 +855,27 @@ static void ice_set_dflt_vsi_ctx(struct ice_vsi_ctx *ctxt)
        ctxt->info.sw_flags = ICE_AQ_VSI_SW_FLAG_SRC_PRUNE;
        /* Traffic from VSI can be sent to LAN */
        ctxt->info.sw_flags2 = ICE_AQ_VSI_SW_FLAG_LAN_ENA;
-       /* By default bits 3 and 4 in vlan_flags are 0's which results in legacy
-        * behavior (show VLAN, DEI, and UP) in descriptor. Also, allow all
-        * packets untagged/tagged.
+       /* allow all untagged/tagged packets by default on Tx */
+       ctxt->info.inner_vlan_flags = ((ICE_AQ_VSI_INNER_VLAN_TX_MODE_ALL &
+                                 ICE_AQ_VSI_INNER_VLAN_TX_MODE_M) >>
+                                ICE_AQ_VSI_INNER_VLAN_TX_MODE_S);
+       /* SVM - by default bits 3 and 4 in inner_vlan_flags are 0's which
+        * results in legacy behavior (show VLAN, DEI, and UP) in descriptor.
+        *
+        * DVM - leave inner VLAN in packet by default
         */
-       ctxt->info.vlan_flags = ((ICE_AQ_VSI_VLAN_MODE_ALL &
-                                 ICE_AQ_VSI_VLAN_MODE_M) >>
-                                ICE_AQ_VSI_VLAN_MODE_S);
+       if (ice_is_dvm_ena(hw)) {
+               ctxt->info.inner_vlan_flags |=
+                       ICE_AQ_VSI_INNER_VLAN_EMODE_NOTHING;
+               ctxt->info.outer_vlan_flags =
+                       (ICE_AQ_VSI_OUTER_VLAN_TX_MODE_ALL <<
+                        ICE_AQ_VSI_OUTER_VLAN_TX_MODE_S) &
+                       ICE_AQ_VSI_OUTER_VLAN_TX_MODE_M;
+               ctxt->info.outer_vlan_flags |=
+                       (ICE_AQ_VSI_OUTER_TAG_VLAN_8100 <<
+                        ICE_AQ_VSI_OUTER_TAG_TYPE_S) &
+                       ICE_AQ_VSI_OUTER_TAG_TYPE_M;
+       }
        /* Have 1:1 UP mapping for both ingress/egress tables */
        table |= ICE_UP_TABLE_TRANSLATE(0, 0);
        table |= ICE_UP_TABLE_TRANSLATE(1, 1);
@@ -1136,7 +1152,7 @@ static int ice_vsi_init(struct ice_vsi *vsi, bool init_vsi)
                                ~ICE_AQ_VSI_SW_FLAG_RX_VLAN_PRUNE_ENA;
        }
 
-       ice_set_dflt_vsi_ctx(ctxt);
+       ice_set_dflt_vsi_ctx(hw, ctxt);
        if (test_bit(ICE_FLAG_FD_ENA, pf->flags))
                ice_set_fd_vsi_ctx(ctxt, vsi);
        /* if the switch is in VEB mode, allow VSI loopback */
@@ -1168,25 +1184,6 @@ static int ice_vsi_init(struct ice_vsi *vsi, bool init_vsi)
                                cpu_to_le16(ICE_AQ_VSI_PROP_RXQ_MAP_VALID);
        }
 
-       /* enable/disable MAC and VLAN anti-spoof when spoofchk is on/off
-        * respectively
-        */
-       if (vsi->type == ICE_VSI_VF) {
-               ctxt->info.valid_sections |=
-                       cpu_to_le16(ICE_AQ_VSI_PROP_SECURITY_VALID);
-               if (pf->vf[vsi->vf_id].spoofchk) {
-                       ctxt->info.sec_flags |=
-                               ICE_AQ_VSI_SEC_FLAG_ENA_MAC_ANTI_SPOOF |
-                               (ICE_AQ_VSI_SEC_TX_VLAN_PRUNE_ENA <<
-                                ICE_AQ_VSI_SEC_TX_PRUNE_ENA_S);
-               } else {
-                       ctxt->info.sec_flags &=
-                               ~(ICE_AQ_VSI_SEC_FLAG_ENA_MAC_ANTI_SPOOF |
-                                 (ICE_AQ_VSI_SEC_TX_VLAN_PRUNE_ENA <<
-                                  ICE_AQ_VSI_SEC_TX_PRUNE_ENA_S));
-               }
-       }
-
        /* Allow control frames out of main VSI */
        if (vsi->type == ICE_VSI_PF) {
                ctxt->info.sec_flags |= ICE_AQ_VSI_SEC_FLAG_ALLOW_DEST_OVRD;
@@ -1431,6 +1428,7 @@ static void ice_vsi_clear_rings(struct ice_vsi *vsi)
  */
 static int ice_vsi_alloc_rings(struct ice_vsi *vsi)
 {
+       bool dvm_ena = ice_is_dvm_ena(&vsi->back->hw);
        struct ice_pf *pf = vsi->back;
        struct device *dev;
        u16 i;
@@ -1452,6 +1450,10 @@ static int ice_vsi_alloc_rings(struct ice_vsi *vsi)
                ring->tx_tstamps = &pf->ptp.port.tx;
                ring->dev = dev;
                ring->count = vsi->num_tx_desc;
+               if (dvm_ena)
+                       ring->flags |= ICE_TX_FLAGS_RING_VLAN_L2TAG2;
+               else
+                       ring->flags |= ICE_TX_FLAGS_RING_VLAN_L2TAG1;
                WRITE_ONCE(vsi->tx_rings[i], ring);
        }
 
@@ -1757,62 +1759,6 @@ void ice_update_eth_stats(struct ice_vsi *vsi)
 }
 
 /**
- * ice_vsi_add_vlan - Add VSI membership for given VLAN
- * @vsi: the VSI being configured
- * @vid: VLAN ID to be added
- * @action: filter action to be performed on match
- */
-int
-ice_vsi_add_vlan(struct ice_vsi *vsi, u16 vid, enum ice_sw_fwd_act_type action)
-{
-       struct ice_pf *pf = vsi->back;
-       struct device *dev;
-       int err = 0;
-
-       dev = ice_pf_to_dev(pf);
-
-       if (!ice_fltr_add_vlan(vsi, vid, action)) {
-               vsi->num_vlan++;
-       } else {
-               err = -ENODEV;
-               dev_err(dev, "Failure Adding VLAN %d on VSI %i\n", vid,
-                       vsi->vsi_num);
-       }
-
-       return err;
-}
-
-/**
- * ice_vsi_kill_vlan - Remove VSI membership for a given VLAN
- * @vsi: the VSI being configured
- * @vid: VLAN ID to be removed
- *
- * Returns 0 on success and negative on failure
- */
-int ice_vsi_kill_vlan(struct ice_vsi *vsi, u16 vid)
-{
-       struct ice_pf *pf = vsi->back;
-       struct device *dev;
-       int err;
-
-       dev = ice_pf_to_dev(pf);
-
-       err = ice_fltr_remove_vlan(vsi, vid, ICE_FWD_TO_VSI);
-       if (!err) {
-               vsi->num_vlan--;
-       } else if (err == -ENOENT) {
-               dev_dbg(dev, "Failed to remove VLAN %d on VSI %i, it does not exist, error: %d\n",
-                       vid, vsi->vsi_num, err);
-               err = 0;
-       } else {
-               dev_err(dev, "Error removing VLAN %d on vsi %i error: %d\n",
-                       vid, vsi->vsi_num, err);
-       }
-
-       return err;
-}
-
-/**
  * ice_vsi_cfg_frame_size - setup max frame size and Rx buffer length
  * @vsi: VSI
  */
@@ -2140,95 +2086,6 @@ void ice_vsi_cfg_msix(struct ice_vsi *vsi)
 }
 
 /**
- * ice_vsi_manage_vlan_insertion - Manage VLAN insertion for the VSI for Tx
- * @vsi: the VSI being changed
- */
-int ice_vsi_manage_vlan_insertion(struct ice_vsi *vsi)
-{
-       struct ice_hw *hw = &vsi->back->hw;
-       struct ice_vsi_ctx *ctxt;
-       int ret;
-
-       ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
-       if (!ctxt)
-               return -ENOMEM;
-
-       /* Here we are configuring the VSI to let the driver add VLAN tags by
-        * setting vlan_flags to ICE_AQ_VSI_VLAN_MODE_ALL. The actual VLAN tag
-        * insertion happens in the Tx hot path, in ice_tx_map.
-        */
-       ctxt->info.vlan_flags = ICE_AQ_VSI_VLAN_MODE_ALL;
-
-       /* Preserve existing VLAN strip setting */
-       ctxt->info.vlan_flags |= (vsi->info.vlan_flags &
-                                 ICE_AQ_VSI_VLAN_EMOD_M);
-
-       ctxt->info.valid_sections = cpu_to_le16(ICE_AQ_VSI_PROP_VLAN_VALID);
-
-       ret = ice_update_vsi(hw, vsi->idx, ctxt, NULL);
-       if (ret) {
-               dev_err(ice_pf_to_dev(vsi->back), "update VSI for VLAN insert failed, err %d aq_err %s\n",
-                       ret, ice_aq_str(hw->adminq.sq_last_status));
-               goto out;
-       }
-
-       vsi->info.vlan_flags = ctxt->info.vlan_flags;
-out:
-       kfree(ctxt);
-       return ret;
-}
-
-/**
- * ice_vsi_manage_vlan_stripping - Manage VLAN stripping for the VSI for Rx
- * @vsi: the VSI being changed
- * @ena: boolean value indicating if this is a enable or disable request
- */
-int ice_vsi_manage_vlan_stripping(struct ice_vsi *vsi, bool ena)
-{
-       struct ice_hw *hw = &vsi->back->hw;
-       struct ice_vsi_ctx *ctxt;
-       int ret;
-
-       /* do not allow modifying VLAN stripping when a port VLAN is configured
-        * on this VSI
-        */
-       if (vsi->info.pvid)
-               return 0;
-
-       ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
-       if (!ctxt)
-               return -ENOMEM;
-
-       /* Here we are configuring what the VSI should do with the VLAN tag in
-        * the Rx packet. We can either leave the tag in the packet or put it in
-        * the Rx descriptor.
-        */
-       if (ena)
-               /* Strip VLAN tag from Rx packet and put it in the desc */
-               ctxt->info.vlan_flags = ICE_AQ_VSI_VLAN_EMOD_STR_BOTH;
-       else
-               /* Disable stripping. Leave tag in packet */
-               ctxt->info.vlan_flags = ICE_AQ_VSI_VLAN_EMOD_NOTHING;
-
-       /* Allow all packets untagged/tagged */
-       ctxt->info.vlan_flags |= ICE_AQ_VSI_VLAN_MODE_ALL;
-
-       ctxt->info.valid_sections = cpu_to_le16(ICE_AQ_VSI_PROP_VLAN_VALID);
-
-       ret = ice_update_vsi(hw, vsi->idx, ctxt, NULL);
-       if (ret) {
-               dev_err(ice_pf_to_dev(vsi->back), "update VSI for VLAN strip failed, ena = %d err %d aq_err %s\n",
-                       ena, ret, ice_aq_str(hw->adminq.sq_last_status));
-               goto out;
-       }
-
-       vsi->info.vlan_flags = ctxt->info.vlan_flags;
-out:
-       kfree(ctxt);
-       return ret;
-}
-
-/**
  * ice_vsi_start_all_rx_rings - start/enable all of a VSI's Rx rings
  * @vsi: the VSI whose rings are to be enabled
  *
@@ -2321,61 +2178,6 @@ bool ice_vsi_is_vlan_pruning_ena(struct ice_vsi *vsi)
        return (vsi->info.sw_flags2 & ICE_AQ_VSI_SW_FLAG_RX_VLAN_PRUNE_ENA);
 }
 
-/**
- * ice_cfg_vlan_pruning - enable or disable VLAN pruning on the VSI
- * @vsi: VSI to enable or disable VLAN pruning on
- * @ena: set to true to enable VLAN pruning and false to disable it
- *
- * returns 0 if VSI is updated, negative otherwise
- */
-int ice_cfg_vlan_pruning(struct ice_vsi *vsi, bool ena)
-{
-       struct ice_vsi_ctx *ctxt;
-       struct ice_pf *pf;
-       int status;
-
-       if (!vsi)
-               return -EINVAL;
-
-       /* Don't enable VLAN pruning if the netdev is currently in promiscuous
-        * mode. VLAN pruning will be enabled when the interface exits
-        * promiscuous mode if any VLAN filters are active.
-        */
-       if (vsi->netdev && vsi->netdev->flags & IFF_PROMISC && ena)
-               return 0;
-
-       pf = vsi->back;
-       ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
-       if (!ctxt)
-               return -ENOMEM;
-
-       ctxt->info = vsi->info;
-
-       if (ena)
-               ctxt->info.sw_flags2 |= ICE_AQ_VSI_SW_FLAG_RX_VLAN_PRUNE_ENA;
-       else
-               ctxt->info.sw_flags2 &= ~ICE_AQ_VSI_SW_FLAG_RX_VLAN_PRUNE_ENA;
-
-       ctxt->info.valid_sections = cpu_to_le16(ICE_AQ_VSI_PROP_SW_VALID);
-
-       status = ice_update_vsi(&pf->hw, vsi->idx, ctxt, NULL);
-       if (status) {
-               netdev_err(vsi->netdev, "%sabling VLAN pruning on VSI handle: %d, VSI HW ID: %d failed, err = %d, aq_err = %s\n",
-                          ena ? "En" : "Dis", vsi->idx, vsi->vsi_num,
-                          status, ice_aq_str(pf->hw.adminq.sq_last_status));
-               goto err_out;
-       }
-
-       vsi->info.sw_flags2 = ctxt->info.sw_flags2;
-
-       kfree(ctxt);
-       return 0;
-
-err_out:
-       kfree(ctxt);
-       return -EIO;
-}
-
 static void ice_vsi_set_tc_cfg(struct ice_vsi *vsi)
 {
        if (!test_bit(ICE_FLAG_DCB_ENA, vsi->back->flags)) {
@@ -2655,6 +2457,8 @@ ice_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi,
        if (ret)
                goto unroll_get_qs;
 
+       ice_vsi_init_vlan_ops(vsi);
+
        switch (vsi->type) {
        case ICE_VSI_CTRL:
        case ICE_VSI_SWITCHDEV_CTRL:
@@ -2675,17 +2479,6 @@ ice_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi,
                if (ret)
                        goto unroll_vector_base;
 
-               /* Always add VLAN ID 0 switch rule by default. This is needed
-                * in order to allow all untagged and 0 tagged priority traffic
-                * if Rx VLAN pruning is enabled. Also there are cases where we
-                * don't get the call to add VLAN 0 via ice_vlan_rx_add_vid()
-                * so this handles those cases (i.e. adding the PF to a bridge
-                * without the 8021q module loaded).
-                */
-               ret = ice_vsi_add_vlan(vsi, 0, ICE_FWD_TO_VSI);
-               if (ret)
-                       goto unroll_clear_rings;
-
                ice_vsi_map_rings_to_vectors(vsi);
 
                /* ICE_VSI_CTRL does not need RSS so skip RSS processing */
@@ -3318,6 +3111,8 @@ int ice_vsi_rebuild(struct ice_vsi *vsi, bool init_vsi)
        if (vtype == ICE_VSI_VF)
                vf = &pf->vf[vsi->vf_id];
 
+       ice_vsi_init_vlan_ops(vsi);
+
        coalesce = kcalloc(vsi->num_q_vectors,
                           sizeof(struct ice_coalesce_stored), GFP_KERNEL);
        if (!coalesce)
@@ -4131,6 +3926,115 @@ int ice_set_link(struct ice_vsi *vsi, bool ena)
 }
 
 /**
+ * ice_vsi_add_vlan_zero - add VLAN 0 filter(s) for this VSI
+ * @vsi: VSI used to add VLAN filters
+ *
+ * In Single VLAN Mode (SVM), single VLAN filters via ICE_SW_LKUP_VLAN are based
+ * on the inner VLAN ID, so the VLAN TPID (i.e. 0x8100 or 0x888a8) doesn't
+ * matter. In Double VLAN Mode (DVM), outer/single VLAN filters via
+ * ICE_SW_LKUP_VLAN are based on the outer/single VLAN ID + VLAN TPID.
+ *
+ * For both modes add a VLAN 0 + no VLAN TPID filter to handle untagged traffic
+ * when VLAN pruning is enabled. Also, this handles VLAN 0 priority tagged
+ * traffic in SVM, since the VLAN TPID isn't part of filtering.
+ *
+ * If DVM is enabled then an explicit VLAN 0 + VLAN TPID filter needs to be
+ * added to allow VLAN 0 priority tagged traffic in DVM, since the VLAN TPID is
+ * part of filtering.
+ */
+int ice_vsi_add_vlan_zero(struct ice_vsi *vsi)
+{
+       struct ice_vsi_vlan_ops *vlan_ops = ice_get_compat_vsi_vlan_ops(vsi);
+       struct ice_vlan vlan;
+       int err;
+
+       vlan = ICE_VLAN(0, 0, 0);
+       err = vlan_ops->add_vlan(vsi, &vlan);
+       if (err && err != -EEXIST)
+               return err;
+
+       /* in SVM both VLAN 0 filters are identical */
+       if (!ice_is_dvm_ena(&vsi->back->hw))
+               return 0;
+
+       vlan = ICE_VLAN(ETH_P_8021Q, 0, 0);
+       err = vlan_ops->add_vlan(vsi, &vlan);
+       if (err && err != -EEXIST)
+               return err;
+
+       return 0;
+}
+
+/**
+ * ice_vsi_del_vlan_zero - delete VLAN 0 filter(s) for this VSI
+ * @vsi: VSI used to add VLAN filters
+ *
+ * Delete the VLAN 0 filters in the same manner that they were added in
+ * ice_vsi_add_vlan_zero.
+ */
+int ice_vsi_del_vlan_zero(struct ice_vsi *vsi)
+{
+       struct ice_vsi_vlan_ops *vlan_ops = ice_get_compat_vsi_vlan_ops(vsi);
+       struct ice_vlan vlan;
+       int err;
+
+       vlan = ICE_VLAN(0, 0, 0);
+       err = vlan_ops->del_vlan(vsi, &vlan);
+       if (err && err != -EEXIST)
+               return err;
+
+       /* in SVM both VLAN 0 filters are identical */
+       if (!ice_is_dvm_ena(&vsi->back->hw))
+               return 0;
+
+       vlan = ICE_VLAN(ETH_P_8021Q, 0, 0);
+       err = vlan_ops->del_vlan(vsi, &vlan);
+       if (err && err != -EEXIST)
+               return err;
+
+       return 0;
+}
+
+/**
+ * ice_vsi_num_zero_vlans - get number of VLAN 0 filters based on VLAN mode
+ * @vsi: VSI used to get the VLAN mode
+ *
+ * If DVM is enabled then 2 VLAN 0 filters are added, else if SVM is enabled
+ * then 1 VLAN 0 filter is added. See ice_vsi_add_vlan_zero for more details.
+ */
+static u16 ice_vsi_num_zero_vlans(struct ice_vsi *vsi)
+{
+#define ICE_DVM_NUM_ZERO_VLAN_FLTRS    2
+#define ICE_SVM_NUM_ZERO_VLAN_FLTRS    1
+       /* no VLAN 0 filter is created when a port VLAN is active */
+       if (vsi->type == ICE_VSI_VF &&
+           ice_vf_is_port_vlan_ena(&vsi->back->vf[vsi->vf_id]))
+               return 0;
+       if (ice_is_dvm_ena(&vsi->back->hw))
+               return ICE_DVM_NUM_ZERO_VLAN_FLTRS;
+       else
+               return ICE_SVM_NUM_ZERO_VLAN_FLTRS;
+}
+
+/**
+ * ice_vsi_has_non_zero_vlans - check if VSI has any non-zero VLANs
+ * @vsi: VSI used to determine if any non-zero VLANs have been added
+ */
+bool ice_vsi_has_non_zero_vlans(struct ice_vsi *vsi)
+{
+       return (vsi->num_vlan > ice_vsi_num_zero_vlans(vsi));
+}
+
+/**
+ * ice_vsi_num_non_zero_vlans - get the number of non-zero VLANs for this VSI
+ * @vsi: VSI used to get the number of non-zero VLANs added
+ */
+u16 ice_vsi_num_non_zero_vlans(struct ice_vsi *vsi)
+{
+       return (vsi->num_vlan - ice_vsi_num_zero_vlans(vsi));
+}
+
+/**
  * ice_is_feature_supported
  * @pf: pointer to the struct ice_pf instance
  * @f: feature enum to be checked
index b2ed189..133fc23 100644 (file)
@@ -5,6 +5,7 @@
 #define _ICE_LIB_H_
 
 #include "ice.h"
+#include "ice_vlan.h"
 
 const char *ice_vsi_type_str(enum ice_vsi_type vsi_type);
 
@@ -22,15 +23,6 @@ int ice_vsi_cfg_lan_txqs(struct ice_vsi *vsi);
 
 void ice_vsi_cfg_msix(struct ice_vsi *vsi);
 
-int
-ice_vsi_add_vlan(struct ice_vsi *vsi, u16 vid, enum ice_sw_fwd_act_type action);
-
-int ice_vsi_kill_vlan(struct ice_vsi *vsi, u16 vid);
-
-int ice_vsi_manage_vlan_insertion(struct ice_vsi *vsi);
-
-int ice_vsi_manage_vlan_stripping(struct ice_vsi *vsi, bool ena);
-
 int ice_vsi_start_all_rx_rings(struct ice_vsi *vsi);
 
 int ice_vsi_stop_all_rx_rings(struct ice_vsi *vsi);
@@ -45,8 +37,6 @@ int ice_vsi_stop_xdp_tx_rings(struct ice_vsi *vsi);
 
 bool ice_vsi_is_vlan_pruning_ena(struct ice_vsi *vsi);
 
-int ice_cfg_vlan_pruning(struct ice_vsi *vsi, bool ena);
-
 void ice_cfg_sw_lldp(struct ice_vsi *vsi, bool tx, bool create);
 
 int ice_set_link(struct ice_vsi *vsi, bool ena);
@@ -132,7 +122,10 @@ void ice_vsi_ctx_clear_antispoof(struct ice_vsi_ctx *ctx);
 void ice_vsi_ctx_set_allow_override(struct ice_vsi_ctx *ctx);
 
 void ice_vsi_ctx_clear_allow_override(struct ice_vsi_ctx *ctx);
-
+int ice_vsi_add_vlan_zero(struct ice_vsi *vsi);
+int ice_vsi_del_vlan_zero(struct ice_vsi *vsi);
+bool ice_vsi_has_non_zero_vlans(struct ice_vsi *vsi);
+u16 ice_vsi_num_non_zero_vlans(struct ice_vsi *vsi);
 bool ice_is_feature_supported(struct ice_pf *pf, enum ice_feature f);
 void ice_clear_feature_support(struct ice_pf *pf, enum ice_feature f);
 void ice_init_feature_support(struct ice_pf *pf);
index 17a9bb4..ce90ebf 100644 (file)
@@ -21,6 +21,7 @@
 #include "ice_trace.h"
 #include "ice_eswitch.h"
 #include "ice_tc_lib.h"
+#include "ice_vsi_vlan_ops.h"
 
 #define DRV_SUMMARY    "Intel(R) Ethernet Connection E800 Series Linux Driver"
 static const char ice_driver_string[] = DRV_SUMMARY;
@@ -244,7 +245,7 @@ static int ice_set_promisc(struct ice_vsi *vsi, u8 promisc_m)
        if (vsi->type != ICE_VSI_PF)
                return 0;
 
-       if (vsi->num_vlan > 1)
+       if (ice_vsi_has_non_zero_vlans(vsi))
                status = ice_fltr_set_vlan_vsi_promisc(&vsi->back->hw, vsi, promisc_m);
        else
                status = ice_fltr_set_vsi_promisc(&vsi->back->hw, vsi->idx, promisc_m, 0);
@@ -264,7 +265,7 @@ static int ice_clear_promisc(struct ice_vsi *vsi, u8 promisc_m)
        if (vsi->type != ICE_VSI_PF)
                return 0;
 
-       if (vsi->num_vlan > 1)
+       if (ice_vsi_has_non_zero_vlans(vsi))
                status = ice_fltr_clear_vlan_vsi_promisc(&vsi->back->hw, vsi, promisc_m);
        else
                status = ice_fltr_clear_vsi_promisc(&vsi->back->hw, vsi->idx, promisc_m, 0);
@@ -279,6 +280,7 @@ static int ice_clear_promisc(struct ice_vsi *vsi, u8 promisc_m)
  */
 static int ice_vsi_sync_fltr(struct ice_vsi *vsi)
 {
+       struct ice_vsi_vlan_ops *vlan_ops = ice_get_compat_vsi_vlan_ops(vsi);
        struct device *dev = ice_pf_to_dev(vsi->back);
        struct net_device *netdev = vsi->netdev;
        bool promisc_forced_on = false;
@@ -352,7 +354,7 @@ static int ice_vsi_sync_fltr(struct ice_vsi *vsi)
        /* check for changes in promiscuous modes */
        if (changed_flags & IFF_ALLMULTI) {
                if (vsi->current_netdev_flags & IFF_ALLMULTI) {
-                       if (vsi->num_vlan > 1)
+                       if (ice_vsi_has_non_zero_vlans(vsi))
                                promisc_m = ICE_MCAST_VLAN_PROMISC_BITS;
                        else
                                promisc_m = ICE_MCAST_PROMISC_BITS;
@@ -366,7 +368,7 @@ static int ice_vsi_sync_fltr(struct ice_vsi *vsi)
                        }
                } else {
                        /* !(vsi->current_netdev_flags & IFF_ALLMULTI) */
-                       if (vsi->num_vlan > 1)
+                       if (ice_vsi_has_non_zero_vlans(vsi))
                                promisc_m = ICE_MCAST_VLAN_PROMISC_BITS;
                        else
                                promisc_m = ICE_MCAST_PROMISC_BITS;
@@ -396,7 +398,7 @@ static int ice_vsi_sync_fltr(struct ice_vsi *vsi)
                                        goto out_promisc;
                                }
                                err = 0;
-                               ice_cfg_vlan_pruning(vsi, false);
+                               vlan_ops->dis_rx_filtering(vsi);
                        }
                } else {
                        /* Clear Rx filter to remove traffic from wire */
@@ -409,8 +411,9 @@ static int ice_vsi_sync_fltr(struct ice_vsi *vsi)
                                                IFF_PROMISC;
                                        goto out_promisc;
                                }
-                               if (vsi->num_vlan > 1)
-                                       ice_cfg_vlan_pruning(vsi, true);
+                               if (vsi->current_netdev_flags &
+                                   NETIF_F_HW_VLAN_CTAG_FILTER)
+                                       vlan_ops->ena_rx_filtering(vsi);
                        }
                }
        }
@@ -2498,10 +2501,10 @@ static int ice_xdp_alloc_setup_rings(struct ice_vsi *vsi)
                xdp_ring->reg_idx = vsi->txq_map[xdp_q_idx];
                xdp_ring->vsi = vsi;
                xdp_ring->netdev = NULL;
-               xdp_ring->next_dd = ICE_TX_THRESH - 1;
-               xdp_ring->next_rs = ICE_TX_THRESH - 1;
                xdp_ring->dev = dev;
                xdp_ring->count = vsi->num_tx_desc;
+               xdp_ring->next_dd = ICE_RING_QUARTER(xdp_ring) - 1;
+               xdp_ring->next_rs = ICE_RING_QUARTER(xdp_ring) - 1;
                WRITE_ONCE(vsi->xdp_rings[i], xdp_ring);
                if (ice_setup_tx_ring(xdp_ring))
                        goto free_xdp_rings;
@@ -3233,6 +3236,7 @@ static void ice_set_ops(struct net_device *netdev)
 static void ice_set_netdev_features(struct net_device *netdev)
 {
        struct ice_pf *pf = ice_netdev_to_pf(netdev);
+       bool is_dvm_ena = ice_is_dvm_ena(&pf->hw);
        netdev_features_t csumo_features;
        netdev_features_t vlano_features;
        netdev_features_t dflt_features;
@@ -3259,6 +3263,10 @@ static void ice_set_netdev_features(struct net_device *netdev)
                         NETIF_F_HW_VLAN_CTAG_TX     |
                         NETIF_F_HW_VLAN_CTAG_RX;
 
+       /* Enable CTAG/STAG filtering by default in Double VLAN Mode (DVM) */
+       if (is_dvm_ena)
+               vlano_features |= NETIF_F_HW_VLAN_STAG_FILTER;
+
        tso_features = NETIF_F_TSO                      |
                       NETIF_F_TSO_ECN                  |
                       NETIF_F_TSO6                     |
@@ -3290,6 +3298,15 @@ static void ice_set_netdev_features(struct net_device *netdev)
                                   tso_features;
        netdev->vlan_features |= dflt_features | csumo_features |
                                 tso_features;
+
+       /* advertise support but don't enable by default since only one type of
+        * VLAN offload can be enabled at a time (i.e. CTAG or STAG). When one
+        * type turns on the other has to be turned off. This is enforced by the
+        * ice_fix_features() ndo callback.
+        */
+       if (is_dvm_ena)
+               netdev->hw_features |= NETIF_F_HW_VLAN_STAG_RX |
+                       NETIF_F_HW_VLAN_STAG_TX;
 }
 
 /**
@@ -3405,34 +3422,31 @@ ice_lb_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi)
 /**
  * ice_vlan_rx_add_vid - Add a VLAN ID filter to HW offload
  * @netdev: network interface to be adjusted
- * @proto: unused protocol
+ * @proto: VLAN TPID
  * @vid: VLAN ID to be added
  *
  * net_device_ops implementation for adding VLAN IDs
  */
 static int
-ice_vlan_rx_add_vid(struct net_device *netdev, __always_unused __be16 proto,
-                   u16 vid)
+ice_vlan_rx_add_vid(struct net_device *netdev, __be16 proto, u16 vid)
 {
        struct ice_netdev_priv *np = netdev_priv(netdev);
+       struct ice_vsi_vlan_ops *vlan_ops;
        struct ice_vsi *vsi = np->vsi;
+       struct ice_vlan vlan;
        int ret;
 
        /* VLAN 0 is added by default during load/reset */
        if (!vid)
                return 0;
 
-       /* Enable VLAN pruning when a VLAN other than 0 is added */
-       if (!ice_vsi_is_vlan_pruning_ena(vsi)) {
-               ret = ice_cfg_vlan_pruning(vsi, true);
-               if (ret)
-                       return ret;
-       }
+       vlan_ops = ice_get_compat_vsi_vlan_ops(vsi);
 
        /* Add a switch rule for this VLAN ID so its corresponding VLAN tagged
         * packets aren't pruned by the device's internal switch on Rx
         */
-       ret = ice_vsi_add_vlan(vsi, vid, ICE_FWD_TO_VSI);
+       vlan = ICE_VLAN(be16_to_cpu(proto), vid, 0);
+       ret = vlan_ops->add_vlan(vsi, &vlan);
        if (!ret)
                set_bit(ICE_VSI_VLAN_FLTR_CHANGED, vsi->state);
 
@@ -3442,36 +3456,36 @@ ice_vlan_rx_add_vid(struct net_device *netdev, __always_unused __be16 proto,
 /**
  * ice_vlan_rx_kill_vid - Remove a VLAN ID filter from HW offload
  * @netdev: network interface to be adjusted
- * @proto: unused protocol
+ * @proto: VLAN TPID
  * @vid: VLAN ID to be removed
  *
  * net_device_ops implementation for removing VLAN IDs
  */
 static int
-ice_vlan_rx_kill_vid(struct net_device *netdev, __always_unused __be16 proto,
-                    u16 vid)
+ice_vlan_rx_kill_vid(struct net_device *netdev, __be16 proto, u16 vid)
 {
        struct ice_netdev_priv *np = netdev_priv(netdev);
+       struct ice_vsi_vlan_ops *vlan_ops;
        struct ice_vsi *vsi = np->vsi;
+       struct ice_vlan vlan;
        int ret;
 
        /* don't allow removal of VLAN 0 */
        if (!vid)
                return 0;
 
-       /* Make sure ice_vsi_kill_vlan is successful before updating VLAN
+       vlan_ops = ice_get_compat_vsi_vlan_ops(vsi);
+
+       /* Make sure VLAN delete is successful before updating VLAN
         * information
         */
-       ret = ice_vsi_kill_vlan(vsi, vid);
+       vlan = ICE_VLAN(be16_to_cpu(proto), vid, 0);
+       ret = vlan_ops->del_vlan(vsi, &vlan);
        if (ret)
                return ret;
 
-       /* Disable pruning when VLAN 0 is the only VLAN rule */
-       if (vsi->num_vlan == 1 && ice_vsi_is_vlan_pruning_ena(vsi))
-               ret = ice_cfg_vlan_pruning(vsi, false);
-
        set_bit(ICE_VSI_VLAN_FLTR_CHANGED, vsi->state);
-       return ret;
+       return 0;
 }
 
 /**
@@ -3540,12 +3554,17 @@ static int ice_tc_indir_block_register(struct ice_vsi *vsi)
 static int ice_setup_pf_sw(struct ice_pf *pf)
 {
        struct device *dev = ice_pf_to_dev(pf);
+       bool dvm = ice_is_dvm_ena(&pf->hw);
        struct ice_vsi *vsi;
        int status;
 
        if (ice_is_reset_in_progress(pf->state))
                return -EBUSY;
 
+       status = ice_aq_set_port_params(pf->hw.port_info, dvm, NULL);
+       if (status)
+               return -EIO;
+
        vsi = ice_pf_vsi_setup(pf, pf->hw.port_info);
        if (!vsi)
                return -ENOMEM;
@@ -4067,8 +4086,8 @@ static void ice_set_safe_mode_vlan_cfg(struct ice_pf *pf)
        ctxt->info.sw_flags2 &= ~ICE_AQ_VSI_SW_FLAG_RX_VLAN_PRUNE_ENA;
 
        /* allow all VLANs on Tx and don't strip on Rx */
-       ctxt->info.vlan_flags = ICE_AQ_VSI_VLAN_MODE_ALL |
-               ICE_AQ_VSI_VLAN_EMOD_NOTHING;
+       ctxt->info.inner_vlan_flags = ICE_AQ_VSI_INNER_VLAN_TX_MODE_ALL |
+               ICE_AQ_VSI_INNER_VLAN_EMODE_NOTHING;
 
        status = ice_update_vsi(hw, vsi->idx, ctxt, NULL);
        if (status) {
@@ -4077,7 +4096,7 @@ static void ice_set_safe_mode_vlan_cfg(struct ice_pf *pf)
        } else {
                vsi->info.sec_flags = ctxt->info.sec_flags;
                vsi->info.sw_flags2 = ctxt->info.sw_flags2;
-               vsi->info.vlan_flags = ctxt->info.vlan_flags;
+               vsi->info.inner_vlan_flags = ctxt->info.inner_vlan_flags;
        }
 
        kfree(ctxt);
@@ -4462,8 +4481,6 @@ ice_probe(struct pci_dev *pdev, const struct pci_device_id __always_unused *ent)
 
        /* set up for high or low DMA */
        err = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64));
-       if (err)
-               err = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(32));
        if (err) {
                dev_err(dev, "DMA configuration failed: 0x%x\n", err);
                return err;
@@ -5573,6 +5590,194 @@ ice_fdb_del(struct ndmsg *ndm, __always_unused struct nlattr *tb[],
        return err;
 }
 
+#define NETIF_VLAN_OFFLOAD_FEATURES    (NETIF_F_HW_VLAN_CTAG_RX | \
+                                        NETIF_F_HW_VLAN_CTAG_TX | \
+                                        NETIF_F_HW_VLAN_STAG_RX | \
+                                        NETIF_F_HW_VLAN_STAG_TX)
+
+#define NETIF_VLAN_FILTERING_FEATURES  (NETIF_F_HW_VLAN_CTAG_FILTER | \
+                                        NETIF_F_HW_VLAN_STAG_FILTER)
+
+/**
+ * ice_fix_features - fix the netdev features flags based on device limitations
+ * @netdev: ptr to the netdev that flags are being fixed on
+ * @features: features that need to be checked and possibly fixed
+ *
+ * Make sure any fixups are made to features in this callback. This enables the
+ * driver to not have to check unsupported configurations throughout the driver
+ * because that's the responsiblity of this callback.
+ *
+ * Single VLAN Mode (SVM) Supported Features:
+ *     NETIF_F_HW_VLAN_CTAG_FILTER
+ *     NETIF_F_HW_VLAN_CTAG_RX
+ *     NETIF_F_HW_VLAN_CTAG_TX
+ *
+ * Double VLAN Mode (DVM) Supported Features:
+ *     NETIF_F_HW_VLAN_CTAG_FILTER
+ *     NETIF_F_HW_VLAN_CTAG_RX
+ *     NETIF_F_HW_VLAN_CTAG_TX
+ *
+ *     NETIF_F_HW_VLAN_STAG_FILTER
+ *     NETIF_HW_VLAN_STAG_RX
+ *     NETIF_HW_VLAN_STAG_TX
+ *
+ * Features that need fixing:
+ *     Cannot simultaneously enable CTAG and STAG stripping and/or insertion.
+ *     These are mutually exlusive as the VSI context cannot support multiple
+ *     VLAN ethertypes simultaneously for stripping and/or insertion. If this
+ *     is not done, then default to clearing the requested STAG offload
+ *     settings.
+ *
+ *     All supported filtering has to be enabled or disabled together. For
+ *     example, in DVM, CTAG and STAG filtering have to be enabled and disabled
+ *     together. If this is not done, then default to VLAN filtering disabled.
+ *     These are mutually exclusive as there is currently no way to
+ *     enable/disable VLAN filtering based on VLAN ethertype when using VLAN
+ *     prune rules.
+ */
+static netdev_features_t
+ice_fix_features(struct net_device *netdev, netdev_features_t features)
+{
+       struct ice_netdev_priv *np = netdev_priv(netdev);
+       netdev_features_t supported_vlan_filtering;
+       netdev_features_t requested_vlan_filtering;
+       struct ice_vsi *vsi = np->vsi;
+
+       requested_vlan_filtering = features & NETIF_VLAN_FILTERING_FEATURES;
+
+       /* make sure supported_vlan_filtering works for both SVM and DVM */
+       supported_vlan_filtering = NETIF_F_HW_VLAN_CTAG_FILTER;
+       if (ice_is_dvm_ena(&vsi->back->hw))
+               supported_vlan_filtering |= NETIF_F_HW_VLAN_STAG_FILTER;
+
+       if (requested_vlan_filtering &&
+           requested_vlan_filtering != supported_vlan_filtering) {
+               if (requested_vlan_filtering & NETIF_F_HW_VLAN_CTAG_FILTER) {
+                       netdev_warn(netdev, "cannot support requested VLAN filtering settings, enabling all supported VLAN filtering settings\n");
+                       features |= supported_vlan_filtering;
+               } else {
+                       netdev_warn(netdev, "cannot support requested VLAN filtering settings, clearing all supported VLAN filtering settings\n");
+                       features &= ~supported_vlan_filtering;
+               }
+       }
+
+       if ((features & (NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_HW_VLAN_CTAG_TX)) &&
+           (features & (NETIF_F_HW_VLAN_STAG_RX | NETIF_F_HW_VLAN_STAG_TX))) {
+               netdev_warn(netdev, "cannot support CTAG and STAG VLAN stripping and/or insertion simultaneously since CTAG and STAG offloads are mutually exclusive, clearing STAG offload settings\n");
+               features &= ~(NETIF_F_HW_VLAN_STAG_RX |
+                             NETIF_F_HW_VLAN_STAG_TX);
+       }
+
+       return features;
+}
+
+/**
+ * ice_set_vlan_offload_features - set VLAN offload features for the PF VSI
+ * @vsi: PF's VSI
+ * @features: features used to determine VLAN offload settings
+ *
+ * First, determine the vlan_ethertype based on the VLAN offload bits in
+ * features. Then determine if stripping and insertion should be enabled or
+ * disabled. Finally enable or disable VLAN stripping and insertion.
+ */
+static int
+ice_set_vlan_offload_features(struct ice_vsi *vsi, netdev_features_t features)
+{
+       bool enable_stripping = true, enable_insertion = true;
+       struct ice_vsi_vlan_ops *vlan_ops;
+       int strip_err = 0, insert_err = 0;
+       u16 vlan_ethertype = 0;
+
+       vlan_ops = ice_get_compat_vsi_vlan_ops(vsi);
+
+       if (features & (NETIF_F_HW_VLAN_STAG_RX | NETIF_F_HW_VLAN_STAG_TX))
+               vlan_ethertype = ETH_P_8021AD;
+       else if (features & (NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_HW_VLAN_CTAG_TX))
+               vlan_ethertype = ETH_P_8021Q;
+
+       if (!(features & (NETIF_F_HW_VLAN_STAG_RX | NETIF_F_HW_VLAN_CTAG_RX)))
+               enable_stripping = false;
+       if (!(features & (NETIF_F_HW_VLAN_STAG_TX | NETIF_F_HW_VLAN_CTAG_TX)))
+               enable_insertion = false;
+
+       if (enable_stripping)
+               strip_err = vlan_ops->ena_stripping(vsi, vlan_ethertype);
+       else
+               strip_err = vlan_ops->dis_stripping(vsi);
+
+       if (enable_insertion)
+               insert_err = vlan_ops->ena_insertion(vsi, vlan_ethertype);
+       else
+               insert_err = vlan_ops->dis_insertion(vsi);
+
+       if (strip_err || insert_err)
+               return -EIO;
+
+       return 0;
+}
+
+/**
+ * ice_set_vlan_filtering_features - set VLAN filtering features for the PF VSI
+ * @vsi: PF's VSI
+ * @features: features used to determine VLAN filtering settings
+ *
+ * Enable or disable Rx VLAN filtering based on the VLAN filtering bits in the
+ * features.
+ */
+static int
+ice_set_vlan_filtering_features(struct ice_vsi *vsi, netdev_features_t features)
+{
+       struct ice_vsi_vlan_ops *vlan_ops = ice_get_compat_vsi_vlan_ops(vsi);
+       int err = 0;
+
+       /* support Single VLAN Mode (SVM) and Double VLAN Mode (DVM) by checking
+        * if either bit is set
+        */
+       if (features &
+           (NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_VLAN_STAG_FILTER))
+               err = vlan_ops->ena_rx_filtering(vsi);
+       else
+               err = vlan_ops->dis_rx_filtering(vsi);
+
+       return err;
+}
+
+/**
+ * ice_set_vlan_features - set VLAN settings based on suggested feature set
+ * @netdev: ptr to the netdev being adjusted
+ * @features: the feature set that the stack is suggesting
+ *
+ * Only update VLAN settings if the requested_vlan_features are different than
+ * the current_vlan_features.
+ */
+static int
+ice_set_vlan_features(struct net_device *netdev, netdev_features_t features)
+{
+       netdev_features_t current_vlan_features, requested_vlan_features;
+       struct ice_netdev_priv *np = netdev_priv(netdev);
+       struct ice_vsi *vsi = np->vsi;
+       int err;
+
+       current_vlan_features = netdev->features & NETIF_VLAN_OFFLOAD_FEATURES;
+       requested_vlan_features = features & NETIF_VLAN_OFFLOAD_FEATURES;
+       if (current_vlan_features ^ requested_vlan_features) {
+               err = ice_set_vlan_offload_features(vsi, features);
+               if (err)
+                       return err;
+       }
+
+       current_vlan_features = netdev->features &
+               NETIF_VLAN_FILTERING_FEATURES;
+       requested_vlan_features = features & NETIF_VLAN_FILTERING_FEATURES;
+       if (current_vlan_features ^ requested_vlan_features) {
+               err = ice_set_vlan_filtering_features(vsi, features);
+               if (err)
+                       return err;
+       }
+
+       return 0;
+}
+
 /**
  * ice_set_features - set the netdev feature flags
  * @netdev: ptr to the netdev being adjusted
@@ -5607,26 +5812,9 @@ ice_set_features(struct net_device *netdev, netdev_features_t features)
                 netdev->features & NETIF_F_RXHASH)
                ice_vsi_manage_rss_lut(vsi, false);
 
-       if ((features & NETIF_F_HW_VLAN_CTAG_RX) &&
-           !(netdev->features & NETIF_F_HW_VLAN_CTAG_RX))
-               ret = ice_vsi_manage_vlan_stripping(vsi, true);
-       else if (!(features & NETIF_F_HW_VLAN_CTAG_RX) &&
-                (netdev->features & NETIF_F_HW_VLAN_CTAG_RX))
-               ret = ice_vsi_manage_vlan_stripping(vsi, false);
-
-       if ((features & NETIF_F_HW_VLAN_CTAG_TX) &&
-           !(netdev->features & NETIF_F_HW_VLAN_CTAG_TX))
-               ret = ice_vsi_manage_vlan_insertion(vsi);
-       else if (!(features & NETIF_F_HW_VLAN_CTAG_TX) &&
-                (netdev->features & NETIF_F_HW_VLAN_CTAG_TX))
-               ret = ice_vsi_manage_vlan_insertion(vsi);
-
-       if ((features & NETIF_F_HW_VLAN_CTAG_FILTER) &&
-           !(netdev->features & NETIF_F_HW_VLAN_CTAG_FILTER))
-               ret = ice_cfg_vlan_pruning(vsi, true);
-       else if (!(features & NETIF_F_HW_VLAN_CTAG_FILTER) &&
-                (netdev->features & NETIF_F_HW_VLAN_CTAG_FILTER))
-               ret = ice_cfg_vlan_pruning(vsi, false);
+       ret = ice_set_vlan_features(netdev, features);
+       if (ret)
+               return ret;
 
        if ((features & NETIF_F_NTUPLE) &&
            !(netdev->features & NETIF_F_NTUPLE)) {
@@ -5650,23 +5838,26 @@ ice_set_features(struct net_device *netdev, netdev_features_t features)
        else
                clear_bit(ICE_FLAG_CLS_FLOWER, pf->flags);
 
-       return ret;
+       return 0;
 }
 
 /**
- * ice_vsi_vlan_setup - Setup VLAN offload properties on a VSI
+ * ice_vsi_vlan_setup - Setup VLAN offload properties on a PF VSI
  * @vsi: VSI to setup VLAN properties for
  */
 static int ice_vsi_vlan_setup(struct ice_vsi *vsi)
 {
-       int ret = 0;
+       int err;
 
-       if (vsi->netdev->features & NETIF_F_HW_VLAN_CTAG_RX)
-               ret = ice_vsi_manage_vlan_stripping(vsi, true);
-       if (vsi->netdev->features & NETIF_F_HW_VLAN_CTAG_TX)
-               ret = ice_vsi_manage_vlan_insertion(vsi);
+       err = ice_set_vlan_offload_features(vsi, vsi->netdev->features);
+       if (err)
+               return err;
 
-       return ret;
+       err = ice_set_vlan_filtering_features(vsi, vsi->netdev->features);
+       if (err)
+               return err;
+
+       return ice_vsi_add_vlan_zero(vsi);
 }
 
 /**
@@ -6267,11 +6458,12 @@ static void ice_napi_disable_all(struct ice_vsi *vsi)
  */
 int ice_down(struct ice_vsi *vsi)
 {
-       int i, tx_err, rx_err, link_err = 0;
+       int i, tx_err, rx_err, link_err = 0, vlan_err = 0;
 
        WARN_ON(!test_bit(ICE_VSI_DOWN, vsi->state));
 
        if (vsi->netdev && vsi->type == ICE_VSI_PF) {
+               vlan_err = ice_vsi_del_vlan_zero(vsi);
                if (!ice_is_e810(&vsi->back->hw))
                        ice_ptp_link_change(vsi->back, vsi->back->hw.pf_id, false);
                netif_carrier_off(vsi->netdev);
@@ -6313,7 +6505,7 @@ int ice_down(struct ice_vsi *vsi)
        ice_for_each_rxq(vsi, i)
                ice_clean_rx_ring(vsi->rx_rings[i]);
 
-       if (tx_err || rx_err || link_err) {
+       if (tx_err || rx_err || link_err || vlan_err) {
                netdev_err(vsi->netdev, "Failed to close VSI 0x%04X on switch 0x%04X\n",
                           vsi->vsi_num, vsi->vsw->sw_id);
                return -EIO;
@@ -6623,6 +6815,7 @@ static void ice_rebuild(struct ice_pf *pf, enum ice_reset_req reset_type)
 {
        struct device *dev = ice_pf_to_dev(pf);
        struct ice_hw *hw = &pf->hw;
+       bool dvm;
        int err;
 
        if (test_bit(ICE_DOWN, pf->state))
@@ -6686,6 +6879,12 @@ static void ice_rebuild(struct ice_pf *pf, enum ice_reset_req reset_type)
                goto err_init_ctrlq;
        }
 
+       dvm = ice_is_dvm_ena(hw);
+
+       err = ice_aq_set_port_params(pf->hw.port_info, dvm, NULL);
+       if (err)
+               goto err_init_ctrlq;
+
        err = ice_sched_init_port(hw->port_info);
        if (err)
                goto err_sched_init_port;
@@ -8594,6 +8793,7 @@ static const struct net_device_ops ice_netdev_ops = {
        .ndo_start_xmit = ice_start_xmit,
        .ndo_select_queue = ice_select_queue,
        .ndo_features_check = ice_features_check,
+       .ndo_fix_features = ice_fix_features,
        .ndo_set_rx_mode = ice_set_rx_mode,
        .ndo_set_mac_address = ice_set_mac_address,
        .ndo_validate_addr = eth_validate_addr,
index f57c414..380e8ae 100644 (file)
@@ -9,6 +9,7 @@
 #ifndef CONFIG_64BIT
 #include <linux/io-64-nonatomic-lo-hi.h>
 #endif
+#include <net/udp_tunnel.h>
 
 #define wr32(a, reg, value)    writel((value), ((a)->hw_addr + (reg)))
 #define rd32(a, reg)           readl((a)->hw_addr + (reg))
diff --git a/drivers/net/ethernet/intel/ice/ice_pf_vsi_vlan_ops.c b/drivers/net/ethernet/intel/ice/ice_pf_vsi_vlan_ops.c
new file mode 100644 (file)
index 0000000..976a03d
--- /dev/null
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2019-2021, Intel Corporation. */
+
+#include "ice_vsi_vlan_ops.h"
+#include "ice_vsi_vlan_lib.h"
+#include "ice_vlan_mode.h"
+#include "ice.h"
+#include "ice_pf_vsi_vlan_ops.h"
+
+void ice_pf_vsi_init_vlan_ops(struct ice_vsi *vsi)
+{
+       struct ice_vsi_vlan_ops *vlan_ops;
+
+       if (ice_is_dvm_ena(&vsi->back->hw)) {
+               vlan_ops = &vsi->outer_vlan_ops;
+
+               vlan_ops->add_vlan = ice_vsi_add_vlan;
+               vlan_ops->del_vlan = ice_vsi_del_vlan;
+               vlan_ops->ena_stripping = ice_vsi_ena_outer_stripping;
+               vlan_ops->dis_stripping = ice_vsi_dis_outer_stripping;
+               vlan_ops->ena_insertion = ice_vsi_ena_outer_insertion;
+               vlan_ops->dis_insertion = ice_vsi_dis_outer_insertion;
+               vlan_ops->ena_rx_filtering = ice_vsi_ena_rx_vlan_filtering;
+               vlan_ops->dis_rx_filtering = ice_vsi_dis_rx_vlan_filtering;
+       } else {
+               vlan_ops = &vsi->inner_vlan_ops;
+
+               vlan_ops->add_vlan = ice_vsi_add_vlan;
+               vlan_ops->del_vlan = ice_vsi_del_vlan;
+               vlan_ops->ena_stripping = ice_vsi_ena_inner_stripping;
+               vlan_ops->dis_stripping = ice_vsi_dis_inner_stripping;
+               vlan_ops->ena_insertion = ice_vsi_ena_inner_insertion;
+               vlan_ops->dis_insertion = ice_vsi_dis_inner_insertion;
+               vlan_ops->ena_rx_filtering = ice_vsi_ena_rx_vlan_filtering;
+               vlan_ops->dis_rx_filtering = ice_vsi_dis_rx_vlan_filtering;
+       }
+}
+
diff --git a/drivers/net/ethernet/intel/ice/ice_pf_vsi_vlan_ops.h b/drivers/net/ethernet/intel/ice/ice_pf_vsi_vlan_ops.h
new file mode 100644 (file)
index 0000000..6741ec8
--- /dev/null
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2019-2021, Intel Corporation. */
+
+#ifndef _ICE_PF_VSI_VLAN_OPS_H_
+#define _ICE_PF_VSI_VLAN_OPS_H_
+
+#include "ice_vsi_vlan_ops.h"
+
+struct ice_vsi;
+
+void ice_pf_vsi_init_vlan_ops(struct ice_vsi *vsi);
+
+#endif /* _ICE_PF_VSI_VLAN_OPS_H_ */
index 11ae0be..4143728 100644 (file)
@@ -1097,6 +1097,64 @@ ice_aq_get_recipe(struct ice_hw *hw,
 }
 
 /**
+ * ice_update_recipe_lkup_idx - update a default recipe based on the lkup_idx
+ * @hw: pointer to the HW struct
+ * @params: parameters used to update the default recipe
+ *
+ * This function only supports updating default recipes and it only supports
+ * updating a single recipe based on the lkup_idx at a time.
+ *
+ * This is done as a read-modify-write operation. First, get the current recipe
+ * contents based on the recipe's ID. Then modify the field vector index and
+ * mask if it's valid at the lkup_idx. Finally, use the add recipe AQ to update
+ * the pre-existing recipe with the modifications.
+ */
+int
+ice_update_recipe_lkup_idx(struct ice_hw *hw,
+                          struct ice_update_recipe_lkup_idx_params *params)
+{
+       struct ice_aqc_recipe_data_elem *rcp_list;
+       u16 num_recps = ICE_MAX_NUM_RECIPES;
+       int status;
+
+       rcp_list = kcalloc(num_recps, sizeof(*rcp_list), GFP_KERNEL);
+       if (!rcp_list)
+               return -ENOMEM;
+
+       /* read current recipe list from firmware */
+       rcp_list->recipe_indx = params->rid;
+       status = ice_aq_get_recipe(hw, rcp_list, &num_recps, params->rid, NULL);
+       if (status) {
+               ice_debug(hw, ICE_DBG_SW, "Failed to get recipe %d, status %d\n",
+                         params->rid, status);
+               goto error_out;
+       }
+
+       /* only modify existing recipe's lkup_idx and mask if valid, while
+        * leaving all other fields the same, then update the recipe firmware
+        */
+       rcp_list->content.lkup_indx[params->lkup_idx] = params->fv_idx;
+       if (params->mask_valid)
+               rcp_list->content.mask[params->lkup_idx] =
+                       cpu_to_le16(params->mask);
+
+       if (params->ignore_valid)
+               rcp_list->content.lkup_indx[params->lkup_idx] |=
+                       ICE_AQ_RECIPE_LKUP_IGNORE;
+
+       status = ice_aq_add_recipe(hw, &rcp_list[0], 1, NULL);
+       if (status)
+               ice_debug(hw, ICE_DBG_SW, "Failed to update recipe %d lkup_idx %d fv_idx %d mask %d mask_valid %s, status %d\n",
+                         params->rid, params->lkup_idx, params->fv_idx,
+                         params->mask, params->mask_valid ? "true" : "false",
+                         status);
+
+error_out:
+       kfree(rcp_list);
+       return status;
+}
+
+/**
  * ice_aq_map_recipe_to_profile - Map recipe to packet profile
  * @hw: pointer to the HW struct
  * @profile_id: package profile ID to associate the recipe with
@@ -1539,6 +1597,7 @@ ice_fill_sw_rule(struct ice_hw *hw, struct ice_fltr_info *f_info,
                 struct ice_aqc_sw_rules_elem *s_rule, enum ice_adminq_opc opc)
 {
        u16 vlan_id = ICE_MAX_VLAN_ID + 1;
+       u16 vlan_tpid = ETH_P_8021Q;
        void *daddr = NULL;
        u16 eth_hdr_sz;
        u8 *eth_hdr;
@@ -1611,6 +1670,8 @@ ice_fill_sw_rule(struct ice_hw *hw, struct ice_fltr_info *f_info,
                break;
        case ICE_SW_LKUP_VLAN:
                vlan_id = f_info->l_data.vlan.vlan_id;
+               if (f_info->l_data.vlan.tpid_valid)
+                       vlan_tpid = f_info->l_data.vlan.tpid;
                if (f_info->fltr_act == ICE_FWD_TO_VSI ||
                    f_info->fltr_act == ICE_FWD_TO_VSI_LIST) {
                        act |= ICE_SINGLE_ACT_PRUNE;
@@ -1653,6 +1714,8 @@ ice_fill_sw_rule(struct ice_hw *hw, struct ice_fltr_info *f_info,
        if (!(vlan_id > ICE_MAX_VLAN_ID)) {
                off = (__force __be16 *)(eth_hdr + ICE_ETH_VLAN_TCI_OFFSET);
                *off = cpu_to_be16(vlan_id);
+               off = (__force __be16 *)(eth_hdr + ICE_ETH_ETHTYPE_OFFSET);
+               *off = cpu_to_be16(vlan_tpid);
        }
 
        /* Create the switch rule with the final dummy Ethernet header */
@@ -3868,6 +3931,23 @@ ice_find_recp(struct ice_hw *hw, struct ice_prot_lkup_ext *lkup_exts,
 }
 
 /**
+ * ice_change_proto_id_to_dvm - change proto id in prot_id_tbl
+ *
+ * As protocol id for outer vlan is different in dvm and svm, if dvm is
+ * supported protocol array record for outer vlan has to be modified to
+ * reflect the value proper for DVM.
+ */
+void ice_change_proto_id_to_dvm(void)
+{
+       u8 i;
+
+       for (i = 0; i < ARRAY_SIZE(ice_prot_id_tbl); i++)
+               if (ice_prot_id_tbl[i].type == ICE_VLAN_OFOS &&
+                   ice_prot_id_tbl[i].protocol_id != ICE_VLAN_OF_HW)
+                       ice_prot_id_tbl[i].protocol_id = ICE_VLAN_OF_HW;
+}
+
+/**
  * ice_prot_type_to_id - get protocol ID from protocol type
  * @type: protocol type
  * @id: pointer to variable that will receive the ID
index d8334be..7b42c51 100644 (file)
@@ -33,15 +33,6 @@ struct ice_vsi_ctx {
        struct ice_q_ctx *rdma_q_ctx[ICE_MAX_TRAFFIC_CLASS];
 };
 
-enum ice_sw_fwd_act_type {
-       ICE_FWD_TO_VSI = 0,
-       ICE_FWD_TO_VSI_LIST, /* Do not use this when adding filter */
-       ICE_FWD_TO_Q,
-       ICE_FWD_TO_QGRP,
-       ICE_DROP_PACKET,
-       ICE_INVAL_ACT
-};
-
 /* Switch recipe ID enum values are specific to hardware */
 enum ice_sw_lkup_type {
        ICE_SW_LKUP_ETHERTYPE = 0,
@@ -86,6 +77,8 @@ struct ice_fltr_info {
                } mac_vlan;
                struct {
                        u16 vlan_id;
+                       u16 tpid;
+                       u8 tpid_valid;
                } vlan;
                /* Set lkup_type as ICE_SW_LKUP_ETHERTYPE
                 * if just using ethertype as filter. Set lkup_type as
@@ -125,6 +118,15 @@ struct ice_fltr_info {
        u8 lan_en;      /* Indicate if packet can be forwarded to the uplink */
 };
 
+struct ice_update_recipe_lkup_idx_params {
+       u16 rid;
+       u16 fv_idx;
+       bool ignore_valid;
+       u16 mask;
+       bool mask_valid;
+       u8 lkup_idx;
+};
+
 struct ice_adv_lkup_elem {
        enum ice_protocol_type type;
        union ice_prot_hdr h_u; /* Header values */
@@ -367,4 +369,8 @@ void ice_rm_all_sw_replay_rule_info(struct ice_hw *hw);
 int
 ice_aq_sw_rules(struct ice_hw *hw, void *rule_list, u16 rule_list_sz,
                u8 num_rules, enum ice_adminq_opc opc, struct ice_sq_cd *cd);
+int
+ice_update_recipe_lkup_idx(struct ice_hw *hw,
+                          struct ice_update_recipe_lkup_idx_params *params);
+void ice_change_proto_id_to_dvm(void);
 #endif /* _ICE_SWITCH_H_ */
index 3e38695..ff93ec7 100644 (file)
@@ -173,6 +173,8 @@ tx_skip_free:
 
        tx_ring->next_to_use = 0;
        tx_ring->next_to_clean = 0;
+       tx_ring->next_dd = ICE_RING_QUARTER(tx_ring) - 1;
+       tx_ring->next_rs = ICE_RING_QUARTER(tx_ring) - 1;
 
        if (!tx_ring->netdev)
                return;
@@ -983,15 +985,17 @@ static struct sk_buff *
 ice_construct_skb(struct ice_rx_ring *rx_ring, struct ice_rx_buf *rx_buf,
                  struct xdp_buff *xdp)
 {
+       unsigned int metasize = xdp->data - xdp->data_meta;
        unsigned int size = xdp->data_end - xdp->data;
        unsigned int headlen;
        struct sk_buff *skb;
 
        /* prefetch first cache line of first page */
-       net_prefetch(xdp->data);
+       net_prefetch(xdp->data_meta);
 
        /* allocate a skb to store the frags */
-       skb = __napi_alloc_skb(&rx_ring->q_vector->napi, ICE_RX_HDR_SIZE,
+       skb = __napi_alloc_skb(&rx_ring->q_vector->napi,
+                              ICE_RX_HDR_SIZE + metasize,
                               GFP_ATOMIC | __GFP_NOWARN);
        if (unlikely(!skb))
                return NULL;
@@ -1003,8 +1007,13 @@ ice_construct_skb(struct ice_rx_ring *rx_ring, struct ice_rx_buf *rx_buf,
                headlen = eth_get_headlen(skb->dev, xdp->data, ICE_RX_HDR_SIZE);
 
        /* align pull length to size of long to optimize memcpy performance */
-       memcpy(__skb_put(skb, headlen), xdp->data, ALIGN(headlen,
-                                                        sizeof(long)));
+       memcpy(__skb_put(skb, headlen + metasize), xdp->data_meta,
+              ALIGN(headlen + metasize, sizeof(long)));
+
+       if (metasize) {
+               skb_metadata_set(skb, metasize);
+               __skb_pull(skb, metasize);
+       }
 
        /* if we exhaust the linear part then add what is left as a frag */
        size -= headlen;
@@ -1080,7 +1089,7 @@ ice_is_non_eop(struct ice_rx_ring *rx_ring, union ice_32b_rx_flex_desc *rx_desc)
 {
        /* if we are the last buffer then there is nothing else to do */
 #define ICE_RXD_EOF BIT(ICE_RX_FLEX_DESC_STATUS0_EOF_S)
-       if (likely(ice_test_staterr(rx_desc, ICE_RXD_EOF)))
+       if (likely(ice_test_staterr(rx_desc->wb.status_error0, ICE_RXD_EOF)))
                return false;
 
        rx_ring->rx_stats.non_eop_descs++;
@@ -1142,7 +1151,7 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget)
                 * hardware wrote DD then it will be non-zero
                 */
                stat_err_bits = BIT(ICE_RX_FLEX_DESC_STATUS0_DD_S);
-               if (!ice_test_staterr(rx_desc, stat_err_bits))
+               if (!ice_test_staterr(rx_desc->wb.status_error0, stat_err_bits))
                        break;
 
                /* This memory barrier is needed to keep us from reading
@@ -1228,14 +1237,13 @@ construct_skb:
                        continue;
 
                stat_err_bits = BIT(ICE_RX_FLEX_DESC_STATUS0_RXE_S);
-               if (unlikely(ice_test_staterr(rx_desc, stat_err_bits))) {
+               if (unlikely(ice_test_staterr(rx_desc->wb.status_error0,
+                                             stat_err_bits))) {
                        dev_kfree_skb_any(skb);
                        continue;
                }
 
-               stat_err_bits = BIT(ICE_RX_FLEX_DESC_STATUS0_L2TAG1P_S);
-               if (ice_test_staterr(rx_desc, stat_err_bits))
-                       vlan_tag = le16_to_cpu(rx_desc->wb.l2tag1);
+               vlan_tag = ice_get_vlan_tag_from_rx_desc(rx_desc);
 
                /* pad the skb if needed, to make a valid ethernet frame */
                if (eth_skb_pad(skb)) {
@@ -1460,7 +1468,7 @@ int ice_napi_poll(struct napi_struct *napi, int budget)
                bool wd;
 
                if (tx_ring->xsk_pool)
-                       wd = ice_clean_tx_irq_zc(tx_ring, budget);
+                       wd = ice_xmit_zc(tx_ring, ICE_DESC_UNUSED(tx_ring), budget);
                else if (ice_ring_is_xdp(tx_ring))
                        wd = true;
                else
@@ -1513,7 +1521,7 @@ int ice_napi_poll(struct napi_struct *napi, int budget)
        /* Exit the polling mode, but don't re-enable interrupts if stack might
         * poll us due to busy-polling
         */
-       if (likely(napi_complete_done(napi, work_done))) {
+       if (napi_complete_done(napi, work_done)) {
                ice_net_dim(q_vector);
                ice_enable_interrupt(q_vector);
        } else {
@@ -1917,12 +1925,16 @@ ice_tx_prepare_vlan_flags(struct ice_tx_ring *tx_ring, struct ice_tx_buf *first)
        if (!skb_vlan_tag_present(skb) && eth_type_vlan(skb->protocol))
                return;
 
-       /* currently, we always assume 802.1Q for VLAN insertion as VLAN
-        * insertion for 802.1AD is not supported
+       /* the VLAN ethertype/tpid is determined by VSI configuration and netdev
+        * feature flags, which the driver only allows either 802.1Q or 802.1ad
+        * VLAN offloads exclusively so we only care about the VLAN ID here
         */
        if (skb_vlan_tag_present(skb)) {
                first->tx_flags |= skb_vlan_tag_get(skb) << ICE_TX_FLAGS_VLAN_S;
-               first->tx_flags |= ICE_TX_FLAGS_HW_VLAN;
+               if (tx_ring->flags & ICE_TX_FLAGS_RING_VLAN_L2TAG2)
+                       first->tx_flags |= ICE_TX_FLAGS_HW_OUTER_SINGLE_VLAN;
+               else
+                       first->tx_flags |= ICE_TX_FLAGS_HW_VLAN;
        }
 
        ice_tx_prepare_vlan_flags_dcb(tx_ring, first);
@@ -2295,6 +2307,13 @@ ice_xmit_frame_ring(struct sk_buff *skb, struct ice_tx_ring *tx_ring)
 
        /* prepare the VLAN tagging flags for Tx */
        ice_tx_prepare_vlan_flags(tx_ring, first);
+       if (first->tx_flags & ICE_TX_FLAGS_HW_OUTER_SINGLE_VLAN) {
+               offload.cd_qw1 |= (u64)(ICE_TX_DESC_DTYPE_CTX |
+                                       (ICE_TX_CTX_DESC_IL2TAG2 <<
+                                       ICE_TXD_CTX_QW1_CMD_S));
+               offload.cd_l2tag2 = (first->tx_flags & ICE_TX_FLAGS_VLAN_M) >>
+                       ICE_TX_FLAGS_VLAN_S;
+       }
 
        /* set up TSO offload */
        tso = ice_tso(first, &offload);
index b7b3bd4..cead3eb 100644 (file)
@@ -13,7 +13,6 @@
 #define ICE_MAX_CHAINED_RX_BUFS        5
 #define ICE_MAX_BUF_TXD                8
 #define ICE_MIN_TX_LEN         17
-#define ICE_TX_THRESH          32
 
 /* The size limit for a transmit buffer in a descriptor is (16K - 1).
  * In order to align with the read requests we will align the value to
@@ -111,6 +110,8 @@ static inline int ice_skb_pad(void)
        (u16)((((R)->next_to_clean > (R)->next_to_use) ? 0 : (R)->count) + \
              (R)->next_to_clean - (R)->next_to_use - 1)
 
+#define ICE_RING_QUARTER(R) ((R)->count >> 2)
+
 #define ICE_TX_FLAGS_TSO       BIT(0)
 #define ICE_TX_FLAGS_HW_VLAN   BIT(1)
 #define ICE_TX_FLAGS_SW_VLAN   BIT(2)
@@ -122,6 +123,7 @@ static inline int ice_skb_pad(void)
 #define ICE_TX_FLAGS_IPV4      BIT(5)
 #define ICE_TX_FLAGS_IPV6      BIT(6)
 #define ICE_TX_FLAGS_TUNNEL    BIT(7)
+#define ICE_TX_FLAGS_HW_OUTER_SINGLE_VLAN      BIT(8)
 #define ICE_TX_FLAGS_VLAN_M    0xffff0000
 #define ICE_TX_FLAGS_VLAN_PR_M 0xe0000000
 #define ICE_TX_FLAGS_VLAN_PR_S 29
@@ -321,18 +323,21 @@ struct ice_tx_ring {
        u16 count;                      /* Number of descriptors */
        u16 q_index;                    /* Queue number of ring */
        /* stats structs */
-       struct ice_q_stats      stats;
-       struct u64_stats_sync syncp;
        struct ice_txq_stats tx_stats;
-
        /* CL3 - 3rd cacheline starts here */
+       struct ice_q_stats      stats;
+       struct u64_stats_sync syncp;
        struct rcu_head rcu;            /* to avoid race on free */
        DECLARE_BITMAP(xps_state, ICE_TX_NBITS);        /* XPS Config State */
        struct ice_channel *ch;
        struct ice_ptp_tx *tx_tstamps;
        spinlock_t tx_lock;
        u32 txq_teid;                   /* Added Tx queue TEID */
+       /* CL4 - 4th cacheline starts here */
+       u16 xdp_tx_active;
 #define ICE_TX_FLAGS_RING_XDP          BIT(0)
+#define ICE_TX_FLAGS_RING_VLAN_L2TAG1  BIT(1)
+#define ICE_TX_FLAGS_RING_VLAN_L2TAG2  BIT(2)
        u8 flags;
        u8 dcb_tc;                      /* Traffic class of ring */
        u8 ptp_tx;
index 0e87b98..7ee38d0 100644 (file)
@@ -209,9 +209,14 @@ ice_process_skb_fields(struct ice_rx_ring *rx_ring,
 void
 ice_receive_skb(struct ice_rx_ring *rx_ring, struct sk_buff *skb, u16 vlan_tag)
 {
-       if ((rx_ring->netdev->features & NETIF_F_HW_VLAN_CTAG_RX) &&
-           (vlan_tag & VLAN_VID_MASK))
+       netdev_features_t features = rx_ring->netdev->features;
+       bool non_zero_vlan = !!(vlan_tag & VLAN_VID_MASK);
+
+       if ((features & NETIF_F_HW_VLAN_CTAG_RX) && non_zero_vlan)
                __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tag);
+       else if ((features & NETIF_F_HW_VLAN_STAG_RX) && non_zero_vlan)
+               __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021AD), vlan_tag);
+
        napi_gro_receive(&rx_ring->q_vector->napi, skb);
 }
 
@@ -222,6 +227,7 @@ ice_receive_skb(struct ice_rx_ring *rx_ring, struct sk_buff *skb, u16 vlan_tag)
 static void ice_clean_xdp_irq(struct ice_tx_ring *xdp_ring)
 {
        unsigned int total_bytes = 0, total_pkts = 0;
+       u16 tx_thresh = ICE_RING_QUARTER(xdp_ring);
        u16 ntc = xdp_ring->next_to_clean;
        struct ice_tx_desc *next_dd_desc;
        u16 next_dd = xdp_ring->next_dd;
@@ -233,7 +239,7 @@ static void ice_clean_xdp_irq(struct ice_tx_ring *xdp_ring)
            cpu_to_le64(ICE_TX_DESC_DTYPE_DESC_DONE)))
                return;
 
-       for (i = 0; i < ICE_TX_THRESH; i++) {
+       for (i = 0; i < tx_thresh; i++) {
                tx_buf = &xdp_ring->tx_buf[ntc];
 
                total_bytes += tx_buf->bytecount;
@@ -254,9 +260,9 @@ static void ice_clean_xdp_irq(struct ice_tx_ring *xdp_ring)
        }
 
        next_dd_desc->cmd_type_offset_bsz = 0;
-       xdp_ring->next_dd = xdp_ring->next_dd + ICE_TX_THRESH;
+       xdp_ring->next_dd = xdp_ring->next_dd + tx_thresh;
        if (xdp_ring->next_dd > xdp_ring->count)
-               xdp_ring->next_dd = ICE_TX_THRESH - 1;
+               xdp_ring->next_dd = tx_thresh - 1;
        xdp_ring->next_to_clean = ntc;
        ice_update_tx_ring_stats(xdp_ring, total_pkts, total_bytes);
 }
@@ -269,12 +275,13 @@ static void ice_clean_xdp_irq(struct ice_tx_ring *xdp_ring)
  */
 int ice_xmit_xdp_ring(void *data, u16 size, struct ice_tx_ring *xdp_ring)
 {
+       u16 tx_thresh = ICE_RING_QUARTER(xdp_ring);
        u16 i = xdp_ring->next_to_use;
        struct ice_tx_desc *tx_desc;
        struct ice_tx_buf *tx_buf;
        dma_addr_t dma;
 
-       if (ICE_DESC_UNUSED(xdp_ring) < ICE_TX_THRESH)
+       if (ICE_DESC_UNUSED(xdp_ring) < tx_thresh)
                ice_clean_xdp_irq(xdp_ring);
 
        if (!unlikely(ICE_DESC_UNUSED(xdp_ring))) {
@@ -300,13 +307,14 @@ int ice_xmit_xdp_ring(void *data, u16 size, struct ice_tx_ring *xdp_ring)
        tx_desc->cmd_type_offset_bsz = ice_build_ctob(ICE_TX_DESC_CMD_EOP, 0,
                                                      size, 0);
 
+       xdp_ring->xdp_tx_active++;
        i++;
        if (i == xdp_ring->count) {
                i = 0;
                tx_desc = ICE_TX_DESC(xdp_ring, xdp_ring->next_rs);
                tx_desc->cmd_type_offset_bsz |=
                        cpu_to_le64(ICE_TX_DESC_CMD_RS << ICE_TXD_QW1_CMD_S);
-               xdp_ring->next_rs = ICE_TX_THRESH - 1;
+               xdp_ring->next_rs = tx_thresh - 1;
        }
        xdp_ring->next_to_use = i;
 
@@ -314,7 +322,7 @@ int ice_xmit_xdp_ring(void *data, u16 size, struct ice_tx_ring *xdp_ring)
                tx_desc = ICE_TX_DESC(xdp_ring, xdp_ring->next_rs);
                tx_desc->cmd_type_offset_bsz |=
                        cpu_to_le64(ICE_TX_DESC_CMD_RS << ICE_TXD_QW1_CMD_S);
-               xdp_ring->next_rs += ICE_TX_THRESH;
+               xdp_ring->next_rs += tx_thresh;
        }
 
        return ICE_XDP_TX;
index 11b6c16..c7d2954 100644 (file)
@@ -7,7 +7,7 @@
 
 /**
  * ice_test_staterr - tests bits in Rx descriptor status and error fields
- * @rx_desc: pointer to receive descriptor (in le64 format)
+ * @status_err_n: Rx descriptor status_error0 or status_error1 bits
  * @stat_err_bits: value to mask
  *
  * This function does some fast chicanery in order to return the
@@ -16,9 +16,9 @@
  * at offset zero.
  */
 static inline bool
-ice_test_staterr(union ice_32b_rx_flex_desc *rx_desc, const u16 stat_err_bits)
+ice_test_staterr(__le16 status_err_n, const u16 stat_err_bits)
 {
-       return !!(rx_desc->wb.status_error0 & cpu_to_le16(stat_err_bits));
+       return !!(status_err_n & cpu_to_le16(stat_err_bits));
 }
 
 static inline __le64
@@ -32,6 +32,30 @@ ice_build_ctob(u64 td_cmd, u64 td_offset, unsigned int size, u64 td_tag)
 }
 
 /**
+ * ice_get_vlan_tag_from_rx_desc - get VLAN from Rx flex descriptor
+ * @rx_desc: Rx 32b flex descriptor with RXDID=2
+ *
+ * The OS and current PF implementation only support stripping a single VLAN tag
+ * at a time, so there should only ever be 0 or 1 tags in the l2tag* fields. If
+ * one is found return the tag, else return 0 to mean no VLAN tag was found.
+ */
+static inline u16
+ice_get_vlan_tag_from_rx_desc(union ice_32b_rx_flex_desc *rx_desc)
+{
+       u16 stat_err_bits;
+
+       stat_err_bits = BIT(ICE_RX_FLEX_DESC_STATUS0_L2TAG1P_S);
+       if (ice_test_staterr(rx_desc->wb.status_error0, stat_err_bits))
+               return le16_to_cpu(rx_desc->wb.l2tag1);
+
+       stat_err_bits = BIT(ICE_RX_FLEX_DESC_STATUS1_L2TAG2P_S);
+       if (ice_test_staterr(rx_desc->wb.status_error1, stat_err_bits))
+               return le16_to_cpu(rx_desc->wb.l2tag2_2nd);
+
+       return 0;
+}
+
+/**
  * ice_xdp_ring_update_tail - Updates the XDP Tx ring tail register
  * @xdp_ring: XDP Tx ring
  *
index 546145d..28fcab2 100644 (file)
@@ -15,6 +15,7 @@
 #include "ice_flex_type.h"
 #include "ice_protocol_type.h"
 #include "ice_sbq_cmd.h"
+#include "ice_vlan_mode.h"
 
 static inline bool ice_is_tc_ena(unsigned long bitmap, u8 tc)
 {
@@ -54,6 +55,11 @@ static inline u32 ice_round_to_num(u32 N, u32 R)
 #define ICE_DBG_AQ_DESC                BIT_ULL(25)
 #define ICE_DBG_AQ_DESC_BUF    BIT_ULL(26)
 #define ICE_DBG_AQ_CMD         BIT_ULL(27)
+#define ICE_DBG_AQ             (ICE_DBG_AQ_MSG         | \
+                                ICE_DBG_AQ_DESC        | \
+                                ICE_DBG_AQ_DESC_BUF    | \
+                                ICE_DBG_AQ_CMD)
+
 #define ICE_DBG_USER           BIT_ULL(31)
 
 enum ice_aq_res_ids {
@@ -920,6 +926,9 @@ struct ice_hw {
        struct udp_tunnel_nic_shared udp_tunnel_shared;
        struct udp_tunnel_nic_info udp_tunnel_nic;
 
+       /* dvm boost update information */
+       struct ice_dvm_table dvm_upd;
+
        /* HW block tables */
        struct ice_blk_info blk[ICE_BLK_COUNT];
        struct mutex fl_profs_locks[ICE_BLK_COUNT];     /* lock fltr profiles */
@@ -943,6 +952,7 @@ struct ice_hw {
        struct list_head rss_list_head;
        struct ice_mbx_snapshot mbx_snapshot;
        DECLARE_BITMAP(hw_ptype, ICE_FLOW_PTYPE_MAX);
+       u8 dvm_ena;
        u16 io_expander_handle;
 };
 
@@ -1008,6 +1018,15 @@ struct ice_hw_port_stats {
        u64 fd_sb_match;
 };
 
+enum ice_sw_fwd_act_type {
+       ICE_FWD_TO_VSI = 0,
+       ICE_FWD_TO_VSI_LIST, /* Do not use this when adding filter */
+       ICE_FWD_TO_Q,
+       ICE_FWD_TO_QGRP,
+       ICE_DROP_PACKET,
+       ICE_INVAL_ACT
+};
+
 struct ice_aq_get_set_rss_lut_params {
        u16 vsi_handle;         /* software VSI handle */
        u16 lut_size;           /* size of the LUT buffer */
diff --git a/drivers/net/ethernet/intel/ice/ice_vf_vsi_vlan_ops.c b/drivers/net/ethernet/intel/ice/ice_vf_vsi_vlan_ops.c
new file mode 100644 (file)
index 0000000..39f2d36
--- /dev/null
@@ -0,0 +1,202 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2019-2021, Intel Corporation. */
+
+#include "ice_vsi_vlan_ops.h"
+#include "ice_vsi_vlan_lib.h"
+#include "ice_vlan_mode.h"
+#include "ice.h"
+#include "ice_vf_vsi_vlan_ops.h"
+#include "ice_virtchnl_pf.h"
+
+static int
+noop_vlan_arg(struct ice_vsi __always_unused *vsi,
+             struct ice_vlan __always_unused *vlan)
+{
+       return 0;
+}
+
+static int
+noop_vlan(struct ice_vsi __always_unused *vsi)
+{
+       return 0;
+}
+
+/**
+ * ice_vf_vsi_init_vlan_ops - Initialize default VSI VLAN ops for VF VSI
+ * @vsi: VF's VSI being configured
+ *
+ * If Double VLAN Mode (DVM) is enabled, assume that the VF supports the new
+ * VIRTCHNL_VF_VLAN_OFFLOAD_V2 capability and set up the VLAN ops accordingly.
+ * If SVM is enabled maintain the same level of VLAN support previous to
+ * VIRTCHNL_VF_VLAN_OFFLOAD_V2.
+ */
+void ice_vf_vsi_init_vlan_ops(struct ice_vsi *vsi)
+{
+       struct ice_vsi_vlan_ops *vlan_ops;
+       struct ice_pf *pf = vsi->back;
+       struct ice_vf *vf;
+
+       vf = &pf->vf[vsi->vf_id];
+
+       if (ice_is_dvm_ena(&pf->hw)) {
+               vlan_ops = &vsi->outer_vlan_ops;
+
+               /* outer VLAN ops regardless of port VLAN config */
+               vlan_ops->add_vlan = ice_vsi_add_vlan;
+               vlan_ops->dis_rx_filtering = ice_vsi_dis_rx_vlan_filtering;
+               vlan_ops->ena_tx_filtering = ice_vsi_ena_tx_vlan_filtering;
+               vlan_ops->dis_tx_filtering = ice_vsi_dis_tx_vlan_filtering;
+
+               if (ice_vf_is_port_vlan_ena(vf)) {
+                       /* setup outer VLAN ops */
+                       vlan_ops->set_port_vlan = ice_vsi_set_outer_port_vlan;
+                       vlan_ops->ena_rx_filtering =
+                               ice_vsi_ena_rx_vlan_filtering;
+
+                       /* setup inner VLAN ops */
+                       vlan_ops = &vsi->inner_vlan_ops;
+                       vlan_ops->add_vlan = noop_vlan_arg;
+                       vlan_ops->del_vlan = noop_vlan_arg;
+                       vlan_ops->ena_stripping = ice_vsi_ena_inner_stripping;
+                       vlan_ops->dis_stripping = ice_vsi_dis_inner_stripping;
+                       vlan_ops->ena_insertion = ice_vsi_ena_inner_insertion;
+                       vlan_ops->dis_insertion = ice_vsi_dis_inner_insertion;
+               } else {
+                       if (!test_bit(ICE_FLAG_VF_VLAN_PRUNING, pf->flags))
+                               vlan_ops->ena_rx_filtering = noop_vlan;
+                       else
+                               vlan_ops->ena_rx_filtering =
+                                       ice_vsi_ena_rx_vlan_filtering;
+
+                       vlan_ops->del_vlan = ice_vsi_del_vlan;
+                       vlan_ops->ena_stripping = ice_vsi_ena_outer_stripping;
+                       vlan_ops->dis_stripping = ice_vsi_dis_outer_stripping;
+                       vlan_ops->ena_insertion = ice_vsi_ena_outer_insertion;
+                       vlan_ops->dis_insertion = ice_vsi_dis_outer_insertion;
+
+                       /* setup inner VLAN ops */
+                       vlan_ops = &vsi->inner_vlan_ops;
+
+                       vlan_ops->ena_stripping = ice_vsi_ena_inner_stripping;
+                       vlan_ops->dis_stripping = ice_vsi_dis_inner_stripping;
+                       vlan_ops->ena_insertion = ice_vsi_ena_inner_insertion;
+                       vlan_ops->dis_insertion = ice_vsi_dis_inner_insertion;
+               }
+       } else {
+               vlan_ops = &vsi->inner_vlan_ops;
+
+               /* inner VLAN ops regardless of port VLAN config */
+               vlan_ops->add_vlan = ice_vsi_add_vlan;
+               vlan_ops->dis_rx_filtering = ice_vsi_dis_rx_vlan_filtering;
+               vlan_ops->ena_tx_filtering = ice_vsi_ena_tx_vlan_filtering;
+               vlan_ops->dis_tx_filtering = ice_vsi_dis_tx_vlan_filtering;
+
+               if (ice_vf_is_port_vlan_ena(vf)) {
+                       vlan_ops->set_port_vlan = ice_vsi_set_inner_port_vlan;
+                       vlan_ops->ena_rx_filtering =
+                               ice_vsi_ena_rx_vlan_filtering;
+               } else {
+                       if (!test_bit(ICE_FLAG_VF_VLAN_PRUNING, pf->flags))
+                               vlan_ops->ena_rx_filtering = noop_vlan;
+                       else
+                               vlan_ops->ena_rx_filtering =
+                                       ice_vsi_ena_rx_vlan_filtering;
+
+                       vlan_ops->del_vlan = ice_vsi_del_vlan;
+                       vlan_ops->ena_stripping = ice_vsi_ena_inner_stripping;
+                       vlan_ops->dis_stripping = ice_vsi_dis_inner_stripping;
+                       vlan_ops->ena_insertion = ice_vsi_ena_inner_insertion;
+                       vlan_ops->dis_insertion = ice_vsi_dis_inner_insertion;
+               }
+       }
+}
+
+/**
+ * ice_vf_vsi_cfg_dvm_legacy_vlan_mode - Config VLAN mode for old VFs in DVM
+ * @vsi: VF's VSI being configured
+ *
+ * This should only be called when Double VLAN Mode (DVM) is enabled, there
+ * is not a port VLAN enabled on this VF, and the VF negotiates
+ * VIRTCHNL_VF_OFFLOAD_VLAN.
+ *
+ * This function sets up the VF VSI's inner and outer ice_vsi_vlan_ops and also
+ * initializes software only VLAN mode (i.e. allow all VLANs). Also, use no-op
+ * implementations for any functions that may be called during the lifetime of
+ * the VF so these methods do nothing and succeed.
+ */
+void ice_vf_vsi_cfg_dvm_legacy_vlan_mode(struct ice_vsi *vsi)
+{
+       struct ice_vf *vf = &vsi->back->vf[vsi->vf_id];
+       struct device *dev = ice_pf_to_dev(vf->pf);
+       struct ice_vsi_vlan_ops *vlan_ops;
+
+       if (!ice_is_dvm_ena(&vsi->back->hw) || ice_vf_is_port_vlan_ena(vf))
+               return;
+
+       vlan_ops = &vsi->outer_vlan_ops;
+
+       /* Rx VLAN filtering always disabled to allow software offloaded VLANs
+        * for VFs that only support VIRTCHNL_VF_OFFLOAD_VLAN and don't have a
+        * port VLAN configured
+        */
+       vlan_ops->dis_rx_filtering = ice_vsi_dis_rx_vlan_filtering;
+       /* Don't fail when attempting to enable Rx VLAN filtering */
+       vlan_ops->ena_rx_filtering = noop_vlan;
+
+       /* Tx VLAN filtering always disabled to allow software offloaded VLANs
+        * for VFs that only support VIRTCHNL_VF_OFFLOAD_VLAN and don't have a
+        * port VLAN configured
+        */
+       vlan_ops->dis_tx_filtering = ice_vsi_dis_tx_vlan_filtering;
+       /* Don't fail when attempting to enable Tx VLAN filtering */
+       vlan_ops->ena_tx_filtering = noop_vlan;
+
+       if (vlan_ops->dis_rx_filtering(vsi))
+               dev_dbg(dev, "Failed to disable Rx VLAN filtering for old VF without VIRTCHNL_VF_OFFLOAD_VLAN_V2 support\n");
+       if (vlan_ops->dis_tx_filtering(vsi))
+               dev_dbg(dev, "Failed to disable Tx VLAN filtering for old VF without VIRTHCNL_VF_OFFLOAD_VLAN_V2 support\n");
+
+       /* All outer VLAN offloads must be disabled */
+       vlan_ops->dis_stripping = ice_vsi_dis_outer_stripping;
+       vlan_ops->dis_insertion = ice_vsi_dis_outer_insertion;
+
+       if (vlan_ops->dis_stripping(vsi))
+               dev_dbg(dev, "Failed to disable outer VLAN stripping for old VF without VIRTCHNL_VF_OFFLOAD_VLAN_V2 support\n");
+
+       if (vlan_ops->dis_insertion(vsi))
+               dev_dbg(dev, "Failed to disable outer VLAN insertion for old VF without VIRTCHNL_VF_OFFLOAD_VLAN_V2 support\n");
+
+       /* All inner VLAN offloads must be disabled */
+       vlan_ops = &vsi->inner_vlan_ops;
+
+       vlan_ops->dis_stripping = ice_vsi_dis_outer_stripping;
+       vlan_ops->dis_insertion = ice_vsi_dis_outer_insertion;
+
+       if (vlan_ops->dis_stripping(vsi))
+               dev_dbg(dev, "Failed to disable inner VLAN stripping for old VF without VIRTCHNL_VF_OFFLOAD_VLAN_V2 support\n");
+
+       if (vlan_ops->dis_insertion(vsi))
+               dev_dbg(dev, "Failed to disable inner VLAN insertion for old VF without VIRTCHNL_VF_OFFLOAD_VLAN_V2 support\n");
+}
+
+/**
+ * ice_vf_vsi_cfg_svm_legacy_vlan_mode - Config VLAN mode for old VFs in SVM
+ * @vsi: VF's VSI being configured
+ *
+ * This should only be called when Single VLAN Mode (SVM) is enabled, there is
+ * not a port VLAN enabled on this VF, and the VF negotiates
+ * VIRTCHNL_VF_OFFLOAD_VLAN.
+ *
+ * All of the normal SVM VLAN ops are identical for this case. However, by
+ * default Rx VLAN filtering should be turned off by default in this case.
+ */
+void ice_vf_vsi_cfg_svm_legacy_vlan_mode(struct ice_vsi *vsi)
+{
+       struct ice_vf *vf = &vsi->back->vf[vsi->vf_id];
+
+       if (ice_is_dvm_ena(&vsi->back->hw) || ice_vf_is_port_vlan_ena(vf))
+               return;
+
+       if (vsi->inner_vlan_ops.dis_rx_filtering(vsi))
+               dev_dbg(ice_pf_to_dev(vf->pf), "Failed to disable Rx VLAN filtering for old VF with VIRTCHNL_VF_OFFLOAD_VLAN support\n");
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_vf_vsi_vlan_ops.h b/drivers/net/ethernet/intel/ice/ice_vf_vsi_vlan_ops.h
new file mode 100644 (file)
index 0000000..875a4e6
--- /dev/null
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2019-2021, Intel Corporation. */
+
+#ifndef _ICE_VF_VSI_VLAN_OPS_H_
+#define _ICE_VF_VSI_VLAN_OPS_H_
+
+#include "ice_vsi_vlan_ops.h"
+
+struct ice_vsi;
+
+void ice_vf_vsi_cfg_dvm_legacy_vlan_mode(struct ice_vsi *vsi);
+void ice_vf_vsi_cfg_svm_legacy_vlan_mode(struct ice_vsi *vsi);
+
+#ifdef CONFIG_PCI_IOV
+void ice_vf_vsi_init_vlan_ops(struct ice_vsi *vsi);
+#else
+static inline void ice_vf_vsi_init_vlan_ops(struct ice_vsi *vsi) { }
+#endif /* CONFIG_PCI_IOV */
+#endif /* _ICE_PF_VSI_VLAN_OPS_H_ */
index 9feebe5..5a82216 100644 (file)
@@ -55,6 +55,15 @@ static const u32 vlan_allowlist_opcodes[] = {
        VIRTCHNL_OP_ENABLE_VLAN_STRIPPING, VIRTCHNL_OP_DISABLE_VLAN_STRIPPING,
 };
 
+/* VIRTCHNL_VF_OFFLOAD_VLAN_V2 */
+static const u32 vlan_v2_allowlist_opcodes[] = {
+       VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS, VIRTCHNL_OP_ADD_VLAN_V2,
+       VIRTCHNL_OP_DEL_VLAN_V2, VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2,
+       VIRTCHNL_OP_DISABLE_VLAN_STRIPPING_V2,
+       VIRTCHNL_OP_ENABLE_VLAN_INSERTION_V2,
+       VIRTCHNL_OP_DISABLE_VLAN_INSERTION_V2,
+};
+
 /* VIRTCHNL_VF_OFFLOAD_RSS_PF */
 static const u32 rss_pf_allowlist_opcodes[] = {
        VIRTCHNL_OP_CONFIG_RSS_KEY, VIRTCHNL_OP_CONFIG_RSS_LUT,
@@ -89,6 +98,7 @@ static const struct allowlist_opcode_info allowlist_opcodes[] = {
        ALLOW_ITEM(VIRTCHNL_VF_OFFLOAD_RSS_PF, rss_pf_allowlist_opcodes),
        ALLOW_ITEM(VIRTCHNL_VF_OFFLOAD_ADV_RSS_PF, adv_rss_pf_allowlist_opcodes),
        ALLOW_ITEM(VIRTCHNL_VF_OFFLOAD_FDIR_PF, fdir_pf_allowlist_opcodes),
+       ALLOW_ITEM(VIRTCHNL_VF_OFFLOAD_VLAN_V2, vlan_v2_allowlist_opcodes),
 };
 
 /**
index 39b8012..02a8c15 100644 (file)
@@ -10,6 +10,8 @@
 #include "ice_eswitch.h"
 #include "ice_virtchnl_allowlist.h"
 #include "ice_flex_pipe.h"
+#include "ice_vf_vsi_vlan_ops.h"
+#include "ice_vlan.h"
 
 #define FIELD_SELECTOR(proto_hdr_field) \
                BIT((proto_hdr_field) & PROTO_HDR_FIELD_MASK)
@@ -643,55 +645,6 @@ static void ice_trigger_vf_reset(struct ice_vf *vf, bool is_vflr, bool is_pfr)
 }
 
 /**
- * ice_vsi_manage_pvid - Enable or disable port VLAN for VSI
- * @vsi: the VSI to update
- * @pvid_info: VLAN ID and QoS used to set the PVID VSI context field
- * @enable: true for enable PVID false for disable
- */
-static int ice_vsi_manage_pvid(struct ice_vsi *vsi, u16 pvid_info, bool enable)
-{
-       struct ice_hw *hw = &vsi->back->hw;
-       struct ice_aqc_vsi_props *info;
-       struct ice_vsi_ctx *ctxt;
-       int ret;
-
-       ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
-       if (!ctxt)
-               return -ENOMEM;
-
-       ctxt->info = vsi->info;
-       info = &ctxt->info;
-       if (enable) {
-               info->vlan_flags = ICE_AQ_VSI_VLAN_MODE_UNTAGGED |
-                       ICE_AQ_VSI_PVLAN_INSERT_PVID |
-                       ICE_AQ_VSI_VLAN_EMOD_STR;
-               info->sw_flags2 |= ICE_AQ_VSI_SW_FLAG_RX_VLAN_PRUNE_ENA;
-       } else {
-               info->vlan_flags = ICE_AQ_VSI_VLAN_EMOD_NOTHING |
-                       ICE_AQ_VSI_VLAN_MODE_ALL;
-               info->sw_flags2 &= ~ICE_AQ_VSI_SW_FLAG_RX_VLAN_PRUNE_ENA;
-       }
-
-       info->pvid = cpu_to_le16(pvid_info);
-       info->valid_sections = cpu_to_le16(ICE_AQ_VSI_PROP_VLAN_VALID |
-                                          ICE_AQ_VSI_PROP_SW_VALID);
-
-       ret = ice_update_vsi(hw, vsi->idx, ctxt, NULL);
-       if (ret) {
-               dev_info(ice_hw_to_dev(hw), "update VSI for port VLAN failed, err %d aq_err %s\n",
-                        ret, ice_aq_str(hw->adminq.sq_last_status));
-               goto out;
-       }
-
-       vsi->info.vlan_flags = info->vlan_flags;
-       vsi->info.sw_flags2 = info->sw_flags2;
-       vsi->info.pvid = info->pvid;
-out:
-       kfree(ctxt);
-       return ret;
-}
-
-/**
  * ice_vf_get_port_info - Get the VF's port info structure
  * @vf: VF used to get the port info structure for
  */
@@ -800,43 +753,151 @@ static int ice_vf_rebuild_host_tx_rate_cfg(struct ice_vf *vf)
        return 0;
 }
 
+static u16 ice_vf_get_port_vlan_id(struct ice_vf *vf)
+{
+       return vf->port_vlan_info.vid;
+}
+
+static u8 ice_vf_get_port_vlan_prio(struct ice_vf *vf)
+{
+       return vf->port_vlan_info.prio;
+}
+
+bool ice_vf_is_port_vlan_ena(struct ice_vf *vf)
+{
+       return (ice_vf_get_port_vlan_id(vf) || ice_vf_get_port_vlan_prio(vf));
+}
+
+static u16 ice_vf_get_port_vlan_tpid(struct ice_vf *vf)
+{
+       return vf->port_vlan_info.tpid;
+}
+
 /**
  * ice_vf_rebuild_host_vlan_cfg - add VLAN 0 filter or rebuild the Port VLAN
  * @vf: VF to add MAC filters for
+ * @vsi: Pointer to VSI
  *
  * Called after a VF VSI has been re-added/rebuilt during reset. The PF driver
  * always re-adds either a VLAN 0 or port VLAN based filter after reset.
  */
-static int ice_vf_rebuild_host_vlan_cfg(struct ice_vf *vf)
+static int ice_vf_rebuild_host_vlan_cfg(struct ice_vf *vf, struct ice_vsi *vsi)
 {
+       struct ice_vsi_vlan_ops *vlan_ops = ice_get_compat_vsi_vlan_ops(vsi);
        struct device *dev = ice_pf_to_dev(vf->pf);
-       struct ice_vsi *vsi = ice_get_vf_vsi(vf);
-       u16 vlan_id = 0;
        int err;
 
-       if (vf->port_vlan_info) {
-               err = ice_vsi_manage_pvid(vsi, vf->port_vlan_info, true);
+       if (ice_vf_is_port_vlan_ena(vf)) {
+               err = vlan_ops->set_port_vlan(vsi, &vf->port_vlan_info);
                if (err) {
                        dev_err(dev, "failed to configure port VLAN via VSI parameters for VF %u, error %d\n",
                                vf->vf_id, err);
                        return err;
                }
 
-               vlan_id = vf->port_vlan_info & VLAN_VID_MASK;
+               err = vlan_ops->add_vlan(vsi, &vf->port_vlan_info);
+       } else {
+               err = ice_vsi_add_vlan_zero(vsi);
        }
 
-       /* vlan_id will either be 0 or the port VLAN number */
-       err = ice_vsi_add_vlan(vsi, vlan_id, ICE_FWD_TO_VSI);
        if (err) {
-               dev_err(dev, "failed to add %s VLAN %u filter for VF %u, error %d\n",
-                       vf->port_vlan_info ? "port" : "", vlan_id, vf->vf_id,
-                       err);
+               dev_err(dev, "failed to add VLAN %u filter for VF %u during VF rebuild, error %d\n",
+                       ice_vf_is_port_vlan_ena(vf) ?
+                       ice_vf_get_port_vlan_id(vf) : 0, vf->vf_id, err);
                return err;
        }
 
+       err = vlan_ops->ena_rx_filtering(vsi);
+       if (err)
+               dev_warn(dev, "failed to enable Rx VLAN filtering for VF %d VSI %d during VF rebuild, error %d\n",
+                        vf->vf_id, vsi->idx, err);
+
        return 0;
 }
 
+static int ice_cfg_mac_antispoof(struct ice_vsi *vsi, bool enable)
+{
+       struct ice_vsi_ctx *ctx;
+       int err;
+
+       ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+       if (!ctx)
+               return -ENOMEM;
+
+       ctx->info.sec_flags = vsi->info.sec_flags;
+       ctx->info.valid_sections = cpu_to_le16(ICE_AQ_VSI_PROP_SECURITY_VALID);
+
+       if (enable)
+               ctx->info.sec_flags |= ICE_AQ_VSI_SEC_FLAG_ENA_MAC_ANTI_SPOOF;
+       else
+               ctx->info.sec_flags &= ~ICE_AQ_VSI_SEC_FLAG_ENA_MAC_ANTI_SPOOF;
+
+       err = ice_update_vsi(&vsi->back->hw, vsi->idx, ctx, NULL);
+       if (err)
+               dev_err(ice_pf_to_dev(vsi->back), "Failed to configure Tx MAC anti-spoof %s for VSI %d, error %d\n",
+                       enable ? "ON" : "OFF", vsi->vsi_num, err);
+       else
+               vsi->info.sec_flags = ctx->info.sec_flags;
+
+       kfree(ctx);
+
+       return err;
+}
+
+/**
+ * ice_vsi_ena_spoofchk - enable Tx spoof checking for this VSI
+ * @vsi: VSI to enable Tx spoof checking for
+ */
+static int ice_vsi_ena_spoofchk(struct ice_vsi *vsi)
+{
+       struct ice_vsi_vlan_ops *vlan_ops;
+       int err;
+
+       vlan_ops = ice_get_compat_vsi_vlan_ops(vsi);
+
+       err = vlan_ops->ena_tx_filtering(vsi);
+       if (err)
+               return err;
+
+       return ice_cfg_mac_antispoof(vsi, true);
+}
+
+/**
+ * ice_vsi_dis_spoofchk - disable Tx spoof checking for this VSI
+ * @vsi: VSI to disable Tx spoof checking for
+ */
+static int ice_vsi_dis_spoofchk(struct ice_vsi *vsi)
+{
+       struct ice_vsi_vlan_ops *vlan_ops;
+       int err;
+
+       vlan_ops = ice_get_compat_vsi_vlan_ops(vsi);
+
+       err = vlan_ops->dis_tx_filtering(vsi);
+       if (err)
+               return err;
+
+       return ice_cfg_mac_antispoof(vsi, false);
+}
+
+/**
+ * ice_vf_set_spoofchk_cfg - apply Tx spoof checking setting
+ * @vf: VF set spoofchk for
+ * @vsi: VSI associated to the VF
+ */
+static int
+ice_vf_set_spoofchk_cfg(struct ice_vf *vf, struct ice_vsi *vsi)
+{
+       int err;
+
+       if (vf->spoofchk)
+               err = ice_vsi_ena_spoofchk(vsi);
+       else
+               err = ice_vsi_dis_spoofchk(vsi);
+
+       return err;
+}
+
 /**
  * ice_vf_rebuild_host_mac_cfg - add broadcast and the VF's perm_addr/LAA
  * @vf: VF to add MAC filters for
@@ -1227,10 +1288,10 @@ ice_vf_set_vsi_promisc(struct ice_vf *vf, struct ice_vsi *vsi, u8 promisc_m)
        struct ice_hw *hw = &vsi->back->hw;
        int status;
 
-       if (vf->port_vlan_info)
+       if (ice_vf_is_port_vlan_ena(vf))
                status = ice_fltr_set_vsi_promisc(hw, vsi->idx, promisc_m,
-                                                 vf->port_vlan_info & VLAN_VID_MASK);
-       else if (vsi->num_vlan > 1)
+                                                 ice_vf_get_port_vlan_id(vf));
+       else if (ice_vsi_has_non_zero_vlans(vsi))
                status = ice_fltr_set_vlan_vsi_promisc(hw, vsi, promisc_m);
        else
                status = ice_fltr_set_vsi_promisc(hw, vsi->idx, promisc_m, 0);
@@ -1250,10 +1311,10 @@ ice_vf_clear_vsi_promisc(struct ice_vf *vf, struct ice_vsi *vsi, u8 promisc_m)
        struct ice_hw *hw = &vsi->back->hw;
        int status;
 
-       if (vf->port_vlan_info)
+       if (ice_vf_is_port_vlan_ena(vf))
                status = ice_fltr_clear_vsi_promisc(hw, vsi->idx, promisc_m,
-                                                   vf->port_vlan_info & VLAN_VID_MASK);
-       else if (vsi->num_vlan > 1)
+                                                   ice_vf_get_port_vlan_id(vf));
+       else if (ice_vsi_has_non_zero_vlans(vsi))
                status = ice_fltr_clear_vlan_vsi_promisc(hw, vsi, promisc_m);
        else
                status = ice_fltr_clear_vsi_promisc(hw, vsi->idx, promisc_m, 0);
@@ -1338,7 +1399,7 @@ static void ice_vf_rebuild_host_cfg(struct ice_vf *vf)
                dev_err(dev, "failed to rebuild default MAC configuration for VF %d\n",
                        vf->vf_id);
 
-       if (ice_vf_rebuild_host_vlan_cfg(vf))
+       if (ice_vf_rebuild_host_vlan_cfg(vf, vsi))
                dev_err(dev, "failed to rebuild VLAN configuration for VF %u\n",
                        vf->vf_id);
 
@@ -1346,6 +1407,10 @@ static void ice_vf_rebuild_host_cfg(struct ice_vf *vf)
                dev_err(dev, "failed to rebuild Tx rate limiting configuration for VF %u\n",
                        vf->vf_id);
 
+       if (ice_vf_set_spoofchk_cfg(vf, vsi))
+               dev_err(dev, "failed to rebuild spoofchk configuration for VF %d\n",
+                       vf->vf_id);
+
        /* rebuild aggregator node config for main VF VSI */
        ice_vf_rebuild_aggregator_node_cfg(vsi);
 }
@@ -1406,6 +1471,7 @@ static void ice_vf_set_initialized(struct ice_vf *vf)
        clear_bit(ICE_VF_STATE_UC_PROMISC, vf->vf_states);
        clear_bit(ICE_VF_STATE_DIS, vf->vf_states);
        set_bit(ICE_VF_STATE_INIT, vf->vf_states);
+       memset(&vf->vlan_v2_caps, 0, sizeof(vf->vlan_v2_caps));
 }
 
 /**
@@ -1623,7 +1689,7 @@ bool ice_reset_vf(struct ice_vf *vf, bool is_vflr)
         */
        if (test_bit(ICE_VF_STATE_UC_PROMISC, vf->vf_states) ||
            test_bit(ICE_VF_STATE_MC_PROMISC, vf->vf_states)) {
-               if (vf->port_vlan_info || vsi->num_vlan)
+               if (ice_vf_is_port_vlan_ena(vf) || vsi->num_vlan)
                        promisc_m = ICE_UCAST_VLAN_PROMISC_BITS;
                else
                        promisc_m = ICE_UCAST_PROMISC_BITS;
@@ -1732,6 +1798,7 @@ static void ice_vc_notify_vf_reset(struct ice_vf *vf)
  */
 static int ice_init_vf_vsi_res(struct ice_vf *vf)
 {
+       struct ice_vsi_vlan_ops *vlan_ops;
        struct ice_pf *pf = vf->pf;
        u8 broadcast[ETH_ALEN];
        struct ice_vsi *vsi;
@@ -1745,13 +1812,21 @@ static int ice_init_vf_vsi_res(struct ice_vf *vf)
        if (!vsi)
                return -ENOMEM;
 
-       err = ice_vsi_add_vlan(vsi, 0, ICE_FWD_TO_VSI);
+       err = ice_vsi_add_vlan_zero(vsi);
        if (err) {
                dev_warn(dev, "Failed to add VLAN 0 filter for VF %d\n",
                         vf->vf_id);
                goto release_vsi;
        }
 
+       vlan_ops = ice_get_compat_vsi_vlan_ops(vsi);
+       err = vlan_ops->ena_rx_filtering(vsi);
+       if (err) {
+               dev_warn(dev, "Failed to enable Rx VLAN filtering for VF %d\n",
+                        vf->vf_id);
+               goto release_vsi;
+       }
+
        eth_broadcast_addr(broadcast);
        err = ice_fltr_add_mac(vsi, broadcast, ICE_FWD_TO_VSI);
        if (err) {
@@ -1760,6 +1835,13 @@ static int ice_init_vf_vsi_res(struct ice_vf *vf)
                goto release_vsi;
        }
 
+       err = ice_vf_set_spoofchk_cfg(vf, vsi);
+       if (err) {
+               dev_warn(dev, "Failed to initialize spoofchk setting for VF %d\n",
+                        vf->vf_id);
+               goto release_vsi;
+       }
+
        vf->num_mac = 1;
 
        return 0;
@@ -2239,7 +2321,7 @@ static u16 ice_vc_get_max_frame_size(struct ice_vf *vf)
 
        max_frame_size = pi->phy.link_info.max_frame_size;
 
-       if (vf->port_vlan_info)
+       if (ice_vf_is_port_vlan_ena(vf))
                max_frame_size -= VLAN_HLEN;
 
        return max_frame_size;
@@ -2288,8 +2370,33 @@ static int ice_vc_get_vf_res_msg(struct ice_vf *vf, u8 *msg)
                goto err;
        }
 
-       if (!vsi->info.pvid)
-               vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_VLAN;
+       if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_VLAN_V2) {
+               /* VLAN offloads based on current device configuration */
+               vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_VLAN_V2;
+       } else if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_VLAN) {
+               /* allow VF to negotiate VIRTCHNL_VF_OFFLOAD explicitly for
+                * these two conditions, which amounts to guest VLAN filtering
+                * and offloads being based on the inner VLAN or the
+                * inner/single VLAN respectively and don't allow VF to
+                * negotiate VIRTCHNL_VF_OFFLOAD in any other cases
+                */
+               if (ice_is_dvm_ena(&pf->hw) && ice_vf_is_port_vlan_ena(vf)) {
+                       vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_VLAN;
+               } else if (!ice_is_dvm_ena(&pf->hw) &&
+                          !ice_vf_is_port_vlan_ena(vf)) {
+                       vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_VLAN;
+                       /* configure backward compatible support for VFs that
+                        * only support VIRTCHNL_VF_OFFLOAD_VLAN, the PF is
+                        * configured in SVM, and no port VLAN is configured
+                        */
+                       ice_vf_vsi_cfg_svm_legacy_vlan_mode(vsi);
+               } else if (ice_is_dvm_ena(&pf->hw)) {
+                       /* configure software offloaded VLAN support when DVM
+                        * is enabled, but no port VLAN is enabled
+                        */
+                       ice_vf_vsi_cfg_dvm_legacy_vlan_mode(vsi);
+               }
+       }
 
        if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_RSS_PF) {
                vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_RSS_PF;
@@ -2892,7 +2999,6 @@ int ice_set_vf_spoofchk(struct net_device *netdev, int vf_id, bool ena)
 {
        struct ice_netdev_priv *np = netdev_priv(netdev);
        struct ice_pf *pf = np->vsi->back;
-       struct ice_vsi_ctx *ctx;
        struct ice_vsi *vf_vsi;
        struct device *dev;
        struct ice_vf *vf;
@@ -2925,37 +3031,16 @@ int ice_set_vf_spoofchk(struct net_device *netdev, int vf_id, bool ena)
                return 0;
        }
 
-       ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
-       if (!ctx)
-               return -ENOMEM;
-
-       ctx->info.sec_flags = vf_vsi->info.sec_flags;
-       ctx->info.valid_sections = cpu_to_le16(ICE_AQ_VSI_PROP_SECURITY_VALID);
-       if (ena) {
-               ctx->info.sec_flags |=
-                       ICE_AQ_VSI_SEC_FLAG_ENA_MAC_ANTI_SPOOF |
-                       (ICE_AQ_VSI_SEC_TX_VLAN_PRUNE_ENA <<
-                        ICE_AQ_VSI_SEC_TX_PRUNE_ENA_S);
-       } else {
-               ctx->info.sec_flags &=
-                       ~(ICE_AQ_VSI_SEC_FLAG_ENA_MAC_ANTI_SPOOF |
-                         (ICE_AQ_VSI_SEC_TX_VLAN_PRUNE_ENA <<
-                          ICE_AQ_VSI_SEC_TX_PRUNE_ENA_S));
-       }
-
-       ret = ice_update_vsi(&pf->hw, vf_vsi->idx, ctx, NULL);
-       if (ret) {
-               dev_err(dev, "Failed to %sable spoofchk on VF %d VSI %d\n error %d\n",
-                       ena ? "en" : "dis", vf->vf_id, vf_vsi->vsi_num, ret);
-               goto out;
-       }
-
-       /* only update spoofchk state and VSI context on success */
-       vf_vsi->info.sec_flags = ctx->info.sec_flags;
-       vf->spoofchk = ena;
+       if (ena)
+               ret = ice_vsi_ena_spoofchk(vf_vsi);
+       else
+               ret = ice_vsi_dis_spoofchk(vf_vsi);
+       if (ret)
+               dev_err(dev, "Failed to set spoofchk %s for VF %d VSI %d\n error %d\n",
+                       ena ? "ON" : "OFF", vf->vf_id, vf_vsi->vsi_num, ret);
+       else
+               vf->spoofchk = ena;
 
-out:
-       kfree(ctx);
        return ret;
 }
 
@@ -2995,6 +3080,7 @@ static int ice_vc_cfg_promiscuous_mode_msg(struct ice_vf *vf, u8 *msg)
        bool rm_promisc, alluni = false, allmulti = false;
        struct virtchnl_promisc_info *info =
            (struct virtchnl_promisc_info *)msg;
+       struct ice_vsi_vlan_ops *vlan_ops;
        int mcast_err = 0, ucast_err = 0;
        struct ice_pf *pf = vf->pf;
        struct ice_vsi *vsi;
@@ -3033,16 +3119,15 @@ static int ice_vc_cfg_promiscuous_mode_msg(struct ice_vf *vf, u8 *msg)
 
        rm_promisc = !allmulti && !alluni;
 
-       if (vsi->num_vlan || vf->port_vlan_info) {
-               if (rm_promisc)
-                       ret = ice_cfg_vlan_pruning(vsi, true);
-               else
-                       ret = ice_cfg_vlan_pruning(vsi, false);
-               if (ret) {
-                       dev_err(dev, "Failed to configure VLAN pruning in promiscuous mode\n");
-                       v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-                       goto error_param;
-               }
+       vlan_ops = ice_get_compat_vsi_vlan_ops(vsi);
+       if (rm_promisc)
+               ret = vlan_ops->ena_rx_filtering(vsi);
+       else
+               ret = vlan_ops->dis_rx_filtering(vsi);
+       if (ret) {
+               dev_err(dev, "Failed to configure VLAN pruning in promiscuous mode\n");
+               v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+               goto error_param;
        }
 
        if (!test_bit(ICE_FLAG_VF_TRUE_PROMISC_ENA, pf->flags)) {
@@ -3069,7 +3154,8 @@ static int ice_vc_cfg_promiscuous_mode_msg(struct ice_vf *vf, u8 *msg)
        } else {
                u8 mcast_m, ucast_m;
 
-               if (vf->port_vlan_info || vsi->num_vlan > 1) {
+               if (ice_vf_is_port_vlan_ena(vf) ||
+                   ice_vsi_has_non_zero_vlans(vsi)) {
                        mcast_m = ICE_MCAST_VLAN_PROMISC_BITS;
                        ucast_m = ICE_UCAST_VLAN_PROMISC_BITS;
                } else {
@@ -3652,7 +3738,7 @@ static int ice_vc_cfg_qs_msg(struct ice_vf *vf, u8 *msg)
                        /* add space for the port VLAN since the VF driver is not
                         * expected to account for it in the MTU calculation
                         */
-                       if (vf->port_vlan_info)
+                       if (ice_vf_is_port_vlan_ena(vf))
                                vsi->max_frame += VLAN_HLEN;
 
                        if (ice_vsi_cfg_single_rxq(vsi, q_idx)) {
@@ -4064,6 +4150,33 @@ error_param:
 }
 
 /**
+ * ice_is_supported_port_vlan_proto - make sure the vlan_proto is supported
+ * @hw: hardware structure used to check the VLAN mode
+ * @vlan_proto: VLAN TPID being checked
+ *
+ * If the device is configured in Double VLAN Mode (DVM), then both ETH_P_8021Q
+ * and ETH_P_8021AD are supported. If the device is configured in Single VLAN
+ * Mode (SVM), then only ETH_P_8021Q is supported.
+ */
+static bool
+ice_is_supported_port_vlan_proto(struct ice_hw *hw, u16 vlan_proto)
+{
+       bool is_supported = false;
+
+       switch (vlan_proto) {
+       case ETH_P_8021Q:
+               is_supported = true;
+               break;
+       case ETH_P_8021AD:
+               if (ice_is_dvm_ena(hw))
+                       is_supported = true;
+               break;
+       }
+
+       return is_supported;
+}
+
+/**
  * ice_set_vf_port_vlan
  * @netdev: network interface device structure
  * @vf_id: VF identifier
@@ -4078,9 +4191,9 @@ ice_set_vf_port_vlan(struct net_device *netdev, int vf_id, u16 vlan_id, u8 qos,
                     __be16 vlan_proto)
 {
        struct ice_pf *pf = ice_netdev_to_pf(netdev);
+       u16 local_vlan_proto = ntohs(vlan_proto);
        struct device *dev;
        struct ice_vf *vf;
-       u16 vlanprio;
        int ret;
 
        dev = ice_pf_to_dev(pf);
@@ -4093,8 +4206,9 @@ ice_set_vf_port_vlan(struct net_device *netdev, int vf_id, u16 vlan_id, u8 qos,
                return -EINVAL;
        }
 
-       if (vlan_proto != htons(ETH_P_8021Q)) {
-               dev_err(dev, "VF VLAN protocol is not supported\n");
+       if (!ice_is_supported_port_vlan_proto(&pf->hw, local_vlan_proto)) {
+               dev_err(dev, "VF VLAN protocol 0x%04x is not supported\n",
+                       local_vlan_proto);
                return -EPROTONOSUPPORT;
        }
 
@@ -4103,21 +4217,21 @@ ice_set_vf_port_vlan(struct net_device *netdev, int vf_id, u16 vlan_id, u8 qos,
        if (ret)
                return ret;
 
-       vlanprio = vlan_id | (qos << VLAN_PRIO_SHIFT);
-
-       if (vf->port_vlan_info == vlanprio) {
+       if (ice_vf_get_port_vlan_prio(vf) == qos &&
+           ice_vf_get_port_vlan_tpid(vf) == local_vlan_proto &&
+           ice_vf_get_port_vlan_id(vf) == vlan_id) {
                /* duplicate request, so just return success */
-               dev_dbg(dev, "Duplicate pvid %d request\n", vlanprio);
+               dev_dbg(dev, "Duplicate port VLAN %u, QoS %u, TPID 0x%04x request\n",
+                       vlan_id, qos, local_vlan_proto);
                return 0;
        }
 
        mutex_lock(&vf->cfg_lock);
 
-       vf->port_vlan_info = vlanprio;
-
-       if (vf->port_vlan_info)
-               dev_info(dev, "Setting VLAN %d, QoS 0x%x on VF %d\n",
-                        vlan_id, qos, vf_id);
+       vf->port_vlan_info = ICE_VLAN(local_vlan_proto, vlan_id, qos);
+       if (ice_vf_is_port_vlan_ena(vf))
+               dev_info(dev, "Setting VLAN %u, QoS %u, TPID 0x%04x on VF %d\n",
+                        vlan_id, qos, local_vlan_proto, vf_id);
        else
                dev_info(dev, "Clearing port VLAN on VF %d\n", vf_id);
 
@@ -4139,6 +4253,83 @@ static bool ice_vf_vlan_offload_ena(u32 caps)
 }
 
 /**
+ * ice_is_vlan_promisc_allowed - check if VLAN promiscuous config is allowed
+ * @vf: VF used to determine if VLAN promiscuous config is allowed
+ */
+static bool ice_is_vlan_promisc_allowed(struct ice_vf *vf)
+{
+       if ((test_bit(ICE_VF_STATE_UC_PROMISC, vf->vf_states) ||
+            test_bit(ICE_VF_STATE_MC_PROMISC, vf->vf_states)) &&
+           test_bit(ICE_FLAG_VF_TRUE_PROMISC_ENA, vf->pf->flags))
+               return true;
+
+       return false;
+}
+
+/**
+ * ice_vf_ena_vlan_promisc - Enable Tx/Rx VLAN promiscuous for the VLAN
+ * @vsi: VF's VSI used to enable VLAN promiscuous mode
+ * @vlan: VLAN used to enable VLAN promiscuous
+ *
+ * This function should only be called if VLAN promiscuous mode is allowed,
+ * which can be determined via ice_is_vlan_promisc_allowed().
+ */
+static int ice_vf_ena_vlan_promisc(struct ice_vsi *vsi, struct ice_vlan *vlan)
+{
+       u8 promisc_m = ICE_PROMISC_VLAN_TX | ICE_PROMISC_VLAN_RX;
+       int status;
+
+       status = ice_fltr_set_vsi_promisc(&vsi->back->hw, vsi->idx, promisc_m,
+                                         vlan->vid);
+       if (status && status != -EEXIST)
+               return status;
+
+       return 0;
+}
+
+/**
+ * ice_vf_dis_vlan_promisc - Disable Tx/Rx VLAN promiscuous for the VLAN
+ * @vsi: VF's VSI used to disable VLAN promiscuous mode for
+ * @vlan: VLAN used to disable VLAN promiscuous
+ *
+ * This function should only be called if VLAN promiscuous mode is allowed,
+ * which can be determined via ice_is_vlan_promisc_allowed().
+ */
+static int ice_vf_dis_vlan_promisc(struct ice_vsi *vsi, struct ice_vlan *vlan)
+{
+       u8 promisc_m = ICE_PROMISC_VLAN_TX | ICE_PROMISC_VLAN_RX;
+       int status;
+
+       status = ice_fltr_clear_vsi_promisc(&vsi->back->hw, vsi->idx, promisc_m,
+                                           vlan->vid);
+       if (status && status != -ENOENT)
+               return status;
+
+       return 0;
+}
+
+/**
+ * ice_vf_has_max_vlans - check if VF already has the max allowed VLAN filters
+ * @vf: VF to check against
+ * @vsi: VF's VSI
+ *
+ * If the VF is trusted then the VF is allowed to add as many VLANs as it
+ * wants to, so return false.
+ *
+ * When the VF is untrusted compare the number of non-zero VLANs + 1 to the max
+ * allowed VLANs for an untrusted VF. Return the result of this comparison.
+ */
+static bool ice_vf_has_max_vlans(struct ice_vf *vf, struct ice_vsi *vsi)
+{
+       if (ice_is_vf_trusted(vf))
+               return false;
+
+#define ICE_VF_ADDED_VLAN_ZERO_FLTRS   1
+       return ((ice_vsi_num_non_zero_vlans(vsi) +
+               ICE_VF_ADDED_VLAN_ZERO_FLTRS) >= ICE_MAX_VLAN_PER_VF);
+}
+
+/**
  * ice_vc_process_vlan_msg
  * @vf: pointer to the VF info
  * @msg: pointer to the msg buffer
@@ -4155,9 +4346,7 @@ static int ice_vc_process_vlan_msg(struct ice_vf *vf, u8 *msg, bool add_v)
        bool vlan_promisc = false;
        struct ice_vsi *vsi;
        struct device *dev;
-       struct ice_hw *hw;
        int status = 0;
-       u8 promisc_m;
        int i;
 
        dev = ice_pf_to_dev(pf);
@@ -4185,15 +4374,13 @@ static int ice_vc_process_vlan_msg(struct ice_vf *vf, u8 *msg, bool add_v)
                }
        }
 
-       hw = &pf->hw;
        vsi = ice_get_vf_vsi(vf);
        if (!vsi) {
                v_ret = VIRTCHNL_STATUS_ERR_PARAM;
                goto error_param;
        }
 
-       if (add_v && !ice_is_vf_trusted(vf) &&
-           vsi->num_vlan >= ICE_MAX_VLAN_PER_VF) {
+       if (add_v && ice_vf_has_max_vlans(vf, vsi)) {
                dev_info(dev, "VF-%d is not trusted, switch the VF to trusted mode, in order to add more VLAN addresses\n",
                         vf->vf_id);
                /* There is no need to let VF know about being not trusted,
@@ -4202,22 +4389,28 @@ static int ice_vc_process_vlan_msg(struct ice_vf *vf, u8 *msg, bool add_v)
                goto error_param;
        }
 
-       if (vsi->info.pvid) {
+       /* in DVM a VF can add/delete inner VLAN filters when
+        * VIRTCHNL_VF_OFFLOAD_VLAN is negotiated, so only reject in SVM
+        */
+       if (ice_vf_is_port_vlan_ena(vf) && !ice_is_dvm_ena(&pf->hw)) {
                v_ret = VIRTCHNL_STATUS_ERR_PARAM;
                goto error_param;
        }
 
-       if ((test_bit(ICE_VF_STATE_UC_PROMISC, vf->vf_states) ||
-            test_bit(ICE_VF_STATE_MC_PROMISC, vf->vf_states)) &&
-           test_bit(ICE_FLAG_VF_TRUE_PROMISC_ENA, pf->flags))
-               vlan_promisc = true;
+       /* in DVM VLAN promiscuous is based on the outer VLAN, which would be
+        * the port VLAN if VIRTCHNL_VF_OFFLOAD_VLAN was negotiated, so only
+        * allow vlan_promisc = true in SVM and if no port VLAN is configured
+        */
+       vlan_promisc = ice_is_vlan_promisc_allowed(vf) &&
+               !ice_is_dvm_ena(&pf->hw) &&
+               !ice_vf_is_port_vlan_ena(vf);
 
        if (add_v) {
                for (i = 0; i < vfl->num_elements; i++) {
                        u16 vid = vfl->vlan_id[i];
+                       struct ice_vlan vlan;
 
-                       if (!ice_is_vf_trusted(vf) &&
-                           vsi->num_vlan >= ICE_MAX_VLAN_PER_VF) {
+                       if (ice_vf_has_max_vlans(vf, vsi)) {
                                dev_info(dev, "VF-%d is not trusted, switch the VF to trusted mode, in order to add more VLAN addresses\n",
                                         vf->vf_id);
                                /* There is no need to let VF know about being
@@ -4234,29 +4427,23 @@ static int ice_vc_process_vlan_msg(struct ice_vf *vf, u8 *msg, bool add_v)
                        if (!vid)
                                continue;
 
-                       status = ice_vsi_add_vlan(vsi, vid, ICE_FWD_TO_VSI);
+                       vlan = ICE_VLAN(ETH_P_8021Q, vid, 0);
+                       status = vsi->inner_vlan_ops.add_vlan(vsi, &vlan);
                        if (status) {
                                v_ret = VIRTCHNL_STATUS_ERR_PARAM;
                                goto error_param;
                        }
 
-                       /* Enable VLAN pruning when non-zero VLAN is added */
-                       if (!vlan_promisc && vid &&
-                           !ice_vsi_is_vlan_pruning_ena(vsi)) {
-                               status = ice_cfg_vlan_pruning(vsi, true);
-                               if (status) {
+                       /* Enable VLAN filtering on first non-zero VLAN */
+                       if (!vlan_promisc && vid && !ice_is_dvm_ena(&pf->hw)) {
+                               if (vsi->inner_vlan_ops.ena_rx_filtering(vsi)) {
                                        v_ret = VIRTCHNL_STATUS_ERR_PARAM;
                                        dev_err(dev, "Enable VLAN pruning on VLAN ID: %d failed error-%d\n",
                                                vid, status);
                                        goto error_param;
                                }
                        } else if (vlan_promisc) {
-                               /* Enable Ucast/Mcast VLAN promiscuous mode */
-                               promisc_m = ICE_PROMISC_VLAN_TX |
-                                           ICE_PROMISC_VLAN_RX;
-
-                               status = ice_set_vsi_promisc(hw, vsi->idx,
-                                                            promisc_m, vid);
+                               status = ice_vf_ena_vlan_promisc(vsi, &vlan);
                                if (status) {
                                        v_ret = VIRTCHNL_STATUS_ERR_PARAM;
                                        dev_err(dev, "Enable Unicast/multicast promiscuous mode on VLAN ID:%d failed error-%d\n",
@@ -4277,6 +4464,7 @@ static int ice_vc_process_vlan_msg(struct ice_vf *vf, u8 *msg, bool add_v)
                num_vf_vlan = vsi->num_vlan;
                for (i = 0; i < vfl->num_elements && i < num_vf_vlan; i++) {
                        u16 vid = vfl->vlan_id[i];
+                       struct ice_vlan vlan;
 
                        /* we add VLAN 0 by default for each VF so we can enable
                         * Tx VLAN anti-spoof without triggering MDD events so
@@ -4285,28 +4473,19 @@ static int ice_vc_process_vlan_msg(struct ice_vf *vf, u8 *msg, bool add_v)
                        if (!vid)
                                continue;
 
-                       /* Make sure ice_vsi_kill_vlan is successful before
-                        * updating VLAN information
-                        */
-                       status = ice_vsi_kill_vlan(vsi, vid);
+                       vlan = ICE_VLAN(ETH_P_8021Q, vid, 0);
+                       status = vsi->inner_vlan_ops.del_vlan(vsi, &vlan);
                        if (status) {
                                v_ret = VIRTCHNL_STATUS_ERR_PARAM;
                                goto error_param;
                        }
 
-                       /* Disable VLAN pruning when only VLAN 0 is left */
-                       if (vsi->num_vlan == 1 &&
-                           ice_vsi_is_vlan_pruning_ena(vsi))
-                               ice_cfg_vlan_pruning(vsi, false);
-
-                       /* Disable Unicast/Multicast VLAN promiscuous mode */
-                       if (vlan_promisc) {
-                               promisc_m = ICE_PROMISC_VLAN_TX |
-                                           ICE_PROMISC_VLAN_RX;
+                       /* Disable VLAN filtering when only VLAN 0 is left */
+                       if (!ice_vsi_has_non_zero_vlans(vsi))
+                               vsi->inner_vlan_ops.dis_rx_filtering(vsi);
 
-                               ice_clear_vsi_promisc(hw, vsi->idx,
-                                                     promisc_m, vid);
-                       }
+                       if (vlan_promisc)
+                               ice_vf_dis_vlan_promisc(vsi, &vlan);
                }
        }
 
@@ -4366,7 +4545,7 @@ static int ice_vc_ena_vlan_stripping(struct ice_vf *vf)
        }
 
        vsi = ice_get_vf_vsi(vf);
-       if (ice_vsi_manage_vlan_stripping(vsi, true))
+       if (vsi->inner_vlan_ops.ena_stripping(vsi, ETH_P_8021Q))
                v_ret = VIRTCHNL_STATUS_ERR_PARAM;
 
 error_param:
@@ -4401,7 +4580,7 @@ static int ice_vc_dis_vlan_stripping(struct ice_vf *vf)
                goto error_param;
        }
 
-       if (ice_vsi_manage_vlan_stripping(vsi, false))
+       if (vsi->inner_vlan_ops.dis_stripping(vsi))
                v_ret = VIRTCHNL_STATUS_ERR_PARAM;
 
 error_param:
@@ -4413,11 +4592,8 @@ error_param:
  * ice_vf_init_vlan_stripping - enable/disable VLAN stripping on initialization
  * @vf: VF to enable/disable VLAN stripping for on initialization
  *
- * If the VIRTCHNL_VF_OFFLOAD_VLAN flag is set enable VLAN stripping, else if
- * the flag is cleared then we want to disable stripping. For example, the flag
- * will be cleared when port VLANs are configured by the administrator before
- * passing the VF to the guest or if the AVF driver doesn't support VLAN
- * offloads.
+ * Set the default for VLAN stripping based on whether a port VLAN is configured
+ * and the current VLAN mode of the device.
  */
 static int ice_vf_init_vlan_stripping(struct ice_vf *vf)
 {
@@ -4426,68 +4602,1026 @@ static int ice_vf_init_vlan_stripping(struct ice_vf *vf)
        if (!vsi)
                return -EINVAL;
 
-       /* don't modify stripping if port VLAN is configured */
-       if (vsi->info.pvid)
+       /* don't modify stripping if port VLAN is configured in SVM since the
+        * port VLAN is based on the inner/single VLAN in SVM
+        */
+       if (ice_vf_is_port_vlan_ena(vf) && !ice_is_dvm_ena(&vsi->back->hw))
                return 0;
 
        if (ice_vf_vlan_offload_ena(vf->driver_caps))
-               return ice_vsi_manage_vlan_stripping(vsi, true);
+               return vsi->inner_vlan_ops.ena_stripping(vsi, ETH_P_8021Q);
        else
-               return ice_vsi_manage_vlan_stripping(vsi, false);
+               return vsi->inner_vlan_ops.dis_stripping(vsi);
 }
 
-static struct ice_vc_vf_ops ice_vc_vf_dflt_ops = {
-       .get_ver_msg = ice_vc_get_ver_msg,
-       .get_vf_res_msg = ice_vc_get_vf_res_msg,
-       .reset_vf = ice_vc_reset_vf_msg,
-       .add_mac_addr_msg = ice_vc_add_mac_addr_msg,
-       .del_mac_addr_msg = ice_vc_del_mac_addr_msg,
-       .cfg_qs_msg = ice_vc_cfg_qs_msg,
-       .ena_qs_msg = ice_vc_ena_qs_msg,
-       .dis_qs_msg = ice_vc_dis_qs_msg,
-       .request_qs_msg = ice_vc_request_qs_msg,
-       .cfg_irq_map_msg = ice_vc_cfg_irq_map_msg,
-       .config_rss_key = ice_vc_config_rss_key,
-       .config_rss_lut = ice_vc_config_rss_lut,
-       .get_stats_msg = ice_vc_get_stats_msg,
-       .cfg_promiscuous_mode_msg = ice_vc_cfg_promiscuous_mode_msg,
-       .add_vlan_msg = ice_vc_add_vlan_msg,
-       .remove_vlan_msg = ice_vc_remove_vlan_msg,
-       .ena_vlan_stripping = ice_vc_ena_vlan_stripping,
-       .dis_vlan_stripping = ice_vc_dis_vlan_stripping,
-       .handle_rss_cfg_msg = ice_vc_handle_rss_cfg,
-       .add_fdir_fltr_msg = ice_vc_add_fdir_fltr,
-       .del_fdir_fltr_msg = ice_vc_del_fdir_fltr,
-};
+static u16 ice_vc_get_max_vlan_fltrs(struct ice_vf *vf)
+{
+       if (vf->trusted)
+               return VLAN_N_VID;
+       else
+               return ICE_MAX_VLAN_PER_VF;
+}
 
-void ice_vc_set_dflt_vf_ops(struct ice_vc_vf_ops *ops)
+/**
+ * ice_vf_outer_vlan_not_allowed - check outer VLAN can be used when the device is in DVM
+ * @vf: VF that being checked for
+ */
+static bool ice_vf_outer_vlan_not_allowed(struct ice_vf *vf)
 {
-       *ops = ice_vc_vf_dflt_ops;
+       if (ice_vf_is_port_vlan_ena(vf))
+               return true;
+
+       return false;
 }
 
 /**
- * ice_vc_repr_add_mac
- * @vf: pointer to VF
- * @msg: virtchannel message
+ * ice_vc_set_dvm_caps - set VLAN capabilities when the device is in DVM
+ * @vf: VF that capabilities are being set for
+ * @caps: VLAN capabilities to populate
  *
- * When port representors are created, we do not add MAC rule
- * to firmware, we store it so that PF could report same
- * MAC as VF.
+ * Determine VLAN capabilities support based on whether a port VLAN is
+ * configured. If a port VLAN is configured then the VF should use the inner
+ * filtering/offload capabilities since the port VLAN is using the outer VLAN
+ * capabilies.
  */
-static int ice_vc_repr_add_mac(struct ice_vf *vf, u8 *msg)
+static void
+ice_vc_set_dvm_caps(struct ice_vf *vf, struct virtchnl_vlan_caps *caps)
 {
-       enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
-       struct virtchnl_ether_addr_list *al =
-           (struct virtchnl_ether_addr_list *)msg;
-       struct ice_vsi *vsi;
-       struct ice_pf *pf;
-       int i;
+       struct virtchnl_vlan_supported_caps *supported_caps;
 
-       if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states) ||
-           !ice_vc_isvalid_vsi_id(vf, al->vsi_id)) {
-               v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-               goto handle_mac_exit;
-       }
+       if (ice_vf_outer_vlan_not_allowed(vf)) {
+               /* until support for inner VLAN filtering is added when a port
+                * VLAN is configured, only support software offloaded inner
+                * VLANs when a port VLAN is confgured in DVM
+                */
+               supported_caps = &caps->filtering.filtering_support;
+               supported_caps->inner = VIRTCHNL_VLAN_UNSUPPORTED;
+
+               supported_caps = &caps->offloads.stripping_support;
+               supported_caps->inner = VIRTCHNL_VLAN_ETHERTYPE_8100 |
+                                       VIRTCHNL_VLAN_TOGGLE |
+                                       VIRTCHNL_VLAN_TAG_LOCATION_L2TAG1;
+               supported_caps->outer = VIRTCHNL_VLAN_UNSUPPORTED;
+
+               supported_caps = &caps->offloads.insertion_support;
+               supported_caps->inner = VIRTCHNL_VLAN_ETHERTYPE_8100 |
+                                       VIRTCHNL_VLAN_TOGGLE |
+                                       VIRTCHNL_VLAN_TAG_LOCATION_L2TAG1;
+               supported_caps->outer = VIRTCHNL_VLAN_UNSUPPORTED;
+
+               caps->offloads.ethertype_init = VIRTCHNL_VLAN_ETHERTYPE_8100;
+               caps->offloads.ethertype_match =
+                       VIRTCHNL_ETHERTYPE_STRIPPING_MATCHES_INSERTION;
+       } else {
+               supported_caps = &caps->filtering.filtering_support;
+               supported_caps->inner = VIRTCHNL_VLAN_UNSUPPORTED;
+               supported_caps->outer = VIRTCHNL_VLAN_ETHERTYPE_8100 |
+                                       VIRTCHNL_VLAN_ETHERTYPE_88A8 |
+                                       VIRTCHNL_VLAN_ETHERTYPE_9100 |
+                                       VIRTCHNL_VLAN_ETHERTYPE_AND;
+               caps->filtering.ethertype_init = VIRTCHNL_VLAN_ETHERTYPE_8100 |
+                                                VIRTCHNL_VLAN_ETHERTYPE_88A8 |
+                                                VIRTCHNL_VLAN_ETHERTYPE_9100;
+
+               supported_caps = &caps->offloads.stripping_support;
+               supported_caps->inner = VIRTCHNL_VLAN_TOGGLE |
+                                       VIRTCHNL_VLAN_ETHERTYPE_8100 |
+                                       VIRTCHNL_VLAN_TAG_LOCATION_L2TAG1;
+               supported_caps->outer = VIRTCHNL_VLAN_TOGGLE |
+                                       VIRTCHNL_VLAN_ETHERTYPE_8100 |
+                                       VIRTCHNL_VLAN_ETHERTYPE_88A8 |
+                                       VIRTCHNL_VLAN_ETHERTYPE_9100 |
+                                       VIRTCHNL_VLAN_ETHERTYPE_XOR |
+                                       VIRTCHNL_VLAN_TAG_LOCATION_L2TAG2_2;
+
+               supported_caps = &caps->offloads.insertion_support;
+               supported_caps->inner = VIRTCHNL_VLAN_TOGGLE |
+                                       VIRTCHNL_VLAN_ETHERTYPE_8100 |
+                                       VIRTCHNL_VLAN_TAG_LOCATION_L2TAG1;
+               supported_caps->outer = VIRTCHNL_VLAN_TOGGLE |
+                                       VIRTCHNL_VLAN_ETHERTYPE_8100 |
+                                       VIRTCHNL_VLAN_ETHERTYPE_88A8 |
+                                       VIRTCHNL_VLAN_ETHERTYPE_9100 |
+                                       VIRTCHNL_VLAN_ETHERTYPE_XOR |
+                                       VIRTCHNL_VLAN_TAG_LOCATION_L2TAG2;
+
+               caps->offloads.ethertype_init = VIRTCHNL_VLAN_ETHERTYPE_8100;
+
+               caps->offloads.ethertype_match =
+                       VIRTCHNL_ETHERTYPE_STRIPPING_MATCHES_INSERTION;
+       }
+
+       caps->filtering.max_filters = ice_vc_get_max_vlan_fltrs(vf);
+}
+
+/**
+ * ice_vc_set_svm_caps - set VLAN capabilities when the device is in SVM
+ * @vf: VF that capabilities are being set for
+ * @caps: VLAN capabilities to populate
+ *
+ * Determine VLAN capabilities support based on whether a port VLAN is
+ * configured. If a port VLAN is configured then the VF does not have any VLAN
+ * filtering or offload capabilities since the port VLAN is using the inner VLAN
+ * capabilities in single VLAN mode (SVM). Otherwise allow the VF to use inner
+ * VLAN fitlering and offload capabilities.
+ */
+static void
+ice_vc_set_svm_caps(struct ice_vf *vf, struct virtchnl_vlan_caps *caps)
+{
+       struct virtchnl_vlan_supported_caps *supported_caps;
+
+       if (ice_vf_is_port_vlan_ena(vf)) {
+               supported_caps = &caps->filtering.filtering_support;
+               supported_caps->inner = VIRTCHNL_VLAN_UNSUPPORTED;
+               supported_caps->outer = VIRTCHNL_VLAN_UNSUPPORTED;
+
+               supported_caps = &caps->offloads.stripping_support;
+               supported_caps->inner = VIRTCHNL_VLAN_UNSUPPORTED;
+               supported_caps->outer = VIRTCHNL_VLAN_UNSUPPORTED;
+
+               supported_caps = &caps->offloads.insertion_support;
+               supported_caps->inner = VIRTCHNL_VLAN_UNSUPPORTED;
+               supported_caps->outer = VIRTCHNL_VLAN_UNSUPPORTED;
+
+               caps->offloads.ethertype_init = VIRTCHNL_VLAN_UNSUPPORTED;
+               caps->offloads.ethertype_match = VIRTCHNL_VLAN_UNSUPPORTED;
+               caps->filtering.max_filters = 0;
+       } else {
+               supported_caps = &caps->filtering.filtering_support;
+               supported_caps->inner = VIRTCHNL_VLAN_ETHERTYPE_8100;
+               supported_caps->outer = VIRTCHNL_VLAN_UNSUPPORTED;
+               caps->filtering.ethertype_init = VIRTCHNL_VLAN_ETHERTYPE_8100;
+
+               supported_caps = &caps->offloads.stripping_support;
+               supported_caps->inner = VIRTCHNL_VLAN_ETHERTYPE_8100 |
+                                       VIRTCHNL_VLAN_TOGGLE |
+                                       VIRTCHNL_VLAN_TAG_LOCATION_L2TAG1;
+               supported_caps->outer = VIRTCHNL_VLAN_UNSUPPORTED;
+
+               supported_caps = &caps->offloads.insertion_support;
+               supported_caps->inner = VIRTCHNL_VLAN_ETHERTYPE_8100 |
+                                       VIRTCHNL_VLAN_TOGGLE |
+                                       VIRTCHNL_VLAN_TAG_LOCATION_L2TAG1;
+               supported_caps->outer = VIRTCHNL_VLAN_UNSUPPORTED;
+
+               caps->offloads.ethertype_init = VIRTCHNL_VLAN_ETHERTYPE_8100;
+               caps->offloads.ethertype_match =
+                       VIRTCHNL_ETHERTYPE_STRIPPING_MATCHES_INSERTION;
+               caps->filtering.max_filters = ice_vc_get_max_vlan_fltrs(vf);
+       }
+}
+
+/**
+ * ice_vc_get_offload_vlan_v2_caps - determine VF's VLAN capabilities
+ * @vf: VF to determine VLAN capabilities for
+ *
+ * This will only be called if the VF and PF successfully negotiated
+ * VIRTCHNL_VF_OFFLOAD_VLAN_V2.
+ *
+ * Set VLAN capabilities based on the current VLAN mode and whether a port VLAN
+ * is configured or not.
+ */
+static int ice_vc_get_offload_vlan_v2_caps(struct ice_vf *vf)
+{
+       enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+       struct virtchnl_vlan_caps *caps = NULL;
+       int err, len = 0;
+
+       if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
+               v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+               goto out;
+       }
+
+       caps = kzalloc(sizeof(*caps), GFP_KERNEL);
+       if (!caps) {
+               v_ret = VIRTCHNL_STATUS_ERR_NO_MEMORY;
+               goto out;
+       }
+       len = sizeof(*caps);
+
+       if (ice_is_dvm_ena(&vf->pf->hw))
+               ice_vc_set_dvm_caps(vf, caps);
+       else
+               ice_vc_set_svm_caps(vf, caps);
+
+       /* store negotiated caps to prevent invalid VF messages */
+       memcpy(&vf->vlan_v2_caps, caps, sizeof(*caps));
+
+out:
+       err = ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS,
+                                   v_ret, (u8 *)caps, len);
+       kfree(caps);
+       return err;
+}
+
+/**
+ * ice_vc_validate_vlan_tpid - validate VLAN TPID
+ * @filtering_caps: negotiated/supported VLAN filtering capabilities
+ * @tpid: VLAN TPID used for validation
+ *
+ * Convert the VLAN TPID to a VIRTCHNL_VLAN_ETHERTYPE_* and then compare against
+ * the negotiated/supported filtering caps to see if the VLAN TPID is valid.
+ */
+static bool ice_vc_validate_vlan_tpid(u16 filtering_caps, u16 tpid)
+{
+       enum virtchnl_vlan_support vlan_ethertype = VIRTCHNL_VLAN_UNSUPPORTED;
+
+       switch (tpid) {
+       case ETH_P_8021Q:
+               vlan_ethertype = VIRTCHNL_VLAN_ETHERTYPE_8100;
+               break;
+       case ETH_P_8021AD:
+               vlan_ethertype = VIRTCHNL_VLAN_ETHERTYPE_88A8;
+               break;
+       case ETH_P_QINQ1:
+               vlan_ethertype = VIRTCHNL_VLAN_ETHERTYPE_9100;
+               break;
+       }
+
+       if (!(filtering_caps & vlan_ethertype))
+               return false;
+
+       return true;
+}
+
+/**
+ * ice_vc_is_valid_vlan - validate the virtchnl_vlan
+ * @vc_vlan: virtchnl_vlan to validate
+ *
+ * If the VLAN TCI and VLAN TPID are 0, then this filter is invalid, so return
+ * false. Otherwise return true.
+ */
+static bool ice_vc_is_valid_vlan(struct virtchnl_vlan *vc_vlan)
+{
+       if (!vc_vlan->tci || !vc_vlan->tpid)
+               return false;
+
+       return true;
+}
+
+/**
+ * ice_vc_validate_vlan_filter_list - validate the filter list from the VF
+ * @vfc: negotiated/supported VLAN filtering capabilities
+ * @vfl: VLAN filter list from VF to validate
+ *
+ * Validate all of the filters in the VLAN filter list from the VF. If any of
+ * the checks fail then return false. Otherwise return true.
+ */
+static bool
+ice_vc_validate_vlan_filter_list(struct virtchnl_vlan_filtering_caps *vfc,
+                                struct virtchnl_vlan_filter_list_v2 *vfl)
+{
+       u16 i;
+
+       if (!vfl->num_elements)
+               return false;
+
+       for (i = 0; i < vfl->num_elements; i++) {
+               struct virtchnl_vlan_supported_caps *filtering_support =
+                       &vfc->filtering_support;
+               struct virtchnl_vlan_filter *vlan_fltr = &vfl->filters[i];
+               struct virtchnl_vlan *outer = &vlan_fltr->outer;
+               struct virtchnl_vlan *inner = &vlan_fltr->inner;
+
+               if ((ice_vc_is_valid_vlan(outer) &&
+                    filtering_support->outer == VIRTCHNL_VLAN_UNSUPPORTED) ||
+                   (ice_vc_is_valid_vlan(inner) &&
+                    filtering_support->inner == VIRTCHNL_VLAN_UNSUPPORTED))
+                       return false;
+
+               if ((outer->tci_mask &&
+                    !(filtering_support->outer & VIRTCHNL_VLAN_FILTER_MASK)) ||
+                   (inner->tci_mask &&
+                    !(filtering_support->inner & VIRTCHNL_VLAN_FILTER_MASK)))
+                       return false;
+
+               if (((outer->tci & VLAN_PRIO_MASK) &&
+                    !(filtering_support->outer & VIRTCHNL_VLAN_PRIO)) ||
+                   ((inner->tci & VLAN_PRIO_MASK) &&
+                    !(filtering_support->inner & VIRTCHNL_VLAN_PRIO)))
+                       return false;
+
+               if ((ice_vc_is_valid_vlan(outer) &&
+                    !ice_vc_validate_vlan_tpid(filtering_support->outer, outer->tpid)) ||
+                   (ice_vc_is_valid_vlan(inner) &&
+                    !ice_vc_validate_vlan_tpid(filtering_support->inner, inner->tpid)))
+                       return false;
+       }
+
+       return true;
+}
+
+/**
+ * ice_vc_to_vlan - transform from struct virtchnl_vlan to struct ice_vlan
+ * @vc_vlan: struct virtchnl_vlan to transform
+ */
+static struct ice_vlan ice_vc_to_vlan(struct virtchnl_vlan *vc_vlan)
+{
+       struct ice_vlan vlan = { 0 };
+
+       vlan.prio = (vc_vlan->tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
+       vlan.vid = vc_vlan->tci & VLAN_VID_MASK;
+       vlan.tpid = vc_vlan->tpid;
+
+       return vlan;
+}
+
+/**
+ * ice_vc_vlan_action - action to perform on the virthcnl_vlan
+ * @vsi: VF's VSI used to perform the action
+ * @vlan_action: function to perform the action with (i.e. add/del)
+ * @vlan: VLAN filter to perform the action with
+ */
+static int
+ice_vc_vlan_action(struct ice_vsi *vsi,
+                  int (*vlan_action)(struct ice_vsi *, struct ice_vlan *),
+                  struct ice_vlan *vlan)
+{
+       int err;
+
+       err = vlan_action(vsi, vlan);
+       if (err)
+               return err;
+
+       return 0;
+}
+
+/**
+ * ice_vc_del_vlans - delete VLAN(s) from the virtchnl filter list
+ * @vf: VF used to delete the VLAN(s)
+ * @vsi: VF's VSI used to delete the VLAN(s)
+ * @vfl: virthchnl filter list used to delete the filters
+ */
+static int
+ice_vc_del_vlans(struct ice_vf *vf, struct ice_vsi *vsi,
+                struct virtchnl_vlan_filter_list_v2 *vfl)
+{
+       bool vlan_promisc = ice_is_vlan_promisc_allowed(vf);
+       int err;
+       u16 i;
+
+       for (i = 0; i < vfl->num_elements; i++) {
+               struct virtchnl_vlan_filter *vlan_fltr = &vfl->filters[i];
+               struct virtchnl_vlan *vc_vlan;
+
+               vc_vlan = &vlan_fltr->outer;
+               if (ice_vc_is_valid_vlan(vc_vlan)) {
+                       struct ice_vlan vlan = ice_vc_to_vlan(vc_vlan);
+
+                       err = ice_vc_vlan_action(vsi,
+                                                vsi->outer_vlan_ops.del_vlan,
+                                                &vlan);
+                       if (err)
+                               return err;
+
+                       if (vlan_promisc)
+                               ice_vf_dis_vlan_promisc(vsi, &vlan);
+               }
+
+               vc_vlan = &vlan_fltr->inner;
+               if (ice_vc_is_valid_vlan(vc_vlan)) {
+                       struct ice_vlan vlan = ice_vc_to_vlan(vc_vlan);
+
+                       err = ice_vc_vlan_action(vsi,
+                                                vsi->inner_vlan_ops.del_vlan,
+                                                &vlan);
+                       if (err)
+                               return err;
+
+                       /* no support for VLAN promiscuous on inner VLAN unless
+                        * we are in Single VLAN Mode (SVM)
+                        */
+                       if (!ice_is_dvm_ena(&vsi->back->hw) && vlan_promisc)
+                               ice_vf_dis_vlan_promisc(vsi, &vlan);
+               }
+       }
+
+       return 0;
+}
+
+/**
+ * ice_vc_remove_vlan_v2_msg - virtchnl handler for VIRTCHNL_OP_DEL_VLAN_V2
+ * @vf: VF the message was received from
+ * @msg: message received from the VF
+ */
+static int ice_vc_remove_vlan_v2_msg(struct ice_vf *vf, u8 *msg)
+{
+       struct virtchnl_vlan_filter_list_v2 *vfl =
+               (struct virtchnl_vlan_filter_list_v2 *)msg;
+       enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+       struct ice_vsi *vsi;
+
+       if (!ice_vc_validate_vlan_filter_list(&vf->vlan_v2_caps.filtering,
+                                             vfl)) {
+               v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+               goto out;
+       }
+
+       if (!ice_vc_isvalid_vsi_id(vf, vfl->vport_id)) {
+               v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+               goto out;
+       }
+
+       vsi = ice_get_vf_vsi(vf);
+       if (!vsi) {
+               v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+               goto out;
+       }
+
+       if (ice_vc_del_vlans(vf, vsi, vfl))
+               v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+
+out:
+       return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_DEL_VLAN_V2, v_ret, NULL,
+                                    0);
+}
+
+/**
+ * ice_vc_add_vlans - add VLAN(s) from the virtchnl filter list
+ * @vf: VF used to add the VLAN(s)
+ * @vsi: VF's VSI used to add the VLAN(s)
+ * @vfl: virthchnl filter list used to add the filters
+ */
+static int
+ice_vc_add_vlans(struct ice_vf *vf, struct ice_vsi *vsi,
+                struct virtchnl_vlan_filter_list_v2 *vfl)
+{
+       bool vlan_promisc = ice_is_vlan_promisc_allowed(vf);
+       int err;
+       u16 i;
+
+       for (i = 0; i < vfl->num_elements; i++) {
+               struct virtchnl_vlan_filter *vlan_fltr = &vfl->filters[i];
+               struct virtchnl_vlan *vc_vlan;
+
+               vc_vlan = &vlan_fltr->outer;
+               if (ice_vc_is_valid_vlan(vc_vlan)) {
+                       struct ice_vlan vlan = ice_vc_to_vlan(vc_vlan);
+
+                       err = ice_vc_vlan_action(vsi,
+                                                vsi->outer_vlan_ops.add_vlan,
+                                                &vlan);
+                       if (err)
+                               return err;
+
+                       if (vlan_promisc) {
+                               err = ice_vf_ena_vlan_promisc(vsi, &vlan);
+                               if (err)
+                                       return err;
+                       }
+               }
+
+               vc_vlan = &vlan_fltr->inner;
+               if (ice_vc_is_valid_vlan(vc_vlan)) {
+                       struct ice_vlan vlan = ice_vc_to_vlan(vc_vlan);
+
+                       err = ice_vc_vlan_action(vsi,
+                                                vsi->inner_vlan_ops.add_vlan,
+                                                &vlan);
+                       if (err)
+                               return err;
+
+                       /* no support for VLAN promiscuous on inner VLAN unless
+                        * we are in Single VLAN Mode (SVM)
+                        */
+                       if (!ice_is_dvm_ena(&vsi->back->hw) && vlan_promisc) {
+                               err = ice_vf_ena_vlan_promisc(vsi, &vlan);
+                               if (err)
+                                       return err;
+                       }
+               }
+       }
+
+       return 0;
+}
+
+/**
+ * ice_vc_validate_add_vlan_filter_list - validate add filter list from the VF
+ * @vsi: VF VSI used to get number of existing VLAN filters
+ * @vfc: negotiated/supported VLAN filtering capabilities
+ * @vfl: VLAN filter list from VF to validate
+ *
+ * Validate all of the filters in the VLAN filter list from the VF during the
+ * VIRTCHNL_OP_ADD_VLAN_V2 opcode. If any of the checks fail then return false.
+ * Otherwise return true.
+ */
+static bool
+ice_vc_validate_add_vlan_filter_list(struct ice_vsi *vsi,
+                                    struct virtchnl_vlan_filtering_caps *vfc,
+                                    struct virtchnl_vlan_filter_list_v2 *vfl)
+{
+       u16 num_requested_filters = vsi->num_vlan + vfl->num_elements;
+
+       if (num_requested_filters > vfc->max_filters)
+               return false;
+
+       return ice_vc_validate_vlan_filter_list(vfc, vfl);
+}
+
+/**
+ * ice_vc_add_vlan_v2_msg - virtchnl handler for VIRTCHNL_OP_ADD_VLAN_V2
+ * @vf: VF the message was received from
+ * @msg: message received from the VF
+ */
+static int ice_vc_add_vlan_v2_msg(struct ice_vf *vf, u8 *msg)
+{
+       enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+       struct virtchnl_vlan_filter_list_v2 *vfl =
+               (struct virtchnl_vlan_filter_list_v2 *)msg;
+       struct ice_vsi *vsi;
+
+       if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
+               v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+               goto out;
+       }
+
+       if (!ice_vc_isvalid_vsi_id(vf, vfl->vport_id)) {
+               v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+               goto out;
+       }
+
+       vsi = ice_get_vf_vsi(vf);
+       if (!vsi) {
+               v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+               goto out;
+       }
+
+       if (!ice_vc_validate_add_vlan_filter_list(vsi,
+                                                 &vf->vlan_v2_caps.filtering,
+                                                 vfl)) {
+               v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+               goto out;
+       }
+
+       if (ice_vc_add_vlans(vf, vsi, vfl))
+               v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+
+out:
+       return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_ADD_VLAN_V2, v_ret, NULL,
+                                    0);
+}
+
+/**
+ * ice_vc_valid_vlan_setting - validate VLAN setting
+ * @negotiated_settings: negotiated VLAN settings during VF init
+ * @ethertype_setting: ethertype(s) requested for the VLAN setting
+ */
+static bool
+ice_vc_valid_vlan_setting(u32 negotiated_settings, u32 ethertype_setting)
+{
+       if (ethertype_setting && !(negotiated_settings & ethertype_setting))
+               return false;
+
+       /* only allow a single VIRTCHNL_VLAN_ETHERTYPE if
+        * VIRTHCNL_VLAN_ETHERTYPE_AND is not negotiated/supported
+        */
+       if (!(negotiated_settings & VIRTCHNL_VLAN_ETHERTYPE_AND) &&
+           hweight32(ethertype_setting) > 1)
+               return false;
+
+       /* ability to modify the VLAN setting was not negotiated */
+       if (!(negotiated_settings & VIRTCHNL_VLAN_TOGGLE))
+               return false;
+
+       return true;
+}
+
+/**
+ * ice_vc_valid_vlan_setting_msg - validate the VLAN setting message
+ * @caps: negotiated VLAN settings during VF init
+ * @msg: message to validate
+ *
+ * Used to validate any VLAN virtchnl message sent as a
+ * virtchnl_vlan_setting structure. Validates the message against the
+ * negotiated/supported caps during VF driver init.
+ */
+static bool
+ice_vc_valid_vlan_setting_msg(struct virtchnl_vlan_supported_caps *caps,
+                             struct virtchnl_vlan_setting *msg)
+{
+       if ((!msg->outer_ethertype_setting &&
+            !msg->inner_ethertype_setting) ||
+           (!caps->outer && !caps->inner))
+               return false;
+
+       if (msg->outer_ethertype_setting &&
+           !ice_vc_valid_vlan_setting(caps->outer,
+                                      msg->outer_ethertype_setting))
+               return false;
+
+       if (msg->inner_ethertype_setting &&
+           !ice_vc_valid_vlan_setting(caps->inner,
+                                      msg->inner_ethertype_setting))
+               return false;
+
+       return true;
+}
+
+/**
+ * ice_vc_get_tpid - transform from VIRTCHNL_VLAN_ETHERTYPE_* to VLAN TPID
+ * @ethertype_setting: VIRTCHNL_VLAN_ETHERTYPE_* used to get VLAN TPID
+ * @tpid: VLAN TPID to populate
+ */
+static int ice_vc_get_tpid(u32 ethertype_setting, u16 *tpid)
+{
+       switch (ethertype_setting) {
+       case VIRTCHNL_VLAN_ETHERTYPE_8100:
+               *tpid = ETH_P_8021Q;
+               break;
+       case VIRTCHNL_VLAN_ETHERTYPE_88A8:
+               *tpid = ETH_P_8021AD;
+               break;
+       case VIRTCHNL_VLAN_ETHERTYPE_9100:
+               *tpid = ETH_P_QINQ1;
+               break;
+       default:
+               *tpid = 0;
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+/**
+ * ice_vc_ena_vlan_offload - enable VLAN offload based on the ethertype_setting
+ * @vsi: VF's VSI used to enable the VLAN offload
+ * @ena_offload: function used to enable the VLAN offload
+ * @ethertype_setting: VIRTCHNL_VLAN_ETHERTYPE_* to enable offloads for
+ */
+static int
+ice_vc_ena_vlan_offload(struct ice_vsi *vsi,
+                       int (*ena_offload)(struct ice_vsi *vsi, u16 tpid),
+                       u32 ethertype_setting)
+{
+       u16 tpid;
+       int err;
+
+       err = ice_vc_get_tpid(ethertype_setting, &tpid);
+       if (err)
+               return err;
+
+       err = ena_offload(vsi, tpid);
+       if (err)
+               return err;
+
+       return 0;
+}
+
+#define ICE_L2TSEL_QRX_CONTEXT_REG_IDX 3
+#define ICE_L2TSEL_BIT_OFFSET          23
+enum ice_l2tsel {
+       ICE_L2TSEL_EXTRACT_FIRST_TAG_L2TAG2_2ND,
+       ICE_L2TSEL_EXTRACT_FIRST_TAG_L2TAG1,
+};
+
+/**
+ * ice_vsi_update_l2tsel - update l2tsel field for all Rx rings on this VSI
+ * @vsi: VSI used to update l2tsel on
+ * @l2tsel: l2tsel setting requested
+ *
+ * Use the l2tsel setting to update all of the Rx queue context bits for l2tsel.
+ * This will modify which descriptor field the first offloaded VLAN will be
+ * stripped into.
+ */
+static void ice_vsi_update_l2tsel(struct ice_vsi *vsi, enum ice_l2tsel l2tsel)
+{
+       struct ice_hw *hw = &vsi->back->hw;
+       u32 l2tsel_bit;
+       int i;
+
+       if (l2tsel == ICE_L2TSEL_EXTRACT_FIRST_TAG_L2TAG2_2ND)
+               l2tsel_bit = 0;
+       else
+               l2tsel_bit = BIT(ICE_L2TSEL_BIT_OFFSET);
+
+       for (i = 0; i < vsi->alloc_rxq; i++) {
+               u16 pfq = vsi->rxq_map[i];
+               u32 qrx_context_offset;
+               u32 regval;
+
+               qrx_context_offset =
+                       QRX_CONTEXT(ICE_L2TSEL_QRX_CONTEXT_REG_IDX, pfq);
+
+               regval = rd32(hw, qrx_context_offset);
+               regval &= ~BIT(ICE_L2TSEL_BIT_OFFSET);
+               regval |= l2tsel_bit;
+               wr32(hw, qrx_context_offset, regval);
+       }
+}
+
+/**
+ * ice_vc_ena_vlan_stripping_v2_msg
+ * @vf: VF the message was received from
+ * @msg: message received from the VF
+ *
+ * virthcnl handler for VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2
+ */
+static int ice_vc_ena_vlan_stripping_v2_msg(struct ice_vf *vf, u8 *msg)
+{
+       enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+       struct virtchnl_vlan_supported_caps *stripping_support;
+       struct virtchnl_vlan_setting *strip_msg =
+               (struct virtchnl_vlan_setting *)msg;
+       u32 ethertype_setting;
+       struct ice_vsi *vsi;
+
+       if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
+               v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+               goto out;
+       }
+
+       if (!ice_vc_isvalid_vsi_id(vf, strip_msg->vport_id)) {
+               v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+               goto out;
+       }
+
+       vsi = ice_get_vf_vsi(vf);
+       if (!vsi) {
+               v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+               goto out;
+       }
+
+       stripping_support = &vf->vlan_v2_caps.offloads.stripping_support;
+       if (!ice_vc_valid_vlan_setting_msg(stripping_support, strip_msg)) {
+               v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+               goto out;
+       }
+
+       ethertype_setting = strip_msg->outer_ethertype_setting;
+       if (ethertype_setting) {
+               if (ice_vc_ena_vlan_offload(vsi,
+                                           vsi->outer_vlan_ops.ena_stripping,
+                                           ethertype_setting)) {
+                       v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+                       goto out;
+               } else {
+                       enum ice_l2tsel l2tsel =
+                               ICE_L2TSEL_EXTRACT_FIRST_TAG_L2TAG2_2ND;
+
+                       /* PF tells the VF that the outer VLAN tag is always
+                        * extracted to VIRTCHNL_VLAN_TAG_LOCATION_L2TAG2_2 and
+                        * inner is always extracted to
+                        * VIRTCHNL_VLAN_TAG_LOCATION_L2TAG1. This is needed to
+                        * support outer stripping so the first tag always ends
+                        * up in L2TAG2_2ND and the second/inner tag, if
+                        * enabled, is extracted in L2TAG1.
+                        */
+                       ice_vsi_update_l2tsel(vsi, l2tsel);
+               }
+       }
+
+       ethertype_setting = strip_msg->inner_ethertype_setting;
+       if (ethertype_setting &&
+           ice_vc_ena_vlan_offload(vsi, vsi->inner_vlan_ops.ena_stripping,
+                                   ethertype_setting)) {
+               v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+               goto out;
+       }
+
+out:
+       return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2, v_ret, NULL, 0);
+}
+
+/**
+ * ice_vc_dis_vlan_stripping_v2_msg
+ * @vf: VF the message was received from
+ * @msg: message received from the VF
+ *
+ * virthcnl handler for VIRTCHNL_OP_DISABLE_VLAN_STRIPPING_V2
+ */
+static int ice_vc_dis_vlan_stripping_v2_msg(struct ice_vf *vf, u8 *msg)
+{
+       enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+       struct virtchnl_vlan_supported_caps *stripping_support;
+       struct virtchnl_vlan_setting *strip_msg =
+               (struct virtchnl_vlan_setting *)msg;
+       u32 ethertype_setting;
+       struct ice_vsi *vsi;
+
+       if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
+               v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+               goto out;
+       }
+
+       if (!ice_vc_isvalid_vsi_id(vf, strip_msg->vport_id)) {
+               v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+               goto out;
+       }
+
+       vsi = ice_get_vf_vsi(vf);
+       if (!vsi) {
+               v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+               goto out;
+       }
+
+       stripping_support = &vf->vlan_v2_caps.offloads.stripping_support;
+       if (!ice_vc_valid_vlan_setting_msg(stripping_support, strip_msg)) {
+               v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+               goto out;
+       }
+
+       ethertype_setting = strip_msg->outer_ethertype_setting;
+       if (ethertype_setting) {
+               if (vsi->outer_vlan_ops.dis_stripping(vsi)) {
+                       v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+                       goto out;
+               } else {
+                       enum ice_l2tsel l2tsel =
+                               ICE_L2TSEL_EXTRACT_FIRST_TAG_L2TAG1;
+
+                       /* PF tells the VF that the outer VLAN tag is always
+                        * extracted to VIRTCHNL_VLAN_TAG_LOCATION_L2TAG2_2 and
+                        * inner is always extracted to
+                        * VIRTCHNL_VLAN_TAG_LOCATION_L2TAG1. This is needed to
+                        * support inner stripping while outer stripping is
+                        * disabled so that the first and only tag is extracted
+                        * in L2TAG1.
+                        */
+                       ice_vsi_update_l2tsel(vsi, l2tsel);
+               }
+       }
+
+       ethertype_setting = strip_msg->inner_ethertype_setting;
+       if (ethertype_setting && vsi->inner_vlan_ops.dis_stripping(vsi)) {
+               v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+               goto out;
+       }
+
+out:
+       return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_DISABLE_VLAN_STRIPPING_V2, v_ret, NULL, 0);
+}
+
+/**
+ * ice_vc_ena_vlan_insertion_v2_msg
+ * @vf: VF the message was received from
+ * @msg: message received from the VF
+ *
+ * virthcnl handler for VIRTCHNL_OP_ENABLE_VLAN_INSERTION_V2
+ */
+static int ice_vc_ena_vlan_insertion_v2_msg(struct ice_vf *vf, u8 *msg)
+{
+       enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+       struct virtchnl_vlan_supported_caps *insertion_support;
+       struct virtchnl_vlan_setting *insertion_msg =
+               (struct virtchnl_vlan_setting *)msg;
+       u32 ethertype_setting;
+       struct ice_vsi *vsi;
+
+       if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
+               v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+               goto out;
+       }
+
+       if (!ice_vc_isvalid_vsi_id(vf, insertion_msg->vport_id)) {
+               v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+               goto out;
+       }
+
+       vsi = ice_get_vf_vsi(vf);
+       if (!vsi) {
+               v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+               goto out;
+       }
+
+       insertion_support = &vf->vlan_v2_caps.offloads.insertion_support;
+       if (!ice_vc_valid_vlan_setting_msg(insertion_support, insertion_msg)) {
+               v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+               goto out;
+       }
+
+       ethertype_setting = insertion_msg->outer_ethertype_setting;
+       if (ethertype_setting &&
+           ice_vc_ena_vlan_offload(vsi, vsi->outer_vlan_ops.ena_insertion,
+                                   ethertype_setting)) {
+               v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+               goto out;
+       }
+
+       ethertype_setting = insertion_msg->inner_ethertype_setting;
+       if (ethertype_setting &&
+           ice_vc_ena_vlan_offload(vsi, vsi->inner_vlan_ops.ena_insertion,
+                                   ethertype_setting)) {
+               v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+               goto out;
+       }
+
+out:
+       return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_ENABLE_VLAN_INSERTION_V2, v_ret, NULL, 0);
+}
+
+/**
+ * ice_vc_dis_vlan_insertion_v2_msg
+ * @vf: VF the message was received from
+ * @msg: message received from the VF
+ *
+ * virthcnl handler for VIRTCHNL_OP_DISABLE_VLAN_INSERTION_V2
+ */
+static int ice_vc_dis_vlan_insertion_v2_msg(struct ice_vf *vf, u8 *msg)
+{
+       enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+       struct virtchnl_vlan_supported_caps *insertion_support;
+       struct virtchnl_vlan_setting *insertion_msg =
+               (struct virtchnl_vlan_setting *)msg;
+       u32 ethertype_setting;
+       struct ice_vsi *vsi;
+
+       if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
+               v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+               goto out;
+       }
+
+       if (!ice_vc_isvalid_vsi_id(vf, insertion_msg->vport_id)) {
+               v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+               goto out;
+       }
+
+       vsi = ice_get_vf_vsi(vf);
+       if (!vsi) {
+               v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+               goto out;
+       }
+
+       insertion_support = &vf->vlan_v2_caps.offloads.insertion_support;
+       if (!ice_vc_valid_vlan_setting_msg(insertion_support, insertion_msg)) {
+               v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+               goto out;
+       }
+
+       ethertype_setting = insertion_msg->outer_ethertype_setting;
+       if (ethertype_setting && vsi->outer_vlan_ops.dis_insertion(vsi)) {
+               v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+               goto out;
+       }
+
+       ethertype_setting = insertion_msg->inner_ethertype_setting;
+       if (ethertype_setting && vsi->inner_vlan_ops.dis_insertion(vsi)) {
+               v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+               goto out;
+       }
+
+out:
+       return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_DISABLE_VLAN_INSERTION_V2, v_ret, NULL, 0);
+}
+
+static struct ice_vc_vf_ops ice_vc_vf_dflt_ops = {
+       .get_ver_msg = ice_vc_get_ver_msg,
+       .get_vf_res_msg = ice_vc_get_vf_res_msg,
+       .reset_vf = ice_vc_reset_vf_msg,
+       .add_mac_addr_msg = ice_vc_add_mac_addr_msg,
+       .del_mac_addr_msg = ice_vc_del_mac_addr_msg,
+       .cfg_qs_msg = ice_vc_cfg_qs_msg,
+       .ena_qs_msg = ice_vc_ena_qs_msg,
+       .dis_qs_msg = ice_vc_dis_qs_msg,
+       .request_qs_msg = ice_vc_request_qs_msg,
+       .cfg_irq_map_msg = ice_vc_cfg_irq_map_msg,
+       .config_rss_key = ice_vc_config_rss_key,
+       .config_rss_lut = ice_vc_config_rss_lut,
+       .get_stats_msg = ice_vc_get_stats_msg,
+       .cfg_promiscuous_mode_msg = ice_vc_cfg_promiscuous_mode_msg,
+       .add_vlan_msg = ice_vc_add_vlan_msg,
+       .remove_vlan_msg = ice_vc_remove_vlan_msg,
+       .ena_vlan_stripping = ice_vc_ena_vlan_stripping,
+       .dis_vlan_stripping = ice_vc_dis_vlan_stripping,
+       .handle_rss_cfg_msg = ice_vc_handle_rss_cfg,
+       .add_fdir_fltr_msg = ice_vc_add_fdir_fltr,
+       .del_fdir_fltr_msg = ice_vc_del_fdir_fltr,
+       .get_offload_vlan_v2_caps = ice_vc_get_offload_vlan_v2_caps,
+       .add_vlan_v2_msg = ice_vc_add_vlan_v2_msg,
+       .remove_vlan_v2_msg = ice_vc_remove_vlan_v2_msg,
+       .ena_vlan_stripping_v2_msg = ice_vc_ena_vlan_stripping_v2_msg,
+       .dis_vlan_stripping_v2_msg = ice_vc_dis_vlan_stripping_v2_msg,
+       .ena_vlan_insertion_v2_msg = ice_vc_ena_vlan_insertion_v2_msg,
+       .dis_vlan_insertion_v2_msg = ice_vc_dis_vlan_insertion_v2_msg,
+};
+
+void ice_vc_set_dflt_vf_ops(struct ice_vc_vf_ops *ops)
+{
+       *ops = ice_vc_vf_dflt_ops;
+}
+
+/**
+ * ice_vc_repr_add_mac
+ * @vf: pointer to VF
+ * @msg: virtchannel message
+ *
+ * When port representors are created, we do not add MAC rule
+ * to firmware, we store it so that PF could report same
+ * MAC as VF.
+ */
+static int ice_vc_repr_add_mac(struct ice_vf *vf, u8 *msg)
+{
+       enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+       struct virtchnl_ether_addr_list *al =
+           (struct virtchnl_ether_addr_list *)msg;
+       struct ice_vsi *vsi;
+       struct ice_pf *pf;
+       int i;
+
+       if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states) ||
+           !ice_vc_isvalid_vsi_id(vf, al->vsi_id)) {
+               v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+               goto handle_mac_exit;
+       }
 
        pf = vf->pf;
 
@@ -4686,7 +5820,7 @@ error_handler:
        case VIRTCHNL_OP_GET_VF_RESOURCES:
                err = ops->get_vf_res_msg(vf, msg);
                if (ice_vf_init_vlan_stripping(vf))
-                       dev_err(dev, "Failed to initialize VLAN stripping for VF %d\n",
+                       dev_dbg(dev, "Failed to initialize VLAN stripping for VF %d\n",
                                vf->vf_id);
                ice_vc_notify_vf_link_state(vf);
                break;
@@ -4751,6 +5885,27 @@ error_handler:
        case VIRTCHNL_OP_DEL_RSS_CFG:
                err = ops->handle_rss_cfg_msg(vf, msg, false);
                break;
+       case VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS:
+               err = ops->get_offload_vlan_v2_caps(vf);
+               break;
+       case VIRTCHNL_OP_ADD_VLAN_V2:
+               err = ops->add_vlan_v2_msg(vf, msg);
+               break;
+       case VIRTCHNL_OP_DEL_VLAN_V2:
+               err = ops->remove_vlan_v2_msg(vf, msg);
+               break;
+       case VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2:
+               err = ops->ena_vlan_stripping_v2_msg(vf, msg);
+               break;
+       case VIRTCHNL_OP_DISABLE_VLAN_STRIPPING_V2:
+               err = ops->dis_vlan_stripping_v2_msg(vf, msg);
+               break;
+       case VIRTCHNL_OP_ENABLE_VLAN_INSERTION_V2:
+               err = ops->ena_vlan_insertion_v2_msg(vf, msg);
+               break;
+       case VIRTCHNL_OP_DISABLE_VLAN_INSERTION_V2:
+               err = ops->dis_vlan_insertion_v2_msg(vf, msg);
+               break;
        case VIRTCHNL_OP_UNKNOWN:
        default:
                dev_err(dev, "Unsupported opcode %d from VF %d\n", v_opcode,
@@ -4797,8 +5952,10 @@ ice_get_vf_cfg(struct net_device *netdev, int vf_id, struct ifla_vf_info *ivi)
        ether_addr_copy(ivi->mac, vf->hw_lan_addr.addr);
 
        /* VF configuration for VLAN and applicable QoS */
-       ivi->vlan = vf->port_vlan_info & VLAN_VID_MASK;
-       ivi->qos = (vf->port_vlan_info & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
+       ivi->vlan = ice_vf_get_port_vlan_id(vf);
+       ivi->qos = ice_vf_get_port_vlan_prio(vf);
+       if (ice_vf_is_port_vlan_ena(vf))
+               ivi->vlan_proto = cpu_to_be16(ice_vf_get_port_vlan_tpid(vf));
 
        ivi->trusted = vf->trusted;
        ivi->spoofchk = vf->spoofchk;
index 752487a..4f49610 100644 (file)
@@ -5,6 +5,7 @@
 #define _ICE_VIRTCHNL_PF_H_
 #include "ice.h"
 #include "ice_virtchnl_fdir.h"
+#include "ice_vsi_vlan_ops.h"
 
 /* Restrict number of MAC Addr and VLAN that non-trusted VF can programmed */
 #define ICE_MAX_VLAN_PER_VF            8
@@ -94,6 +95,13 @@ struct ice_vc_vf_ops {
        int (*handle_rss_cfg_msg)(struct ice_vf *vf, u8 *msg, bool add);
        int (*add_fdir_fltr_msg)(struct ice_vf *vf, u8 *msg);
        int (*del_fdir_fltr_msg)(struct ice_vf *vf, u8 *msg);
+       int (*get_offload_vlan_v2_caps)(struct ice_vf *vf);
+       int (*add_vlan_v2_msg)(struct ice_vf *vf, u8 *msg);
+       int (*remove_vlan_v2_msg)(struct ice_vf *vf, u8 *msg);
+       int (*ena_vlan_stripping_v2_msg)(struct ice_vf *vf, u8 *msg);
+       int (*dis_vlan_stripping_v2_msg)(struct ice_vf *vf, u8 *msg);
+       int (*ena_vlan_insertion_v2_msg)(struct ice_vf *vf, u8 *msg);
+       int (*dis_vlan_insertion_v2_msg)(struct ice_vf *vf, u8 *msg);
 };
 
 /* VF information structure */
@@ -119,7 +127,8 @@ struct ice_vf {
        struct ice_time_mac legacy_last_added_umac;
        DECLARE_BITMAP(txq_ena, ICE_MAX_RSS_QS_PER_VF);
        DECLARE_BITMAP(rxq_ena, ICE_MAX_RSS_QS_PER_VF);
-       u16 port_vlan_info;             /* Port VLAN ID and QoS */
+       struct ice_vlan port_vlan_info; /* Port VLAN ID, QoS, and TPID */
+       struct virtchnl_vlan_caps vlan_v2_caps;
        u8 pf_set_mac:1;                /* VF MAC address set by VMM admin */
        u8 trusted:1;
        u8 spoofchk:1;
@@ -210,6 +219,7 @@ int
 ice_vc_send_msg_to_vf(struct ice_vf *vf, u32 v_opcode,
                      enum virtchnl_status_code v_retval, u8 *msg, u16 msglen);
 bool ice_vc_isvalid_vsi_id(struct ice_vf *vf, u16 vsi_id);
+bool ice_vf_is_port_vlan_ena(struct ice_vf *vf);
 #else /* CONFIG_PCI_IOV */
 static inline void ice_process_vflr_event(struct ice_pf *pf) { }
 static inline void ice_free_vfs(struct ice_pf *pf) { }
@@ -342,5 +352,10 @@ static inline bool ice_is_any_vf_in_promisc(struct ice_pf __always_unused *pf)
 {
        return false;
 }
+
+static inline bool ice_vf_is_port_vlan_ena(struct ice_vf __always_unused *vf)
+{
+       return false;
+}
 #endif /* CONFIG_PCI_IOV */
 #endif /* _ICE_VIRTCHNL_PF_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_vlan.h b/drivers/net/ethernet/intel/ice/ice_vlan.h
new file mode 100644 (file)
index 0000000..bc4550a
--- /dev/null
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2019-2021, Intel Corporation. */
+
+#ifndef _ICE_VLAN_H_
+#define _ICE_VLAN_H_
+
+#include <linux/types.h>
+#include "ice_type.h"
+
+struct ice_vlan {
+       u16 tpid;
+       u16 vid;
+       u8 prio;
+};
+
+#define ICE_VLAN(tpid, vid, prio) ((struct ice_vlan){ tpid, vid, prio })
+
+#endif /* _ICE_VLAN_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_vlan_mode.c b/drivers/net/ethernet/intel/ice/ice_vlan_mode.c
new file mode 100644 (file)
index 0000000..1b618de
--- /dev/null
@@ -0,0 +1,439 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2019-2021, Intel Corporation. */
+
+#include "ice_common.h"
+
+/**
+ * ice_pkg_get_supported_vlan_mode - determine if DDP supports Double VLAN mode
+ * @hw: pointer to the HW struct
+ * @dvm: output variable to determine if DDP supports DVM(true) or SVM(false)
+ */
+static int
+ice_pkg_get_supported_vlan_mode(struct ice_hw *hw, bool *dvm)
+{
+       u16 meta_init_size = sizeof(struct ice_meta_init_section);
+       struct ice_meta_init_section *sect;
+       struct ice_buf_build *bld;
+       int status;
+
+       /* if anything fails, we assume there is no DVM support */
+       *dvm = false;
+
+       bld = ice_pkg_buf_alloc_single_section(hw,
+                                              ICE_SID_RXPARSER_METADATA_INIT,
+                                              meta_init_size, (void **)&sect);
+       if (!bld)
+               return -ENOMEM;
+
+       /* only need to read a single section */
+       sect->count = cpu_to_le16(1);
+       sect->offset = cpu_to_le16(ICE_META_VLAN_MODE_ENTRY);
+
+       status = ice_aq_upload_section(hw,
+                                      (struct ice_buf_hdr *)ice_pkg_buf(bld),
+                                      ICE_PKG_BUF_SIZE, NULL);
+       if (!status) {
+               DECLARE_BITMAP(entry, ICE_META_INIT_BITS);
+               u32 arr[ICE_META_INIT_DW_CNT];
+               u16 i;
+
+               /* convert to host bitmap format */
+               for (i = 0; i < ICE_META_INIT_DW_CNT; i++)
+                       arr[i] = le32_to_cpu(sect->entry.bm[i]);
+
+               bitmap_from_arr32(entry, arr, (u16)ICE_META_INIT_BITS);
+
+               /* check if DVM is supported */
+               *dvm = test_bit(ICE_META_VLAN_MODE_BIT, entry);
+       }
+
+       ice_pkg_buf_free(hw, bld);
+
+       return status;
+}
+
+/**
+ * ice_aq_get_vlan_mode - get the VLAN mode of the device
+ * @hw: pointer to the HW structure
+ * @get_params: structure FW fills in based on the current VLAN mode config
+ *
+ * Get VLAN Mode Parameters (0x020D)
+ */
+static int
+ice_aq_get_vlan_mode(struct ice_hw *hw,
+                    struct ice_aqc_get_vlan_mode *get_params)
+{
+       struct ice_aq_desc desc;
+
+       if (!get_params)
+               return -EINVAL;
+
+       ice_fill_dflt_direct_cmd_desc(&desc,
+                                     ice_aqc_opc_get_vlan_mode_parameters);
+
+       return ice_aq_send_cmd(hw, &desc, get_params, sizeof(*get_params),
+                              NULL);
+}
+
+/**
+ * ice_aq_is_dvm_ena - query FW to check if double VLAN mode is enabled
+ * @hw: pointer to the HW structure
+ *
+ * Returns true if the hardware/firmware is configured in double VLAN mode,
+ * else return false signaling that the hardware/firmware is configured in
+ * single VLAN mode.
+ *
+ * Also, return false if this call fails for any reason (i.e. firmware doesn't
+ * support this AQ call).
+ */
+static bool ice_aq_is_dvm_ena(struct ice_hw *hw)
+{
+       struct ice_aqc_get_vlan_mode get_params = { 0 };
+       int status;
+
+       status = ice_aq_get_vlan_mode(hw, &get_params);
+       if (status) {
+               ice_debug(hw, ICE_DBG_AQ, "Failed to get VLAN mode, status %d\n",
+                         status);
+               return false;
+       }
+
+       return (get_params.vlan_mode & ICE_AQ_VLAN_MODE_DVM_ENA);
+}
+
+/**
+ * ice_is_dvm_ena - check if double VLAN mode is enabled
+ * @hw: pointer to the HW structure
+ *
+ * The device is configured in single or double VLAN mode on initialization and
+ * this cannot be dynamically changed during runtime. Based on this there is no
+ * need to make an AQ call every time the driver needs to know the VLAN mode.
+ * Instead, use the cached VLAN mode.
+ */
+bool ice_is_dvm_ena(struct ice_hw *hw)
+{
+       return hw->dvm_ena;
+}
+
+/**
+ * ice_cache_vlan_mode - cache VLAN mode after DDP is downloaded
+ * @hw: pointer to the HW structure
+ *
+ * This is only called after downloading the DDP and after the global
+ * configuration lock has been released because all ports on a device need to
+ * cache the VLAN mode.
+ */
+static void ice_cache_vlan_mode(struct ice_hw *hw)
+{
+       hw->dvm_ena = ice_aq_is_dvm_ena(hw) ? true : false;
+}
+
+/**
+ * ice_pkg_supports_dvm - find out if DDP supports DVM
+ * @hw: pointer to the HW structure
+ */
+static bool ice_pkg_supports_dvm(struct ice_hw *hw)
+{
+       bool pkg_supports_dvm;
+       int status;
+
+       status = ice_pkg_get_supported_vlan_mode(hw, &pkg_supports_dvm);
+       if (status) {
+               ice_debug(hw, ICE_DBG_PKG, "Failed to get supported VLAN mode, status %d\n",
+                         status);
+               return false;
+       }
+
+       return pkg_supports_dvm;
+}
+
+/**
+ * ice_fw_supports_dvm - find out if FW supports DVM
+ * @hw: pointer to the HW structure
+ */
+static bool ice_fw_supports_dvm(struct ice_hw *hw)
+{
+       struct ice_aqc_get_vlan_mode get_vlan_mode = { 0 };
+       int status;
+
+       /* If firmware returns success, then it supports DVM, else it only
+        * supports SVM
+        */
+       status = ice_aq_get_vlan_mode(hw, &get_vlan_mode);
+       if (status) {
+               ice_debug(hw, ICE_DBG_NVM, "Failed to get VLAN mode, status %d\n",
+                         status);
+               return false;
+       }
+
+       return true;
+}
+
+/**
+ * ice_is_dvm_supported - check if Double VLAN Mode is supported
+ * @hw: pointer to the hardware structure
+ *
+ * Returns true if Double VLAN Mode (DVM) is supported and false if only Single
+ * VLAN Mode (SVM) is supported. In order for DVM to be supported the DDP and
+ * firmware must support it, otherwise only SVM is supported. This function
+ * should only be called while the global config lock is held and after the
+ * package has been successfully downloaded.
+ */
+static bool ice_is_dvm_supported(struct ice_hw *hw)
+{
+       if (!ice_pkg_supports_dvm(hw)) {
+               ice_debug(hw, ICE_DBG_PKG, "DDP doesn't support DVM\n");
+               return false;
+       }
+
+       if (!ice_fw_supports_dvm(hw)) {
+               ice_debug(hw, ICE_DBG_PKG, "FW doesn't support DVM\n");
+               return false;
+       }
+
+       return true;
+}
+
+#define ICE_EXTERNAL_VLAN_ID_FV_IDX                    11
+#define ICE_SW_LKUP_VLAN_LOC_LKUP_IDX                  1
+#define ICE_SW_LKUP_VLAN_PKT_FLAGS_LKUP_IDX            2
+#define ICE_SW_LKUP_PROMISC_VLAN_LOC_LKUP_IDX          2
+#define ICE_PKT_FLAGS_0_TO_15_FV_IDX                   1
+#define ICE_PKT_FLAGS_0_TO_15_VLAN_FLAGS_MASK          0xD000
+static struct ice_update_recipe_lkup_idx_params ice_dvm_dflt_recipes[] = {
+       {
+               /* Update recipe ICE_SW_LKUP_VLAN to filter based on the
+                * outer/single VLAN in DVM
+                */
+               .rid = ICE_SW_LKUP_VLAN,
+               .fv_idx = ICE_EXTERNAL_VLAN_ID_FV_IDX,
+               .ignore_valid = true,
+               .mask = 0,
+               .mask_valid = false, /* use pre-existing mask */
+               .lkup_idx = ICE_SW_LKUP_VLAN_LOC_LKUP_IDX,
+       },
+       {
+               /* Update recipe ICE_SW_LKUP_VLAN to filter based on the VLAN
+                * packet flags to support VLAN filtering on multiple VLAN
+                * ethertypes (i.e. 0x8100 and 0x88a8) in DVM
+                */
+               .rid = ICE_SW_LKUP_VLAN,
+               .fv_idx = ICE_PKT_FLAGS_0_TO_15_FV_IDX,
+               .ignore_valid = false,
+               .mask = ICE_PKT_FLAGS_0_TO_15_VLAN_FLAGS_MASK,
+               .mask_valid = true,
+               .lkup_idx = ICE_SW_LKUP_VLAN_PKT_FLAGS_LKUP_IDX,
+       },
+       {
+               /* Update recipe ICE_SW_LKUP_PROMISC_VLAN to filter based on the
+                * outer/single VLAN in DVM
+                */
+               .rid = ICE_SW_LKUP_PROMISC_VLAN,
+               .fv_idx = ICE_EXTERNAL_VLAN_ID_FV_IDX,
+               .ignore_valid = true,
+               .mask = 0,
+               .mask_valid = false,  /* use pre-existing mask */
+               .lkup_idx = ICE_SW_LKUP_PROMISC_VLAN_LOC_LKUP_IDX,
+       },
+};
+
+/**
+ * ice_dvm_update_dflt_recipes - update default switch recipes in DVM
+ * @hw: hardware structure used to update the recipes
+ */
+static int ice_dvm_update_dflt_recipes(struct ice_hw *hw)
+{
+       unsigned long i;
+
+       for (i = 0; i < ARRAY_SIZE(ice_dvm_dflt_recipes); i++) {
+               struct ice_update_recipe_lkup_idx_params *params;
+               int status;
+
+               params = &ice_dvm_dflt_recipes[i];
+
+               status = ice_update_recipe_lkup_idx(hw, params);
+               if (status) {
+                       ice_debug(hw, ICE_DBG_INIT, "Failed to update RID %d lkup_idx %d fv_idx %d mask_valid %s mask 0x%04x\n",
+                                 params->rid, params->lkup_idx, params->fv_idx,
+                                 params->mask_valid ? "true" : "false",
+                                 params->mask);
+                       return status;
+               }
+       }
+
+       return 0;
+}
+
+/**
+ * ice_aq_set_vlan_mode - set the VLAN mode of the device
+ * @hw: pointer to the HW structure
+ * @set_params: requested VLAN mode configuration
+ *
+ * Set VLAN Mode Parameters (0x020C)
+ */
+static int
+ice_aq_set_vlan_mode(struct ice_hw *hw,
+                    struct ice_aqc_set_vlan_mode *set_params)
+{
+       u8 rdma_packet, mng_vlan_prot_id;
+       struct ice_aq_desc desc;
+
+       if (!set_params)
+               return -EINVAL;
+
+       if (set_params->l2tag_prio_tagging > ICE_AQ_VLAN_PRIO_TAG_MAX)
+               return -EINVAL;
+
+       rdma_packet = set_params->rdma_packet;
+       if (rdma_packet != ICE_AQ_SVM_VLAN_RDMA_PKT_FLAG_SETTING &&
+           rdma_packet != ICE_AQ_DVM_VLAN_RDMA_PKT_FLAG_SETTING)
+               return -EINVAL;
+
+       mng_vlan_prot_id = set_params->mng_vlan_prot_id;
+       if (mng_vlan_prot_id != ICE_AQ_VLAN_MNG_PROTOCOL_ID_OUTER &&
+           mng_vlan_prot_id != ICE_AQ_VLAN_MNG_PROTOCOL_ID_INNER)
+               return -EINVAL;
+
+       ice_fill_dflt_direct_cmd_desc(&desc,
+                                     ice_aqc_opc_set_vlan_mode_parameters);
+       desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+
+       return ice_aq_send_cmd(hw, &desc, set_params, sizeof(*set_params),
+                              NULL);
+}
+
+/**
+ * ice_set_dvm - sets up software and hardware for double VLAN mode
+ * @hw: pointer to the hardware structure
+ */
+static int ice_set_dvm(struct ice_hw *hw)
+{
+       struct ice_aqc_set_vlan_mode params = { 0 };
+       int status;
+
+       params.l2tag_prio_tagging = ICE_AQ_VLAN_PRIO_TAG_OUTER_CTAG;
+       params.rdma_packet = ICE_AQ_DVM_VLAN_RDMA_PKT_FLAG_SETTING;
+       params.mng_vlan_prot_id = ICE_AQ_VLAN_MNG_PROTOCOL_ID_OUTER;
+
+       status = ice_aq_set_vlan_mode(hw, &params);
+       if (status) {
+               ice_debug(hw, ICE_DBG_INIT, "Failed to set double VLAN mode parameters, status %d\n",
+                         status);
+               return status;
+       }
+
+       status = ice_dvm_update_dflt_recipes(hw);
+       if (status) {
+               ice_debug(hw, ICE_DBG_INIT, "Failed to update default recipes for double VLAN mode, status %d\n",
+                         status);
+               return status;
+       }
+
+       status = ice_aq_set_port_params(hw->port_info, true, NULL);
+       if (status) {
+               ice_debug(hw, ICE_DBG_INIT, "Failed to set port in double VLAN mode, status %d\n",
+                         status);
+               return status;
+       }
+
+       status = ice_set_dvm_boost_entries(hw);
+       if (status) {
+               ice_debug(hw, ICE_DBG_INIT, "Failed to set boost TCAM entries for double VLAN mode, status %d\n",
+                         status);
+               return status;
+       }
+
+       return 0;
+}
+
+/**
+ * ice_set_svm - set single VLAN mode
+ * @hw: pointer to the HW structure
+ */
+static int ice_set_svm(struct ice_hw *hw)
+{
+       struct ice_aqc_set_vlan_mode *set_params;
+       int status;
+
+       status = ice_aq_set_port_params(hw->port_info, false, NULL);
+       if (status) {
+               ice_debug(hw, ICE_DBG_INIT, "Failed to set port parameters for single VLAN mode\n");
+               return status;
+       }
+
+       set_params = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*set_params),
+                                 GFP_KERNEL);
+       if (!set_params)
+               return -ENOMEM;
+
+       /* default configuration for SVM configurations */
+       set_params->l2tag_prio_tagging = ICE_AQ_VLAN_PRIO_TAG_INNER_CTAG;
+       set_params->rdma_packet = ICE_AQ_SVM_VLAN_RDMA_PKT_FLAG_SETTING;
+       set_params->mng_vlan_prot_id = ICE_AQ_VLAN_MNG_PROTOCOL_ID_INNER;
+
+       status = ice_aq_set_vlan_mode(hw, set_params);
+       if (status)
+               ice_debug(hw, ICE_DBG_INIT, "Failed to configure port in single VLAN mode\n");
+
+       devm_kfree(ice_hw_to_dev(hw), set_params);
+       return status;
+}
+
+/**
+ * ice_set_vlan_mode
+ * @hw: pointer to the HW structure
+ */
+int ice_set_vlan_mode(struct ice_hw *hw)
+{
+       if (!ice_is_dvm_supported(hw))
+               return 0;
+
+       if (!ice_set_dvm(hw))
+               return 0;
+
+       return ice_set_svm(hw);
+}
+
+/**
+ * ice_print_dvm_not_supported - print if DDP and/or FW doesn't support DVM
+ * @hw: pointer to the HW structure
+ *
+ * The purpose of this function is to print that  QinQ is not supported due to
+ * incompatibilty from the DDP and/or FW. This will give a hint to the user to
+ * update one and/or both components if they expect QinQ functionality.
+ */
+static void ice_print_dvm_not_supported(struct ice_hw *hw)
+{
+       bool pkg_supports_dvm = ice_pkg_supports_dvm(hw);
+       bool fw_supports_dvm = ice_fw_supports_dvm(hw);
+
+       if (!fw_supports_dvm && !pkg_supports_dvm)
+               dev_info(ice_hw_to_dev(hw), "QinQ functionality cannot be enabled on this device. Update your DDP package and NVM to versions that support QinQ.\n");
+       else if (!pkg_supports_dvm)
+               dev_info(ice_hw_to_dev(hw), "QinQ functionality cannot be enabled on this device. Update your DDP package to a version that supports QinQ.\n");
+       else if (!fw_supports_dvm)
+               dev_info(ice_hw_to_dev(hw), "QinQ functionality cannot be enabled on this device. Update your NVM to a version that supports QinQ.\n");
+}
+
+/**
+ * ice_post_pkg_dwnld_vlan_mode_cfg - configure VLAN mode after DDP download
+ * @hw: pointer to the HW structure
+ *
+ * This function is meant to configure any VLAN mode specific functionality
+ * after the global configuration lock has been released and the DDP has been
+ * downloaded.
+ *
+ * Since only one PF downloads the DDP and configures the VLAN mode there needs
+ * to be a way to configure the other PFs after the DDP has been downloaded and
+ * the global configuration lock has been released. All such code should go in
+ * this function.
+ */
+void ice_post_pkg_dwnld_vlan_mode_cfg(struct ice_hw *hw)
+{
+       ice_cache_vlan_mode(hw);
+
+       if (ice_is_dvm_ena(hw))
+               ice_change_proto_id_to_dvm();
+       else
+               ice_print_dvm_not_supported(hw);
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_vlan_mode.h b/drivers/net/ethernet/intel/ice/ice_vlan_mode.h
new file mode 100644 (file)
index 0000000..a0fb743
--- /dev/null
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2019-2021, Intel Corporation. */
+
+#ifndef _ICE_VLAN_MODE_H_
+#define _ICE_VLAN_MODE_H_
+
+struct ice_hw;
+
+bool ice_is_dvm_ena(struct ice_hw *hw);
+int ice_set_vlan_mode(struct ice_hw *hw);
+void ice_post_pkg_dwnld_vlan_mode_cfg(struct ice_hw *hw);
+
+#endif /* _ICE_VLAN_MODE_H */
diff --git a/drivers/net/ethernet/intel/ice/ice_vsi_vlan_lib.c b/drivers/net/ethernet/intel/ice/ice_vsi_vlan_lib.c
new file mode 100644 (file)
index 0000000..5b4a0ab
--- /dev/null
@@ -0,0 +1,707 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2019-2021, Intel Corporation. */
+
+#include "ice_vsi_vlan_lib.h"
+#include "ice_lib.h"
+#include "ice_fltr.h"
+#include "ice.h"
+
+static void print_invalid_tpid(struct ice_vsi *vsi, u16 tpid)
+{
+       dev_err(ice_pf_to_dev(vsi->back), "%s %d specified invalid VLAN tpid 0x%04x\n",
+               ice_vsi_type_str(vsi->type), vsi->idx, tpid);
+}
+
+/**
+ * validate_vlan - check if the ice_vlan passed in is valid
+ * @vsi: VSI used for printing error message
+ * @vlan: ice_vlan structure to validate
+ *
+ * Return true if the VLAN TPID is valid or if the VLAN TPID is 0 and the VLAN
+ * VID is 0, which allows for non-zero VLAN filters with the specified VLAN TPID
+ * and untagged VLAN 0 filters to be added to the prune list respectively.
+ */
+static bool validate_vlan(struct ice_vsi *vsi, struct ice_vlan *vlan)
+{
+       if (vlan->tpid != ETH_P_8021Q && vlan->tpid != ETH_P_8021AD &&
+           vlan->tpid != ETH_P_QINQ1 && (vlan->tpid || vlan->vid)) {
+               print_invalid_tpid(vsi, vlan->tpid);
+               return false;
+       }
+
+       return true;
+}
+
+/**
+ * ice_vsi_add_vlan - default add VLAN implementation for all VSI types
+ * @vsi: VSI being configured
+ * @vlan: VLAN filter to add
+ */
+int ice_vsi_add_vlan(struct ice_vsi *vsi, struct ice_vlan *vlan)
+{
+       int err;
+
+       if (!validate_vlan(vsi, vlan))
+               return -EINVAL;
+
+       err = ice_fltr_add_vlan(vsi, vlan);
+       if (err && err != -EEXIST) {
+               dev_err(ice_pf_to_dev(vsi->back), "Failure Adding VLAN %d on VSI %i, status %d\n",
+                       vlan->vid, vsi->vsi_num, err);
+               return err;
+       }
+
+       vsi->num_vlan++;
+       return 0;
+}
+
+/**
+ * ice_vsi_del_vlan - default del VLAN implementation for all VSI types
+ * @vsi: VSI being configured
+ * @vlan: VLAN filter to delete
+ */
+int ice_vsi_del_vlan(struct ice_vsi *vsi, struct ice_vlan *vlan)
+{
+       struct ice_pf *pf = vsi->back;
+       struct device *dev;
+       int err;
+
+       if (!validate_vlan(vsi, vlan))
+               return -EINVAL;
+
+       dev = ice_pf_to_dev(pf);
+
+       err = ice_fltr_remove_vlan(vsi, vlan);
+       if (!err)
+               vsi->num_vlan--;
+       else if (err == -ENOENT || err == -EBUSY)
+               err = 0;
+       else
+               dev_err(dev, "Error removing VLAN %d on VSI %i error: %d\n",
+                       vlan->vid, vsi->vsi_num, err);
+
+       return err;
+}
+
+/**
+ * ice_vsi_manage_vlan_insertion - Manage VLAN insertion for the VSI for Tx
+ * @vsi: the VSI being changed
+ */
+static int ice_vsi_manage_vlan_insertion(struct ice_vsi *vsi)
+{
+       struct ice_hw *hw = &vsi->back->hw;
+       struct ice_vsi_ctx *ctxt;
+       int err;
+
+       ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
+       if (!ctxt)
+               return -ENOMEM;
+
+       /* Here we are configuring the VSI to let the driver add VLAN tags by
+        * setting inner_vlan_flags to ICE_AQ_VSI_INNER_VLAN_TX_MODE_ALL. The actual VLAN tag
+        * insertion happens in the Tx hot path, in ice_tx_map.
+        */
+       ctxt->info.inner_vlan_flags = ICE_AQ_VSI_INNER_VLAN_TX_MODE_ALL;
+
+       /* Preserve existing VLAN strip setting */
+       ctxt->info.inner_vlan_flags |= (vsi->info.inner_vlan_flags &
+                                       ICE_AQ_VSI_INNER_VLAN_EMODE_M);
+
+       ctxt->info.valid_sections = cpu_to_le16(ICE_AQ_VSI_PROP_VLAN_VALID);
+
+       err = ice_update_vsi(hw, vsi->idx, ctxt, NULL);
+       if (err) {
+               dev_err(ice_pf_to_dev(vsi->back), "update VSI for VLAN insert failed, err %d aq_err %s\n",
+                       err, ice_aq_str(hw->adminq.sq_last_status));
+               goto out;
+       }
+
+       vsi->info.inner_vlan_flags = ctxt->info.inner_vlan_flags;
+out:
+       kfree(ctxt);
+       return err;
+}
+
+/**
+ * ice_vsi_manage_vlan_stripping - Manage VLAN stripping for the VSI for Rx
+ * @vsi: the VSI being changed
+ * @ena: boolean value indicating if this is a enable or disable request
+ */
+static int ice_vsi_manage_vlan_stripping(struct ice_vsi *vsi, bool ena)
+{
+       struct ice_hw *hw = &vsi->back->hw;
+       struct ice_vsi_ctx *ctxt;
+       int err;
+
+       /* do not allow modifying VLAN stripping when a port VLAN is configured
+        * on this VSI
+        */
+       if (vsi->info.port_based_inner_vlan)
+               return 0;
+
+       ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
+       if (!ctxt)
+               return -ENOMEM;
+
+       /* Here we are configuring what the VSI should do with the VLAN tag in
+        * the Rx packet. We can either leave the tag in the packet or put it in
+        * the Rx descriptor.
+        */
+       if (ena)
+               /* Strip VLAN tag from Rx packet and put it in the desc */
+               ctxt->info.inner_vlan_flags = ICE_AQ_VSI_INNER_VLAN_EMODE_STR_BOTH;
+       else
+               /* Disable stripping. Leave tag in packet */
+               ctxt->info.inner_vlan_flags = ICE_AQ_VSI_INNER_VLAN_EMODE_NOTHING;
+
+       /* Allow all packets untagged/tagged */
+       ctxt->info.inner_vlan_flags |= ICE_AQ_VSI_INNER_VLAN_TX_MODE_ALL;
+
+       ctxt->info.valid_sections = cpu_to_le16(ICE_AQ_VSI_PROP_VLAN_VALID);
+
+       err = ice_update_vsi(hw, vsi->idx, ctxt, NULL);
+       if (err) {
+               dev_err(ice_pf_to_dev(vsi->back), "update VSI for VLAN strip failed, ena = %d err %d aq_err %s\n",
+                       ena, err, ice_aq_str(hw->adminq.sq_last_status));
+               goto out;
+       }
+
+       vsi->info.inner_vlan_flags = ctxt->info.inner_vlan_flags;
+out:
+       kfree(ctxt);
+       return err;
+}
+
+int ice_vsi_ena_inner_stripping(struct ice_vsi *vsi, const u16 tpid)
+{
+       if (tpid != ETH_P_8021Q) {
+               print_invalid_tpid(vsi, tpid);
+               return -EINVAL;
+       }
+
+       return ice_vsi_manage_vlan_stripping(vsi, true);
+}
+
+int ice_vsi_dis_inner_stripping(struct ice_vsi *vsi)
+{
+       return ice_vsi_manage_vlan_stripping(vsi, false);
+}
+
+int ice_vsi_ena_inner_insertion(struct ice_vsi *vsi, const u16 tpid)
+{
+       if (tpid != ETH_P_8021Q) {
+               print_invalid_tpid(vsi, tpid);
+               return -EINVAL;
+       }
+
+       return ice_vsi_manage_vlan_insertion(vsi);
+}
+
+int ice_vsi_dis_inner_insertion(struct ice_vsi *vsi)
+{
+       return ice_vsi_manage_vlan_insertion(vsi);
+}
+
+/**
+ * __ice_vsi_set_inner_port_vlan - set port VLAN VSI context settings to enable a port VLAN
+ * @vsi: the VSI to update
+ * @pvid_info: VLAN ID and QoS used to set the PVID VSI context field
+ */
+static int __ice_vsi_set_inner_port_vlan(struct ice_vsi *vsi, u16 pvid_info)
+{
+       struct ice_hw *hw = &vsi->back->hw;
+       struct ice_aqc_vsi_props *info;
+       struct ice_vsi_ctx *ctxt;
+       int ret;
+
+       ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
+       if (!ctxt)
+               return -ENOMEM;
+
+       ctxt->info = vsi->info;
+       info = &ctxt->info;
+       info->inner_vlan_flags = ICE_AQ_VSI_INNER_VLAN_TX_MODE_ACCEPTUNTAGGED |
+               ICE_AQ_VSI_INNER_VLAN_INSERT_PVID |
+               ICE_AQ_VSI_INNER_VLAN_EMODE_STR;
+       info->sw_flags2 |= ICE_AQ_VSI_SW_FLAG_RX_VLAN_PRUNE_ENA;
+
+       info->port_based_inner_vlan = cpu_to_le16(pvid_info);
+       info->valid_sections = cpu_to_le16(ICE_AQ_VSI_PROP_VLAN_VALID |
+                                          ICE_AQ_VSI_PROP_SW_VALID);
+
+       ret = ice_update_vsi(hw, vsi->idx, ctxt, NULL);
+       if (ret) {
+               dev_info(ice_hw_to_dev(hw), "update VSI for port VLAN failed, err %d aq_err %s\n",
+                        ret, ice_aq_str(hw->adminq.sq_last_status));
+               goto out;
+       }
+
+       vsi->info.inner_vlan_flags = info->inner_vlan_flags;
+       vsi->info.sw_flags2 = info->sw_flags2;
+       vsi->info.port_based_inner_vlan = info->port_based_inner_vlan;
+out:
+       kfree(ctxt);
+       return ret;
+}
+
+int ice_vsi_set_inner_port_vlan(struct ice_vsi *vsi, struct ice_vlan *vlan)
+{
+       u16 port_vlan_info;
+
+       if (vlan->tpid != ETH_P_8021Q)
+               return -EINVAL;
+
+       if (vlan->prio > 7)
+               return -EINVAL;
+
+       port_vlan_info = vlan->vid | (vlan->prio << VLAN_PRIO_SHIFT);
+
+       return __ice_vsi_set_inner_port_vlan(vsi, port_vlan_info);
+}
+
+/**
+ * ice_cfg_vlan_pruning - enable or disable VLAN pruning on the VSI
+ * @vsi: VSI to enable or disable VLAN pruning on
+ * @ena: set to true to enable VLAN pruning and false to disable it
+ *
+ * returns 0 if VSI is updated, negative otherwise
+ */
+static int ice_cfg_vlan_pruning(struct ice_vsi *vsi, bool ena)
+{
+       struct ice_vsi_ctx *ctxt;
+       struct ice_pf *pf;
+       int status;
+
+       if (!vsi)
+               return -EINVAL;
+
+       /* Don't enable VLAN pruning if the netdev is currently in promiscuous
+        * mode. VLAN pruning will be enabled when the interface exits
+        * promiscuous mode if any VLAN filters are active.
+        */
+       if (vsi->netdev && vsi->netdev->flags & IFF_PROMISC && ena)
+               return 0;
+
+       pf = vsi->back;
+       ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
+       if (!ctxt)
+               return -ENOMEM;
+
+       ctxt->info = vsi->info;
+
+       if (ena)
+               ctxt->info.sw_flags2 |= ICE_AQ_VSI_SW_FLAG_RX_VLAN_PRUNE_ENA;
+       else
+               ctxt->info.sw_flags2 &= ~ICE_AQ_VSI_SW_FLAG_RX_VLAN_PRUNE_ENA;
+
+       ctxt->info.valid_sections = cpu_to_le16(ICE_AQ_VSI_PROP_SW_VALID);
+
+       status = ice_update_vsi(&pf->hw, vsi->idx, ctxt, NULL);
+       if (status) {
+               netdev_err(vsi->netdev, "%sabling VLAN pruning on VSI handle: %d, VSI HW ID: %d failed, err = %d, aq_err = %s\n",
+                          ena ? "En" : "Dis", vsi->idx, vsi->vsi_num, status,
+                          ice_aq_str(pf->hw.adminq.sq_last_status));
+               goto err_out;
+       }
+
+       vsi->info.sw_flags2 = ctxt->info.sw_flags2;
+
+       kfree(ctxt);
+       return 0;
+
+err_out:
+       kfree(ctxt);
+       return status;
+}
+
+int ice_vsi_ena_rx_vlan_filtering(struct ice_vsi *vsi)
+{
+       return ice_cfg_vlan_pruning(vsi, true);
+}
+
+int ice_vsi_dis_rx_vlan_filtering(struct ice_vsi *vsi)
+{
+       return ice_cfg_vlan_pruning(vsi, false);
+}
+
+static int ice_cfg_vlan_antispoof(struct ice_vsi *vsi, bool enable)
+{
+       struct ice_vsi_ctx *ctx;
+       int err;
+
+       ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+       if (!ctx)
+               return -ENOMEM;
+
+       ctx->info.sec_flags = vsi->info.sec_flags;
+       ctx->info.valid_sections = cpu_to_le16(ICE_AQ_VSI_PROP_SECURITY_VALID);
+
+       if (enable)
+               ctx->info.sec_flags |= ICE_AQ_VSI_SEC_TX_VLAN_PRUNE_ENA <<
+                       ICE_AQ_VSI_SEC_TX_PRUNE_ENA_S;
+       else
+               ctx->info.sec_flags &= ~(ICE_AQ_VSI_SEC_TX_VLAN_PRUNE_ENA <<
+                                        ICE_AQ_VSI_SEC_TX_PRUNE_ENA_S);
+
+       err = ice_update_vsi(&vsi->back->hw, vsi->idx, ctx, NULL);
+       if (err)
+               dev_err(ice_pf_to_dev(vsi->back), "Failed to configure Tx VLAN anti-spoof %s for VSI %d, error %d\n",
+                       enable ? "ON" : "OFF", vsi->vsi_num, err);
+       else
+               vsi->info.sec_flags = ctx->info.sec_flags;
+
+       kfree(ctx);
+
+       return err;
+}
+
+int ice_vsi_ena_tx_vlan_filtering(struct ice_vsi *vsi)
+{
+       return ice_cfg_vlan_antispoof(vsi, true);
+}
+
+int ice_vsi_dis_tx_vlan_filtering(struct ice_vsi *vsi)
+{
+       return ice_cfg_vlan_antispoof(vsi, false);
+}
+
+/**
+ * tpid_to_vsi_outer_vlan_type - convert from TPID to VSI context based tag_type
+ * @tpid: tpid used to translate into VSI context based tag_type
+ * @tag_type: output variable to hold the VSI context based tag type
+ */
+static int tpid_to_vsi_outer_vlan_type(u16 tpid, u8 *tag_type)
+{
+       switch (tpid) {
+       case ETH_P_8021Q:
+               *tag_type = ICE_AQ_VSI_OUTER_TAG_VLAN_8100;
+               break;
+       case ETH_P_8021AD:
+               *tag_type = ICE_AQ_VSI_OUTER_TAG_STAG;
+               break;
+       case ETH_P_QINQ1:
+               *tag_type = ICE_AQ_VSI_OUTER_TAG_VLAN_9100;
+               break;
+       default:
+               *tag_type = 0;
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+/**
+ * ice_vsi_ena_outer_stripping - enable outer VLAN stripping
+ * @vsi: VSI to configure
+ * @tpid: TPID to enable outer VLAN stripping for
+ *
+ * Enable outer VLAN stripping via VSI context. This function should only be
+ * used if DVM is supported. Also, this function should never be called directly
+ * as it should be part of ice_vsi_vlan_ops if it's needed.
+ *
+ * Since the VSI context only supports a single TPID for insertion and
+ * stripping, setting the TPID for stripping will affect the TPID for insertion.
+ * Callers need to be aware of this limitation.
+ *
+ * Only modify outer VLAN stripping settings and the VLAN TPID. Outer VLAN
+ * insertion settings are unmodified.
+ *
+ * This enables hardware to strip a VLAN tag with the specified TPID to be
+ * stripped from the packet and placed in the receive descriptor.
+ */
+int ice_vsi_ena_outer_stripping(struct ice_vsi *vsi, u16 tpid)
+{
+       struct ice_hw *hw = &vsi->back->hw;
+       struct ice_vsi_ctx *ctxt;
+       u8 tag_type;
+       int err;
+
+       /* do not allow modifying VLAN stripping when a port VLAN is configured
+        * on this VSI
+        */
+       if (vsi->info.port_based_outer_vlan)
+               return 0;
+
+       if (tpid_to_vsi_outer_vlan_type(tpid, &tag_type))
+               return -EINVAL;
+
+       ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
+       if (!ctxt)
+               return -ENOMEM;
+
+       ctxt->info.valid_sections =
+               cpu_to_le16(ICE_AQ_VSI_PROP_OUTER_TAG_VALID);
+       /* clear current outer VLAN strip settings */
+       ctxt->info.outer_vlan_flags = vsi->info.outer_vlan_flags &
+               ~(ICE_AQ_VSI_OUTER_VLAN_EMODE_M | ICE_AQ_VSI_OUTER_TAG_TYPE_M);
+       ctxt->info.outer_vlan_flags |=
+               ((ICE_AQ_VSI_OUTER_VLAN_EMODE_SHOW_BOTH <<
+                 ICE_AQ_VSI_OUTER_VLAN_EMODE_S) |
+                ((tag_type << ICE_AQ_VSI_OUTER_TAG_TYPE_S) &
+                 ICE_AQ_VSI_OUTER_TAG_TYPE_M));
+
+       err = ice_update_vsi(hw, vsi->idx, ctxt, NULL);
+       if (err)
+               dev_err(ice_pf_to_dev(vsi->back), "update VSI for enabling outer VLAN stripping failed, err %d aq_err %s\n",
+                       err, ice_aq_str(hw->adminq.sq_last_status));
+       else
+               vsi->info.outer_vlan_flags = ctxt->info.outer_vlan_flags;
+
+       kfree(ctxt);
+       return err;
+}
+
+/**
+ * ice_vsi_dis_outer_stripping - disable outer VLAN stripping
+ * @vsi: VSI to configure
+ *
+ * Disable outer VLAN stripping via VSI context. This function should only be
+ * used if DVM is supported. Also, this function should never be called directly
+ * as it should be part of ice_vsi_vlan_ops if it's needed.
+ *
+ * Only modify the outer VLAN stripping settings. The VLAN TPID and outer VLAN
+ * insertion settings are unmodified.
+ *
+ * This tells the hardware to not strip any VLAN tagged packets, thus leaving
+ * them in the packet. This enables software offloaded VLAN stripping and
+ * disables hardware offloaded VLAN stripping.
+ */
+int ice_vsi_dis_outer_stripping(struct ice_vsi *vsi)
+{
+       struct ice_hw *hw = &vsi->back->hw;
+       struct ice_vsi_ctx *ctxt;
+       int err;
+
+       if (vsi->info.port_based_outer_vlan)
+               return 0;
+
+       ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
+       if (!ctxt)
+               return -ENOMEM;
+
+       ctxt->info.valid_sections =
+               cpu_to_le16(ICE_AQ_VSI_PROP_OUTER_TAG_VALID);
+       /* clear current outer VLAN strip settings */
+       ctxt->info.outer_vlan_flags = vsi->info.outer_vlan_flags &
+               ~ICE_AQ_VSI_OUTER_VLAN_EMODE_M;
+       ctxt->info.outer_vlan_flags |= ICE_AQ_VSI_OUTER_VLAN_EMODE_NOTHING <<
+               ICE_AQ_VSI_OUTER_VLAN_EMODE_S;
+
+       err = ice_update_vsi(hw, vsi->idx, ctxt, NULL);
+       if (err)
+               dev_err(ice_pf_to_dev(vsi->back), "update VSI for disabling outer VLAN stripping failed, err %d aq_err %s\n",
+                       err, ice_aq_str(hw->adminq.sq_last_status));
+       else
+               vsi->info.outer_vlan_flags = ctxt->info.outer_vlan_flags;
+
+       kfree(ctxt);
+       return err;
+}
+
+/**
+ * ice_vsi_ena_outer_insertion - enable outer VLAN insertion
+ * @vsi: VSI to configure
+ * @tpid: TPID to enable outer VLAN insertion for
+ *
+ * Enable outer VLAN insertion via VSI context. This function should only be
+ * used if DVM is supported. Also, this function should never be called directly
+ * as it should be part of ice_vsi_vlan_ops if it's needed.
+ *
+ * Since the VSI context only supports a single TPID for insertion and
+ * stripping, setting the TPID for insertion will affect the TPID for stripping.
+ * Callers need to be aware of this limitation.
+ *
+ * Only modify outer VLAN insertion settings and the VLAN TPID. Outer VLAN
+ * stripping settings are unmodified.
+ *
+ * This allows a VLAN tag with the specified TPID to be inserted in the transmit
+ * descriptor.
+ */
+int ice_vsi_ena_outer_insertion(struct ice_vsi *vsi, u16 tpid)
+{
+       struct ice_hw *hw = &vsi->back->hw;
+       struct ice_vsi_ctx *ctxt;
+       u8 tag_type;
+       int err;
+
+       if (vsi->info.port_based_outer_vlan)
+               return 0;
+
+       if (tpid_to_vsi_outer_vlan_type(tpid, &tag_type))
+               return -EINVAL;
+
+       ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
+       if (!ctxt)
+               return -ENOMEM;
+
+       ctxt->info.valid_sections =
+               cpu_to_le16(ICE_AQ_VSI_PROP_OUTER_TAG_VALID);
+       /* clear current outer VLAN insertion settings */
+       ctxt->info.outer_vlan_flags = vsi->info.outer_vlan_flags &
+               ~(ICE_AQ_VSI_OUTER_VLAN_PORT_BASED_INSERT |
+                 ICE_AQ_VSI_OUTER_VLAN_BLOCK_TX_DESC |
+                 ICE_AQ_VSI_OUTER_VLAN_TX_MODE_M |
+                 ICE_AQ_VSI_OUTER_TAG_TYPE_M);
+       ctxt->info.outer_vlan_flags |=
+               ((ICE_AQ_VSI_OUTER_VLAN_TX_MODE_ALL <<
+                 ICE_AQ_VSI_OUTER_VLAN_TX_MODE_S) &
+                ICE_AQ_VSI_OUTER_VLAN_TX_MODE_M) |
+               ((tag_type << ICE_AQ_VSI_OUTER_TAG_TYPE_S) &
+                ICE_AQ_VSI_OUTER_TAG_TYPE_M);
+
+       err = ice_update_vsi(hw, vsi->idx, ctxt, NULL);
+       if (err)
+               dev_err(ice_pf_to_dev(vsi->back), "update VSI for enabling outer VLAN insertion failed, err %d aq_err %s\n",
+                       err, ice_aq_str(hw->adminq.sq_last_status));
+       else
+               vsi->info.outer_vlan_flags = ctxt->info.outer_vlan_flags;
+
+       kfree(ctxt);
+       return err;
+}
+
+/**
+ * ice_vsi_dis_outer_insertion - disable outer VLAN insertion
+ * @vsi: VSI to configure
+ *
+ * Disable outer VLAN insertion via VSI context. This function should only be
+ * used if DVM is supported. Also, this function should never be called directly
+ * as it should be part of ice_vsi_vlan_ops if it's needed.
+ *
+ * Only modify the outer VLAN insertion settings. The VLAN TPID and outer VLAN
+ * settings are unmodified.
+ *
+ * This tells the hardware to not allow any VLAN tagged packets in the transmit
+ * descriptor. This enables software offloaded VLAN insertion and disables
+ * hardware offloaded VLAN insertion.
+ */
+int ice_vsi_dis_outer_insertion(struct ice_vsi *vsi)
+{
+       struct ice_hw *hw = &vsi->back->hw;
+       struct ice_vsi_ctx *ctxt;
+       int err;
+
+       if (vsi->info.port_based_outer_vlan)
+               return 0;
+
+       ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
+       if (!ctxt)
+               return -ENOMEM;
+
+       ctxt->info.valid_sections =
+               cpu_to_le16(ICE_AQ_VSI_PROP_OUTER_TAG_VALID);
+       /* clear current outer VLAN insertion settings */
+       ctxt->info.outer_vlan_flags = vsi->info.outer_vlan_flags &
+               ~(ICE_AQ_VSI_OUTER_VLAN_PORT_BASED_INSERT |
+                 ICE_AQ_VSI_OUTER_VLAN_TX_MODE_M);
+       ctxt->info.outer_vlan_flags |=
+               ICE_AQ_VSI_OUTER_VLAN_BLOCK_TX_DESC |
+               ((ICE_AQ_VSI_OUTER_VLAN_TX_MODE_ALL <<
+                 ICE_AQ_VSI_OUTER_VLAN_TX_MODE_S) &
+                ICE_AQ_VSI_OUTER_VLAN_TX_MODE_M);
+
+       err = ice_update_vsi(hw, vsi->idx, ctxt, NULL);
+       if (err)
+               dev_err(ice_pf_to_dev(vsi->back), "update VSI for disabling outer VLAN insertion failed, err %d aq_err %s\n",
+                       err, ice_aq_str(hw->adminq.sq_last_status));
+       else
+               vsi->info.outer_vlan_flags = ctxt->info.outer_vlan_flags;
+
+       kfree(ctxt);
+       return err;
+}
+
+/**
+ * __ice_vsi_set_outer_port_vlan - set the outer port VLAN and related settings
+ * @vsi: VSI to configure
+ * @vlan_info: packed u16 that contains the VLAN prio and ID
+ * @tpid: TPID of the port VLAN
+ *
+ * Set the port VLAN prio, ID, and TPID.
+ *
+ * Enable VLAN pruning so the VSI doesn't receive any traffic that doesn't match
+ * a VLAN prune rule. The caller should take care to add a VLAN prune rule that
+ * matches the port VLAN ID and TPID.
+ *
+ * Tell hardware to strip outer VLAN tagged packets on receive and don't put
+ * them in the receive descriptor. VSI(s) in port VLANs should not be aware of
+ * the port VLAN ID or TPID they are assigned to.
+ *
+ * Tell hardware to prevent outer VLAN tag insertion on transmit and only allow
+ * untagged outer packets from the transmit descriptor.
+ *
+ * Also, tell the hardware to insert the port VLAN on transmit.
+ */
+static int
+__ice_vsi_set_outer_port_vlan(struct ice_vsi *vsi, u16 vlan_info, u16 tpid)
+{
+       struct ice_hw *hw = &vsi->back->hw;
+       struct ice_vsi_ctx *ctxt;
+       u8 tag_type;
+       int err;
+
+       if (tpid_to_vsi_outer_vlan_type(tpid, &tag_type))
+               return -EINVAL;
+
+       ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
+       if (!ctxt)
+               return -ENOMEM;
+
+       ctxt->info = vsi->info;
+
+       ctxt->info.sw_flags2 |= ICE_AQ_VSI_SW_FLAG_RX_VLAN_PRUNE_ENA;
+
+       ctxt->info.port_based_outer_vlan = cpu_to_le16(vlan_info);
+       ctxt->info.outer_vlan_flags =
+               (ICE_AQ_VSI_OUTER_VLAN_EMODE_SHOW <<
+                ICE_AQ_VSI_OUTER_VLAN_EMODE_S) |
+               ((tag_type << ICE_AQ_VSI_OUTER_TAG_TYPE_S) &
+                ICE_AQ_VSI_OUTER_TAG_TYPE_M) |
+               ICE_AQ_VSI_OUTER_VLAN_BLOCK_TX_DESC |
+               (ICE_AQ_VSI_OUTER_VLAN_TX_MODE_ACCEPTUNTAGGED <<
+                ICE_AQ_VSI_OUTER_VLAN_TX_MODE_S) |
+               ICE_AQ_VSI_OUTER_VLAN_PORT_BASED_INSERT;
+
+       ctxt->info.valid_sections =
+               cpu_to_le16(ICE_AQ_VSI_PROP_OUTER_TAG_VALID |
+                           ICE_AQ_VSI_PROP_SW_VALID);
+
+       err = ice_update_vsi(hw, vsi->idx, ctxt, NULL);
+       if (err) {
+               dev_err(ice_pf_to_dev(vsi->back), "update VSI for setting outer port based VLAN failed, err %d aq_err %s\n",
+                       err, ice_aq_str(hw->adminq.sq_last_status));
+       } else {
+               vsi->info.port_based_outer_vlan = ctxt->info.port_based_outer_vlan;
+               vsi->info.outer_vlan_flags = ctxt->info.outer_vlan_flags;
+               vsi->info.sw_flags2 = ctxt->info.sw_flags2;
+       }
+
+       kfree(ctxt);
+       return err;
+}
+
+/**
+ * ice_vsi_set_outer_port_vlan - public version of __ice_vsi_set_outer_port_vlan
+ * @vsi: VSI to configure
+ * @vlan: ice_vlan structure used to set the port VLAN
+ *
+ * Set the outer port VLAN via VSI context. This function should only be
+ * used if DVM is supported. Also, this function should never be called directly
+ * as it should be part of ice_vsi_vlan_ops if it's needed.
+ *
+ * This function does not support clearing the port VLAN as there is currently
+ * no use case for this.
+ *
+ * Use the ice_vlan structure passed in to set this VSI in a port VLAN.
+ */
+int ice_vsi_set_outer_port_vlan(struct ice_vsi *vsi, struct ice_vlan *vlan)
+{
+       u16 port_vlan_info;
+
+       if (vlan->prio > (VLAN_PRIO_MASK >> VLAN_PRIO_SHIFT))
+               return -EINVAL;
+
+       port_vlan_info = vlan->vid | (vlan->prio << VLAN_PRIO_SHIFT);
+
+       return __ice_vsi_set_outer_port_vlan(vsi, port_vlan_info, vlan->tpid);
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_vsi_vlan_lib.h b/drivers/net/ethernet/intel/ice/ice_vsi_vlan_lib.h
new file mode 100644 (file)
index 0000000..f459909
--- /dev/null
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2019-2021, Intel Corporation. */
+
+#ifndef _ICE_VSI_VLAN_LIB_H_
+#define _ICE_VSI_VLAN_LIB_H_
+
+#include <linux/types.h>
+#include "ice_vlan.h"
+
+struct ice_vsi;
+
+int ice_vsi_add_vlan(struct ice_vsi *vsi, struct ice_vlan *vlan);
+int ice_vsi_del_vlan(struct ice_vsi *vsi, struct ice_vlan *vlan);
+
+int ice_vsi_ena_inner_stripping(struct ice_vsi *vsi, u16 tpid);
+int ice_vsi_dis_inner_stripping(struct ice_vsi *vsi);
+int ice_vsi_ena_inner_insertion(struct ice_vsi *vsi, u16 tpid);
+int ice_vsi_dis_inner_insertion(struct ice_vsi *vsi);
+int ice_vsi_set_inner_port_vlan(struct ice_vsi *vsi, struct ice_vlan *vlan);
+
+int ice_vsi_ena_rx_vlan_filtering(struct ice_vsi *vsi);
+int ice_vsi_dis_rx_vlan_filtering(struct ice_vsi *vsi);
+int ice_vsi_ena_tx_vlan_filtering(struct ice_vsi *vsi);
+int ice_vsi_dis_tx_vlan_filtering(struct ice_vsi *vsi);
+
+int ice_vsi_ena_outer_stripping(struct ice_vsi *vsi, u16 tpid);
+int ice_vsi_dis_outer_stripping(struct ice_vsi *vsi);
+int ice_vsi_ena_outer_insertion(struct ice_vsi *vsi, u16 tpid);
+int ice_vsi_dis_outer_insertion(struct ice_vsi *vsi);
+int ice_vsi_set_outer_port_vlan(struct ice_vsi *vsi, struct ice_vlan *vlan);
+
+#endif /* _ICE_VSI_VLAN_LIB_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_vsi_vlan_ops.c b/drivers/net/ethernet/intel/ice/ice_vsi_vlan_ops.c
new file mode 100644 (file)
index 0000000..4a6c850
--- /dev/null
@@ -0,0 +1,103 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2019-2021, Intel Corporation. */
+
+#include "ice_pf_vsi_vlan_ops.h"
+#include "ice_vf_vsi_vlan_ops.h"
+#include "ice_lib.h"
+#include "ice.h"
+
+static int
+op_unsupported_vlan_arg(struct ice_vsi * __always_unused vsi,
+                       struct ice_vlan * __always_unused vlan)
+{
+       return -EOPNOTSUPP;
+}
+
+static int
+op_unsupported_tpid_arg(struct ice_vsi *__always_unused vsi,
+                       u16 __always_unused tpid)
+{
+       return -EOPNOTSUPP;
+}
+
+static int op_unsupported(struct ice_vsi *__always_unused vsi)
+{
+       return -EOPNOTSUPP;
+}
+
+/* If any new ops are added to the VSI VLAN ops interface then an unsupported
+ * implementation should be set here.
+ */
+static struct ice_vsi_vlan_ops ops_unsupported = {
+       .add_vlan = op_unsupported_vlan_arg,
+       .del_vlan = op_unsupported_vlan_arg,
+       .ena_stripping = op_unsupported_tpid_arg,
+       .dis_stripping = op_unsupported,
+       .ena_insertion = op_unsupported_tpid_arg,
+       .dis_insertion = op_unsupported,
+       .ena_rx_filtering = op_unsupported,
+       .dis_rx_filtering = op_unsupported,
+       .ena_tx_filtering = op_unsupported,
+       .dis_tx_filtering = op_unsupported,
+       .set_port_vlan = op_unsupported_vlan_arg,
+};
+
+/**
+ * ice_vsi_init_unsupported_vlan_ops - init all VSI VLAN ops to unsupported
+ * @vsi: VSI to initialize VSI VLAN ops to unsupported for
+ *
+ * By default all inner and outer VSI VLAN ops return -EOPNOTSUPP. This was done
+ * as oppsed to leaving the ops null to prevent unexpected crashes. Instead if
+ * an unsupported VSI VLAN op is called it will just return -EOPNOTSUPP.
+ *
+ */
+static void ice_vsi_init_unsupported_vlan_ops(struct ice_vsi *vsi)
+{
+       vsi->outer_vlan_ops = ops_unsupported;
+       vsi->inner_vlan_ops = ops_unsupported;
+}
+
+/**
+ * ice_vsi_init_vlan_ops - initialize type specific VSI VLAN ops
+ * @vsi: VSI to initialize ops for
+ *
+ * If any VSI types are added and/or require different ops than the PF or VF VSI
+ * then they will have to add a case here to handle that. Also, VSI type
+ * specific files should be added in the same manner that was done for PF VSI.
+ */
+void ice_vsi_init_vlan_ops(struct ice_vsi *vsi)
+{
+       /* Initialize all VSI types to have unsupported VSI VLAN ops */
+       ice_vsi_init_unsupported_vlan_ops(vsi);
+
+       switch (vsi->type) {
+       case ICE_VSI_PF:
+       case ICE_VSI_SWITCHDEV_CTRL:
+               ice_pf_vsi_init_vlan_ops(vsi);
+               break;
+       case ICE_VSI_VF:
+               ice_vf_vsi_init_vlan_ops(vsi);
+               break;
+       default:
+               dev_dbg(ice_pf_to_dev(vsi->back), "%s does not support VLAN operations\n",
+                       ice_vsi_type_str(vsi->type));
+               break;
+       }
+}
+
+/**
+ * ice_get_compat_vsi_vlan_ops - Get VSI VLAN ops based on VLAN mode
+ * @vsi: VSI used to get the VSI VLAN ops
+ *
+ * This function is meant to be used when the caller doesn't know which VLAN ops
+ * to use (i.e. inner or outer). This allows backward compatibility for VLANs
+ * since most of the Outer VSI VLAN functins are not supported when
+ * the device is configured in Single VLAN Mode (SVM).
+ */
+struct ice_vsi_vlan_ops *ice_get_compat_vsi_vlan_ops(struct ice_vsi *vsi)
+{
+       if (ice_is_dvm_ena(&vsi->back->hw))
+               return &vsi->outer_vlan_ops;
+       else
+               return &vsi->inner_vlan_ops;
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_vsi_vlan_ops.h b/drivers/net/ethernet/intel/ice/ice_vsi_vlan_ops.h
new file mode 100644 (file)
index 0000000..5b47568
--- /dev/null
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2019-2021, Intel Corporation. */
+
+#ifndef _ICE_VSI_VLAN_OPS_H_
+#define _ICE_VSI_VLAN_OPS_H_
+
+#include "ice_type.h"
+#include "ice_vsi_vlan_lib.h"
+
+struct ice_vsi;
+
+struct ice_vsi_vlan_ops {
+       int (*add_vlan)(struct ice_vsi *vsi, struct ice_vlan *vlan);
+       int (*del_vlan)(struct ice_vsi *vsi, struct ice_vlan *vlan);
+       int (*ena_stripping)(struct ice_vsi *vsi, const u16 tpid);
+       int (*dis_stripping)(struct ice_vsi *vsi);
+       int (*ena_insertion)(struct ice_vsi *vsi, const u16 tpid);
+       int (*dis_insertion)(struct ice_vsi *vsi);
+       int (*ena_rx_filtering)(struct ice_vsi *vsi);
+       int (*dis_rx_filtering)(struct ice_vsi *vsi);
+       int (*ena_tx_filtering)(struct ice_vsi *vsi);
+       int (*dis_tx_filtering)(struct ice_vsi *vsi);
+       int (*set_port_vlan)(struct ice_vsi *vsi, struct ice_vlan *vlan);
+};
+
+void ice_vsi_init_vlan_ops(struct ice_vsi *vsi);
+struct ice_vsi_vlan_ops *ice_get_compat_vsi_vlan_ops(struct ice_vsi *vsi);
+
+#endif /* _ICE_VSI_VLAN_OPS_H_ */
index 2388837..88853a6 100644 (file)
@@ -327,6 +327,13 @@ int ice_xsk_pool_setup(struct ice_vsi *vsi, struct xsk_buff_pool *pool, u16 qid)
        bool if_running, pool_present = !!pool;
        int ret = 0, pool_failure = 0;
 
+       if (!is_power_of_2(vsi->rx_rings[qid]->count) ||
+           !is_power_of_2(vsi->tx_rings[qid]->count)) {
+               netdev_err(vsi->netdev, "Please align ring sizes to power of 2\n");
+               pool_failure = -EINVAL;
+               goto failure;
+       }
+
        if_running = netif_running(vsi->netdev) && ice_is_xdp_ena_vsi(vsi);
 
        if (if_running) {
@@ -349,6 +356,7 @@ xsk_pool_if_up:
                        netdev_err(vsi->netdev, "ice_qp_ena error = %d\n", ret);
        }
 
+failure:
        if (pool_failure) {
                netdev_err(vsi->netdev, "Could not %sable buffer pool, error = %d\n",
                           pool_present ? "en" : "dis", pool_failure);
@@ -359,33 +367,28 @@ xsk_pool_if_up:
 }
 
 /**
- * ice_alloc_rx_bufs_zc - allocate a number of Rx buffers
- * @rx_ring: Rx ring
+ * ice_fill_rx_descs - pick buffers from XSK buffer pool and use it
+ * @pool: XSK Buffer pool to pull the buffers from
+ * @xdp: SW ring of xdp_buff that will hold the buffers
+ * @rx_desc: Pointer to Rx descriptors that will be filled
  * @count: The number of buffers to allocate
  *
  * This function allocates a number of Rx buffers from the fill ring
  * or the internal recycle mechanism and places them on the Rx ring.
  *
- * Returns true if all allocations were successful, false if any fail.
+ * Note that ring wrap should be handled by caller of this function.
+ *
+ * Returns the amount of allocated Rx descriptors
  */
-bool ice_alloc_rx_bufs_zc(struct ice_rx_ring *rx_ring, u16 count)
+static u16 ice_fill_rx_descs(struct xsk_buff_pool *pool, struct xdp_buff **xdp,
+                            union ice_32b_rx_flex_desc *rx_desc, u16 count)
 {
-       union ice_32b_rx_flex_desc *rx_desc;
-       u16 ntu = rx_ring->next_to_use;
-       struct xdp_buff **xdp;
-       u32 nb_buffs, i;
        dma_addr_t dma;
+       u16 buffs;
+       int i;
 
-       rx_desc = ICE_RX_DESC(rx_ring, ntu);
-       xdp = ice_xdp_buf(rx_ring, ntu);
-
-       nb_buffs = min_t(u16, count, rx_ring->count - ntu);
-       nb_buffs = xsk_buff_alloc_batch(rx_ring->xsk_pool, xdp, nb_buffs);
-       if (!nb_buffs)
-               return false;
-
-       i = nb_buffs;
-       while (i--) {
+       buffs = xsk_buff_alloc_batch(pool, xdp, count);
+       for (i = 0; i < buffs; i++) {
                dma = xsk_buff_xdp_get_dma(*xdp);
                rx_desc->read.pkt_addr = cpu_to_le64(dma);
                rx_desc->wb.status_error0 = 0;
@@ -394,13 +397,77 @@ bool ice_alloc_rx_bufs_zc(struct ice_rx_ring *rx_ring, u16 count)
                xdp++;
        }
 
+       return buffs;
+}
+
+/**
+ * __ice_alloc_rx_bufs_zc - allocate a number of Rx buffers
+ * @rx_ring: Rx ring
+ * @count: The number of buffers to allocate
+ *
+ * Place the @count of descriptors onto Rx ring. Handle the ring wrap
+ * for case where space from next_to_use up to the end of ring is less
+ * than @count. Finally do a tail bump.
+ *
+ * Returns true if all allocations were successful, false if any fail.
+ */
+static bool __ice_alloc_rx_bufs_zc(struct ice_rx_ring *rx_ring, u16 count)
+{
+       union ice_32b_rx_flex_desc *rx_desc;
+       u32 nb_buffs_extra = 0, nb_buffs;
+       u16 ntu = rx_ring->next_to_use;
+       u16 total_count = count;
+       struct xdp_buff **xdp;
+
+       rx_desc = ICE_RX_DESC(rx_ring, ntu);
+       xdp = ice_xdp_buf(rx_ring, ntu);
+
+       if (ntu + count >= rx_ring->count) {
+               nb_buffs_extra = ice_fill_rx_descs(rx_ring->xsk_pool, xdp,
+                                                  rx_desc,
+                                                  rx_ring->count - ntu);
+               rx_desc = ICE_RX_DESC(rx_ring, 0);
+               xdp = ice_xdp_buf(rx_ring, 0);
+               ntu = 0;
+               count -= nb_buffs_extra;
+               ice_release_rx_desc(rx_ring, 0);
+       }
+
+       nb_buffs = ice_fill_rx_descs(rx_ring->xsk_pool, xdp, rx_desc, count);
+
        ntu += nb_buffs;
        if (ntu == rx_ring->count)
                ntu = 0;
 
-       ice_release_rx_desc(rx_ring, ntu);
+       if (rx_ring->next_to_use != ntu)
+               ice_release_rx_desc(rx_ring, ntu);
+
+       return total_count == (nb_buffs_extra + nb_buffs);
+}
+
+/**
+ * ice_alloc_rx_bufs_zc - allocate a number of Rx buffers
+ * @rx_ring: Rx ring
+ * @count: The number of buffers to allocate
+ *
+ * Wrapper for internal allocation routine; figure out how many tail
+ * bumps should take place based on the given threshold
+ *
+ * Returns true if all calls to internal alloc routine succeeded
+ */
+bool ice_alloc_rx_bufs_zc(struct ice_rx_ring *rx_ring, u16 count)
+{
+       u16 rx_thresh = ICE_RING_QUARTER(rx_ring);
+       u16 batched, leftover, i, tail_bumps;
+
+       batched = ALIGN_DOWN(count, rx_thresh);
+       tail_bumps = batched / rx_thresh;
+       leftover = count & (rx_thresh - 1);
 
-       return count == nb_buffs;
+       for (i = 0; i < tail_bumps; i++)
+               if (!__ice_alloc_rx_bufs_zc(rx_ring, rx_thresh))
+                       return false;
+       return __ice_alloc_rx_bufs_zc(rx_ring, leftover);
 }
 
 /**
@@ -428,20 +495,24 @@ static void ice_bump_ntc(struct ice_rx_ring *rx_ring)
 static struct sk_buff *
 ice_construct_skb_zc(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp)
 {
-       unsigned int datasize_hard = xdp->data_end - xdp->data_hard_start;
+       unsigned int totalsize = xdp->data_end - xdp->data_meta;
        unsigned int metasize = xdp->data - xdp->data_meta;
-       unsigned int datasize = xdp->data_end - xdp->data;
        struct sk_buff *skb;
 
-       skb = __napi_alloc_skb(&rx_ring->q_vector->napi, datasize_hard,
+       net_prefetch(xdp->data_meta);
+
+       skb = __napi_alloc_skb(&rx_ring->q_vector->napi, totalsize,
                               GFP_ATOMIC | __GFP_NOWARN);
        if (unlikely(!skb))
                return NULL;
 
-       skb_reserve(skb, xdp->data - xdp->data_hard_start);
-       memcpy(__skb_put(skb, datasize), xdp->data, datasize);
-       if (metasize)
+       memcpy(__skb_put(skb, totalsize), xdp->data_meta,
+              ALIGN(totalsize, sizeof(long)));
+
+       if (metasize) {
                skb_metadata_set(skb, metasize);
+               __skb_pull(skb, metasize);
+       }
 
        xsk_buff_free(xdp);
        return skb;
@@ -528,7 +599,7 @@ int ice_clean_rx_irq_zc(struct ice_rx_ring *rx_ring, int budget)
                rx_desc = ICE_RX_DESC(rx_ring, rx_ring->next_to_clean);
 
                stat_err_bits = BIT(ICE_RX_FLEX_DESC_STATUS0_DD_S);
-               if (!ice_test_staterr(rx_desc, stat_err_bits))
+               if (!ice_test_staterr(rx_desc->wb.status_error0, stat_err_bits))
                        break;
 
                /* This memory barrier is needed to keep us from reading
@@ -583,9 +654,7 @@ construct_skb:
                total_rx_bytes += skb->len;
                total_rx_packets++;
 
-               stat_err_bits = BIT(ICE_RX_FLEX_DESC_STATUS0_L2TAG1P_S);
-               if (ice_test_staterr(rx_desc, stat_err_bits))
-                       vlan_tag = le16_to_cpu(rx_desc->wb.l2tag1);
+               vlan_tag = ice_get_vlan_tag_from_rx_desc(rx_desc);
 
                rx_ptype = le16_to_cpu(rx_desc->wb.ptype_flex_flags0) &
                                       ICE_RX_FLEX_DESC_PTYPE_M;
@@ -612,134 +681,221 @@ construct_skb:
 }
 
 /**
- * ice_xmit_zc - Completes AF_XDP entries, and cleans XDP entries
+ * ice_clean_xdp_tx_buf - Free and unmap XDP Tx buffer
  * @xdp_ring: XDP Tx ring
- * @budget: max number of frames to xmit
+ * @tx_buf: Tx buffer to clean
+ */
+static void
+ice_clean_xdp_tx_buf(struct ice_tx_ring *xdp_ring, struct ice_tx_buf *tx_buf)
+{
+       xdp_return_frame((struct xdp_frame *)tx_buf->raw_buf);
+       xdp_ring->xdp_tx_active--;
+       dma_unmap_single(xdp_ring->dev, dma_unmap_addr(tx_buf, dma),
+                        dma_unmap_len(tx_buf, len), DMA_TO_DEVICE);
+       dma_unmap_len_set(tx_buf, len, 0);
+}
+
+/**
+ * ice_clean_xdp_irq_zc - Reclaim resources after transmit completes on XDP ring
+ * @xdp_ring: XDP ring to clean
+ * @napi_budget: amount of descriptors that NAPI allows us to clean
  *
- * Returns true if cleanup/transmission is done.
+ * Returns count of cleaned descriptors
  */
-static bool ice_xmit_zc(struct ice_tx_ring *xdp_ring, int budget)
+static u16 ice_clean_xdp_irq_zc(struct ice_tx_ring *xdp_ring, int napi_budget)
 {
-       struct ice_tx_desc *tx_desc = NULL;
-       bool work_done = true;
-       struct xdp_desc desc;
-       dma_addr_t dma;
+       u16 tx_thresh = ICE_RING_QUARTER(xdp_ring);
+       int budget = napi_budget / tx_thresh;
+       u16 next_dd = xdp_ring->next_dd;
+       u16 ntc, cleared_dds = 0;
 
-       while (likely(budget-- > 0)) {
+       do {
+               struct ice_tx_desc *next_dd_desc;
+               u16 desc_cnt = xdp_ring->count;
                struct ice_tx_buf *tx_buf;
+               u32 xsk_frames;
+               u16 i;
 
-               if (unlikely(!ICE_DESC_UNUSED(xdp_ring))) {
-                       xdp_ring->tx_stats.tx_busy++;
-                       work_done = false;
+               next_dd_desc = ICE_TX_DESC(xdp_ring, next_dd);
+               if (!(next_dd_desc->cmd_type_offset_bsz &
+                   cpu_to_le64(ICE_TX_DESC_DTYPE_DESC_DONE)))
                        break;
-               }
 
-               tx_buf = &xdp_ring->tx_buf[xdp_ring->next_to_use];
-
-               if (!xsk_tx_peek_desc(xdp_ring->xsk_pool, &desc))
-                       break;
-
-               dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, desc.addr);
-               xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_pool, dma,
-                                                desc.len);
-
-               tx_buf->bytecount = desc.len;
+               cleared_dds++;
+               xsk_frames = 0;
+               if (likely(!xdp_ring->xdp_tx_active)) {
+                       xsk_frames = tx_thresh;
+                       goto skip;
+               }
 
-               tx_desc = ICE_TX_DESC(xdp_ring, xdp_ring->next_to_use);
-               tx_desc->buf_addr = cpu_to_le64(dma);
-               tx_desc->cmd_type_offset_bsz =
-                       ice_build_ctob(ICE_TXD_LAST_DESC_CMD, 0, desc.len, 0);
+               ntc = xdp_ring->next_to_clean;
 
-               xdp_ring->next_to_use++;
-               if (xdp_ring->next_to_use == xdp_ring->count)
-                       xdp_ring->next_to_use = 0;
-       }
+               for (i = 0; i < tx_thresh; i++) {
+                       tx_buf = &xdp_ring->tx_buf[ntc];
 
-       if (tx_desc) {
-               ice_xdp_ring_update_tail(xdp_ring);
-               xsk_tx_release(xdp_ring->xsk_pool);
-       }
+                       if (tx_buf->raw_buf) {
+                               ice_clean_xdp_tx_buf(xdp_ring, tx_buf);
+                               tx_buf->raw_buf = NULL;
+                       } else {
+                               xsk_frames++;
+                       }
 
-       return budget > 0 && work_done;
+                       ntc++;
+                       if (ntc >= xdp_ring->count)
+                               ntc = 0;
+               }
+skip:
+               xdp_ring->next_to_clean += tx_thresh;
+               if (xdp_ring->next_to_clean >= desc_cnt)
+                       xdp_ring->next_to_clean -= desc_cnt;
+               if (xsk_frames)
+                       xsk_tx_completed(xdp_ring->xsk_pool, xsk_frames);
+               next_dd_desc->cmd_type_offset_bsz = 0;
+               next_dd = next_dd + tx_thresh;
+               if (next_dd >= desc_cnt)
+                       next_dd = tx_thresh - 1;
+       } while (budget--);
+
+       xdp_ring->next_dd = next_dd;
+
+       return cleared_dds * tx_thresh;
 }
 
 /**
- * ice_clean_xdp_tx_buf - Free and unmap XDP Tx buffer
- * @xdp_ring: XDP Tx ring
- * @tx_buf: Tx buffer to clean
+ * ice_xmit_pkt - produce a single HW Tx descriptor out of AF_XDP descriptor
+ * @xdp_ring: XDP ring to produce the HW Tx descriptor on
+ * @desc: AF_XDP descriptor to pull the DMA address and length from
+ * @total_bytes: bytes accumulator that will be used for stats update
  */
-static void
-ice_clean_xdp_tx_buf(struct ice_tx_ring *xdp_ring, struct ice_tx_buf *tx_buf)
+static void ice_xmit_pkt(struct ice_tx_ring *xdp_ring, struct xdp_desc *desc,
+                        unsigned int *total_bytes)
 {
-       xdp_return_frame((struct xdp_frame *)tx_buf->raw_buf);
-       dma_unmap_single(xdp_ring->dev, dma_unmap_addr(tx_buf, dma),
-                        dma_unmap_len(tx_buf, len), DMA_TO_DEVICE);
-       dma_unmap_len_set(tx_buf, len, 0);
+       struct ice_tx_desc *tx_desc;
+       dma_addr_t dma;
+
+       dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, desc->addr);
+       xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_pool, dma, desc->len);
+
+       tx_desc = ICE_TX_DESC(xdp_ring, xdp_ring->next_to_use++);
+       tx_desc->buf_addr = cpu_to_le64(dma);
+       tx_desc->cmd_type_offset_bsz = ice_build_ctob(ICE_TX_DESC_CMD_EOP,
+                                                     0, desc->len, 0);
+
+       *total_bytes += desc->len;
 }
 
 /**
- * ice_clean_tx_irq_zc - Completes AF_XDP entries, and cleans XDP entries
- * @xdp_ring: XDP Tx ring
- * @budget: NAPI budget
- *
- * Returns true if cleanup/tranmission is done.
+ * ice_xmit_pkt_batch - produce a batch of HW Tx descriptors out of AF_XDP descriptors
+ * @xdp_ring: XDP ring to produce the HW Tx descriptors on
+ * @descs: AF_XDP descriptors to pull the DMA addresses and lengths from
+ * @total_bytes: bytes accumulator that will be used for stats update
  */
-bool ice_clean_tx_irq_zc(struct ice_tx_ring *xdp_ring, int budget)
+static void ice_xmit_pkt_batch(struct ice_tx_ring *xdp_ring, struct xdp_desc *descs,
+                              unsigned int *total_bytes)
 {
-       int total_packets = 0, total_bytes = 0;
-       s16 ntc = xdp_ring->next_to_clean;
+       u16 tx_thresh = ICE_RING_QUARTER(xdp_ring);
+       u16 ntu = xdp_ring->next_to_use;
        struct ice_tx_desc *tx_desc;
-       struct ice_tx_buf *tx_buf;
-       u32 xsk_frames = 0;
-       bool xmit_done;
+       u32 i;
 
-       tx_desc = ICE_TX_DESC(xdp_ring, ntc);
-       tx_buf = &xdp_ring->tx_buf[ntc];
-       ntc -= xdp_ring->count;
+       loop_unrolled_for(i = 0; i < PKTS_PER_BATCH; i++) {
+               dma_addr_t dma;
 
-       do {
-               if (!(tx_desc->cmd_type_offset_bsz &
-                     cpu_to_le64(ICE_TX_DESC_DTYPE_DESC_DONE)))
-                       break;
+               dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, descs[i].addr);
+               xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_pool, dma, descs[i].len);
 
-               total_bytes += tx_buf->bytecount;
-               total_packets++;
+               tx_desc = ICE_TX_DESC(xdp_ring, ntu++);
+               tx_desc->buf_addr = cpu_to_le64(dma);
+               tx_desc->cmd_type_offset_bsz = ice_build_ctob(ICE_TX_DESC_CMD_EOP,
+                                                             0, descs[i].len, 0);
 
-               if (tx_buf->raw_buf) {
-                       ice_clean_xdp_tx_buf(xdp_ring, tx_buf);
-                       tx_buf->raw_buf = NULL;
-               } else {
-                       xsk_frames++;
-               }
+               *total_bytes += descs[i].len;
+       }
 
-               tx_desc->cmd_type_offset_bsz = 0;
-               tx_buf++;
-               tx_desc++;
-               ntc++;
+       xdp_ring->next_to_use = ntu;
 
-               if (unlikely(!ntc)) {
-                       ntc -= xdp_ring->count;
-                       tx_buf = xdp_ring->tx_buf;
-                       tx_desc = ICE_TX_DESC(xdp_ring, 0);
-               }
+       if (xdp_ring->next_to_use > xdp_ring->next_rs) {
+               tx_desc = ICE_TX_DESC(xdp_ring, xdp_ring->next_rs);
+               tx_desc->cmd_type_offset_bsz |=
+                       cpu_to_le64(ICE_TX_DESC_CMD_RS << ICE_TXD_QW1_CMD_S);
+               xdp_ring->next_rs += tx_thresh;
+       }
+}
 
-               prefetch(tx_desc);
+/**
+ * ice_fill_tx_hw_ring - produce the number of Tx descriptors onto ring
+ * @xdp_ring: XDP ring to produce the HW Tx descriptors on
+ * @descs: AF_XDP descriptors to pull the DMA addresses and lengths from
+ * @nb_pkts: count of packets to be send
+ * @total_bytes: bytes accumulator that will be used for stats update
+ */
+static void ice_fill_tx_hw_ring(struct ice_tx_ring *xdp_ring, struct xdp_desc *descs,
+                               u32 nb_pkts, unsigned int *total_bytes)
+{
+       u16 tx_thresh = ICE_RING_QUARTER(xdp_ring);
+       u32 batched, leftover, i;
+
+       batched = ALIGN_DOWN(nb_pkts, PKTS_PER_BATCH);
+       leftover = nb_pkts & (PKTS_PER_BATCH - 1);
+       for (i = 0; i < batched; i += PKTS_PER_BATCH)
+               ice_xmit_pkt_batch(xdp_ring, &descs[i], total_bytes);
+       for (; i < batched + leftover; i++)
+               ice_xmit_pkt(xdp_ring, &descs[i], total_bytes);
+
+       if (xdp_ring->next_to_use > xdp_ring->next_rs) {
+               struct ice_tx_desc *tx_desc;
+
+               tx_desc = ICE_TX_DESC(xdp_ring, xdp_ring->next_rs);
+               tx_desc->cmd_type_offset_bsz |=
+                       cpu_to_le64(ICE_TX_DESC_CMD_RS << ICE_TXD_QW1_CMD_S);
+               xdp_ring->next_rs += tx_thresh;
+       }
+}
 
-       } while (likely(--budget));
+/**
+ * ice_xmit_zc - take entries from XSK Tx ring and place them onto HW Tx ring
+ * @xdp_ring: XDP ring to produce the HW Tx descriptors on
+ * @budget: number of free descriptors on HW Tx ring that can be used
+ * @napi_budget: amount of descriptors that NAPI allows us to clean
+ *
+ * Returns true if there is no more work that needs to be done, false otherwise
+ */
+bool ice_xmit_zc(struct ice_tx_ring *xdp_ring, u32 budget, int napi_budget)
+{
+       struct xdp_desc *descs = xdp_ring->xsk_pool->tx_descs;
+       u16 tx_thresh = ICE_RING_QUARTER(xdp_ring);
+       u32 nb_pkts, nb_processed = 0;
+       unsigned int total_bytes = 0;
+
+       if (budget < tx_thresh)
+               budget += ice_clean_xdp_irq_zc(xdp_ring, napi_budget);
+
+       nb_pkts = xsk_tx_peek_release_desc_batch(xdp_ring->xsk_pool, budget);
+       if (!nb_pkts)
+               return true;
+
+       if (xdp_ring->next_to_use + nb_pkts >= xdp_ring->count) {
+               struct ice_tx_desc *tx_desc;
+
+               nb_processed = xdp_ring->count - xdp_ring->next_to_use;
+               ice_fill_tx_hw_ring(xdp_ring, descs, nb_processed, &total_bytes);
+               tx_desc = ICE_TX_DESC(xdp_ring, xdp_ring->next_rs);
+               tx_desc->cmd_type_offset_bsz |=
+                       cpu_to_le64(ICE_TX_DESC_CMD_RS << ICE_TXD_QW1_CMD_S);
+               xdp_ring->next_rs = tx_thresh - 1;
+               xdp_ring->next_to_use = 0;
+       }
 
-       ntc += xdp_ring->count;
-       xdp_ring->next_to_clean = ntc;
+       ice_fill_tx_hw_ring(xdp_ring, &descs[nb_processed], nb_pkts - nb_processed,
+                           &total_bytes);
 
-       if (xsk_frames)
-               xsk_tx_completed(xdp_ring->xsk_pool, xsk_frames);
+       ice_xdp_ring_update_tail(xdp_ring);
+       ice_update_tx_ring_stats(xdp_ring, nb_pkts, total_bytes);
 
        if (xsk_uses_need_wakeup(xdp_ring->xsk_pool))
                xsk_set_tx_need_wakeup(xdp_ring->xsk_pool);
 
-       ice_update_tx_ring_stats(xdp_ring, total_packets, total_bytes);
-       xmit_done = ice_xmit_zc(xdp_ring, ICE_DFLT_IRQ_WORK);
-
-       return budget > 0 && xmit_done;
+       return nb_pkts < budget;
 }
 
 /**
index 4c7bd8e..0cbb579 100644 (file)
@@ -6,19 +6,37 @@
 #include "ice_txrx.h"
 #include "ice.h"
 
+#define PKTS_PER_BATCH 8
+
+#ifdef __clang__
+#define loop_unrolled_for _Pragma("clang loop unroll_count(8)") for
+#elif __GNUC__ >= 4
+#define loop_unrolled_for _Pragma("GCC unroll 8") for
+#else
+#define loop_unrolled_for for
+#endif
+
 struct ice_vsi;
 
 #ifdef CONFIG_XDP_SOCKETS
 int ice_xsk_pool_setup(struct ice_vsi *vsi, struct xsk_buff_pool *pool,
                       u16 qid);
 int ice_clean_rx_irq_zc(struct ice_rx_ring *rx_ring, int budget);
-bool ice_clean_tx_irq_zc(struct ice_tx_ring *xdp_ring, int budget);
 int ice_xsk_wakeup(struct net_device *netdev, u32 queue_id, u32 flags);
 bool ice_alloc_rx_bufs_zc(struct ice_rx_ring *rx_ring, u16 count);
 bool ice_xsk_any_rx_ring_ena(struct ice_vsi *vsi);
 void ice_xsk_clean_rx_ring(struct ice_rx_ring *rx_ring);
 void ice_xsk_clean_xdp_ring(struct ice_tx_ring *xdp_ring);
+bool ice_xmit_zc(struct ice_tx_ring *xdp_ring, u32 budget, int napi_budget);
 #else
+static inline bool
+ice_xmit_zc(struct ice_tx_ring __always_unused *xdp_ring,
+           u32 __always_unused budget,
+           int __always_unused napi_budget)
+{
+       return false;
+}
+
 static inline int
 ice_xsk_pool_setup(struct ice_vsi __always_unused *vsi,
                   struct xsk_buff_pool __always_unused *pool,
@@ -35,13 +53,6 @@ ice_clean_rx_irq_zc(struct ice_rx_ring __always_unused *rx_ring,
 }
 
 static inline bool
-ice_clean_tx_irq_zc(struct ice_tx_ring __always_unused *xdp_ring,
-                   int __always_unused budget)
-{
-       return false;
-}
-
-static inline bool
 ice_alloc_rx_bufs_zc(struct ice_rx_ring __always_unused *rx_ring,
                     u16 __always_unused count)
 {
index 51a2dca..2a57820 100644 (file)
@@ -965,10 +965,6 @@ static int igb_set_ringparam(struct net_device *netdev,
                        memcpy(&temp_ring[i], adapter->rx_ring[i],
                               sizeof(struct igb_ring));
 
-                       /* Clear copied XDP RX-queue info */
-                       memset(&temp_ring[i].xdp_rxq, 0,
-                              sizeof(temp_ring[i].xdp_rxq));
-
                        temp_ring[i].count = new_rx_count;
                        err = igb_setup_rx_resources(&temp_ring[i]);
                        if (err) {
index 38ba920..34b33b2 100644 (file)
@@ -3164,8 +3164,8 @@ static int igb_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
        s32 ret_val;
        static int global_quad_port_a; /* global quad port a indication */
        const struct e1000_info *ei = igb_info_tbl[ent->driver_data];
-       int err, pci_using_dac;
        u8 part_str[E1000_PBANUM_LENGTH];
+       int err;
 
        /* Catch broken hardware that put the wrong VF device ID in
         * the PCIe SR-IOV capability.
@@ -3180,17 +3180,11 @@ static int igb_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
        if (err)
                return err;
 
-       pci_using_dac = 0;
        err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
-       if (!err) {
-               pci_using_dac = 1;
-       } else {
-               err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
-               if (err) {
-                       dev_err(&pdev->dev,
-                               "No usable DMA configuration, aborting\n");
-                       goto err_dma;
-               }
+       if (err) {
+               dev_err(&pdev->dev,
+                       "No usable DMA configuration, aborting\n");
+               goto err_dma;
        }
 
        err = pci_request_mem_regions(pdev, igb_driver_name);
@@ -3306,8 +3300,7 @@ static int igb_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
        if (hw->mac.type >= e1000_i350)
                netdev->hw_features |= NETIF_F_NTUPLE;
 
-       if (pci_using_dac)
-               netdev->features |= NETIF_F_HIGHDMA;
+       netdev->features |= NETIF_F_HIGHDMA;
 
        netdev->vlan_features |= netdev->features | NETIF_F_TSO_MANGLEID;
        netdev->mpls_features |= NETIF_F_HW_CSUM;
@@ -4352,7 +4345,18 @@ int igb_setup_rx_resources(struct igb_ring *rx_ring)
 {
        struct igb_adapter *adapter = netdev_priv(rx_ring->netdev);
        struct device *dev = rx_ring->dev;
-       int size;
+       int size, res;
+
+       /* XDP RX-queue info */
+       if (xdp_rxq_info_is_reg(&rx_ring->xdp_rxq))
+               xdp_rxq_info_unreg(&rx_ring->xdp_rxq);
+       res = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev,
+                              rx_ring->queue_index, 0);
+       if (res < 0) {
+               dev_err(dev, "Failed to register xdp_rxq index %u\n",
+                       rx_ring->queue_index);
+               return res;
+       }
 
        size = sizeof(struct igb_rx_buffer) * rx_ring->count;
 
@@ -4375,14 +4379,10 @@ int igb_setup_rx_resources(struct igb_ring *rx_ring)
 
        rx_ring->xdp_prog = adapter->xdp_prog;
 
-       /* XDP RX-queue info */
-       if (xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev,
-                            rx_ring->queue_index, 0) < 0)
-               goto err;
-
        return 0;
 
 err:
+       xdp_rxq_info_unreg(&rx_ring->xdp_rxq);
        vfree(rx_ring->rx_buffer_info);
        rx_ring->rx_buffer_info = NULL;
        dev_err(dev, "Unable to allocate memory for the Rx descriptor ring\n");
index b784072..43ced78 100644 (file)
@@ -2684,25 +2684,18 @@ static int igbvf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
        struct igbvf_adapter *adapter;
        struct e1000_hw *hw;
        const struct igbvf_info *ei = igbvf_info_tbl[ent->driver_data];
-
        static int cards_found;
-       int err, pci_using_dac;
+       int err;
 
        err = pci_enable_device_mem(pdev);
        if (err)
                return err;
 
-       pci_using_dac = 0;
        err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
-       if (!err) {
-               pci_using_dac = 1;
-       } else {
-               err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
-               if (err) {
-                       dev_err(&pdev->dev,
-                               "No usable DMA configuration, aborting\n");
-                       goto err_dma;
-               }
+       if (err) {
+               dev_err(&pdev->dev,
+                       "No usable DMA configuration, aborting\n");
+               goto err_dma;
        }
 
        err = pci_request_regions(pdev, igbvf_driver_name);
@@ -2783,10 +2776,7 @@ static int igbvf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
        netdev->hw_features |= NETIF_F_GSO_PARTIAL |
                               IGBVF_GSO_PARTIAL_FEATURES;
 
-       netdev->features = netdev->hw_features;
-
-       if (pci_using_dac)
-               netdev->features |= NETIF_F_HIGHDMA;
+       netdev->features = netdev->hw_features | NETIF_F_HIGHDMA;
 
        netdev->vlan_features |= netdev->features | NETIF_F_TSO_MANGLEID;
        netdev->mpls_features |= NETIF_F_HW_CSUM;
index 2f17f36..74b2c59 100644 (file)
@@ -505,6 +505,9 @@ int igc_setup_rx_resources(struct igc_ring *rx_ring)
        u8 index = rx_ring->queue_index;
        int size, desc_len, res;
 
+       /* XDP RX-queue info */
+       if (xdp_rxq_info_is_reg(&rx_ring->xdp_rxq))
+               xdp_rxq_info_unreg(&rx_ring->xdp_rxq);
        res = xdp_rxq_info_reg(&rx_ring->xdp_rxq, ndev, index,
                               rx_ring->q_vector->napi.napi_id);
        if (res < 0) {
@@ -2446,19 +2449,20 @@ static int igc_clean_rx_irq(struct igc_q_vector *q_vector, const int budget)
 static struct sk_buff *igc_construct_skb_zc(struct igc_ring *ring,
                                            struct xdp_buff *xdp)
 {
+       unsigned int totalsize = xdp->data_end - xdp->data_meta;
        unsigned int metasize = xdp->data - xdp->data_meta;
-       unsigned int datasize = xdp->data_end - xdp->data;
-       unsigned int totalsize = metasize + datasize;
        struct sk_buff *skb;
 
-       skb = __napi_alloc_skb(&ring->q_vector->napi,
-                              xdp->data_end - xdp->data_hard_start,
+       net_prefetch(xdp->data_meta);
+
+       skb = __napi_alloc_skb(&ring->q_vector->napi, totalsize,
                               GFP_ATOMIC | __GFP_NOWARN);
        if (unlikely(!skb))
                return NULL;
 
-       skb_reserve(skb, xdp->data_meta - xdp->data_hard_start);
-       memcpy(__skb_put(skb, totalsize), xdp->data_meta, totalsize);
+       memcpy(__skb_put(skb, totalsize), xdp->data_meta,
+              ALIGN(totalsize, sizeof(long)));
+
        if (metasize) {
                skb_metadata_set(skb, metasize);
                __skb_pull(skb, metasize);
@@ -6251,23 +6255,17 @@ static int igc_probe(struct pci_dev *pdev,
        struct net_device *netdev;
        struct igc_hw *hw;
        const struct igc_info *ei = igc_info_tbl[ent->driver_data];
-       int err, pci_using_dac;
+       int err;
 
        err = pci_enable_device_mem(pdev);
        if (err)
                return err;
 
-       pci_using_dac = 0;
        err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
-       if (!err) {
-               pci_using_dac = 1;
-       } else {
-               err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
-               if (err) {
-                       dev_err(&pdev->dev,
-                               "No usable DMA configuration, aborting\n");
-                       goto err_dma;
-               }
+       if (err) {
+               dev_err(&pdev->dev,
+                       "No usable DMA configuration, aborting\n");
+               goto err_dma;
        }
 
        err = pci_request_mem_regions(pdev, igc_driver_name);
@@ -6367,8 +6365,7 @@ static int igc_probe(struct pci_dev *pdev,
        netdev->hw_features |= NETIF_F_HW_VLAN_CTAG_RX;
        netdev->hw_features |= netdev->features;
 
-       if (pci_using_dac)
-               netdev->features |= NETIF_F_HIGHDMA;
+       netdev->features |= NETIF_F_HIGHDMA;
 
        netdev->vlan_features |= netdev->features | NETIF_F_TSO_MANGLEID;
        netdev->mpls_features |= NETIF_F_HW_CSUM;
index 99d4819..affdefc 100644 (file)
@@ -361,7 +361,6 @@ ixgb_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
        struct net_device *netdev = NULL;
        struct ixgb_adapter *adapter;
        static int cards_found = 0;
-       int pci_using_dac;
        u8 addr[ETH_ALEN];
        int i;
        int err;
@@ -370,16 +369,10 @@ ixgb_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
        if (err)
                return err;
 
-       pci_using_dac = 0;
        err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
-       if (!err) {
-               pci_using_dac = 1;
-       } else {
-               err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
-               if (err) {
-                       pr_err("No usable DMA configuration, aborting\n");
-                       goto err_dma_mask;
-               }
+       if (err) {
+               pr_err("No usable DMA configuration, aborting\n");
+               goto err_dma_mask;
        }
 
        err = pci_request_regions(pdev, ixgb_driver_name);
@@ -444,10 +437,8 @@ ixgb_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
                           NETIF_F_HW_VLAN_CTAG_FILTER;
        netdev->hw_features |= NETIF_F_RXCSUM;
 
-       if (pci_using_dac) {
-               netdev->features |= NETIF_F_HIGHDMA;
-               netdev->vlan_features |= NETIF_F_HIGHDMA;
-       }
+       netdev->features |= NETIF_F_HIGHDMA;
+       netdev->vlan_features |= NETIF_F_HIGHDMA;
 
        /* MTU range: 68 - 16114 */
        netdev->min_mtu = ETH_MIN_MTU;
index 89b4670..2c8a4a0 100644 (file)
@@ -10632,9 +10632,9 @@ static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
        struct ixgbe_adapter *adapter = NULL;
        struct ixgbe_hw *hw;
        const struct ixgbe_info *ii = ixgbe_info_tbl[ent->driver_data];
-       int i, err, pci_using_dac, expected_gts;
        unsigned int indices = MAX_TX_QUEUES;
        u8 part_str[IXGBE_PBANUM_LENGTH];
+       int i, err, expected_gts;
        bool disable_dev = false;
 #ifdef IXGBE_FCOE
        u16 device_caps;
@@ -10654,16 +10654,11 @@ static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
        if (err)
                return err;
 
-       if (!dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64))) {
-               pci_using_dac = 1;
-       } else {
-               err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
-               if (err) {
-                       dev_err(&pdev->dev,
-                               "No usable DMA configuration, aborting\n");
-                       goto err_dma;
-               }
-               pci_using_dac = 0;
+       err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
+       if (err) {
+               dev_err(&pdev->dev,
+                       "No usable DMA configuration, aborting\n");
+               goto err_dma;
        }
 
        err = pci_request_mem_regions(pdev, ixgbe_driver_name);
@@ -10861,8 +10856,7 @@ skip_sriov:
                netdev->hw_features |= NETIF_F_NTUPLE |
                                       NETIF_F_HW_TC;
 
-       if (pci_using_dac)
-               netdev->features |= NETIF_F_HIGHDMA;
+       netdev->features |= NETIF_F_HIGHDMA;
 
        netdev->vlan_features |= netdev->features | NETIF_F_TSO_MANGLEID;
        netdev->hw_enc_features |= netdev->vlan_features;
index b3fd8e5..ee28929 100644 (file)
@@ -207,26 +207,28 @@ bool ixgbe_alloc_rx_buffers_zc(struct ixgbe_ring *rx_ring, u16 count)
 }
 
 static struct sk_buff *ixgbe_construct_skb_zc(struct ixgbe_ring *rx_ring,
-                                             struct ixgbe_rx_buffer *bi)
+                                             const struct xdp_buff *xdp)
 {
-       unsigned int metasize = bi->xdp->data - bi->xdp->data_meta;
-       unsigned int datasize = bi->xdp->data_end - bi->xdp->data;
+       unsigned int totalsize = xdp->data_end - xdp->data_meta;
+       unsigned int metasize = xdp->data - xdp->data_meta;
        struct sk_buff *skb;
 
+       net_prefetch(xdp->data_meta);
+
        /* allocate a skb to store the frags */
-       skb = __napi_alloc_skb(&rx_ring->q_vector->napi,
-                              bi->xdp->data_end - bi->xdp->data_hard_start,
+       skb = __napi_alloc_skb(&rx_ring->q_vector->napi, totalsize,
                               GFP_ATOMIC | __GFP_NOWARN);
        if (unlikely(!skb))
                return NULL;
 
-       skb_reserve(skb, bi->xdp->data - bi->xdp->data_hard_start);
-       memcpy(__skb_put(skb, datasize), bi->xdp->data, datasize);
-       if (metasize)
+       memcpy(__skb_put(skb, totalsize), xdp->data_meta,
+              ALIGN(totalsize, sizeof(long)));
+
+       if (metasize) {
                skb_metadata_set(skb, metasize);
+               __skb_pull(skb, metasize);
+       }
 
-       xsk_buff_free(bi->xdp);
-       bi->xdp = NULL;
        return skb;
 }
 
@@ -317,12 +319,15 @@ int ixgbe_clean_rx_irq_zc(struct ixgbe_q_vector *q_vector,
                }
 
                /* XDP_PASS path */
-               skb = ixgbe_construct_skb_zc(rx_ring, bi);
+               skb = ixgbe_construct_skb_zc(rx_ring, bi->xdp);
                if (!skb) {
                        rx_ring->rx_stats.alloc_rx_buff_failed++;
                        break;
                }
 
+               xsk_buff_free(bi->xdp);
+               bi->xdp = NULL;
+
                cleaned_count++;
                ixgbe_inc_ntc(rx_ring);
 
index 0f293ac..17fbc45 100644 (file)
@@ -4512,22 +4512,17 @@ static int ixgbevf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
        struct ixgbevf_adapter *adapter = NULL;
        struct ixgbe_hw *hw = NULL;
        const struct ixgbevf_info *ii = ixgbevf_info_tbl[ent->driver_data];
-       int err, pci_using_dac;
        bool disable_dev = false;
+       int err;
 
        err = pci_enable_device(pdev);
        if (err)
                return err;
 
-       if (!dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64))) {
-               pci_using_dac = 1;
-       } else {
-               err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
-               if (err) {
-                       dev_err(&pdev->dev, "No usable DMA configuration, aborting\n");
-                       goto err_dma;
-               }
-               pci_using_dac = 0;
+       err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
+       if (err) {
+               dev_err(&pdev->dev, "No usable DMA configuration, aborting\n");
+               goto err_dma;
        }
 
        err = pci_request_regions(pdev, ixgbevf_driver_name);
@@ -4607,10 +4602,7 @@ static int ixgbevf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
        netdev->hw_features |= NETIF_F_GSO_PARTIAL |
                               IXGBEVF_GSO_PARTIAL_FEATURES;
 
-       netdev->features = netdev->hw_features;
-
-       if (pci_using_dac)
-               netdev->features |= NETIF_F_HIGHDMA;
+       netdev->features = netdev->hw_features | NETIF_F_HIGHDMA;
 
        netdev->vlan_features |= netdev->features | NETIF_F_TSO_MANGLEID;
        netdev->mpls_features |= NETIF_F_SG |
index 83c8908..f1335a1 100644 (file)
@@ -1884,8 +1884,8 @@ static void mvneta_txq_bufs_free(struct mvneta_port *pp,
                        bytes_compl += buf->skb->len;
                        pkts_compl++;
                        dev_kfree_skb_any(buf->skb);
-               } else if (buf->type == MVNETA_TYPE_XDP_TX ||
-                          buf->type == MVNETA_TYPE_XDP_NDO) {
+               } else if ((buf->type == MVNETA_TYPE_XDP_TX ||
+                           buf->type == MVNETA_TYPE_XDP_NDO) && buf->xdpf) {
                        if (napi && buf->type == MVNETA_TYPE_XDP_TX)
                                xdp_return_frame_rx_napi(buf->xdpf);
                        else
@@ -2060,61 +2060,104 @@ int mvneta_rx_refill_queue(struct mvneta_port *pp, struct mvneta_rx_queue *rxq)
 
 static void
 mvneta_xdp_put_buff(struct mvneta_port *pp, struct mvneta_rx_queue *rxq,
-                   struct xdp_buff *xdp, struct skb_shared_info *sinfo,
-                   int sync_len)
+                   struct xdp_buff *xdp, int sync_len)
 {
+       struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
        int i;
 
+       if (likely(!xdp_buff_has_frags(xdp)))
+               goto out;
+
        for (i = 0; i < sinfo->nr_frags; i++)
                page_pool_put_full_page(rxq->page_pool,
                                        skb_frag_page(&sinfo->frags[i]), true);
+
+out:
        page_pool_put_page(rxq->page_pool, virt_to_head_page(xdp->data),
                           sync_len, true);
 }
 
 static int
 mvneta_xdp_submit_frame(struct mvneta_port *pp, struct mvneta_tx_queue *txq,
-                       struct xdp_frame *xdpf, bool dma_map)
+                       struct xdp_frame *xdpf, int *nxmit_byte, bool dma_map)
 {
+       struct skb_shared_info *sinfo = xdp_get_shared_info_from_frame(xdpf);
+       struct device *dev = pp->dev->dev.parent;
        struct mvneta_tx_desc *tx_desc;
-       struct mvneta_tx_buf *buf;
-       dma_addr_t dma_addr;
+       int i, num_frames = 1;
+       struct page *page;
 
-       if (txq->count >= txq->tx_stop_threshold)
+       if (unlikely(xdp_frame_has_frags(xdpf)))
+               num_frames += sinfo->nr_frags;
+
+       if (txq->count + num_frames >= txq->size)
                return MVNETA_XDP_DROPPED;
 
-       tx_desc = mvneta_txq_next_desc_get(txq);
+       for (i = 0; i < num_frames; i++) {
+               struct mvneta_tx_buf *buf = &txq->buf[txq->txq_put_index];
+               skb_frag_t *frag = NULL;
+               int len = xdpf->len;
+               dma_addr_t dma_addr;
 
-       buf = &txq->buf[txq->txq_put_index];
-       if (dma_map) {
-               /* ndo_xdp_xmit */
-               dma_addr = dma_map_single(pp->dev->dev.parent, xdpf->data,
-                                         xdpf->len, DMA_TO_DEVICE);
-               if (dma_mapping_error(pp->dev->dev.parent, dma_addr)) {
-                       mvneta_txq_desc_put(txq);
-                       return MVNETA_XDP_DROPPED;
+               if (unlikely(i)) { /* paged area */
+                       frag = &sinfo->frags[i - 1];
+                       len = skb_frag_size(frag);
                }
-               buf->type = MVNETA_TYPE_XDP_NDO;
-       } else {
-               struct page *page = virt_to_page(xdpf->data);
 
-               dma_addr = page_pool_get_dma_addr(page) +
-                          sizeof(*xdpf) + xdpf->headroom;
-               dma_sync_single_for_device(pp->dev->dev.parent, dma_addr,
-                                          xdpf->len, DMA_BIDIRECTIONAL);
-               buf->type = MVNETA_TYPE_XDP_TX;
-       }
-       buf->xdpf = xdpf;
+               tx_desc = mvneta_txq_next_desc_get(txq);
+               if (dma_map) {
+                       /* ndo_xdp_xmit */
+                       void *data;
+
+                       data = unlikely(frag) ? skb_frag_address(frag)
+                                             : xdpf->data;
+                       dma_addr = dma_map_single(dev, data, len,
+                                                 DMA_TO_DEVICE);
+                       if (dma_mapping_error(dev, dma_addr)) {
+                               mvneta_txq_desc_put(txq);
+                               goto unmap;
+                       }
+
+                       buf->type = MVNETA_TYPE_XDP_NDO;
+               } else {
+                       page = unlikely(frag) ? skb_frag_page(frag)
+                                             : virt_to_page(xdpf->data);
+                       dma_addr = page_pool_get_dma_addr(page);
+                       if (unlikely(frag))
+                               dma_addr += skb_frag_off(frag);
+                       else
+                               dma_addr += sizeof(*xdpf) + xdpf->headroom;
+                       dma_sync_single_for_device(dev, dma_addr, len,
+                                                  DMA_BIDIRECTIONAL);
+                       buf->type = MVNETA_TYPE_XDP_TX;
+               }
+               buf->xdpf = unlikely(i) ? NULL : xdpf;
 
-       tx_desc->command = MVNETA_TXD_FLZ_DESC;
-       tx_desc->buf_phys_addr = dma_addr;
-       tx_desc->data_size = xdpf->len;
+               tx_desc->command = unlikely(i) ? 0 : MVNETA_TXD_F_DESC;
+               tx_desc->buf_phys_addr = dma_addr;
+               tx_desc->data_size = len;
+               *nxmit_byte += len;
 
-       mvneta_txq_inc_put(txq);
-       txq->pending++;
-       txq->count++;
+               mvneta_txq_inc_put(txq);
+       }
+       /*last descriptor */
+       tx_desc->command |= MVNETA_TXD_L_DESC | MVNETA_TXD_Z_PAD;
+
+       txq->pending += num_frames;
+       txq->count += num_frames;
 
        return MVNETA_XDP_TX;
+
+unmap:
+       for (i--; i >= 0; i--) {
+               mvneta_txq_desc_put(txq);
+               tx_desc = txq->descs + txq->next_desc_to_proc;
+               dma_unmap_single(dev, tx_desc->buf_phys_addr,
+                                tx_desc->data_size,
+                                DMA_TO_DEVICE);
+       }
+
+       return MVNETA_XDP_DROPPED;
 }
 
 static int
@@ -2123,8 +2166,8 @@ mvneta_xdp_xmit_back(struct mvneta_port *pp, struct xdp_buff *xdp)
        struct mvneta_pcpu_stats *stats = this_cpu_ptr(pp->stats);
        struct mvneta_tx_queue *txq;
        struct netdev_queue *nq;
+       int cpu, nxmit_byte = 0;
        struct xdp_frame *xdpf;
-       int cpu;
        u32 ret;
 
        xdpf = xdp_convert_buff_to_frame(xdp);
@@ -2136,10 +2179,10 @@ mvneta_xdp_xmit_back(struct mvneta_port *pp, struct xdp_buff *xdp)
        nq = netdev_get_tx_queue(pp->dev, txq->id);
 
        __netif_tx_lock(nq, cpu);
-       ret = mvneta_xdp_submit_frame(pp, txq, xdpf, false);
+       ret = mvneta_xdp_submit_frame(pp, txq, xdpf, &nxmit_byte, false);
        if (ret == MVNETA_XDP_TX) {
                u64_stats_update_begin(&stats->syncp);
-               stats->es.ps.tx_bytes += xdpf->len;
+               stats->es.ps.tx_bytes += nxmit_byte;
                stats->es.ps.tx_packets++;
                stats->es.ps.xdp_tx++;
                u64_stats_update_end(&stats->syncp);
@@ -2178,11 +2221,11 @@ mvneta_xdp_xmit(struct net_device *dev, int num_frame,
 
        __netif_tx_lock(nq, cpu);
        for (i = 0; i < num_frame; i++) {
-               ret = mvneta_xdp_submit_frame(pp, txq, frames[i], true);
+               ret = mvneta_xdp_submit_frame(pp, txq, frames[i], &nxmit_byte,
+                                             true);
                if (ret != MVNETA_XDP_TX)
                        break;
 
-               nxmit_byte += frames[i]->len;
                nxmit++;
        }
 
@@ -2205,7 +2248,6 @@ mvneta_run_xdp(struct mvneta_port *pp, struct mvneta_rx_queue *rxq,
               struct bpf_prog *prog, struct xdp_buff *xdp,
               u32 frame_sz, struct mvneta_stats *stats)
 {
-       struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
        unsigned int len, data_len, sync;
        u32 ret, act;
 
@@ -2226,7 +2268,7 @@ mvneta_run_xdp(struct mvneta_port *pp, struct mvneta_rx_queue *rxq,
 
                err = xdp_do_redirect(pp->dev, xdp, prog);
                if (unlikely(err)) {
-                       mvneta_xdp_put_buff(pp, rxq, xdp, sinfo, sync);
+                       mvneta_xdp_put_buff(pp, rxq, xdp, sync);
                        ret = MVNETA_XDP_DROPPED;
                } else {
                        ret = MVNETA_XDP_REDIR;
@@ -2237,7 +2279,7 @@ mvneta_run_xdp(struct mvneta_port *pp, struct mvneta_rx_queue *rxq,
        case XDP_TX:
                ret = mvneta_xdp_xmit_back(pp, xdp);
                if (ret != MVNETA_XDP_TX)
-                       mvneta_xdp_put_buff(pp, rxq, xdp, sinfo, sync);
+                       mvneta_xdp_put_buff(pp, rxq, xdp, sync);
                break;
        default:
                bpf_warn_invalid_xdp_action(pp->dev, prog, act);
@@ -2246,7 +2288,7 @@ mvneta_run_xdp(struct mvneta_port *pp, struct mvneta_rx_queue *rxq,
                trace_xdp_exception(pp->dev, prog, act);
                fallthrough;
        case XDP_DROP:
-               mvneta_xdp_put_buff(pp, rxq, xdp, sinfo, sync);
+               mvneta_xdp_put_buff(pp, rxq, xdp, sync);
                ret = MVNETA_XDP_DROPPED;
                stats->xdp_drop++;
                break;
@@ -2269,7 +2311,6 @@ mvneta_swbm_rx_frame(struct mvneta_port *pp,
        int data_len = -MVNETA_MH_SIZE, len;
        struct net_device *dev = pp->dev;
        enum dma_data_direction dma_dir;
-       struct skb_shared_info *sinfo;
 
        if (*size > MVNETA_MAX_RX_BUF_SIZE) {
                len = MVNETA_MAX_RX_BUF_SIZE;
@@ -2289,11 +2330,9 @@ mvneta_swbm_rx_frame(struct mvneta_port *pp,
 
        /* Prefetch header */
        prefetch(data);
+       xdp_buff_clear_frags_flag(xdp);
        xdp_prepare_buff(xdp, data, pp->rx_offset_correction + MVNETA_MH_SIZE,
                         data_len, false);
-
-       sinfo = xdp_get_shared_info_from_buff(xdp);
-       sinfo->nr_frags = 0;
 }
 
 static void
@@ -2301,9 +2340,9 @@ mvneta_swbm_add_rx_fragment(struct mvneta_port *pp,
                            struct mvneta_rx_desc *rx_desc,
                            struct mvneta_rx_queue *rxq,
                            struct xdp_buff *xdp, int *size,
-                           struct skb_shared_info *xdp_sinfo,
                            struct page *page)
 {
+       struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
        struct net_device *dev = pp->dev;
        enum dma_data_direction dma_dir;
        int data_len, len;
@@ -2321,25 +2360,25 @@ mvneta_swbm_add_rx_fragment(struct mvneta_port *pp,
                                len, dma_dir);
        rx_desc->buf_phys_addr = 0;
 
-       if (data_len > 0 && xdp_sinfo->nr_frags < MAX_SKB_FRAGS) {
-               skb_frag_t *frag = &xdp_sinfo->frags[xdp_sinfo->nr_frags++];
+       if (!xdp_buff_has_frags(xdp))
+               sinfo->nr_frags = 0;
+
+       if (data_len > 0 && sinfo->nr_frags < MAX_SKB_FRAGS) {
+               skb_frag_t *frag = &sinfo->frags[sinfo->nr_frags++];
 
                skb_frag_off_set(frag, pp->rx_offset_correction);
                skb_frag_size_set(frag, data_len);
                __skb_frag_set_page(frag, page);
+
+               if (!xdp_buff_has_frags(xdp)) {
+                       sinfo->xdp_frags_size = *size;
+                       xdp_buff_set_frags_flag(xdp);
+               }
+               if (page_is_pfmemalloc(page))
+                       xdp_buff_set_frag_pfmemalloc(xdp);
        } else {
                page_pool_put_full_page(rxq->page_pool, page, true);
        }
-
-       /* last fragment */
-       if (len == *size) {
-               struct skb_shared_info *sinfo;
-
-               sinfo = xdp_get_shared_info_from_buff(xdp);
-               sinfo->nr_frags = xdp_sinfo->nr_frags;
-               memcpy(sinfo->frags, xdp_sinfo->frags,
-                      sinfo->nr_frags * sizeof(skb_frag_t));
-       }
        *size -= len;
 }
 
@@ -2348,8 +2387,11 @@ mvneta_swbm_build_skb(struct mvneta_port *pp, struct page_pool *pool,
                      struct xdp_buff *xdp, u32 desc_status)
 {
        struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
-       int i, num_frags = sinfo->nr_frags;
        struct sk_buff *skb;
+       u8 num_frags;
+
+       if (unlikely(xdp_buff_has_frags(xdp)))
+               num_frags = sinfo->nr_frags;
 
        skb = build_skb(xdp->data_hard_start, PAGE_SIZE);
        if (!skb)
@@ -2361,13 +2403,11 @@ mvneta_swbm_build_skb(struct mvneta_port *pp, struct page_pool *pool,
        skb_put(skb, xdp->data_end - xdp->data);
        skb->ip_summed = mvneta_rx_csum(pp, desc_status);
 
-       for (i = 0; i < num_frags; i++) {
-               skb_frag_t *frag = &sinfo->frags[i];
-
-               skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,
-                               skb_frag_page(frag), skb_frag_off(frag),
-                               skb_frag_size(frag), PAGE_SIZE);
-       }
+       if (unlikely(xdp_buff_has_frags(xdp)))
+               xdp_update_skb_shared_info(skb, num_frags,
+                                          sinfo->xdp_frags_size,
+                                          num_frags * xdp->frame_sz,
+                                          xdp_buff_is_frag_pfmemalloc(xdp));
 
        return skb;
 }
@@ -2379,7 +2419,6 @@ static int mvneta_rx_swbm(struct napi_struct *napi,
 {
        int rx_proc = 0, rx_todo, refill, size = 0;
        struct net_device *dev = pp->dev;
-       struct skb_shared_info sinfo;
        struct mvneta_stats ps = {};
        struct bpf_prog *xdp_prog;
        u32 desc_status, frame_sz;
@@ -2388,8 +2427,6 @@ static int mvneta_rx_swbm(struct napi_struct *napi,
        xdp_init_buff(&xdp_buf, PAGE_SIZE, &rxq->xdp_rxq);
        xdp_buf.data_hard_start = NULL;
 
-       sinfo.nr_frags = 0;
-
        /* Get number of received packets */
        rx_todo = mvneta_rxq_busy_desc_num_get(pp, rxq);
 
@@ -2431,7 +2468,7 @@ static int mvneta_rx_swbm(struct napi_struct *napi,
                        }
 
                        mvneta_swbm_add_rx_fragment(pp, rx_desc, rxq, &xdp_buf,
-                                                   &size, &sinfo, page);
+                                                   &size, page);
                } /* Middle or Last descriptor */
 
                if (!(rx_status & MVNETA_RXD_LAST_DESC))
@@ -2439,7 +2476,7 @@ static int mvneta_rx_swbm(struct napi_struct *napi,
                        continue;
 
                if (size) {
-                       mvneta_xdp_put_buff(pp, rxq, &xdp_buf, &sinfo, -1);
+                       mvneta_xdp_put_buff(pp, rxq, &xdp_buf, -1);
                        goto next;
                }
 
@@ -2451,7 +2488,7 @@ static int mvneta_rx_swbm(struct napi_struct *napi,
                if (IS_ERR(skb)) {
                        struct mvneta_pcpu_stats *stats = this_cpu_ptr(pp->stats);
 
-                       mvneta_xdp_put_buff(pp, rxq, &xdp_buf, &sinfo, -1);
+                       mvneta_xdp_put_buff(pp, rxq, &xdp_buf, -1);
 
                        u64_stats_update_begin(&stats->syncp);
                        stats->es.skb_alloc_error++;
@@ -2468,11 +2505,10 @@ static int mvneta_rx_swbm(struct napi_struct *napi,
                napi_gro_receive(napi, skb);
 next:
                xdp_buf.data_hard_start = NULL;
-               sinfo.nr_frags = 0;
        }
 
        if (xdp_buf.data_hard_start)
-               mvneta_xdp_put_buff(pp, rxq, &xdp_buf, &sinfo, -1);
+               mvneta_xdp_put_buff(pp, rxq, &xdp_buf, -1);
 
        if (ps.xdp_redirect)
                xdp_do_flush_map();
@@ -3260,7 +3296,8 @@ static int mvneta_create_page_pool(struct mvneta_port *pp,
                return err;
        }
 
-       err = xdp_rxq_info_reg(&rxq->xdp_rxq, pp->dev, rxq->id, 0);
+       err = __xdp_rxq_info_reg(&rxq->xdp_rxq, pp->dev, rxq->id, 0,
+                                PAGE_SIZE);
        if (err < 0)
                goto err_free_pp;
 
@@ -3740,6 +3777,7 @@ static void mvneta_percpu_disable(void *arg)
 static int mvneta_change_mtu(struct net_device *dev, int mtu)
 {
        struct mvneta_port *pp = netdev_priv(dev);
+       struct bpf_prog *prog = pp->xdp_prog;
        int ret;
 
        if (!IS_ALIGNED(MVNETA_RX_PKT_SIZE(mtu), 8)) {
@@ -3748,8 +3786,11 @@ static int mvneta_change_mtu(struct net_device *dev, int mtu)
                mtu = ALIGN(MVNETA_RX_PKT_SIZE(mtu), 8);
        }
 
-       if (pp->xdp_prog && mtu > MVNETA_MAX_RX_BUF_SIZE) {
-               netdev_info(dev, "Illegal MTU value %d for XDP mode\n", mtu);
+       if (prog && !prog->aux->xdp_has_frags &&
+           mtu > MVNETA_MAX_RX_BUF_SIZE) {
+               netdev_info(dev, "Illegal MTU %d for XDP prog without frags\n",
+                           mtu);
+
                return -EINVAL;
        }
 
@@ -3969,6 +4010,15 @@ static const struct phylink_pcs_ops mvneta_phylink_pcs_ops = {
        .pcs_an_restart = mvneta_pcs_an_restart,
 };
 
+static struct phylink_pcs *mvneta_mac_select_pcs(struct phylink_config *config,
+                                                phy_interface_t interface)
+{
+       struct net_device *ndev = to_net_dev(config->dev);
+       struct mvneta_port *pp = netdev_priv(ndev);
+
+       return &pp->phylink_pcs;
+}
+
 static int mvneta_mac_prepare(struct phylink_config *config, unsigned int mode,
                              phy_interface_t interface)
 {
@@ -4169,13 +4219,14 @@ static void mvneta_mac_link_up(struct phylink_config *config,
        mvneta_port_up(pp);
 
        if (phy && pp->eee_enabled) {
-               pp->eee_active = phy_init_eee(phy, 0) >= 0;
+               pp->eee_active = phy_init_eee(phy, false) >= 0;
                mvneta_set_eee(pp, pp->eee_active && pp->tx_lpi_enabled);
        }
 }
 
 static const struct phylink_mac_ops mvneta_phylink_ops = {
        .validate = phylink_generic_validate,
+       .mac_select_pcs = mvneta_mac_select_pcs,
        .mac_prepare = mvneta_mac_prepare,
        .mac_config = mvneta_mac_config,
        .mac_finish = mvneta_mac_finish,
@@ -4490,8 +4541,9 @@ static int mvneta_xdp_setup(struct net_device *dev, struct bpf_prog *prog,
        struct mvneta_port *pp = netdev_priv(dev);
        struct bpf_prog *old_prog;
 
-       if (prog && dev->mtu > MVNETA_MAX_RX_BUF_SIZE) {
-               NL_SET_ERR_MSG_MOD(extack, "MTU too large for XDP");
+       if (prog && !prog->aux->xdp_has_frags &&
+           dev->mtu > MVNETA_MAX_RX_BUF_SIZE) {
+               NL_SET_ERR_MSG_MOD(extack, "prog does not support XDP frags");
                return -EOPNOTSUPP;
        }
 
@@ -5321,26 +5373,62 @@ static int mvneta_probe(struct platform_device *pdev)
        if (!dev)
                return -ENOMEM;
 
-       dev->irq = irq_of_parse_and_map(dn, 0);
-       if (dev->irq == 0)
-               return -EINVAL;
+       dev->tx_queue_len = MVNETA_MAX_TXD;
+       dev->watchdog_timeo = 5 * HZ;
+       dev->netdev_ops = &mvneta_netdev_ops;
+       dev->ethtool_ops = &mvneta_eth_tool_ops;
+
+       pp = netdev_priv(dev);
+       spin_lock_init(&pp->lock);
+       pp->dn = dn;
+
+       pp->rxq_def = rxq_def;
+       pp->indir[0] = rxq_def;
 
        err = of_get_phy_mode(dn, &phy_mode);
        if (err) {
                dev_err(&pdev->dev, "incorrect phy-mode\n");
-               goto err_free_irq;
+               return err;
        }
 
+       pp->phy_interface = phy_mode;
+
        comphy = devm_of_phy_get(&pdev->dev, dn, NULL);
-       if (comphy == ERR_PTR(-EPROBE_DEFER)) {
-               err = -EPROBE_DEFER;
-               goto err_free_irq;
-       } else if (IS_ERR(comphy)) {
+       if (comphy == ERR_PTR(-EPROBE_DEFER))
+               return -EPROBE_DEFER;
+
+       if (IS_ERR(comphy))
                comphy = NULL;
+
+       pp->comphy = comphy;
+
+       pp->base = devm_platform_ioremap_resource(pdev, 0);
+       if (IS_ERR(pp->base))
+               return PTR_ERR(pp->base);
+
+       /* Get special SoC configurations */
+       if (of_device_is_compatible(dn, "marvell,armada-3700-neta"))
+               pp->neta_armada3700 = true;
+
+       dev->irq = irq_of_parse_and_map(dn, 0);
+       if (dev->irq == 0)
+               return -EINVAL;
+
+       pp->clk = devm_clk_get(&pdev->dev, "core");
+       if (IS_ERR(pp->clk))
+               pp->clk = devm_clk_get(&pdev->dev, NULL);
+       if (IS_ERR(pp->clk)) {
+               err = PTR_ERR(pp->clk);
+               goto err_free_irq;
        }
 
-       pp = netdev_priv(dev);
-       spin_lock_init(&pp->lock);
+       clk_prepare_enable(pp->clk);
+
+       pp->clk_bus = devm_clk_get(&pdev->dev, "bus");
+       if (!IS_ERR(pp->clk_bus))
+               clk_prepare_enable(pp->clk_bus);
+
+       pp->phylink_pcs.ops = &mvneta_phylink_pcs_ops;
 
        pp->phylink_config.dev = &dev->dev;
        pp->phylink_config.type = PHYLINK_NETDEV;
@@ -5377,55 +5465,16 @@ static int mvneta_probe(struct platform_device *pdev)
                                 phy_mode, &mvneta_phylink_ops);
        if (IS_ERR(phylink)) {
                err = PTR_ERR(phylink);
-               goto err_free_irq;
-       }
-
-       dev->tx_queue_len = MVNETA_MAX_TXD;
-       dev->watchdog_timeo = 5 * HZ;
-       dev->netdev_ops = &mvneta_netdev_ops;
-
-       dev->ethtool_ops = &mvneta_eth_tool_ops;
-
-       pp->phylink = phylink;
-       pp->comphy = comphy;
-       pp->phy_interface = phy_mode;
-       pp->dn = dn;
-
-       pp->rxq_def = rxq_def;
-       pp->indir[0] = rxq_def;
-
-       /* Get special SoC configurations */
-       if (of_device_is_compatible(dn, "marvell,armada-3700-neta"))
-               pp->neta_armada3700 = true;
-
-       pp->clk = devm_clk_get(&pdev->dev, "core");
-       if (IS_ERR(pp->clk))
-               pp->clk = devm_clk_get(&pdev->dev, NULL);
-       if (IS_ERR(pp->clk)) {
-               err = PTR_ERR(pp->clk);
-               goto err_free_phylink;
-       }
-
-       clk_prepare_enable(pp->clk);
-
-       pp->clk_bus = devm_clk_get(&pdev->dev, "bus");
-       if (!IS_ERR(pp->clk_bus))
-               clk_prepare_enable(pp->clk_bus);
-
-       pp->base = devm_platform_ioremap_resource(pdev, 0);
-       if (IS_ERR(pp->base)) {
-               err = PTR_ERR(pp->base);
                goto err_clk;
        }
 
-       pp->phylink_pcs.ops = &mvneta_phylink_pcs_ops;
-       phylink_set_pcs(phylink, &pp->phylink_pcs);
+       pp->phylink = phylink;
 
        /* Alloc per-cpu port structure */
        pp->ports = alloc_percpu(struct mvneta_pcpu_port);
        if (!pp->ports) {
                err = -ENOMEM;
-               goto err_clk;
+               goto err_free_phylink;
        }
 
        /* Alloc per-cpu stats */
@@ -5569,12 +5618,12 @@ err_netdev:
        free_percpu(pp->stats);
 err_free_ports:
        free_percpu(pp->ports);
-err_clk:
-       clk_disable_unprepare(pp->clk_bus);
-       clk_disable_unprepare(pp->clk);
 err_free_phylink:
        if (pp->phylink)
                phylink_destroy(pp->phylink);
+err_clk:
+       clk_disable_unprepare(pp->clk_bus);
+       clk_disable_unprepare(pp->clk);
 err_free_irq:
        irq_dispose_mapping(dev->irq);
        return err;
index 3631d61..25491ed 100644 (file)
@@ -578,31 +578,78 @@ void cgx_lmac_promisc_config(int cgx_id, int lmac_id, bool enable)
        }
 }
 
+static int cgx_lmac_get_pause_frm_status(void *cgxd, int lmac_id,
+                                        u8 *tx_pause, u8 *rx_pause)
+{
+       struct cgx *cgx = cgxd;
+       u64 cfg;
+
+       if (is_dev_rpm(cgx))
+               return 0;
+
+       if (!is_lmac_valid(cgx, lmac_id))
+               return -ENODEV;
+
+       cfg = cgx_read(cgx, lmac_id, CGXX_SMUX_RX_FRM_CTL);
+       *rx_pause = !!(cfg & CGX_SMUX_RX_FRM_CTL_CTL_BCK);
+
+       cfg = cgx_read(cgx, lmac_id, CGXX_SMUX_TX_CTL);
+       *tx_pause = !!(cfg & CGX_SMUX_TX_CTL_L2P_BP_CONV);
+       return 0;
+}
+
 /* Enable or disable forwarding received pause frames to Tx block */
 void cgx_lmac_enadis_rx_pause_fwding(void *cgxd, int lmac_id, bool enable)
 {
        struct cgx *cgx = cgxd;
+       u8 rx_pause, tx_pause;
+       bool is_pfc_enabled;
+       struct lmac *lmac;
        u64 cfg;
 
        if (!cgx)
                return;
 
-       if (enable) {
-               cfg = cgx_read(cgx, lmac_id, CGXX_GMP_GMI_RXX_FRM_CTL);
-               cfg |= CGX_GMP_GMI_RXX_FRM_CTL_CTL_BCK;
-               cgx_write(cgx, lmac_id, CGXX_GMP_GMI_RXX_FRM_CTL, cfg);
+       lmac = lmac_pdata(lmac_id, cgx);
+       if (!lmac)
+               return;
 
-               cfg = cgx_read(cgx, lmac_id, CGXX_SMUX_RX_FRM_CTL);
-               cfg |= CGX_SMUX_RX_FRM_CTL_CTL_BCK;
-               cgx_write(cgx, lmac_id, CGXX_SMUX_RX_FRM_CTL, cfg);
+       /* Pause frames are not enabled just return */
+       if (!bitmap_weight(lmac->rx_fc_pfvf_bmap.bmap, lmac->rx_fc_pfvf_bmap.max))
+               return;
+
+       cgx_lmac_get_pause_frm_status(cgx, lmac_id, &rx_pause, &tx_pause);
+       is_pfc_enabled = rx_pause ? false : true;
+
+       if (enable) {
+               if (!is_pfc_enabled) {
+                       cfg = cgx_read(cgx, lmac_id, CGXX_GMP_GMI_RXX_FRM_CTL);
+                       cfg |= CGX_GMP_GMI_RXX_FRM_CTL_CTL_BCK;
+                       cgx_write(cgx, lmac_id, CGXX_GMP_GMI_RXX_FRM_CTL, cfg);
+
+                       cfg = cgx_read(cgx, lmac_id, CGXX_SMUX_RX_FRM_CTL);
+                       cfg |= CGX_SMUX_RX_FRM_CTL_CTL_BCK;
+                       cgx_write(cgx, lmac_id, CGXX_SMUX_RX_FRM_CTL, cfg);
+               } else {
+                       cfg = cgx_read(cgx, lmac_id, CGXX_SMUX_CBFC_CTL);
+                       cfg |= CGXX_SMUX_CBFC_CTL_BCK_EN;
+                       cgx_write(cgx, lmac_id, CGXX_SMUX_CBFC_CTL, cfg);
+               }
        } else {
-               cfg = cgx_read(cgx, lmac_id, CGXX_GMP_GMI_RXX_FRM_CTL);
-               cfg &= ~CGX_GMP_GMI_RXX_FRM_CTL_CTL_BCK;
-               cgx_write(cgx, lmac_id, CGXX_GMP_GMI_RXX_FRM_CTL, cfg);
 
-               cfg = cgx_read(cgx, lmac_id, CGXX_SMUX_RX_FRM_CTL);
-               cfg &= ~CGX_SMUX_RX_FRM_CTL_CTL_BCK;
-               cgx_write(cgx, lmac_id, CGXX_SMUX_RX_FRM_CTL, cfg);
+               if (!is_pfc_enabled) {
+                       cfg = cgx_read(cgx, lmac_id, CGXX_GMP_GMI_RXX_FRM_CTL);
+                       cfg &= ~CGX_GMP_GMI_RXX_FRM_CTL_CTL_BCK;
+                       cgx_write(cgx, lmac_id, CGXX_GMP_GMI_RXX_FRM_CTL, cfg);
+
+                       cfg = cgx_read(cgx, lmac_id, CGXX_SMUX_RX_FRM_CTL);
+                       cfg &= ~CGX_SMUX_RX_FRM_CTL_CTL_BCK;
+                       cgx_write(cgx, lmac_id, CGXX_SMUX_RX_FRM_CTL, cfg);
+               } else {
+                       cfg = cgx_read(cgx, lmac_id, CGXX_SMUX_CBFC_CTL);
+                       cfg &= ~CGXX_SMUX_CBFC_CTL_BCK_EN;
+                       cgx_write(cgx, lmac_id, CGXX_SMUX_CBFC_CTL, cfg);
+               }
        }
 }
 
@@ -722,26 +769,6 @@ int cgx_lmac_tx_enable(void *cgxd, int lmac_id, bool enable)
        return !!(last & DATA_PKT_TX_EN);
 }
 
-static int cgx_lmac_get_pause_frm_status(void *cgxd, int lmac_id,
-                                        u8 *tx_pause, u8 *rx_pause)
-{
-       struct cgx *cgx = cgxd;
-       u64 cfg;
-
-       if (is_dev_rpm(cgx))
-               return 0;
-
-       if (!is_lmac_valid(cgx, lmac_id))
-               return -ENODEV;
-
-       cfg = cgx_read(cgx, lmac_id, CGXX_SMUX_RX_FRM_CTL);
-       *rx_pause = !!(cfg & CGX_SMUX_RX_FRM_CTL_CTL_BCK);
-
-       cfg = cgx_read(cgx, lmac_id, CGXX_SMUX_TX_CTL);
-       *tx_pause = !!(cfg & CGX_SMUX_TX_CTL_L2P_BP_CONV);
-       return 0;
-}
-
 static int cgx_lmac_enadis_pause_frm(void *cgxd, int lmac_id,
                                     u8 tx_pause, u8 rx_pause)
 {
@@ -782,21 +809,8 @@ static void cgx_lmac_pause_frm_config(void *cgxd, int lmac_id, bool enable)
 
        if (!is_lmac_valid(cgx, lmac_id))
                return;
-       if (enable) {
-               /* Enable receive pause frames */
-               cfg = cgx_read(cgx, lmac_id, CGXX_SMUX_RX_FRM_CTL);
-               cfg |= CGX_SMUX_RX_FRM_CTL_CTL_BCK;
-               cgx_write(cgx, lmac_id, CGXX_SMUX_RX_FRM_CTL, cfg);
-
-               cfg = cgx_read(cgx, lmac_id, CGXX_GMP_GMI_RXX_FRM_CTL);
-               cfg |= CGX_GMP_GMI_RXX_FRM_CTL_CTL_BCK;
-               cgx_write(cgx, lmac_id, CGXX_GMP_GMI_RXX_FRM_CTL, cfg);
-
-               /* Enable pause frames transmission */
-               cfg = cgx_read(cgx, lmac_id, CGXX_SMUX_TX_CTL);
-               cfg |= CGX_SMUX_TX_CTL_L2P_BP_CONV;
-               cgx_write(cgx, lmac_id, CGXX_SMUX_TX_CTL, cfg);
 
+       if (enable) {
                /* Set pause time and interval */
                cgx_write(cgx, lmac_id, CGXX_SMUX_TX_PAUSE_PKT_TIME,
                          DEFAULT_PAUSE_TIME);
@@ -813,21 +827,120 @@ static void cgx_lmac_pause_frm_config(void *cgxd, int lmac_id, bool enable)
                cfg &= ~0xFFFFULL;
                cgx_write(cgx, lmac_id, CGXX_GMP_GMI_TX_PAUSE_PKT_INTERVAL,
                          cfg | (DEFAULT_PAUSE_TIME / 2));
-       } else {
-               /* ALL pause frames received are completely ignored */
-               cfg = cgx_read(cgx, lmac_id, CGXX_SMUX_RX_FRM_CTL);
-               cfg &= ~CGX_SMUX_RX_FRM_CTL_CTL_BCK;
-               cgx_write(cgx, lmac_id, CGXX_SMUX_RX_FRM_CTL, cfg);
+       }
 
-               cfg = cgx_read(cgx, lmac_id, CGXX_GMP_GMI_RXX_FRM_CTL);
-               cfg &= ~CGX_GMP_GMI_RXX_FRM_CTL_CTL_BCK;
-               cgx_write(cgx, lmac_id, CGXX_GMP_GMI_RXX_FRM_CTL, cfg);
+       /* ALL pause frames received are completely ignored */
+       cfg = cgx_read(cgx, lmac_id, CGXX_SMUX_RX_FRM_CTL);
+       cfg &= ~CGX_SMUX_RX_FRM_CTL_CTL_BCK;
+       cgx_write(cgx, lmac_id, CGXX_SMUX_RX_FRM_CTL, cfg);
+
+       cfg = cgx_read(cgx, lmac_id, CGXX_GMP_GMI_RXX_FRM_CTL);
+       cfg &= ~CGX_GMP_GMI_RXX_FRM_CTL_CTL_BCK;
+       cgx_write(cgx, lmac_id, CGXX_GMP_GMI_RXX_FRM_CTL, cfg);
+
+       /* Disable pause frames transmission */
+       cfg = cgx_read(cgx, lmac_id, CGXX_SMUX_TX_CTL);
+       cfg &= ~CGX_SMUX_TX_CTL_L2P_BP_CONV;
+       cgx_write(cgx, lmac_id, CGXX_SMUX_TX_CTL, cfg);
+
+       cfg = cgx_read(cgx, 0, CGXX_CMR_RX_OVR_BP);
+       cfg |= CGX_CMR_RX_OVR_BP_EN(lmac_id);
+       cfg &= ~CGX_CMR_RX_OVR_BP_BP(lmac_id);
+       cgx_write(cgx, 0, CGXX_CMR_RX_OVR_BP, cfg);
+}
+
+int verify_lmac_fc_cfg(void *cgxd, int lmac_id, u8 tx_pause, u8 rx_pause,
+                      int pfvf_idx)
+{
+       struct cgx *cgx = cgxd;
+       struct lmac *lmac;
+
+       lmac = lmac_pdata(lmac_id, cgx);
+       if (!lmac)
+               return -ENODEV;
+
+       if (!rx_pause)
+               clear_bit(pfvf_idx, lmac->rx_fc_pfvf_bmap.bmap);
+       else
+               set_bit(pfvf_idx, lmac->rx_fc_pfvf_bmap.bmap);
+
+       if (!tx_pause)
+               clear_bit(pfvf_idx, lmac->tx_fc_pfvf_bmap.bmap);
+       else
+               set_bit(pfvf_idx, lmac->tx_fc_pfvf_bmap.bmap);
+
+       /* check if other pfvfs are using flow control */
+       if (!rx_pause && bitmap_weight(lmac->rx_fc_pfvf_bmap.bmap, lmac->rx_fc_pfvf_bmap.max)) {
+               dev_warn(&cgx->pdev->dev,
+                        "Receive Flow control disable not permitted as its used by other PFVFs\n");
+               return -EPERM;
+       }
+
+       if (!tx_pause && bitmap_weight(lmac->tx_fc_pfvf_bmap.bmap, lmac->tx_fc_pfvf_bmap.max)) {
+               dev_warn(&cgx->pdev->dev,
+                        "Transmit Flow control disable not permitted as its used by other PFVFs\n");
+               return -EPERM;
+       }
+
+       return 0;
+}
+
+int cgx_lmac_pfc_config(void *cgxd, int lmac_id, u8 tx_pause,
+                       u8 rx_pause, u16 pfc_en)
+{
+       struct cgx *cgx = cgxd;
+       u64 cfg;
+
+       if (!is_lmac_valid(cgx, lmac_id))
+               return -ENODEV;
+
+       /* Return as no traffic classes are requested */
+       if (tx_pause && !pfc_en)
+               return 0;
+
+       cfg = cgx_read(cgx, lmac_id, CGXX_SMUX_CBFC_CTL);
 
-               /* Disable pause frames transmission */
-               cfg = cgx_read(cgx, lmac_id, CGXX_SMUX_TX_CTL);
-               cfg &= ~CGX_SMUX_TX_CTL_L2P_BP_CONV;
-               cgx_write(cgx, lmac_id, CGXX_SMUX_TX_CTL, cfg);
+       if (rx_pause) {
+               cfg |= (CGXX_SMUX_CBFC_CTL_RX_EN |
+                       CGXX_SMUX_CBFC_CTL_BCK_EN |
+                       CGXX_SMUX_CBFC_CTL_DRP_EN);
+       } else {
+               cfg &= ~(CGXX_SMUX_CBFC_CTL_RX_EN |
+                       CGXX_SMUX_CBFC_CTL_BCK_EN |
+                       CGXX_SMUX_CBFC_CTL_DRP_EN);
        }
+
+       if (tx_pause)
+               cfg |= CGXX_SMUX_CBFC_CTL_TX_EN;
+       else
+               cfg &= ~CGXX_SMUX_CBFC_CTL_TX_EN;
+
+       cfg = FIELD_SET(CGX_PFC_CLASS_MASK, pfc_en, cfg);
+
+       cgx_write(cgx, lmac_id, CGXX_SMUX_CBFC_CTL, cfg);
+
+       /* Write source MAC address which will be filled into PFC packet */
+       cfg = cgx_lmac_addr_get(cgx->cgx_id, lmac_id);
+       cgx_write(cgx, lmac_id, CGXX_SMUX_SMAC, cfg);
+
+       return 0;
+}
+
+int cgx_lmac_get_pfc_frm_cfg(void *cgxd, int lmac_id, u8 *tx_pause,
+                            u8 *rx_pause)
+{
+       struct cgx *cgx = cgxd;
+       u64 cfg;
+
+       if (!is_lmac_valid(cgx, lmac_id))
+               return -ENODEV;
+
+       cfg = cgx_read(cgx, lmac_id, CGXX_SMUX_CBFC_CTL);
+
+       *rx_pause = !!(cfg & CGXX_SMUX_CBFC_CTL_RX_EN);
+       *tx_pause = !!(cfg & CGXX_SMUX_CBFC_CTL_TX_EN);
+
+       return 0;
 }
 
 void cgx_lmac_ptp_config(void *cgxd, int lmac_id, bool enable)
@@ -1489,6 +1602,16 @@ static int cgx_lmac_init(struct cgx *cgx)
                /* Reserve first entry for default MAC address */
                set_bit(0, lmac->mac_to_index_bmap.bmap);
 
+               lmac->rx_fc_pfvf_bmap.max = 128;
+               err = rvu_alloc_bitmap(&lmac->rx_fc_pfvf_bmap);
+               if (err)
+                       goto err_dmac_bmap_free;
+
+               lmac->tx_fc_pfvf_bmap.max = 128;
+               err = rvu_alloc_bitmap(&lmac->tx_fc_pfvf_bmap);
+               if (err)
+                       goto err_rx_fc_bmap_free;
+
                init_waitqueue_head(&lmac->wq_cmd_cmplt);
                mutex_init(&lmac->cmd_lock);
                spin_lock_init(&lmac->event_cb_lock);
@@ -1505,6 +1628,10 @@ static int cgx_lmac_init(struct cgx *cgx)
        return cgx_lmac_verify_fwi_version(cgx);
 
 err_bitmap_free:
+       rvu_free_bitmap(&lmac->tx_fc_pfvf_bmap);
+err_rx_fc_bmap_free:
+       rvu_free_bitmap(&lmac->rx_fc_pfvf_bmap);
+err_dmac_bmap_free:
        rvu_free_bitmap(&lmac->mac_to_index_bmap);
 err_name_free:
        kfree(lmac->name);
@@ -1572,6 +1699,8 @@ static struct mac_ops     cgx_mac_ops    = {
        .mac_enadis_ptp_config =        cgx_lmac_ptp_config,
        .mac_rx_tx_enable =             cgx_lmac_rx_tx_enable,
        .mac_tx_enable =                cgx_lmac_tx_enable,
+       .pfc_config =                   cgx_lmac_pfc_config,
+       .mac_get_pfc_frm_cfg   =        cgx_lmac_get_pfc_frm_cfg,
 };
 
 static int cgx_probe(struct pci_dev *pdev, const struct pci_device_id *id)
index ab1e4ab..bd2f33a 100644 (file)
 #define CGXX_SMUX_TX_CTL               0x20178
 #define CGXX_SMUX_TX_PAUSE_PKT_TIME    0x20110
 #define CGXX_SMUX_TX_PAUSE_PKT_INTERVAL        0x20120
+#define CGXX_SMUX_SMAC                        0x20108
+#define CGXX_SMUX_CBFC_CTL                    0x20218
+#define CGXX_SMUX_CBFC_CTL_RX_EN             BIT_ULL(0)
+#define CGXX_SMUX_CBFC_CTL_TX_EN             BIT_ULL(1)
+#define CGXX_SMUX_CBFC_CTL_DRP_EN            BIT_ULL(2)
+#define CGXX_SMUX_CBFC_CTL_BCK_EN            BIT_ULL(3)
+#define CGX_PFC_CLASS_MASK                  GENMASK_ULL(47, 32)
 #define CGXX_GMP_GMI_TX_PAUSE_PKT_TIME 0x38230
 #define CGXX_GMP_GMI_TX_PAUSE_PKT_INTERVAL     0x38248
 #define CGX_SMUX_TX_CTL_L2P_BP_CONV    BIT_ULL(7)
@@ -172,4 +179,10 @@ u64 cgx_lmac_read(int cgx_id, int lmac_id, u64 offset);
 int cgx_lmac_addr_update(u8 cgx_id, u8 lmac_id, u8 *mac_addr, u8 index);
 u64 cgx_read_dmac_ctrl(void *cgxd, int lmac_id);
 u64 cgx_read_dmac_entry(void *cgxd, int index);
+int cgx_lmac_pfc_config(void *cgxd, int lmac_id, u8 tx_pause, u8 rx_pause,
+                       u16 pfc_en);
+int cgx_lmac_get_pfc_frm_cfg(void *cgxd, int lmac_id, u8 *tx_pause,
+                            u8 *rx_pause);
+int verify_lmac_fc_cfg(void *cgxd, int lmac_id, u8 tx_pause, u8 rx_pause,
+                      int pfvf_idx);
 #endif /* CGX_H */
index b33e7d1..f30581b 100644 (file)
@@ -17,6 +17,8 @@
  * @resp:              command response
  * @link_info:         link related information
  * @mac_to_index_bmap: Mac address to CGX table index mapping
+ * @rx_fc_pfvf_bmap:    Receive flow control enabled netdev mapping
+ * @tx_fc_pfvf_bmap:    Transmit flow control enabled netdev mapping
  * @event_cb:          callback for linkchange events
  * @event_cb_lock:     lock for serializing callback with unregister
  * @cgx:               parent cgx port
@@ -33,6 +35,8 @@ struct lmac {
        u64 resp;
        struct cgx_link_user_info link_info;
        struct rsrc_bmap mac_to_index_bmap;
+       struct rsrc_bmap rx_fc_pfvf_bmap;
+       struct rsrc_bmap tx_fc_pfvf_bmap;
        struct cgx_event_cb event_cb;
        /* lock for serializing callback with unregister */
        spinlock_t event_cb_lock;
@@ -110,6 +114,12 @@ struct mac_ops {
 
        int                     (*mac_rx_tx_enable)(void *cgxd, int lmac_id, bool enable);
        int                     (*mac_tx_enable)(void *cgxd, int lmac_id, bool enable);
+       int                     (*pfc_config)(void *cgxd, int lmac_id,
+                                             u8 tx_pause, u8 rx_pause, u16 pfc_en);
+
+       int                     (*mac_get_pfc_frm_cfg)(void *cgxd, int lmac_id,
+                                                      u8 *tx_pause, u8 *rx_pause);
+
 };
 
 struct cgx {
index 58e2aee..550cb11 100644 (file)
@@ -172,6 +172,8 @@ M(RPM_STATS,                0x21C, rpm_stats, msg_req, rpm_stats_rsp)       \
 M(CGX_MAC_ADDR_RESET,  0x21D, cgx_mac_addr_reset, msg_req, msg_rsp)    \
 M(CGX_MAC_ADDR_UPDATE, 0x21E, cgx_mac_addr_update, cgx_mac_addr_update_req, \
                               msg_rsp)                                 \
+M(CGX_PRIO_FLOW_CTRL_CFG, 0x21F, cgx_prio_flow_ctrl_cfg, cgx_pfc_cfg,  \
+                                cgx_pfc_rsp)                               \
 /* NPA mbox IDs (range 0x400 - 0x5FF) */                               \
 M(NPA_LF_ALLOC,                0x400, npa_lf_alloc,                            \
                                npa_lf_alloc_req, npa_lf_alloc_rsp)     \
@@ -609,6 +611,21 @@ struct rpm_stats_rsp {
        u64 tx_stats[RPM_TX_STATS_COUNT];
 };
 
+struct cgx_pfc_cfg {
+       struct mbox_msghdr hdr;
+       u8 rx_pause;
+       u8 tx_pause;
+       u16 pfc_en; /*  bitmap indicating pfc enabled traffic classes */
+};
+
+struct cgx_pfc_rsp {
+       struct mbox_msghdr hdr;
+       u8 rx_pause;
+       u8 tx_pause;
+};
+
+ /* NPA mbox message formats */
+
 struct npc_set_pkind {
        struct mbox_msghdr hdr;
 #define OTX2_PRIV_FLAGS_DEFAULT  BIT_ULL(0)
@@ -1603,6 +1620,8 @@ enum cgx_af_status {
        LMAC_AF_ERR_INVALID_PARAM       = -1101,
        LMAC_AF_ERR_PF_NOT_MAPPED       = -1102,
        LMAC_AF_ERR_PERM_DENIED         = -1103,
+       LMAC_AF_ERR_PFC_ENADIS_PERM_DENIED       = -1104,
+       LMAC_AF_ERR_8023PAUSE_ENADIS_PERM_DENIED = -1105,
 };
 
 #endif /* MBOX_H */
index 9ea2f6a..d7a8aad 100644 (file)
@@ -32,6 +32,8 @@ static struct mac_ops rpm_mac_ops   = {
        .mac_enadis_ptp_config =        rpm_lmac_ptp_config,
        .mac_rx_tx_enable =             rpm_lmac_rx_tx_enable,
        .mac_tx_enable =                rpm_lmac_tx_enable,
+       .pfc_config =                   rpm_lmac_pfc_config,
+       .mac_get_pfc_frm_cfg   =        rpm_lmac_get_pfc_frm_cfg,
 };
 
 struct mac_ops *rpm_get_mac_ops(void)
@@ -96,11 +98,20 @@ int rpm_lmac_rx_tx_enable(void *rpmd, int lmac_id, bool enable)
 void rpm_lmac_enadis_rx_pause_fwding(void *rpmd, int lmac_id, bool enable)
 {
        rpm_t *rpm = rpmd;
+       struct lmac *lmac;
        u64 cfg;
 
        if (!rpm)
                return;
 
+       lmac = lmac_pdata(lmac_id, rpm);
+       if (!lmac)
+               return;
+
+       /* Pause frames are not enabled just return */
+       if (!bitmap_weight(lmac->rx_fc_pfvf_bmap.bmap, lmac->rx_fc_pfvf_bmap.max))
+               return;
+
        if (enable) {
                cfg = rpm_read(rpm, lmac_id, RPMX_MTI_MAC100X_COMMAND_CONFIG);
                cfg &= ~RPMX_MTI_MAC100X_COMMAND_CONFIG_PAUSE_IGNORE;
@@ -122,13 +133,93 @@ int rpm_lmac_get_pause_frm_status(void *rpmd, int lmac_id,
                return -ENODEV;
 
        cfg = rpm_read(rpm, lmac_id, RPMX_MTI_MAC100X_COMMAND_CONFIG);
-       *rx_pause = !(cfg & RPMX_MTI_MAC100X_COMMAND_CONFIG_RX_P_DISABLE);
+       if (!(cfg & RPMX_MTI_MAC100X_COMMAND_CONFIG_PFC_MODE)) {
+               *rx_pause = !(cfg & RPMX_MTI_MAC100X_COMMAND_CONFIG_RX_P_DISABLE);
+               *tx_pause = !(cfg & RPMX_MTI_MAC100X_COMMAND_CONFIG_TX_P_DISABLE);
+       }
 
-       cfg = rpm_read(rpm, lmac_id, RPMX_MTI_MAC100X_COMMAND_CONFIG);
-       *tx_pause = !(cfg & RPMX_MTI_MAC100X_COMMAND_CONFIG_TX_P_DISABLE);
        return 0;
 }
 
+static void rpm_cfg_pfc_quanta_thresh(rpm_t *rpm, int lmac_id, u16 pfc_en,
+                                     bool enable)
+{
+       u64 quanta_offset = 0, quanta_thresh = 0, cfg;
+       int i, shift;
+
+       /* Set pause time and interval */
+       for_each_set_bit(i, (unsigned long *)&pfc_en, 16) {
+               switch (i) {
+               case 0:
+               case 1:
+                       quanta_offset = RPMX_MTI_MAC100X_CL01_PAUSE_QUANTA;
+                       quanta_thresh = RPMX_MTI_MAC100X_CL01_QUANTA_THRESH;
+                       break;
+               case 2:
+               case 3:
+                       quanta_offset = RPMX_MTI_MAC100X_CL23_PAUSE_QUANTA;
+                       quanta_thresh = RPMX_MTI_MAC100X_CL23_QUANTA_THRESH;
+                       break;
+               case 4:
+               case 5:
+                       quanta_offset = RPMX_MTI_MAC100X_CL45_PAUSE_QUANTA;
+                       quanta_thresh = RPMX_MTI_MAC100X_CL45_QUANTA_THRESH;
+                       break;
+               case 6:
+               case 7:
+                       quanta_offset = RPMX_MTI_MAC100X_CL67_PAUSE_QUANTA;
+                       quanta_thresh = RPMX_MTI_MAC100X_CL67_QUANTA_THRESH;
+                       break;
+               case 8:
+               case 9:
+                       quanta_offset = RPMX_MTI_MAC100X_CL89_PAUSE_QUANTA;
+                       quanta_thresh = RPMX_MTI_MAC100X_CL89_QUANTA_THRESH;
+                       break;
+               case 10:
+               case 11:
+                       quanta_offset = RPMX_MTI_MAC100X_CL1011_PAUSE_QUANTA;
+                       quanta_thresh = RPMX_MTI_MAC100X_CL1011_QUANTA_THRESH;
+                       break;
+               case 12:
+               case 13:
+                       quanta_offset = RPMX_MTI_MAC100X_CL1213_PAUSE_QUANTA;
+                       quanta_thresh = RPMX_MTI_MAC100X_CL1213_QUANTA_THRESH;
+                       break;
+               case 14:
+               case 15:
+                       quanta_offset = RPMX_MTI_MAC100X_CL1415_PAUSE_QUANTA;
+                       quanta_thresh = RPMX_MTI_MAC100X_CL1415_QUANTA_THRESH;
+                       break;
+               }
+
+               if (!quanta_offset || !quanta_thresh)
+                       continue;
+
+               shift = (i % 2) ? 1 : 0;
+               cfg = rpm_read(rpm, lmac_id, quanta_offset);
+               if (enable) {
+                       cfg |= ((u64)RPM_DEFAULT_PAUSE_TIME <<  shift * 16);
+               } else {
+                       if (!shift)
+                               cfg &= ~GENMASK_ULL(15, 0);
+                       else
+                               cfg &= ~GENMASK_ULL(31, 16);
+               }
+               rpm_write(rpm, lmac_id, quanta_offset, cfg);
+
+               cfg = rpm_read(rpm, lmac_id, quanta_thresh);
+               if (enable) {
+                       cfg |= ((u64)(RPM_DEFAULT_PAUSE_TIME / 2) <<  shift * 16);
+               } else {
+                       if (!shift)
+                               cfg &= ~GENMASK_ULL(15, 0);
+                       else
+                               cfg &= ~GENMASK_ULL(31, 16);
+               }
+               rpm_write(rpm, lmac_id, quanta_thresh, cfg);
+       }
+}
+
 int rpm_lmac_enadis_pause_frm(void *rpmd, int lmac_id, u8 tx_pause,
                              u8 rx_pause)
 {
@@ -152,8 +243,12 @@ int rpm_lmac_enadis_pause_frm(void *rpmd, int lmac_id, u8 tx_pause,
 
        cfg = rpm_read(rpm, 0, RPMX_CMR_RX_OVR_BP);
        if (tx_pause) {
+               /* Configure CL0 Pause Quanta & threshold for 802.3X frames */
+               rpm_cfg_pfc_quanta_thresh(rpm, lmac_id, 1, true);
                cfg &= ~RPMX_CMR_RX_OVR_BP_EN(lmac_id);
        } else {
+               /* Disable all Pause Quanta & threshold values */
+               rpm_cfg_pfc_quanta_thresh(rpm, lmac_id, 0xffff, false);
                cfg |= RPMX_CMR_RX_OVR_BP_EN(lmac_id);
                cfg &= ~RPMX_CMR_RX_OVR_BP_BP(lmac_id);
        }
@@ -166,56 +261,20 @@ void rpm_lmac_pause_frm_config(void *rpmd, int lmac_id, bool enable)
        rpm_t *rpm = rpmd;
        u64 cfg;
 
-       if (enable) {
-               /* Enable 802.3 pause frame mode */
-               cfg = rpm_read(rpm, lmac_id, RPMX_MTI_MAC100X_COMMAND_CONFIG);
-               cfg &= ~RPMX_MTI_MAC100X_COMMAND_CONFIG_PFC_MODE;
-               rpm_write(rpm, lmac_id, RPMX_MTI_MAC100X_COMMAND_CONFIG, cfg);
-
-               /* Enable receive pause frames */
-               cfg = rpm_read(rpm, lmac_id, RPMX_MTI_MAC100X_COMMAND_CONFIG);
-               cfg &= ~RPMX_MTI_MAC100X_COMMAND_CONFIG_RX_P_DISABLE;
-               rpm_write(rpm, lmac_id, RPMX_MTI_MAC100X_COMMAND_CONFIG, cfg);
-
-               /* Enable forward pause to TX block */
-               cfg = rpm_read(rpm, lmac_id, RPMX_MTI_MAC100X_COMMAND_CONFIG);
-               cfg &= ~RPMX_MTI_MAC100X_COMMAND_CONFIG_PAUSE_IGNORE;
-               rpm_write(rpm, lmac_id, RPMX_MTI_MAC100X_COMMAND_CONFIG, cfg);
-
-               /* Enable pause frames transmission */
-               cfg = rpm_read(rpm, lmac_id, RPMX_MTI_MAC100X_COMMAND_CONFIG);
-               cfg &= ~RPMX_MTI_MAC100X_COMMAND_CONFIG_TX_P_DISABLE;
-               rpm_write(rpm, lmac_id, RPMX_MTI_MAC100X_COMMAND_CONFIG, cfg);
-
-               /* Set pause time and interval */
-               cfg = rpm_read(rpm, lmac_id,
-                              RPMX_MTI_MAC100X_CL01_PAUSE_QUANTA);
-               cfg &= ~0xFFFFULL;
-               rpm_write(rpm, lmac_id, RPMX_MTI_MAC100X_CL01_PAUSE_QUANTA,
-                         cfg | RPM_DEFAULT_PAUSE_TIME);
-               /* Set pause interval as the hardware default is too short */
-               cfg = rpm_read(rpm, lmac_id,
-                              RPMX_MTI_MAC100X_CL01_QUANTA_THRESH);
-               cfg &= ~0xFFFFULL;
-               rpm_write(rpm, lmac_id, RPMX_MTI_MAC100X_CL01_QUANTA_THRESH,
-                         cfg | (RPM_DEFAULT_PAUSE_TIME / 2));
-
-       } else {
-               /* ALL pause frames received are completely ignored */
-               cfg = rpm_read(rpm, lmac_id, RPMX_MTI_MAC100X_COMMAND_CONFIG);
-               cfg |= RPMX_MTI_MAC100X_COMMAND_CONFIG_RX_P_DISABLE;
-               rpm_write(rpm, lmac_id, RPMX_MTI_MAC100X_COMMAND_CONFIG, cfg);
+       /* ALL pause frames received are completely ignored */
+       cfg = rpm_read(rpm, lmac_id, RPMX_MTI_MAC100X_COMMAND_CONFIG);
+       cfg |= RPMX_MTI_MAC100X_COMMAND_CONFIG_RX_P_DISABLE;
+       rpm_write(rpm, lmac_id, RPMX_MTI_MAC100X_COMMAND_CONFIG, cfg);
 
-               /* Disable forward pause to TX block */
-               cfg = rpm_read(rpm, lmac_id, RPMX_MTI_MAC100X_COMMAND_CONFIG);
-               cfg |= RPMX_MTI_MAC100X_COMMAND_CONFIG_PAUSE_IGNORE;
-               rpm_write(rpm, lmac_id, RPMX_MTI_MAC100X_COMMAND_CONFIG, cfg);
+       /* Disable forward pause to TX block */
+       cfg = rpm_read(rpm, lmac_id, RPMX_MTI_MAC100X_COMMAND_CONFIG);
+       cfg |= RPMX_MTI_MAC100X_COMMAND_CONFIG_PAUSE_IGNORE;
+       rpm_write(rpm, lmac_id, RPMX_MTI_MAC100X_COMMAND_CONFIG, cfg);
 
-               /* Disable pause frames transmission */
-               cfg = rpm_read(rpm, lmac_id, RPMX_MTI_MAC100X_COMMAND_CONFIG);
-               cfg |= RPMX_MTI_MAC100X_COMMAND_CONFIG_TX_P_DISABLE;
-               rpm_write(rpm, lmac_id, RPMX_MTI_MAC100X_COMMAND_CONFIG, cfg);
-       }
+       /* Disable pause frames transmission */
+       cfg = rpm_read(rpm, lmac_id, RPMX_MTI_MAC100X_COMMAND_CONFIG);
+       cfg |= RPMX_MTI_MAC100X_COMMAND_CONFIG_TX_P_DISABLE;
+       rpm_write(rpm, lmac_id, RPMX_MTI_MAC100X_COMMAND_CONFIG, cfg);
 }
 
 int rpm_get_rx_stats(void *rpmd, int lmac_id, int idx, u64 *rx_stat)
@@ -323,3 +382,65 @@ void rpm_lmac_ptp_config(void *rpmd, int lmac_id, bool enable)
                cfg &= ~RPMX_RX_TS_PREPEND;
        rpm_write(rpm, lmac_id, RPMX_CMRX_CFG, cfg);
 }
+
+int rpm_lmac_pfc_config(void *rpmd, int lmac_id, u8 tx_pause, u8 rx_pause, u16 pfc_en)
+{
+       rpm_t *rpm = rpmd;
+       u64 cfg;
+
+       if (!is_lmac_valid(rpm, lmac_id))
+               return -ENODEV;
+
+       /* reset PFC class quanta and threshold */
+       rpm_cfg_pfc_quanta_thresh(rpm, lmac_id, 0xffff, false);
+
+       cfg = rpm_read(rpm, lmac_id, RPMX_MTI_MAC100X_COMMAND_CONFIG);
+
+       if (rx_pause) {
+               cfg &= ~(RPMX_MTI_MAC100X_COMMAND_CONFIG_RX_P_DISABLE |
+                               RPMX_MTI_MAC100X_COMMAND_CONFIG_PAUSE_IGNORE |
+                               RPMX_MTI_MAC100X_COMMAND_CONFIG_PAUSE_FWD);
+       } else {
+               cfg |= (RPMX_MTI_MAC100X_COMMAND_CONFIG_RX_P_DISABLE |
+                               RPMX_MTI_MAC100X_COMMAND_CONFIG_PAUSE_IGNORE |
+                               RPMX_MTI_MAC100X_COMMAND_CONFIG_PAUSE_FWD);
+       }
+
+       if (tx_pause) {
+               rpm_cfg_pfc_quanta_thresh(rpm, lmac_id, pfc_en, true);
+               cfg &= ~RPMX_MTI_MAC100X_COMMAND_CONFIG_TX_P_DISABLE;
+       } else {
+               rpm_cfg_pfc_quanta_thresh(rpm, lmac_id, 0xfff, false);
+               cfg |= RPMX_MTI_MAC100X_COMMAND_CONFIG_TX_P_DISABLE;
+       }
+
+       if (!rx_pause && !tx_pause)
+               cfg &= ~RPMX_MTI_MAC100X_COMMAND_CONFIG_PFC_MODE;
+       else
+               cfg |= RPMX_MTI_MAC100X_COMMAND_CONFIG_PFC_MODE;
+
+       rpm_write(rpm, lmac_id, RPMX_MTI_MAC100X_COMMAND_CONFIG, cfg);
+
+       cfg = rpm_read(rpm, lmac_id, RPMX_CMRX_PRT_CBFC_CTL);
+       cfg = FIELD_SET(RPM_PFC_CLASS_MASK, pfc_en, cfg);
+       rpm_write(rpm, lmac_id, RPMX_CMRX_PRT_CBFC_CTL, cfg);
+
+       return 0;
+}
+
+int  rpm_lmac_get_pfc_frm_cfg(void *rpmd, int lmac_id, u8 *tx_pause, u8 *rx_pause)
+{
+       rpm_t *rpm = rpmd;
+       u64 cfg;
+
+       if (!is_lmac_valid(rpm, lmac_id))
+               return -ENODEV;
+
+       cfg = rpm_read(rpm, lmac_id, RPMX_MTI_MAC100X_COMMAND_CONFIG);
+       if (cfg & RPMX_MTI_MAC100X_COMMAND_CONFIG_PFC_MODE) {
+               *rx_pause = !(cfg & RPMX_MTI_MAC100X_COMMAND_CONFIG_RX_P_DISABLE);
+               *tx_pause = !(cfg & RPMX_MTI_MAC100X_COMMAND_CONFIG_TX_P_DISABLE);
+       }
+
+       return 0;
+}
index ff58031..9ab8d49 100644 (file)
 #define RPMX_MTI_MAC100X_COMMAND_CONFIG_PAUSE_IGNORE   BIT_ULL(8)
 #define RPMX_MTI_MAC100X_COMMAND_CONFIG_PFC_MODE       BIT_ULL(19)
 #define RPMX_MTI_MAC100X_CL01_PAUSE_QUANTA             0x80A8
+#define RPMX_MTI_MAC100X_CL23_PAUSE_QUANTA             0x80B0
+#define RPMX_MTI_MAC100X_CL45_PAUSE_QUANTA             0x80B8
+#define RPMX_MTI_MAC100X_CL67_PAUSE_QUANTA             0x80C0
 #define RPMX_MTI_MAC100X_CL01_QUANTA_THRESH            0x80C8
+#define RPMX_MTI_MAC100X_CL23_QUANTA_THRESH            0x80D0
+#define RPMX_MTI_MAC100X_CL45_QUANTA_THRESH            0x80D8
+#define RPMX_MTI_MAC100X_CL67_QUANTA_THRESH            0x80E0
+#define RPMX_MTI_MAC100X_CL89_PAUSE_QUANTA             0x8108
+#define RPMX_MTI_MAC100X_CL1011_PAUSE_QUANTA           0x8110
+#define RPMX_MTI_MAC100X_CL1213_PAUSE_QUANTA           0x8118
+#define RPMX_MTI_MAC100X_CL1415_PAUSE_QUANTA           0x8120
+#define RPMX_MTI_MAC100X_CL89_QUANTA_THRESH            0x8128
+#define RPMX_MTI_MAC100X_CL1011_QUANTA_THRESH          0x8130
+#define RPMX_MTI_MAC100X_CL1213_QUANTA_THRESH          0x8138
+#define RPMX_MTI_MAC100X_CL1415_QUANTA_THRESH          0x8140
 #define RPM_DEFAULT_PAUSE_TIME                 0xFFFF
 #define RPMX_CMR_RX_OVR_BP             0x4120
 #define RPMX_CMR_RX_OVR_BP_EN(x)       BIT_ULL((x) + 8)
 #define RPM_LMAC_FWI                   0xa
 #define RPM_TX_EN                      BIT_ULL(0)
 #define RPM_RX_EN                      BIT_ULL(1)
+#define RPMX_CMRX_PRT_CBFC_CTL                         0x5B08
+#define RPMX_CMRX_PRT_CBFC_CTL_LOGL_EN_RX_SHIFT        33
+#define RPMX_CMRX_PRT_CBFC_CTL_PHYS_BP_SHIFT           16
+#define RPMX_CMRX_PRT_CBFC_CTL_LOGL_EN_TX_SHIFT        0
+#define RPM_PFC_CLASS_MASK                            GENMASK_ULL(48, 33)
+#define RPMX_MTI_MAC100X_CL89_QUANTA_THRESH            0x8128
+#define RPMX_MTI_MAC100X_COMMAND_CONFIG_TX_PAD_EN              BIT_ULL(11)
+#define RPMX_MTI_MAC100X_COMMAND_CONFIG_PAUSE_IGNORE           BIT_ULL(8)
+#define RPMX_MTI_MAC100X_COMMAND_CONFIG_PAUSE_FWD              BIT_ULL(7)
+#define RPMX_MTI_MAC100X_CL01_PAUSE_QUANTA              0x80A8
+#define RPMX_MTI_MAC100X_CL89_PAUSE_QUANTA             0x8108
+#define RPM_DEFAULT_PAUSE_TIME                          0xFFFF
 
 /* Function Declarations */
 int rpm_get_nr_lmacs(void *rpmd);
@@ -61,4 +87,8 @@ int rpm_get_rx_stats(void *rpmd, int lmac_id, int idx, u64 *rx_stat);
 void rpm_lmac_ptp_config(void *rpmd, int lmac_id, bool enable);
 int rpm_lmac_rx_tx_enable(void *rpmd, int lmac_id, bool enable);
 int rpm_lmac_tx_enable(void *rpmd, int lmac_id, bool enable);
+int rpm_lmac_pfc_config(void *rpmd, int lmac_id, u8 tx_pause, u8 rx_pause,
+                       u16 pfc_en);
+int rpm_lmac_get_pfc_frm_cfg(void *rpmd, int lmac_id, u8 *tx_pause,
+                            u8 *rx_pause);
 #endif /* RPM_H */
index 5ed94cf..513b43e 100644 (file)
@@ -807,6 +807,9 @@ u32  rvu_cgx_get_fifolen(struct rvu *rvu);
 void *rvu_first_cgx_pdata(struct rvu *rvu);
 int cgxlmac_to_pf(struct rvu *rvu, int cgx_id, int lmac_id);
 int rvu_cgx_config_tx(void *cgxd, int lmac_id, bool enable);
+int rvu_cgx_prio_flow_ctrl_cfg(struct rvu *rvu, u16 pcifunc, u8 tx_pause, u8 rx_pause,
+                              u16 pfc_en);
+int rvu_cgx_cfg_pause_frm(struct rvu *rvu, u16 pcifunc, u8 tx_pause, u8 rx_pause);
 
 int npc_get_nixlf_mcam_index(struct npc_mcam *mcam, u16 pcifunc, int nixlf,
                             int type);
index 8a7ac5a..9ffe998 100644 (file)
@@ -863,6 +863,45 @@ int rvu_mbox_handler_cgx_intlbk_disable(struct rvu *rvu, struct msg_req *req,
        return 0;
 }
 
+int rvu_cgx_cfg_pause_frm(struct rvu *rvu, u16 pcifunc, u8 tx_pause, u8 rx_pause)
+{
+       int pf = rvu_get_pf(pcifunc);
+       u8 rx_pfc = 0, tx_pfc = 0;
+       struct mac_ops *mac_ops;
+       u8 cgx_id, lmac_id;
+       void *cgxd;
+
+       if (!is_mac_feature_supported(rvu, pf, RVU_LMAC_FEAT_FC))
+               return 0;
+
+       /* This msg is expected only from PF/VFs that are mapped to CGX LMACs,
+        * if received from other PF/VF simply ACK, nothing to do.
+        */
+       if (!is_pf_cgxmapped(rvu, pf))
+               return LMAC_AF_ERR_PF_NOT_MAPPED;
+
+       rvu_get_cgx_lmac_id(rvu->pf2cgxlmac_map[pf], &cgx_id, &lmac_id);
+       cgxd = rvu_cgx_pdata(cgx_id, rvu);
+       mac_ops = get_mac_ops(cgxd);
+
+       mac_ops->mac_get_pfc_frm_cfg(cgxd, lmac_id, &tx_pfc, &rx_pfc);
+       if (tx_pfc || rx_pfc) {
+               dev_warn(rvu->dev,
+                        "Can not configure 802.3X flow control as PFC frames are enabled");
+               return LMAC_AF_ERR_8023PAUSE_ENADIS_PERM_DENIED;
+       }
+
+       mutex_lock(&rvu->rsrc_lock);
+       if (verify_lmac_fc_cfg(cgxd, lmac_id, tx_pause, rx_pause,
+                              pcifunc & RVU_PFVF_FUNC_MASK)) {
+               mutex_unlock(&rvu->rsrc_lock);
+               return LMAC_AF_ERR_PERM_DENIED;
+       }
+       mutex_unlock(&rvu->rsrc_lock);
+
+       return mac_ops->mac_enadis_pause_frm(cgxd, lmac_id, tx_pause, rx_pause);
+}
+
 int rvu_mbox_handler_cgx_cfg_pause_frm(struct rvu *rvu,
                                       struct cgx_pause_frm_cfg *req,
                                       struct cgx_pause_frm_cfg *rsp)
@@ -870,11 +909,9 @@ int rvu_mbox_handler_cgx_cfg_pause_frm(struct rvu *rvu,
        int pf = rvu_get_pf(req->hdr.pcifunc);
        struct mac_ops *mac_ops;
        u8 cgx_id, lmac_id;
+       int err = 0;
        void *cgxd;
 
-       if (!is_mac_feature_supported(rvu, pf, RVU_LMAC_FEAT_FC))
-               return 0;
-
        /* This msg is expected only from PF/VFs that are mapped to CGX LMACs,
         * if received from other PF/VF simply ACK, nothing to do.
         */
@@ -886,13 +923,11 @@ int rvu_mbox_handler_cgx_cfg_pause_frm(struct rvu *rvu,
        mac_ops = get_mac_ops(cgxd);
 
        if (req->set)
-               mac_ops->mac_enadis_pause_frm(cgxd, lmac_id,
-                                             req->tx_pause, req->rx_pause);
+               err = rvu_cgx_cfg_pause_frm(rvu, req->hdr.pcifunc, req->tx_pause, req->rx_pause);
        else
-               mac_ops->mac_get_pause_frm_status(cgxd, lmac_id,
-                                                 &rsp->tx_pause,
-                                                 &rsp->rx_pause);
-       return 0;
+               mac_ops->mac_get_pause_frm_status(cgxd, lmac_id, &rsp->tx_pause, &rsp->rx_pause);
+
+       return err;
 }
 
 int rvu_mbox_handler_cgx_get_phy_fec_stats(struct rvu *rvu, struct msg_req *req,
@@ -1079,3 +1114,67 @@ int rvu_mbox_handler_cgx_mac_addr_update(struct rvu *rvu,
        rvu_get_cgx_lmac_id(rvu->pf2cgxlmac_map[pf], &cgx_id, &lmac_id);
        return cgx_lmac_addr_update(cgx_id, lmac_id, req->mac_addr, req->index);
 }
+
+int rvu_cgx_prio_flow_ctrl_cfg(struct rvu *rvu, u16 pcifunc, u8 tx_pause,
+                              u8 rx_pause, u16 pfc_en)
+{
+       int pf = rvu_get_pf(pcifunc);
+       u8 rx_8023 = 0, tx_8023 = 0;
+       struct mac_ops *mac_ops;
+       u8 cgx_id, lmac_id;
+       void *cgxd;
+
+       /* This msg is expected only from PF/VFs that are mapped to CGX LMACs,
+        * if received from other PF/VF simply ACK, nothing to do.
+        */
+       if (!is_pf_cgxmapped(rvu, pf))
+               return -ENODEV;
+
+       rvu_get_cgx_lmac_id(rvu->pf2cgxlmac_map[pf], &cgx_id, &lmac_id);
+       cgxd = rvu_cgx_pdata(cgx_id, rvu);
+       mac_ops = get_mac_ops(cgxd);
+
+       mac_ops->mac_get_pause_frm_status(cgxd, lmac_id, &tx_8023, &rx_8023);
+       if (tx_8023 || rx_8023) {
+               dev_warn(rvu->dev,
+                        "Can not configure PFC as 802.3X pause frames are enabled");
+               return LMAC_AF_ERR_PFC_ENADIS_PERM_DENIED;
+       }
+
+       mutex_lock(&rvu->rsrc_lock);
+       if (verify_lmac_fc_cfg(cgxd, lmac_id, tx_pause, rx_pause,
+                              pcifunc & RVU_PFVF_FUNC_MASK)) {
+               mutex_unlock(&rvu->rsrc_lock);
+               return LMAC_AF_ERR_PERM_DENIED;
+       }
+       mutex_unlock(&rvu->rsrc_lock);
+
+       return mac_ops->pfc_config(cgxd, lmac_id, tx_pause, rx_pause, pfc_en);
+}
+
+int rvu_mbox_handler_cgx_prio_flow_ctrl_cfg(struct rvu *rvu,
+                                           struct cgx_pfc_cfg *req,
+                                           struct cgx_pfc_rsp *rsp)
+{
+       int pf = rvu_get_pf(req->hdr.pcifunc);
+       struct mac_ops *mac_ops;
+       u8 cgx_id, lmac_id;
+       void *cgxd;
+       int err;
+
+       /* This msg is expected only from PF/VFs that are mapped to CGX LMACs,
+        * if received from other PF/VF simply ACK, nothing to do.
+        */
+       if (!is_pf_cgxmapped(rvu, pf))
+               return -ENODEV;
+
+       rvu_get_cgx_lmac_id(rvu->pf2cgxlmac_map[pf], &cgx_id, &lmac_id);
+       cgxd = rvu_cgx_pdata(cgx_id, rvu);
+       mac_ops = get_mac_ops(cgxd);
+
+       err = rvu_cgx_prio_flow_ctrl_cfg(rvu, req->hdr.pcifunc, req->tx_pause,
+                                        req->rx_pause, req->pfc_en);
+
+       mac_ops->mac_get_pfc_frm_cfg(cgxd, lmac_id, &rsp->tx_pause, &rsp->rx_pause);
+       return err;
+}
index 97fb619..0fa625e 100644 (file)
@@ -296,7 +296,6 @@ static int nix_interface_init(struct rvu *rvu, u16 pcifunc, int type, int nixlf,
        struct rvu_hwinfo *hw = rvu->hw;
        struct sdp_node_info *sdp_info;
        int pkind, pf, vf, lbkid, vfid;
-       struct mac_ops *mac_ops;
        u8 cgx_id, lmac_id;
        bool from_vf;
        int err;
@@ -326,13 +325,6 @@ static int nix_interface_init(struct rvu *rvu, u16 pcifunc, int type, int nixlf,
                cgx_set_pkind(rvu_cgx_pdata(cgx_id, rvu), lmac_id, pkind);
                rvu_npc_set_pkind(rvu, pkind, pfvf);
 
-               mac_ops = get_mac_ops(rvu_cgx_pdata(cgx_id, rvu));
-
-               /* By default we enable pause frames */
-               if ((pcifunc & RVU_PFVF_FUNC_MASK) == 0)
-                       mac_ops->mac_enadis_pause_frm(rvu_cgx_pdata(cgx_id,
-                                                                   rvu),
-                                                     lmac_id, true, true);
                break;
        case NIX_INTF_TYPE_LBK:
                vf = (pcifunc & RVU_PFVF_FUNC_MASK) - 1;
@@ -533,7 +525,7 @@ static int rvu_nix_get_bpid(struct rvu *rvu, struct nix_bp_cfg_req *req,
         */
        switch (type) {
        case NIX_INTF_TYPE_CGX:
-               if ((req->chan_base + req->chan_cnt) > 15)
+               if ((req->chan_base + req->chan_cnt) > 16)
                        return -EINVAL;
                rvu_get_cgx_lmac_id(pfvf->cgx_lmac, &cgx_id, &lmac_id);
                /* Assign bpid based on cgx, lmac and chan id */
@@ -4578,6 +4570,12 @@ void rvu_nix_lf_teardown(struct rvu *rvu, u16 pcifunc, int blkaddr, int nixlf)
                pfvf->hw_rx_tstamp_en = false;
        }
 
+       /* reset priority flow control config */
+       rvu_cgx_prio_flow_ctrl_cfg(rvu, pcifunc, 0, 0, 0);
+
+       /* reset 802.3x flow control config */
+       rvu_cgx_cfg_pause_frm(rvu, pcifunc, 0, 0);
+
        nix_ctx_free(rvu, pfvf);
 
        nix_free_all_bandprof(rvu, pcifunc);
@@ -5314,6 +5312,7 @@ int rvu_nix_setup_ratelimit_aggr(struct rvu *rvu, u16 pcifunc,
        aq_req.ctype = NIX_AQ_CTYPE_BANDPROF;
        aq_req.op = NIX_AQ_INSTOP_WRITE;
        memcpy(&aq_req.prof, &aq_rsp.prof, sizeof(struct nix_bandprof_s));
+       memset((char *)&aq_req.prof_mask, 0xff, sizeof(struct nix_bandprof_s));
        /* Clear higher layer enable bit in the mid profile, just in case */
        aq_req.prof.hl_en = 0;
        aq_req.prof_mask.hl_en = 1;
index 0048b59..d463dc7 100644 (file)
@@ -11,4 +11,7 @@ rvu_nicpf-y := otx2_pf.o otx2_common.o otx2_txrx.o otx2_ethtool.o \
                otx2_devlink.o
 rvu_nicvf-y := otx2_vf.o otx2_devlink.o
 
+rvu_nicpf-$(CONFIG_DCB) += otx2_dcbnl.o
+rvu_nicvf-$(CONFIG_DCB) += otx2_dcbnl.o
+
 ccflags-y += -I$(srctree)/drivers/net/ethernet/marvell/octeontx2/af
index 66da31f..2c97608 100644 (file)
@@ -222,8 +222,11 @@ EXPORT_SYMBOL(otx2_set_mac_address);
 int otx2_hw_set_mtu(struct otx2_nic *pfvf, int mtu)
 {
        struct nix_frs_cfg *req;
+       u16 maxlen;
        int err;
 
+       maxlen = otx2_get_max_mtu(pfvf) + OTX2_ETH_HLEN + OTX2_HW_TIMESTAMP_LEN;
+
        mutex_lock(&pfvf->mbox.lock);
        req = otx2_mbox_alloc_msg_nix_set_hw_frs(&pfvf->mbox);
        if (!req) {
@@ -233,6 +236,10 @@ int otx2_hw_set_mtu(struct otx2_nic *pfvf, int mtu)
 
        req->maxlen = pfvf->netdev->mtu + OTX2_ETH_HLEN + OTX2_HW_TIMESTAMP_LEN;
 
+       /* Use max receive length supported by hardware for loopback devices */
+       if (is_otx2_lbkvf(pfvf->pdev))
+               req->maxlen = maxlen;
+
        err = otx2_sync_mbox_msg(&pfvf->mbox);
        mutex_unlock(&pfvf->mbox.lock);
        return err;
@@ -262,6 +269,7 @@ unlock:
        mutex_unlock(&pfvf->mbox.lock);
        return err;
 }
+EXPORT_SYMBOL(otx2_config_pause_frm);
 
 int otx2_set_flowkey_cfg(struct otx2_nic *pfvf)
 {
@@ -931,7 +939,11 @@ static int otx2_cq_init(struct otx2_nic *pfvf, u16 qidx)
                if (!is_otx2_lbkvf(pfvf->pdev)) {
                        /* Enable receive CQ backpressure */
                        aq->cq.bp_ena = 1;
+#ifdef CONFIG_DCB
+                       aq->cq.bpid = pfvf->bpid[pfvf->queue_to_pfc_map[qidx]];
+#else
                        aq->cq.bpid = pfvf->bpid[0];
+#endif
 
                        /* Set backpressure level is same as cq pass level */
                        aq->cq.bp = RQ_PASS_LVL_CQ(pfvf->hw.rq_skid, qset->rqe_cnt);
@@ -1211,7 +1223,11 @@ static int otx2_aura_init(struct otx2_nic *pfvf, int aura_id,
                 */
                if (pfvf->nix_blkaddr == BLKADDR_NIX1)
                        aq->aura.bp_ena = 1;
+#ifdef CONFIG_DCB
+               aq->aura.nix0_bpid = pfvf->bpid[pfvf->queue_to_pfc_map[aura_id]];
+#else
                aq->aura.nix0_bpid = pfvf->bpid[0];
+#endif
 
                /* Set backpressure level for RQ's Aura */
                aq->aura.bp = RQ_BP_LVL_AURA;
@@ -1538,11 +1554,18 @@ int otx2_nix_config_bp(struct otx2_nic *pfvf, bool enable)
                return -ENOMEM;
 
        req->chan_base = 0;
-       req->chan_cnt = 1;
+#ifdef CONFIG_DCB
+       req->chan_cnt = pfvf->pfc_en ? IEEE_8021QAZ_MAX_TCS : 1;
+       req->bpid_per_chan = pfvf->pfc_en ? 1 : 0;
+#else
+       req->chan_cnt =  1;
        req->bpid_per_chan = 0;
+#endif
+
 
        return otx2_sync_mbox_msg(&pfvf->mbox);
 }
+EXPORT_SYMBOL(otx2_nix_config_bp);
 
 /* Mbox message handlers */
 void mbox_handler_cgx_stats(struct otx2_nic *pfvf,
@@ -1704,6 +1727,56 @@ out:
 }
 EXPORT_SYMBOL(otx2_get_max_mtu);
 
+int otx2_handle_ntuple_tc_features(struct net_device *netdev, netdev_features_t features)
+{
+       netdev_features_t changed = features ^ netdev->features;
+       struct otx2_nic *pfvf = netdev_priv(netdev);
+       bool ntuple = !!(features & NETIF_F_NTUPLE);
+       bool tc = !!(features & NETIF_F_HW_TC);
+
+       if ((changed & NETIF_F_NTUPLE) && !ntuple)
+               otx2_destroy_ntuple_flows(pfvf);
+
+       if ((changed & NETIF_F_NTUPLE) && ntuple) {
+               if (!pfvf->flow_cfg->max_flows) {
+                       netdev_err(netdev,
+                                  "Can't enable NTUPLE, MCAM entries not allocated\n");
+                       return -EINVAL;
+               }
+       }
+
+       if ((changed & NETIF_F_HW_TC) && tc) {
+               if (!pfvf->flow_cfg->max_flows) {
+                       netdev_err(netdev,
+                                  "Can't enable TC, MCAM entries not allocated\n");
+                       return -EINVAL;
+               }
+       }
+
+       if ((changed & NETIF_F_HW_TC) && !tc &&
+           pfvf->flow_cfg && pfvf->flow_cfg->nr_flows) {
+               netdev_err(netdev, "Can't disable TC hardware offload while flows are active\n");
+               return -EBUSY;
+       }
+
+       if ((changed & NETIF_F_NTUPLE) && ntuple &&
+           (netdev->features & NETIF_F_HW_TC) && !(changed & NETIF_F_HW_TC)) {
+               netdev_err(netdev,
+                          "Can't enable NTUPLE when TC is active, disable TC and retry\n");
+               return -EINVAL;
+       }
+
+       if ((changed & NETIF_F_HW_TC) && tc &&
+           (netdev->features & NETIF_F_NTUPLE) && !(changed & NETIF_F_NTUPLE)) {
+               netdev_err(netdev,
+                          "Can't enable TC when NTUPLE is active, disable NTUPLE and retry\n");
+               return -EINVAL;
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL(otx2_handle_ntuple_tc_features);
+
 #define M(_name, _id, _fn_name, _req_type, _rsp_type)                  \
 int __weak                                                             \
 otx2_mbox_up_handler_ ## _fn_name(struct otx2_nic *pfvf,               \
index 14509fc..7724f17 100644 (file)
@@ -178,6 +178,9 @@ struct otx2_hw {
        u16                     rqpool_cnt;
        u16                     sqpool_cnt;
 
+#define OTX2_DEFAULT_RBUF_LEN  2048
+       u16                     rbuf_len;
+
        /* NPA */
        u32                     stack_pg_ptrs;  /* No of ptrs per stack page */
        u32                     stack_pg_bytes; /* Size of stack page */
@@ -396,6 +399,11 @@ struct otx2_nic {
 
        /* Devlink */
        struct otx2_devlink     *dl;
+#ifdef CONFIG_DCB
+       /* PFC */
+       u8                      pfc_en;
+       u8                      *queue_to_pfc_map;
+#endif
 };
 
 static inline bool is_otx2_lbkvf(struct pci_dev *pdev)
@@ -863,6 +871,8 @@ int otx2_enable_rxvlan(struct otx2_nic *pf, bool enable);
 int otx2_install_rxvlan_offload_flow(struct otx2_nic *pfvf);
 bool otx2_xdp_sq_append_pkt(struct otx2_nic *pfvf, u64 iova, int len, u16 qidx);
 u16 otx2_get_max_mtu(struct otx2_nic *pfvf);
+int otx2_handle_ntuple_tc_features(struct net_device *netdev,
+                                  netdev_features_t features);
 /* tc support */
 int otx2_init_tc(struct otx2_nic *nic);
 void otx2_shutdown_tc(struct otx2_nic *nic);
@@ -876,4 +886,11 @@ int otx2_dmacflt_remove(struct otx2_nic *pf, const u8 *mac, u8 bit_pos);
 int otx2_dmacflt_update(struct otx2_nic *pf, u8 *mac, u8 bit_pos);
 void otx2_dmacflt_reinstall_flows(struct otx2_nic *pf);
 void otx2_dmacflt_update_pfmac_flow(struct otx2_nic *pfvf);
+
+#ifdef CONFIG_DCB
+/* DCB support*/
+void otx2_update_bpid_in_rqctx(struct otx2_nic *pfvf, int vlan_prio, int qidx, bool pfc_enable);
+int otx2_config_priority_flow_ctrl(struct otx2_nic *pfvf);
+int otx2_dcbnl_set_ops(struct net_device *dev);
+#endif
 #endif /* OTX2_COMMON_H */
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_dcbnl.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_dcbnl.c
new file mode 100644 (file)
index 0000000..723d250
--- /dev/null
@@ -0,0 +1,170 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Marvell RVU Ethernet driver
+ *
+ * Copyright (C) 2021 Marvell.
+ *
+ */
+
+#include "otx2_common.h"
+
+int otx2_config_priority_flow_ctrl(struct otx2_nic *pfvf)
+{
+       struct cgx_pfc_cfg *req;
+       struct cgx_pfc_rsp *rsp;
+       int err = 0;
+
+       if (is_otx2_lbkvf(pfvf->pdev))
+               return 0;
+
+       mutex_lock(&pfvf->mbox.lock);
+       req = otx2_mbox_alloc_msg_cgx_prio_flow_ctrl_cfg(&pfvf->mbox);
+       if (!req) {
+               err = -ENOMEM;
+               goto unlock;
+       }
+
+       if (pfvf->pfc_en) {
+               req->rx_pause = true;
+               req->tx_pause = true;
+       } else {
+               req->rx_pause = false;
+               req->tx_pause = false;
+       }
+       req->pfc_en = pfvf->pfc_en;
+
+       if (!otx2_sync_mbox_msg(&pfvf->mbox)) {
+               rsp = (struct cgx_pfc_rsp *)
+                      otx2_mbox_get_rsp(&pfvf->mbox.mbox, 0, &req->hdr);
+               if (req->rx_pause != rsp->rx_pause || req->tx_pause != rsp->tx_pause) {
+                       dev_warn(pfvf->dev,
+                                "Failed to config PFC\n");
+                       err = -EPERM;
+               }
+       }
+unlock:
+       mutex_unlock(&pfvf->mbox.lock);
+       return err;
+}
+
+void otx2_update_bpid_in_rqctx(struct otx2_nic *pfvf, int vlan_prio, int qidx,
+                              bool pfc_enable)
+{
+       bool if_up = netif_running(pfvf->netdev);
+       struct npa_aq_enq_req *npa_aq;
+       struct nix_aq_enq_req *aq;
+       int err = 0;
+
+       if (pfvf->queue_to_pfc_map[qidx] && pfc_enable) {
+               dev_warn(pfvf->dev,
+                        "PFC enable not permitted as Priority %d already mapped to Queue %d\n",
+                        pfvf->queue_to_pfc_map[qidx], qidx);
+               return;
+       }
+
+       if (if_up) {
+               netif_tx_stop_all_queues(pfvf->netdev);
+               netif_carrier_off(pfvf->netdev);
+       }
+
+       pfvf->queue_to_pfc_map[qidx] = vlan_prio;
+
+       aq = otx2_mbox_alloc_msg_nix_aq_enq(&pfvf->mbox);
+       if (!aq) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       aq->cq.bpid = pfvf->bpid[vlan_prio];
+       aq->cq_mask.bpid = GENMASK(8, 0);
+
+       /* Fill AQ info */
+       aq->qidx = qidx;
+       aq->ctype = NIX_AQ_CTYPE_CQ;
+       aq->op = NIX_AQ_INSTOP_WRITE;
+
+       otx2_sync_mbox_msg(&pfvf->mbox);
+
+       npa_aq = otx2_mbox_alloc_msg_npa_aq_enq(&pfvf->mbox);
+       if (!npa_aq) {
+               err = -ENOMEM;
+               goto out;
+       }
+       npa_aq->aura.nix0_bpid = pfvf->bpid[vlan_prio];
+       npa_aq->aura_mask.nix0_bpid = GENMASK(8, 0);
+
+       /* Fill NPA AQ info */
+       npa_aq->aura_id = qidx;
+       npa_aq->ctype = NPA_AQ_CTYPE_AURA;
+       npa_aq->op = NPA_AQ_INSTOP_WRITE;
+       otx2_sync_mbox_msg(&pfvf->mbox);
+
+out:
+       if (if_up) {
+               netif_carrier_on(pfvf->netdev);
+               netif_tx_start_all_queues(pfvf->netdev);
+       }
+
+       if (err)
+               dev_warn(pfvf->dev,
+                        "Updating BPIDs in CQ and Aura contexts of RQ%d failed with err %d\n",
+                        qidx, err);
+}
+
+static int otx2_dcbnl_ieee_getpfc(struct net_device *dev, struct ieee_pfc *pfc)
+{
+       struct otx2_nic *pfvf = netdev_priv(dev);
+
+       pfc->pfc_cap = IEEE_8021QAZ_MAX_TCS;
+       pfc->pfc_en = pfvf->pfc_en;
+
+       return 0;
+}
+
+static int otx2_dcbnl_ieee_setpfc(struct net_device *dev, struct ieee_pfc *pfc)
+{
+       struct otx2_nic *pfvf = netdev_priv(dev);
+       int err;
+
+       /* Save PFC configuration to interface */
+       pfvf->pfc_en = pfc->pfc_en;
+
+       err = otx2_config_priority_flow_ctrl(pfvf);
+       if (err)
+               return err;
+
+       /* Request Per channel Bpids */
+       if (pfc->pfc_en)
+               otx2_nix_config_bp(pfvf, true);
+
+       return 0;
+}
+
+static u8 otx2_dcbnl_getdcbx(struct net_device __always_unused *dev)
+{
+       return DCB_CAP_DCBX_HOST | DCB_CAP_DCBX_VER_IEEE;
+}
+
+static u8 otx2_dcbnl_setdcbx(struct net_device __always_unused *dev, u8 mode)
+{
+       return (mode != (DCB_CAP_DCBX_HOST | DCB_CAP_DCBX_VER_IEEE)) ? 1 : 0;
+}
+
+static const struct dcbnl_rtnl_ops otx2_dcbnl_ops = {
+       .ieee_getpfc    = otx2_dcbnl_ieee_getpfc,
+       .ieee_setpfc    = otx2_dcbnl_ieee_setpfc,
+       .getdcbx        = otx2_dcbnl_getdcbx,
+       .setdcbx        = otx2_dcbnl_setdcbx,
+};
+
+int otx2_dcbnl_set_ops(struct net_device *dev)
+{
+       struct otx2_nic *pfvf = netdev_priv(dev);
+
+       pfvf->queue_to_pfc_map = devm_kzalloc(pfvf->dev, pfvf->hw.rx_queues,
+                                             GFP_KERNEL);
+       if (!pfvf->queue_to_pfc_map)
+               return -ENOMEM;
+       dev->dcbnl_ops = &otx2_dcbnl_ops;
+
+       return 0;
+}
index d85db90..abe5267 100644 (file)
@@ -371,6 +371,7 @@ static void otx2_get_ringparam(struct net_device *netdev,
        ring->rx_pending = qs->rqe_cnt ? qs->rqe_cnt : Q_COUNT(Q_SIZE_256);
        ring->tx_max_pending = Q_COUNT(Q_SIZE_MAX);
        ring->tx_pending = qs->sqe_cnt ? qs->sqe_cnt : Q_COUNT(Q_SIZE_4K);
+       kernel_ring->rx_buf_len = pfvf->hw.rbuf_len;
 }
 
 static int otx2_set_ringparam(struct net_device *netdev,
@@ -379,6 +380,8 @@ static int otx2_set_ringparam(struct net_device *netdev,
                              struct netlink_ext_ack *extack)
 {
        struct otx2_nic *pfvf = netdev_priv(netdev);
+       u32 rx_buf_len = kernel_ring->rx_buf_len;
+       u32 old_rx_buf_len = pfvf->hw.rbuf_len;
        bool if_up = netif_running(netdev);
        struct otx2_qset *qs = &pfvf->qset;
        u32 rx_count, tx_count;
@@ -386,6 +389,15 @@ static int otx2_set_ringparam(struct net_device *netdev,
        if (ring->rx_mini_pending || ring->rx_jumbo_pending)
                return -EINVAL;
 
+       /* Hardware supports max size of 32k for a receive buffer
+        * and 1536 is typical ethernet frame size.
+        */
+       if (rx_buf_len && (rx_buf_len < 1536 || rx_buf_len > 32768)) {
+               netdev_err(netdev,
+                          "Receive buffer range is 1536 - 32768");
+               return -EINVAL;
+       }
+
        /* Permitted lengths are 16 64 256 1K 4K 16K 64K 256K 1M  */
        rx_count = ring->rx_pending;
        /* On some silicon variants a skid or reserved CQEs are
@@ -403,7 +415,8 @@ static int otx2_set_ringparam(struct net_device *netdev,
                           Q_COUNT(Q_SIZE_4K), Q_COUNT(Q_SIZE_MAX));
        tx_count = Q_COUNT(Q_SIZE(tx_count, 3));
 
-       if (tx_count == qs->sqe_cnt && rx_count == qs->rqe_cnt)
+       if (tx_count == qs->sqe_cnt && rx_count == qs->rqe_cnt &&
+           rx_buf_len == old_rx_buf_len)
                return 0;
 
        if (if_up)
@@ -413,6 +426,8 @@ static int otx2_set_ringparam(struct net_device *netdev,
        qs->sqe_cnt = tx_count;
        qs->rqe_cnt = rx_count;
 
+       pfvf->hw.rbuf_len = rx_buf_len;
+
        if (if_up)
                return netdev->netdev_ops->ndo_open(netdev);
 
@@ -1207,6 +1222,7 @@ end:
 static const struct ethtool_ops otx2_ethtool_ops = {
        .supported_coalesce_params = ETHTOOL_COALESCE_USECS |
                                     ETHTOOL_COALESCE_MAX_FRAMES,
+       .supported_ring_params  = ETHTOOL_RING_USE_RX_BUF_LEN,
        .get_link               = otx2_get_link,
        .get_drvinfo            = otx2_get_drvinfo,
        .get_strings            = otx2_get_strings,
@@ -1326,6 +1342,7 @@ static int otx2vf_get_link_ksettings(struct net_device *netdev,
 static const struct ethtool_ops otx2vf_ethtool_ops = {
        .supported_coalesce_params = ETHTOOL_COALESCE_USECS |
                                     ETHTOOL_COALESCE_MAX_FRAMES,
+       .supported_ring_params  = ETHTOOL_RING_USE_RX_BUF_LEN,
        .get_link               = otx2_get_link,
        .get_drvinfo            = otx2vf_get_drvinfo,
        .get_strings            = otx2vf_get_strings,
index 77a13fb..54f235c 100644 (file)
@@ -21,8 +21,10 @@ struct otx2_flow {
        u16 entry;
        bool is_vf;
        u8 rss_ctx_id;
+#define DMAC_FILTER_RULE               BIT(0)
+#define PFC_FLOWCTRL_RULE              BIT(1)
+       u16 rule_type;
        int vf;
-       bool dmac_filter;
 };
 
 enum dmac_req {
@@ -899,6 +901,9 @@ static int otx2_is_flow_rule_dmacfilter(struct otx2_nic *pfvf,
 static int otx2_add_flow_msg(struct otx2_nic *pfvf, struct otx2_flow *flow)
 {
        u64 ring_cookie = flow->flow_spec.ring_cookie;
+#ifdef CONFIG_DCB
+       int vlan_prio, qidx, pfc_rule = 0;
+#endif
        struct npc_install_flow_req *req;
        int err, vf = 0;
 
@@ -940,6 +945,24 @@ static int otx2_add_flow_msg(struct otx2_nic *pfvf, struct otx2_flow *flow)
                        mutex_unlock(&pfvf->mbox.lock);
                        return -EINVAL;
                }
+
+#ifdef CONFIG_DCB
+               /* Identify PFC rule if PFC enabled and ntuple rule is vlan */
+               if (!vf && (req->features & BIT_ULL(NPC_OUTER_VID)) &&
+                   pfvf->pfc_en && req->op != NIX_RX_ACTIONOP_RSS) {
+                       vlan_prio = ntohs(req->packet.vlan_tci) &
+                                   ntohs(req->mask.vlan_tci);
+
+                       /* Get the priority */
+                       vlan_prio >>= 13;
+                       flow->rule_type |= PFC_FLOWCTRL_RULE;
+                       /* Check if PFC enabled for this priority */
+                       if (pfvf->pfc_en & BIT(vlan_prio)) {
+                               pfc_rule = true;
+                               qidx = req->index;
+                       }
+               }
+#endif
        }
 
        /* ethtool ring_cookie has (VF + 1) for VF */
@@ -951,6 +974,12 @@ static int otx2_add_flow_msg(struct otx2_nic *pfvf, struct otx2_flow *flow)
 
        /* Send message to AF */
        err = otx2_sync_mbox_msg(&pfvf->mbox);
+
+#ifdef CONFIG_DCB
+       if (!err && pfc_rule)
+               otx2_update_bpid_in_rqctx(pfvf, vlan_prio, qidx, true);
+#endif
+
        mutex_unlock(&pfvf->mbox.lock);
        return err;
 }
@@ -966,7 +995,7 @@ static int otx2_add_flow_with_pfmac(struct otx2_nic *pfvf,
                return -ENOMEM;
 
        pf_mac->entry = 0;
-       pf_mac->dmac_filter = true;
+       pf_mac->rule_type |= DMAC_FILTER_RULE;
        pf_mac->location = pfvf->flow_cfg->max_flows;
        memcpy(&pf_mac->flow_spec, &flow->flow_spec,
               sizeof(struct ethtool_rx_flow_spec));
@@ -1031,7 +1060,7 @@ int otx2_add_flow(struct otx2_nic *pfvf, struct ethtool_rxnfc *nfc)
                eth_hdr = &flow->flow_spec.h_u.ether_spec;
 
                /* Sync dmac filter table with updated fields */
-               if (flow->dmac_filter)
+               if (flow->rule_type & DMAC_FILTER_RULE)
                        return otx2_dmacflt_update(pfvf, eth_hdr->h_dest,
                                                   flow->entry);
 
@@ -1052,7 +1081,7 @@ int otx2_add_flow(struct otx2_nic *pfvf, struct ethtool_rxnfc *nfc)
                if (!test_bit(0, &flow_cfg->dmacflt_bmap))
                        otx2_add_flow_with_pfmac(pfvf, flow);
 
-               flow->dmac_filter = true;
+               flow->rule_type |= DMAC_FILTER_RULE;
                flow->entry = find_first_zero_bit(&flow_cfg->dmacflt_bmap,
                                                  flow_cfg->dmacflt_max_flows);
                fsp->location = flow_cfg->max_flows + flow->entry;
@@ -1120,7 +1149,7 @@ static void otx2_update_rem_pfmac(struct otx2_nic *pfvf, int req)
        bool found = false;
 
        list_for_each_entry(iter, &pfvf->flow_cfg->flow_list, list) {
-               if (iter->dmac_filter && iter->entry == 0) {
+               if ((iter->rule_type & DMAC_FILTER_RULE) && iter->entry == 0) {
                        eth_hdr = &iter->flow_spec.h_u.ether_spec;
                        if (req == DMAC_ADDR_DEL) {
                                otx2_dmacflt_remove(pfvf, eth_hdr->h_dest,
@@ -1156,7 +1185,7 @@ int otx2_remove_flow(struct otx2_nic *pfvf, u32 location)
        if (!flow)
                return -ENOENT;
 
-       if (flow->dmac_filter) {
+       if (flow->rule_type & DMAC_FILTER_RULE) {
                struct ethhdr *eth_hdr = &flow->flow_spec.h_u.ether_spec;
 
                /* user not allowed to remove dmac filter with interface mac */
@@ -1174,6 +1203,13 @@ int otx2_remove_flow(struct otx2_nic *pfvf, u32 location)
                                  flow_cfg->dmacflt_max_flows) == 1)
                        otx2_update_rem_pfmac(pfvf, DMAC_ADDR_DEL);
        } else {
+#ifdef CONFIG_DCB
+               if (flow->rule_type & PFC_FLOWCTRL_RULE)
+                       otx2_update_bpid_in_rqctx(pfvf, 0,
+                                                 flow->flow_spec.ring_cookie,
+                                                 false);
+#endif
+
                err = otx2_remove_flow_msg(pfvf, flow->entry, false);
        }
 
@@ -1383,7 +1419,7 @@ void otx2_dmacflt_reinstall_flows(struct otx2_nic *pf)
        struct ethhdr *eth_hdr;
 
        list_for_each_entry(iter, &pf->flow_cfg->flow_list, list) {
-               if (iter->dmac_filter) {
+               if (iter->rule_type & DMAC_FILTER_RULE) {
                        eth_hdr = &iter->flow_spec.h_u.ether_spec;
                        otx2_dmacflt_add(pf, eth_hdr->h_dest,
                                         iter->entry);
index d39341e..a536916 100644 (file)
@@ -1311,6 +1311,9 @@ static int otx2_get_rbuf_size(struct otx2_nic *pf, int mtu)
        int total_size;
        int rbuf_size;
 
+       if (pf->hw.rbuf_len)
+               return ALIGN(pf->hw.rbuf_len, OTX2_ALIGN) + OTX2_HEAD_ROOM;
+
        /* The data transferred by NIX to memory consists of actual packet
         * plus additional data which has timestamp and/or EDSA/HIGIG2
         * headers if interface is configured in corresponding modes.
@@ -1694,9 +1697,6 @@ int otx2_open(struct net_device *netdev)
        if (pf->linfo.link_up && !(pf->pcifunc & RVU_PFVF_FUNC_MASK))
                otx2_handle_link_event(pf);
 
-       /* Restore pause frame settings */
-       otx2_config_pause_frm(pf);
-
        /* Install DMAC Filters */
        if (pf->flags & OTX2_FLAG_DMACFLTR_SUPPORT)
                otx2_dmacflt_reinstall_flows(pf);
@@ -1863,9 +1863,7 @@ static int otx2_set_features(struct net_device *netdev,
                             netdev_features_t features)
 {
        netdev_features_t changed = features ^ netdev->features;
-       bool ntuple = !!(features & NETIF_F_NTUPLE);
        struct otx2_nic *pf = netdev_priv(netdev);
-       bool tc = !!(features & NETIF_F_HW_TC);
 
        if ((changed & NETIF_F_LOOPBACK) && netif_running(netdev))
                return otx2_cgx_config_loopback(pf,
@@ -1875,46 +1873,7 @@ static int otx2_set_features(struct net_device *netdev,
                return otx2_enable_rxvlan(pf,
                                          features & NETIF_F_HW_VLAN_CTAG_RX);
 
-       if ((changed & NETIF_F_NTUPLE) && !ntuple)
-               otx2_destroy_ntuple_flows(pf);
-
-       if ((changed & NETIF_F_NTUPLE) && ntuple) {
-               if (!pf->flow_cfg->max_flows) {
-                       netdev_err(netdev,
-                                  "Can't enable NTUPLE, MCAM entries not allocated\n");
-                       return -EINVAL;
-               }
-       }
-
-       if ((changed & NETIF_F_HW_TC) && tc) {
-               if (!pf->flow_cfg->max_flows) {
-                       netdev_err(netdev,
-                                  "Can't enable TC, MCAM entries not allocated\n");
-                       return -EINVAL;
-               }
-       }
-
-       if ((changed & NETIF_F_HW_TC) && !tc &&
-           pf->flow_cfg && pf->flow_cfg->nr_flows) {
-               netdev_err(netdev, "Can't disable TC hardware offload while flows are active\n");
-               return -EBUSY;
-       }
-
-       if ((changed & NETIF_F_NTUPLE) && ntuple &&
-           (netdev->features & NETIF_F_HW_TC) && !(changed & NETIF_F_HW_TC)) {
-               netdev_err(netdev,
-                          "Can't enable NTUPLE when TC is active, disable TC and retry\n");
-               return -EINVAL;
-       }
-
-       if ((changed & NETIF_F_HW_TC) && tc &&
-           (netdev->features & NETIF_F_NTUPLE) && !(changed & NETIF_F_NTUPLE)) {
-               netdev_err(netdev,
-                          "Can't enable TC when NTUPLE is active, disable NTUPLE and retry\n");
-               return -EINVAL;
-       }
-
-       return 0;
+       return otx2_handle_ntuple_tc_features(netdev, features);
 }
 
 static void otx2_reset_task(struct work_struct *work)
@@ -2625,6 +2584,7 @@ static int otx2_probe(struct pci_dev *pdev, const struct pci_device_id *id)
        hw->tx_queues = qcount;
        hw->tot_tx_queues = qcount;
        hw->max_queues = qcount;
+       hw->rbuf_len = OTX2_DEFAULT_RBUF_LEN;
 
        num_vec = pci_msix_vec_count(pdev);
        hw->irq_name = devm_kmalloc_array(&hw->pdev->dev, num_vec, NAME_SIZE,
@@ -2778,9 +2738,11 @@ static int otx2_probe(struct pci_dev *pdev, const struct pci_device_id *id)
        /* Enable link notifications */
        otx2_cgx_config_linkevents(pf, true);
 
-       /* Enable pause frames by default */
-       pf->flags |= OTX2_FLAG_RX_PAUSE_ENABLED;
-       pf->flags |= OTX2_FLAG_TX_PAUSE_ENABLED;
+#ifdef CONFIG_DCB
+       err = otx2_dcbnl_set_ops(netdev);
+       if (err)
+               goto err_pf_sriov_init;
+#endif
 
        return 0;
 
@@ -2925,6 +2887,21 @@ static void otx2_remove(struct pci_dev *pdev)
        if (pf->flags & OTX2_FLAG_RX_TSTAMP_ENABLED)
                otx2_config_hw_rx_tstamp(pf, false);
 
+       /* Disable 802.3x pause frames */
+       if (pf->flags & OTX2_FLAG_RX_PAUSE_ENABLED ||
+           (pf->flags & OTX2_FLAG_TX_PAUSE_ENABLED)) {
+               pf->flags &= ~OTX2_FLAG_RX_PAUSE_ENABLED;
+               pf->flags &= ~OTX2_FLAG_TX_PAUSE_ENABLED;
+               otx2_config_pause_frm(pf);
+       }
+
+#ifdef CONFIG_DCB
+       /* Disable PFC config */
+       if (pf->pfc_en) {
+               pf->pfc_en = 0;
+               otx2_config_priority_flow_ctrl(pf);
+       }
+#endif
        cancel_work_sync(&pf->reset_task);
        /* Disable link notifications */
        otx2_cgx_config_linkevents(pf, false);
index 626961a..0593106 100644 (file)
@@ -58,7 +58,7 @@ int otx2_tc_alloc_ent_bitmap(struct otx2_nic *nic)
 {
        struct otx2_tc_info *tc = &nic->tc_info;
 
-       if (!nic->flow_cfg->max_flows || is_otx2_vf(nic->pcifunc))
+       if (!nic->flow_cfg->max_flows)
                return 0;
 
        /* Max flows changed, free the existing bitmap */
@@ -1023,6 +1023,7 @@ int otx2_setup_tc(struct net_device *netdev, enum tc_setup_type type,
                return -EOPNOTSUPP;
        }
 }
+EXPORT_SYMBOL(otx2_setup_tc);
 
 static const struct rhashtable_params tc_flow_ht_params = {
        .head_offset = offsetof(struct otx2_tc_flow, node),
@@ -1052,6 +1053,7 @@ int otx2_init_tc(struct otx2_nic *nic)
        tc->flow_ht_params = tc_flow_ht_params;
        return rhashtable_init(&tc->flow_table, &tc->flow_ht_params);
 }
+EXPORT_SYMBOL(otx2_init_tc);
 
 void otx2_shutdown_tc(struct otx2_nic *nic)
 {
@@ -1060,3 +1062,4 @@ void otx2_shutdown_tc(struct otx2_nic *nic)
        kfree(tc->tc_entries_bitmap);
        rhashtable_destroy(&tc->flow_table);
 }
+EXPORT_SYMBOL(otx2_shutdown_tc);
index 925b74e..a232e20 100644 (file)
@@ -472,23 +472,7 @@ static void otx2vf_reset_task(struct work_struct *work)
 static int otx2vf_set_features(struct net_device *netdev,
                               netdev_features_t features)
 {
-       netdev_features_t changed = features ^ netdev->features;
-       bool ntuple_enabled = !!(features & NETIF_F_NTUPLE);
-       struct otx2_nic *vf = netdev_priv(netdev);
-
-       if (changed & NETIF_F_NTUPLE) {
-               if (!ntuple_enabled) {
-                       otx2_mcam_flow_del(vf);
-                       return 0;
-               }
-
-               if (!otx2_get_maxflows(vf->flow_cfg)) {
-                       netdev_err(netdev,
-                                  "Can't enable NTUPLE, MCAM entries not allocated\n");
-                       return -EINVAL;
-               }
-       }
-       return 0;
+       return otx2_handle_ntuple_tc_features(netdev, features);
 }
 
 static const struct net_device_ops otx2vf_netdev_ops = {
@@ -502,6 +486,7 @@ static const struct net_device_ops otx2vf_netdev_ops = {
        .ndo_get_stats64 = otx2_get_stats64,
        .ndo_tx_timeout = otx2_tx_timeout,
        .ndo_eth_ioctl  = otx2_ioctl,
+       .ndo_setup_tc = otx2_setup_tc,
 };
 
 static int otx2_wq_init(struct otx2_nic *vf)
@@ -586,6 +571,7 @@ static int otx2vf_probe(struct pci_dev *pdev, const struct pci_device_id *id)
        hw->tx_queues = qcount;
        hw->max_queues = qcount;
        hw->tot_tx_queues = qcount;
+       hw->rbuf_len = OTX2_DEFAULT_RBUF_LEN;
 
        hw->irq_name = devm_kmalloc_array(&hw->pdev->dev, num_vec, NAME_SIZE,
                                          GFP_KERNEL);
@@ -662,6 +648,7 @@ static int otx2vf_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 
        netdev->hw_features |= NETIF_F_NTUPLE;
        netdev->hw_features |= NETIF_F_RXALL;
+       netdev->hw_features |= NETIF_F_HW_TC;
 
        netif_set_gso_max_segs(netdev, OTX2_MAX_GSO_SEGS);
        netdev->watchdog_timeo = OTX2_TX_TIMEOUT;
@@ -697,16 +684,24 @@ static int otx2vf_probe(struct pci_dev *pdev, const struct pci_device_id *id)
        if (err)
                goto err_unreg_netdev;
 
-       err = otx2_register_dl(vf);
+       err = otx2_init_tc(vf);
        if (err)
                goto err_unreg_netdev;
 
-       /* Enable pause frames by default */
-       vf->flags |= OTX2_FLAG_RX_PAUSE_ENABLED;
-       vf->flags |= OTX2_FLAG_TX_PAUSE_ENABLED;
+       err = otx2_register_dl(vf);
+       if (err)
+               goto err_shutdown_tc;
+
+#ifdef CONFIG_DCB
+       err = otx2_dcbnl_set_ops(netdev);
+       if (err)
+               goto err_shutdown_tc;
+#endif
 
        return 0;
 
+err_shutdown_tc:
+       otx2_shutdown_tc(vf);
 err_unreg_netdev:
        unregister_netdev(netdev);
 err_ptp_destroy:
@@ -739,6 +734,22 @@ static void otx2vf_remove(struct pci_dev *pdev)
 
        vf = netdev_priv(netdev);
 
+       /* Disable 802.3x pause frames */
+       if (vf->flags & OTX2_FLAG_RX_PAUSE_ENABLED ||
+           (vf->flags & OTX2_FLAG_TX_PAUSE_ENABLED)) {
+               vf->flags &= ~OTX2_FLAG_RX_PAUSE_ENABLED;
+               vf->flags &= ~OTX2_FLAG_TX_PAUSE_ENABLED;
+               otx2_config_pause_frm(vf);
+       }
+
+#ifdef CONFIG_DCB
+       /* Disable PFC config */
+       if (vf->pfc_en) {
+               vf->pfc_en = 0;
+               otx2_config_priority_flow_ctrl(vf);
+       }
+#endif
+
        cancel_work_sync(&vf->reset_task);
        otx2_unregister_dl(vf);
        unregister_netdev(netdev);
index 89ca796..4cd0747 100644 (file)
@@ -1556,6 +1556,7 @@ static int mtk_star_probe(struct platform_device *pdev)
        return devm_register_netdev(dev, ndev);
 }
 
+#ifdef CONFIG_OF
 static const struct of_device_id mtk_star_of_match[] = {
        { .compatible = "mediatek,mt8516-eth", },
        { .compatible = "mediatek,mt8518-eth", },
@@ -1563,6 +1564,7 @@ static const struct of_device_id mtk_star_of_match[] = {
        { }
 };
 MODULE_DEVICE_TABLE(of, mtk_star_of_match);
+#endif
 
 static SIMPLE_DEV_PM_OPS(mtk_star_pm_ops,
                         mtk_star_suspend, mtk_star_resume);
index b0de6b9..2b53738 100644 (file)
@@ -7,7 +7,8 @@
 static bool
 tc_act_can_offload_accept(struct mlx5e_tc_act_parse_state *parse_state,
                          const struct flow_action_entry *act,
-                         int act_index)
+                         int act_index,
+                         struct mlx5_flow_attr *attr)
 {
        return true;
 }
@@ -20,7 +21,7 @@ tc_act_parse_accept(struct mlx5e_tc_act_parse_state *parse_state,
 {
        attr->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST |
                        MLX5_FLOW_CONTEXT_ACTION_COUNT;
-       attr->flags |= MLX5_ESW_ATTR_FLAG_ACCEPT;
+       attr->flags |= MLX5_ATTR_FLAG_ACCEPT;
 
        return 0;
 }
index 26efa33..bfbc91c 100644 (file)
@@ -16,12 +16,12 @@ struct mlx5e_tc_act_parse_state {
        unsigned int num_actions;
        struct mlx5e_tc_flow *flow;
        struct netlink_ext_ack *extack;
+       bool ct;
        bool encap;
        bool decap;
        bool mpls_push;
        bool ptype_host;
        const struct ip_tunnel_info *tun_info;
-       struct pedit_headers_action hdrs[__PEDIT_CMD_MAX];
        int ifindexes[MLX5_MAX_FLOW_FWD_VPORTS];
        int if_count;
        struct mlx5_tc_ct_priv *ct_priv;
@@ -30,7 +30,8 @@ struct mlx5e_tc_act_parse_state {
 struct mlx5e_tc_act {
        bool (*can_offload)(struct mlx5e_tc_act_parse_state *parse_state,
                            const struct flow_action_entry *act,
-                           int act_index);
+                           int act_index,
+                           struct mlx5_flow_attr *attr);
 
        int (*parse_action)(struct mlx5e_tc_act_parse_state *parse_state,
                            const struct flow_action_entry *act,
index 29920ef..c0f08ae 100644 (file)
@@ -38,11 +38,12 @@ csum_offload_supported(struct mlx5e_priv *priv,
 static bool
 tc_act_can_offload_csum(struct mlx5e_tc_act_parse_state *parse_state,
                        const struct flow_action_entry *act,
-                       int act_index)
+                       int act_index,
+                       struct mlx5_flow_attr *attr)
 {
        struct mlx5e_tc_flow *flow = parse_state->flow;
 
-       return csum_offload_supported(flow->priv, flow->attr->action,
+       return csum_offload_supported(flow->priv, attr->action,
                                      act->csum_flags, parse_state->extack);
 }
 
index 06ec30c..85f0cb8 100644 (file)
@@ -8,8 +8,10 @@
 static bool
 tc_act_can_offload_ct(struct mlx5e_tc_act_parse_state *parse_state,
                      const struct flow_action_entry *act,
-                     int act_index)
+                     int act_index,
+                     struct mlx5_flow_attr *attr)
 {
+       bool clear_action = act->ct.action & TCA_CT_ACT_CLEAR;
        struct netlink_ext_ack *extack = parse_state->extack;
 
        if (flow_flag_test(parse_state->flow, SAMPLE)) {
@@ -18,6 +20,11 @@ tc_act_can_offload_ct(struct mlx5e_tc_act_parse_state *parse_state,
                return false;
        }
 
+       if (parse_state->ct && !clear_action) {
+               NL_SET_ERR_MSG_MOD(extack, "Multiple CT actions are not supoported");
+               return false;
+       }
+
        return true;
 }
 
@@ -27,6 +34,7 @@ tc_act_parse_ct(struct mlx5e_tc_act_parse_state *parse_state,
                struct mlx5e_priv *priv,
                struct mlx5_flow_attr *attr)
 {
+       bool clear_action = act->ct.action & TCA_CT_ACT_CLEAR;
        int err;
 
        err = mlx5_tc_ct_parse_action(parse_state->ct_priv, attr,
@@ -35,11 +43,16 @@ tc_act_parse_ct(struct mlx5e_tc_act_parse_state *parse_state,
        if (err)
                return err;
 
-       flow_flag_set(parse_state->flow, CT);
 
        if (mlx5e_is_eswitch_flow(parse_state->flow))
                attr->esw_attr->split_count = attr->esw_attr->out_count;
 
+       if (!clear_action) {
+               attr->flags |= MLX5_ATTR_FLAG_CT;
+               flow_flag_set(parse_state->flow, CT);
+               parse_state->ct = true;
+       }
+
        return 0;
 }
 
index 2e29a23..3d5f236 100644 (file)
@@ -7,7 +7,8 @@
 static bool
 tc_act_can_offload_drop(struct mlx5e_tc_act_parse_state *parse_state,
                        const struct flow_action_entry *act,
-                       int act_index)
+                       int act_index,
+                       struct mlx5_flow_attr *attr)
 {
        return true;
 }
index f445150..fb1be82 100644 (file)
@@ -8,6 +8,7 @@
 static int
 validate_goto_chain(struct mlx5e_priv *priv,
                    struct mlx5e_tc_flow *flow,
+                   struct mlx5_flow_attr *attr,
                    const struct flow_action_entry *act,
                    struct netlink_ext_ack *extack)
 {
@@ -32,7 +33,7 @@ validate_goto_chain(struct mlx5e_priv *priv,
        }
 
        if (!mlx5_chains_backwards_supported(chains) &&
-           dest_chain <= flow->attr->chain) {
+           dest_chain <= attr->chain) {
                NL_SET_ERR_MSG_MOD(extack, "Goto lower numbered chain isn't supported");
                return -EOPNOTSUPP;
        }
@@ -43,8 +44,8 @@ validate_goto_chain(struct mlx5e_priv *priv,
                return -EOPNOTSUPP;
        }
 
-       if (flow->attr->action & (MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT |
-                                 MLX5_FLOW_CONTEXT_ACTION_DECAP) &&
+       if (attr->action & (MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT |
+                           MLX5_FLOW_CONTEXT_ACTION_DECAP) &&
            !reformat_and_fwd) {
                NL_SET_ERR_MSG_MOD(extack,
                                   "Goto chain is not allowed if action has reformat or decap");
@@ -57,12 +58,13 @@ validate_goto_chain(struct mlx5e_priv *priv,
 static bool
 tc_act_can_offload_goto(struct mlx5e_tc_act_parse_state *parse_state,
                        const struct flow_action_entry *act,
-                       int act_index)
+                       int act_index,
+                       struct mlx5_flow_attr *attr)
 {
        struct netlink_ext_ack *extack = parse_state->extack;
        struct mlx5e_tc_flow *flow = parse_state->flow;
 
-       if (validate_goto_chain(flow->priv, flow, act, extack))
+       if (validate_goto_chain(flow->priv, flow, attr, act, extack))
                return false;
 
        return true;
index d775c3d..e8d2275 100644 (file)
@@ -7,7 +7,8 @@
 static bool
 tc_act_can_offload_mark(struct mlx5e_tc_act_parse_state *parse_state,
                        const struct flow_action_entry *act,
-                       int act_index)
+                       int act_index,
+                       struct mlx5_flow_attr *attr)
 {
        if (act->mark & ~MLX5E_TC_FLOW_ID_MASK) {
                NL_SET_ERR_MSG_MOD(parse_state->extack, "Bad flow mark, only 16 bit supported");
index c614fc7..99fb98b 100644 (file)
@@ -99,7 +99,8 @@ get_fdb_out_dev(struct net_device *uplink_dev, struct net_device *out_dev)
 static bool
 tc_act_can_offload_mirred(struct mlx5e_tc_act_parse_state *parse_state,
                          const struct flow_action_entry *act,
-                         int act_index)
+                         int act_index,
+                         struct mlx5_flow_attr *attr)
 {
        struct netlink_ext_ack *extack = parse_state->extack;
        struct mlx5e_tc_flow *flow = parse_state->flow;
@@ -108,8 +109,8 @@ tc_act_can_offload_mirred(struct mlx5e_tc_act_parse_state *parse_state,
        struct mlx5e_priv *priv = flow->priv;
        struct mlx5_esw_flow_attr *esw_attr;
 
-       parse_attr = flow->attr->parse_attr;
-       esw_attr = flow->attr->esw_attr;
+       parse_attr = attr->parse_attr;
+       esw_attr = attr->esw_attr;
 
        if (!out_dev) {
                /* out_dev is NULL when filters with
index 2c74567..16681cf 100644 (file)
@@ -7,7 +7,8 @@
 static bool
 tc_act_can_offload_mirred_nic(struct mlx5e_tc_act_parse_state *parse_state,
                              const struct flow_action_entry *act,
-                             int act_index)
+                             int act_index,
+                             struct mlx5_flow_attr *attr)
 {
        struct netlink_ext_ack *extack = parse_state->extack;
        struct mlx5e_tc_flow *flow = parse_state->flow;
index 784fc4f..4033294 100644 (file)
@@ -8,7 +8,8 @@
 static bool
 tc_act_can_offload_mpls_push(struct mlx5e_tc_act_parse_state *parse_state,
                             const struct flow_action_entry *act,
-                            int act_index)
+                            int act_index,
+                            struct mlx5_flow_attr *attr)
 {
        struct netlink_ext_ack *extack = parse_state->extack;
        struct mlx5e_priv *priv = parse_state->flow->priv;
@@ -36,13 +37,13 @@ tc_act_parse_mpls_push(struct mlx5e_tc_act_parse_state *parse_state,
 static bool
 tc_act_can_offload_mpls_pop(struct mlx5e_tc_act_parse_state *parse_state,
                            const struct flow_action_entry *act,
-                           int act_index)
+                           int act_index,
+                           struct mlx5_flow_attr *attr)
 {
        struct netlink_ext_ack *extack = parse_state->extack;
-       struct mlx5e_tc_flow *flow = parse_state->flow;
        struct net_device *filter_dev;
 
-       filter_dev = flow->attr->parse_attr->filter_dev;
+       filter_dev = attr->parse_attr->filter_dev;
 
        /* we only support mpls pop if it is the first action
         * and the filter net device is bareudp. Subsequent
index 79addbb..39f8f71 100644 (file)
@@ -46,9 +46,9 @@ static int
 parse_pedit_to_modify_hdr(struct mlx5e_priv *priv,
                          const struct flow_action_entry *act, int namespace,
                          struct mlx5e_tc_flow_parse_attr *parse_attr,
-                         struct pedit_headers_action *hdrs,
                          struct netlink_ext_ack *extack)
 {
+       struct pedit_headers_action *hdrs = parse_attr->hdrs;
        u8 cmd = (act->id == FLOW_ACTION_MANGLE) ? 0 : 1;
        u8 htype = act->mangle.htype;
        int err = -EOPNOTSUPP;
@@ -110,20 +110,20 @@ int
 mlx5e_tc_act_pedit_parse_action(struct mlx5e_priv *priv,
                                const struct flow_action_entry *act, int namespace,
                                struct mlx5e_tc_flow_parse_attr *parse_attr,
-                               struct pedit_headers_action *hdrs,
                                struct mlx5e_tc_flow *flow,
                                struct netlink_ext_ack *extack)
 {
        if (flow && flow_flag_test(flow, L3_TO_L2_DECAP))
                return parse_pedit_to_reformat(act, parse_attr, extack);
 
-       return parse_pedit_to_modify_hdr(priv, act, namespace, parse_attr, hdrs, extack);
+       return parse_pedit_to_modify_hdr(priv, act, namespace, parse_attr, extack);
 }
 
 static bool
 tc_act_can_offload_pedit(struct mlx5e_tc_act_parse_state *parse_state,
                         const struct flow_action_entry *act,
-                        int act_index)
+                        int act_index,
+                        struct mlx5_flow_attr *attr)
 {
        return true;
 }
@@ -141,8 +141,7 @@ tc_act_parse_pedit(struct mlx5e_tc_act_parse_state *parse_state,
 
        ns_type = mlx5e_get_flow_namespace(flow);
 
-       err = mlx5e_tc_act_pedit_parse_action(flow->priv, act, ns_type,
-                                             attr->parse_attr, parse_state->hdrs,
+       err = mlx5e_tc_act_pedit_parse_action(flow->priv, act, ns_type, attr->parse_attr,
                                              flow, parse_state->extack);
        if (err)
                return err;
index da8ab03..258f030 100644 (file)
@@ -25,7 +25,6 @@ int
 mlx5e_tc_act_pedit_parse_action(struct mlx5e_priv *priv,
                                const struct flow_action_entry *act, int namespace,
                                struct mlx5e_tc_flow_parse_attr *parse_attr,
-                               struct pedit_headers_action *hdrs,
                                struct mlx5e_tc_flow *flow,
                                struct netlink_ext_ack *extack);
 
index 0819110..6454b03 100644 (file)
@@ -7,7 +7,8 @@
 static bool
 tc_act_can_offload_ptype(struct mlx5e_tc_act_parse_state *parse_state,
                         const struct flow_action_entry *act,
-                        int act_index)
+                        int act_index,
+                        struct mlx5_flow_attr *attr)
 {
        return true;
 }
index 1c32e24..9dd2441 100644 (file)
@@ -7,16 +7,16 @@
 static bool
 tc_act_can_offload_redirect_ingress(struct mlx5e_tc_act_parse_state *parse_state,
                                    const struct flow_action_entry *act,
-                                   int act_index)
+                                   int act_index,
+                                   struct mlx5_flow_attr *attr)
 {
        struct netlink_ext_ack *extack = parse_state->extack;
-       struct mlx5e_tc_flow *flow = parse_state->flow;
        struct mlx5e_tc_flow_parse_attr *parse_attr;
        struct net_device *out_dev = act->dev;
        struct mlx5_esw_flow_attr *esw_attr;
 
-       parse_attr = flow->attr->parse_attr;
-       esw_attr = flow->attr->esw_attr;
+       parse_attr = attr->parse_attr;
+       esw_attr = attr->esw_attr;
 
        if (!out_dev)
                return false;
index 6699bdf..539fea1 100644 (file)
@@ -8,7 +8,8 @@
 static bool
 tc_act_can_offload_sample(struct mlx5e_tc_act_parse_state *parse_state,
                          const struct flow_action_entry *act,
-                         int act_index)
+                         int act_index,
+                         struct mlx5_flow_attr *attr)
 {
        struct netlink_ext_ack *extack = parse_state->extack;
 
@@ -27,11 +28,7 @@ tc_act_parse_sample(struct mlx5e_tc_act_parse_state *parse_state,
                    struct mlx5e_priv *priv,
                    struct mlx5_flow_attr *attr)
 {
-       struct mlx5e_sample_attr *sample_attr;
-
-       sample_attr = kzalloc(sizeof(*attr->sample_attr), GFP_KERNEL);
-       if (!sample_attr)
-               return -ENOMEM;
+       struct mlx5e_sample_attr *sample_attr = &attr->sample_attr;
 
        sample_attr->rate = act->sample.rate;
        sample_attr->group_num = act->sample.psample_group->group_num;
@@ -39,7 +36,7 @@ tc_act_parse_sample(struct mlx5e_tc_act_parse_state *parse_state,
        if (act->sample.truncate)
                sample_attr->trunc_size = act->sample.trunc_size;
 
-       attr->sample_attr = sample_attr;
+       attr->flags |= MLX5_ATTR_FLAG_SAMPLE;
        flow_flag_set(parse_state->flow, SAMPLE);
 
        return 0;
index 046b64c..9ea293f 100644 (file)
@@ -7,7 +7,8 @@
 static bool
 tc_act_can_offload_trap(struct mlx5e_tc_act_parse_state *parse_state,
                        const struct flow_action_entry *act,
-                       int act_index)
+                       int act_index,
+                       struct mlx5_flow_attr *attr)
 {
        struct netlink_ext_ack *extack = parse_state->extack;
 
@@ -27,7 +28,7 @@ tc_act_parse_trap(struct mlx5e_tc_act_parse_state *parse_state,
 {
        attr->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST |
                        MLX5_FLOW_CONTEXT_ACTION_COUNT;
-       attr->flags |= MLX5_ESW_ATTR_FLAG_SLOW_PATH;
+       attr->flags |= MLX5_ATTR_FLAG_SLOW_PATH;
 
        return 0;
 }
index 6f4a2cf..b4fa2de 100644 (file)
@@ -8,7 +8,8 @@
 static bool
 tc_act_can_offload_tun_encap(struct mlx5e_tc_act_parse_state *parse_state,
                             const struct flow_action_entry *act,
-                            int act_index)
+                            int act_index,
+                            struct mlx5_flow_attr *attr)
 {
        if (!act->tunnel) {
                NL_SET_ERR_MSG_MOD(parse_state->extack,
@@ -34,7 +35,8 @@ tc_act_parse_tun_encap(struct mlx5e_tc_act_parse_state *parse_state,
 static bool
 tc_act_can_offload_tun_decap(struct mlx5e_tc_act_parse_state *parse_state,
                             const struct flow_action_entry *act,
-                            int act_index)
+                            int act_index,
+                            struct mlx5_flow_attr *attr)
 {
        return true;
 }
index 70fc0c2..6378b75 100644 (file)
@@ -9,7 +9,6 @@
 static int
 add_vlan_prio_tag_rewrite_action(struct mlx5e_priv *priv,
                                 struct mlx5e_tc_flow_parse_attr *parse_attr,
-                                struct pedit_headers_action *hdrs,
                                 u32 *action, struct netlink_ext_ack *extack)
 {
        const struct flow_action_entry prio_tag_act = {
@@ -26,7 +25,7 @@ add_vlan_prio_tag_rewrite_action(struct mlx5e_priv *priv,
        };
 
        return mlx5e_tc_act_vlan_add_rewrite_action(priv, MLX5_FLOW_NAMESPACE_FDB,
-                                                   &prio_tag_act, parse_attr, hdrs, action,
+                                                   &prio_tag_act, parse_attr, action,
                                                    extack);
 }
 
@@ -151,7 +150,8 @@ mlx5e_tc_act_vlan_add_pop_action(struct mlx5e_priv *priv,
 static bool
 tc_act_can_offload_vlan(struct mlx5e_tc_act_parse_state *parse_state,
                        const struct flow_action_entry *act,
-                       int act_index)
+                       int act_index,
+                       struct mlx5_flow_attr *attr)
 {
        return true;
 }
@@ -170,8 +170,8 @@ tc_act_parse_vlan(struct mlx5e_tc_act_parse_state *parse_state,
                /* Replace vlan pop+push with vlan modify */
                attr->action &= ~MLX5_FLOW_CONTEXT_ACTION_VLAN_POP;
                err = mlx5e_tc_act_vlan_add_rewrite_action(priv, MLX5_FLOW_NAMESPACE_FDB, act,
-                                                          attr->parse_attr, parse_state->hdrs,
-                                                          &attr->action, parse_state->extack);
+                                                          attr->parse_attr, &attr->action,
+                                                          parse_state->extack);
        } else {
                err = parse_tc_vlan_action(priv, act, esw_attr, &attr->action,
                                           parse_state->extack);
@@ -191,7 +191,6 @@ tc_act_post_parse_vlan(struct mlx5e_tc_act_parse_state *parse_state,
                       struct mlx5_flow_attr *attr)
 {
        struct mlx5e_tc_flow_parse_attr *parse_attr = attr->parse_attr;
-       struct pedit_headers_action *hdrs = parse_state->hdrs;
        struct netlink_ext_ack *extack = parse_state->extack;
        struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
        int err;
@@ -202,7 +201,7 @@ tc_act_post_parse_vlan(struct mlx5e_tc_act_parse_state *parse_state,
                 * tag rewrite.
                 */
                attr->action &= ~MLX5_FLOW_CONTEXT_ACTION_VLAN_POP;
-               err = add_vlan_prio_tag_rewrite_action(priv, parse_attr, hdrs,
+               err = add_vlan_prio_tag_rewrite_action(priv, parse_attr,
                                                       &attr->action, extack);
                if (err)
                        return err;
index 3d62f13..2fa58c6 100644 (file)
@@ -24,7 +24,6 @@ int
 mlx5e_tc_act_vlan_add_rewrite_action(struct mlx5e_priv *priv, int namespace,
                                     const struct flow_action_entry *act,
                                     struct mlx5e_tc_flow_parse_attr *parse_attr,
-                                    struct pedit_headers_action *hdrs,
                                     u32 *action, struct netlink_ext_ack *extack);
 
 #endif /* __MLX5_EN_TC_ACT_VLAN_H__ */
index 63e36e7..28444d4 100644 (file)
@@ -12,7 +12,6 @@ int
 mlx5e_tc_act_vlan_add_rewrite_action(struct mlx5e_priv *priv, int namespace,
                                     const struct flow_action_entry *act,
                                     struct mlx5e_tc_flow_parse_attr *parse_attr,
-                                    struct pedit_headers_action *hdrs,
                                     u32 *action, struct netlink_ext_ack *extack)
 {
        u16 mask16 = VLAN_VID_MASK;
@@ -44,7 +43,7 @@ mlx5e_tc_act_vlan_add_rewrite_action(struct mlx5e_priv *priv, int namespace,
                return -EOPNOTSUPP;
        }
 
-       err = mlx5e_tc_act_pedit_parse_action(priv, &pedit_act, namespace, parse_attr, hdrs,
+       err = mlx5e_tc_act_pedit_parse_action(priv, &pedit_act, namespace, parse_attr,
                                              NULL, extack);
        *action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR;
 
@@ -54,7 +53,8 @@ mlx5e_tc_act_vlan_add_rewrite_action(struct mlx5e_priv *priv, int namespace,
 static bool
 tc_act_can_offload_vlan_mangle(struct mlx5e_tc_act_parse_state *parse_state,
                               const struct flow_action_entry *act,
-                              int act_index)
+                              int act_index,
+                              struct mlx5_flow_attr *attr)
 {
        return true;
 }
@@ -69,8 +69,7 @@ tc_act_parse_vlan_mangle(struct mlx5e_tc_act_parse_state *parse_state,
        int err;
 
        ns_type = mlx5e_get_flow_namespace(parse_state->flow);
-       err = mlx5e_tc_act_vlan_add_rewrite_action(priv, ns_type, act,
-                                                  attr->parse_attr, parse_state->hdrs,
+       err = mlx5e_tc_act_vlan_add_rewrite_action(priv, ns_type, act, attr->parse_attr,
                                                   &attr->action, parse_state->extack);
        if (err)
                return err;
index 31b4e39..9e0e229 100644 (file)
@@ -101,6 +101,7 @@ mlx5e_tc_post_act_add(struct mlx5e_post_act *post_act, struct mlx5_flow_attr *at
        post_attr->inner_match_level = MLX5_MATCH_NONE;
        post_attr->outer_match_level = MLX5_MATCH_NONE;
        post_attr->action &= ~(MLX5_FLOW_CONTEXT_ACTION_DECAP);
+       post_attr->flags &= ~MLX5_ATTR_FLAG_SAMPLE;
 
        handle->ns_type = post_act->ns_type;
        /* Splits were handled before post action */
index ff4b4f8..32230e6 100644 (file)
@@ -403,7 +403,7 @@ add_post_rule(struct mlx5_eswitch *esw, struct mlx5e_sample_flow *sample_flow,
        post_attr->chain = 0;
        post_attr->prio = 0;
        post_attr->ft = default_tbl;
-       post_attr->flags = MLX5_ESW_ATTR_FLAG_NO_IN_PORT;
+       post_attr->flags = MLX5_ATTR_FLAG_NO_IN_PORT;
 
        /* When offloading sample and encap action, if there is no valid
         * neigh data struct, a slow path rule is offloaded first. Source
@@ -492,8 +492,7 @@ del_post_rule(struct mlx5_eswitch *esw, struct mlx5e_sample_flow *sample_flow,
 struct mlx5_flow_handle *
 mlx5e_tc_sample_offload(struct mlx5e_tc_psample *tc_psample,
                        struct mlx5_flow_spec *spec,
-                       struct mlx5_flow_attr *attr,
-                       u32 tunnel_id)
+                       struct mlx5_flow_attr *attr)
 {
        struct mlx5e_post_act_handle *post_act_handle = NULL;
        struct mlx5_esw_flow_attr *esw_attr = attr->esw_attr;
@@ -502,6 +501,7 @@ mlx5e_tc_sample_offload(struct mlx5e_tc_psample *tc_psample,
        struct mlx5e_sample_flow *sample_flow;
        struct mlx5e_sample_attr *sample_attr;
        struct mlx5_flow_attr *pre_attr;
+       u32 tunnel_id = attr->tunnel_id;
        struct mlx5_eswitch *esw;
        u32 default_tbl_id;
        u32 obj_id;
@@ -513,7 +513,7 @@ mlx5e_tc_sample_offload(struct mlx5e_tc_psample *tc_psample,
        sample_flow = kzalloc(sizeof(*sample_flow), GFP_KERNEL);
        if (!sample_flow)
                return ERR_PTR(-ENOMEM);
-       sample_attr = attr->sample_attr;
+       sample_attr = &attr->sample_attr;
        sample_attr->sample_flow = sample_flow;
 
        /* For NICs with reg_c_preserve support or decap action, use
@@ -546,6 +546,7 @@ mlx5e_tc_sample_offload(struct mlx5e_tc_psample *tc_psample,
                err = PTR_ERR(sample_flow->sampler);
                goto err_sampler;
        }
+       sample_attr->sampler_id = sample_flow->sampler->sampler_id;
 
        /* Create an id mapping reg_c0 value to sample object. */
        restore_obj.type = MLX5_MAPPED_OBJ_SAMPLE;
@@ -580,13 +581,12 @@ mlx5e_tc_sample_offload(struct mlx5e_tc_psample *tc_psample,
        if (tunnel_id)
                pre_attr->action |= MLX5_FLOW_CONTEXT_ACTION_DECAP;
        pre_attr->modify_hdr = sample_flow->restore->modify_hdr;
-       pre_attr->flags = MLX5_ESW_ATTR_FLAG_SAMPLE;
+       pre_attr->flags = MLX5_ATTR_FLAG_SAMPLE;
        pre_attr->inner_match_level = attr->inner_match_level;
        pre_attr->outer_match_level = attr->outer_match_level;
        pre_attr->chain = attr->chain;
        pre_attr->prio = attr->prio;
-       pre_attr->sample_attr = attr->sample_attr;
-       sample_attr->sampler_id = sample_flow->sampler->sampler_id;
+       pre_attr->sample_attr = *sample_attr;
        pre_esw_attr = pre_attr->esw_attr;
        pre_esw_attr->in_mdev = esw_attr->in_mdev;
        pre_esw_attr->in_rep = esw_attr->in_rep;
@@ -633,11 +633,11 @@ mlx5e_tc_sample_unoffload(struct mlx5e_tc_psample *tc_psample,
         * will hit fw syndromes.
         */
        esw = tc_psample->esw;
-       sample_flow = attr->sample_attr->sample_flow;
+       sample_flow = attr->sample_attr.sample_flow;
        mlx5_eswitch_del_offloaded_rule(esw, sample_flow->pre_rule, sample_flow->pre_attr);
 
        sample_restore_put(tc_psample, sample_flow->restore);
-       mapping_remove(esw->offloads.reg_c0_obj_pool, attr->sample_attr->restore_obj_id);
+       mapping_remove(esw->offloads.reg_c0_obj_pool, attr->sample_attr.restore_obj_id);
        sampler_put(tc_psample, sample_flow->sampler);
        if (sample_flow->post_act_handle)
                mlx5e_tc_post_act_del(tc_psample->post_act, sample_flow->post_act_handle);
index 9ef8a49..a569367 100644 (file)
@@ -26,8 +26,7 @@ void mlx5e_tc_sample_skb(struct sk_buff *skb, struct mlx5_mapped_obj *mapped_obj
 struct mlx5_flow_handle *
 mlx5e_tc_sample_offload(struct mlx5e_tc_psample *sample_priv,
                        struct mlx5_flow_spec *spec,
-                       struct mlx5_flow_attr *attr,
-                       u32 tunnel_id);
+                       struct mlx5_flow_attr *attr);
 
 void
 mlx5e_tc_sample_unoffload(struct mlx5e_tc_psample *sample_priv,
@@ -45,8 +44,7 @@ mlx5e_tc_sample_cleanup(struct mlx5e_tc_psample *tc_psample);
 static inline struct mlx5_flow_handle *
 mlx5e_tc_sample_offload(struct mlx5e_tc_psample *tc_psample,
                        struct mlx5_flow_spec *spec,
-                       struct mlx5_flow_attr *attr,
-                       u32 tunnel_id)
+                       struct mlx5_flow_attr *attr)
 { return ERR_PTR(-EOPNOTSUPP); }
 
 static inline void
index 4a0d38d..0f4d3b9 100644 (file)
@@ -809,7 +809,7 @@ mlx5_tc_ct_entry_add_rule(struct mlx5_tc_ct_priv *ct_priv,
        attr->ft = nat ? ct_priv->ct_nat : ct_priv->ct;
        attr->outer_match_level = MLX5_MATCH_L4;
        attr->counter = entry->counter->counter;
-       attr->flags |= MLX5_ESW_ATTR_FLAG_NO_IN_PORT;
+       attr->flags |= MLX5_ATTR_FLAG_NO_IN_PORT;
        if (ct_priv->ns_type == MLX5_FLOW_NAMESPACE_FDB)
                attr->esw_attr->in_mdev = priv->mdev;
 
@@ -1787,7 +1787,6 @@ mlx5_tc_ct_del_ft_cb(struct mlx5_tc_ct_priv *ct_priv, struct mlx5_ct_ft *ft)
  */
 static struct mlx5_flow_handle *
 __mlx5_tc_ct_flow_offload(struct mlx5_tc_ct_priv *ct_priv,
-                         struct mlx5e_tc_flow *flow,
                          struct mlx5_flow_spec *orig_spec,
                          struct mlx5_flow_attr *attr)
 {
@@ -1871,12 +1870,10 @@ __mlx5_tc_ct_flow_offload(struct mlx5_tc_ct_priv *ct_priv,
         */
        if ((pre_ct_attr->action & MLX5_FLOW_CONTEXT_ACTION_DECAP) &&
            attr->chain == 0) {
-               u32 tun_id = mlx5e_tc_get_flow_tun_id(flow);
-
                err = mlx5e_tc_match_to_reg_set(priv->mdev, &pre_mod_acts,
                                                ct_priv->ns_type,
                                                TUNNEL_TO_REG,
-                                               tun_id);
+                                               attr->tunnel_id);
                if (err) {
                        ct_dbg("Failed to set tunnel register mapping");
                        goto err_mapping;
@@ -1926,87 +1923,19 @@ err_ft:
        return ERR_PTR(err);
 }
 
-static struct mlx5_flow_handle *
-__mlx5_tc_ct_flow_offload_clear(struct mlx5_tc_ct_priv *ct_priv,
-                               struct mlx5_flow_spec *orig_spec,
-                               struct mlx5_flow_attr *attr,
-                               struct mlx5e_tc_mod_hdr_acts *mod_acts)
-{
-       struct mlx5e_priv *priv = netdev_priv(ct_priv->netdev);
-       u32 attr_sz = ns_to_attr_sz(ct_priv->ns_type);
-       struct mlx5_flow_attr *pre_ct_attr;
-       struct mlx5_modify_hdr *mod_hdr;
-       struct mlx5_flow_handle *rule;
-       struct mlx5_ct_flow *ct_flow;
-       int err;
-
-       ct_flow = kzalloc(sizeof(*ct_flow), GFP_KERNEL);
-       if (!ct_flow)
-               return ERR_PTR(-ENOMEM);
-
-       /* Base esw attributes on original rule attribute */
-       pre_ct_attr = mlx5_alloc_flow_attr(ct_priv->ns_type);
-       if (!pre_ct_attr) {
-               err = -ENOMEM;
-               goto err_attr;
-       }
-
-       memcpy(pre_ct_attr, attr, attr_sz);
-
-       mod_hdr = mlx5_modify_header_alloc(priv->mdev, ct_priv->ns_type,
-                                          mod_acts->num_actions,
-                                          mod_acts->actions);
-       if (IS_ERR(mod_hdr)) {
-               err = PTR_ERR(mod_hdr);
-               ct_dbg("Failed to add create ct clear mod hdr");
-               goto err_mod_hdr;
-       }
-
-       pre_ct_attr->modify_hdr = mod_hdr;
-
-       rule = mlx5_tc_rule_insert(priv, orig_spec, pre_ct_attr);
-       if (IS_ERR(rule)) {
-               err = PTR_ERR(rule);
-               ct_dbg("Failed to add ct clear rule");
-               goto err_insert;
-       }
-
-       attr->ct_attr.ct_flow = ct_flow;
-       ct_flow->pre_ct_attr = pre_ct_attr;
-       ct_flow->pre_ct_rule = rule;
-       return rule;
-
-err_insert:
-       mlx5_modify_header_dealloc(priv->mdev, mod_hdr);
-err_mod_hdr:
-       netdev_warn(priv->netdev,
-                   "Failed to offload ct clear flow, err %d\n", err);
-       kfree(pre_ct_attr);
-err_attr:
-       kfree(ct_flow);
-
-       return ERR_PTR(err);
-}
-
 struct mlx5_flow_handle *
 mlx5_tc_ct_flow_offload(struct mlx5_tc_ct_priv *priv,
-                       struct mlx5e_tc_flow *flow,
                        struct mlx5_flow_spec *spec,
                        struct mlx5_flow_attr *attr,
                        struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts)
 {
-       bool clear_action = attr->ct_attr.ct_action & TCA_CT_ACT_CLEAR;
        struct mlx5_flow_handle *rule;
 
        if (!priv)
                return ERR_PTR(-EOPNOTSUPP);
 
        mutex_lock(&priv->control_lock);
-
-       if (clear_action)
-               rule = __mlx5_tc_ct_flow_offload_clear(priv, spec, attr, mod_hdr_acts);
-       else
-               rule = __mlx5_tc_ct_flow_offload(priv, flow, spec, attr);
+       rule = __mlx5_tc_ct_flow_offload(priv, spec, attr);
        mutex_unlock(&priv->control_lock);
 
        return rule;
@@ -2014,14 +1943,13 @@ mlx5_tc_ct_flow_offload(struct mlx5_tc_ct_priv *priv,
 
 static void
 __mlx5_tc_ct_delete_flow(struct mlx5_tc_ct_priv *ct_priv,
-                        struct mlx5e_tc_flow *flow,
-                        struct mlx5_ct_flow *ct_flow)
+                        struct mlx5_ct_flow *ct_flow,
+                        struct mlx5_flow_attr *attr)
 {
        struct mlx5_flow_attr *pre_ct_attr = ct_flow->pre_ct_attr;
        struct mlx5e_priv *priv = netdev_priv(ct_priv->netdev);
 
-       mlx5_tc_rule_delete(priv, ct_flow->pre_ct_rule,
-                           pre_ct_attr);
+       mlx5_tc_rule_delete(priv, ct_flow->pre_ct_rule, pre_ct_attr);
        mlx5_modify_header_dealloc(priv->mdev, pre_ct_attr->modify_hdr);
 
        if (ct_flow->post_act_handle) {
@@ -2036,7 +1964,6 @@ __mlx5_tc_ct_delete_flow(struct mlx5_tc_ct_priv *ct_priv,
 
 void
 mlx5_tc_ct_delete_flow(struct mlx5_tc_ct_priv *priv,
-                      struct mlx5e_tc_flow *flow,
                       struct mlx5_flow_attr *attr)
 {
        struct mlx5_ct_flow *ct_flow = attr->ct_attr.ct_flow;
@@ -2048,7 +1975,7 @@ mlx5_tc_ct_delete_flow(struct mlx5_tc_ct_priv *priv,
                return;
 
        mutex_lock(&priv->control_lock);
-       __mlx5_tc_ct_delete_flow(priv, flow, ct_flow);
+       __mlx5_tc_ct_delete_flow(priv, ct_flow, attr);
        mutex_unlock(&priv->control_lock);
 }
 
index 99662af..2b21c7b 100644 (file)
@@ -116,13 +116,11 @@ mlx5_tc_ct_parse_action(struct mlx5_tc_ct_priv *priv,
 
 struct mlx5_flow_handle *
 mlx5_tc_ct_flow_offload(struct mlx5_tc_ct_priv *priv,
-                       struct mlx5e_tc_flow *flow,
                        struct mlx5_flow_spec *spec,
                        struct mlx5_flow_attr *attr,
                        struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts);
 void
 mlx5_tc_ct_delete_flow(struct mlx5_tc_ct_priv *priv,
-                      struct mlx5e_tc_flow *flow,
                       struct mlx5_flow_attr *attr);
 
 bool
@@ -183,7 +181,6 @@ mlx5_tc_ct_parse_action(struct mlx5_tc_ct_priv *priv,
 
 static inline struct mlx5_flow_handle *
 mlx5_tc_ct_flow_offload(struct mlx5_tc_ct_priv *priv,
-                       struct mlx5e_tc_flow *flow,
                        struct mlx5_flow_spec *spec,
                        struct mlx5_flow_attr *attr,
                        struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts)
@@ -193,7 +190,6 @@ mlx5_tc_ct_flow_offload(struct mlx5_tc_ct_priv *priv,
 
 static inline void
 mlx5_tc_ct_delete_flow(struct mlx5_tc_ct_priv *priv,
-                      struct mlx5e_tc_flow *flow,
                       struct mlx5_flow_attr *attr)
 {
 }
index f832c26..9ffba58 100644 (file)
@@ -37,6 +37,7 @@ struct mlx5e_tc_flow_parse_attr {
        const struct ip_tunnel_info *tun_info[MLX5_MAX_FLOW_FWD_VPORTS];
        struct net_device *filter_dev;
        struct mlx5_flow_spec spec;
+       struct pedit_headers_action hdrs[__PEDIT_CMD_MAX];
        struct mlx5e_tc_mod_hdr_acts mod_hdr_acts;
        int mirred_ifindex[MLX5_MAX_FLOW_FWD_VPORTS];
        struct ethhdr eth;
@@ -107,10 +108,19 @@ struct mlx5e_tc_flow {
        struct rcu_head rcu_head;
        struct completion init_done;
        struct completion del_hw_done;
-       int tunnel_id; /* the mapped tunnel id of this flow */
        struct mlx5_flow_attr *attr;
 };
 
+struct mlx5_flow_handle *
+mlx5e_tc_rule_offload(struct mlx5e_priv *priv,
+                     struct mlx5_flow_spec *spec,
+                     struct mlx5_flow_attr *attr);
+
+void
+mlx5e_tc_rule_unoffload(struct mlx5e_priv *priv,
+                       struct mlx5_flow_handle *rule,
+                       struct mlx5_flow_attr *attr);
+
 u8 mlx5e_tc_get_ip_version(struct mlx5_flow_spec *spec, bool outer);
 
 struct mlx5_flow_handle *
@@ -173,6 +183,7 @@ struct mlx5_flow_handle *
 mlx5e_tc_offload_to_slow_path(struct mlx5_eswitch *esw,
                              struct mlx5e_tc_flow *flow,
                              struct mlx5_flow_spec *spec);
+
 void mlx5e_tc_unoffload_fdb_rules(struct mlx5_eswitch *esw,
                                  struct mlx5e_tc_flow *flow,
                                  struct mlx5_flow_attr *attr);
index 9918ed8..1f8d339 100644 (file)
@@ -488,12 +488,14 @@ static void mlx5e_detach_encap_route(struct mlx5e_priv *priv,
                                     int out_index);
 
 void mlx5e_detach_encap(struct mlx5e_priv *priv,
-                       struct mlx5e_tc_flow *flow, int out_index)
+                       struct mlx5e_tc_flow *flow,
+                       struct mlx5_flow_attr *attr,
+                       int out_index)
 {
        struct mlx5e_encap_entry *e = flow->encaps[out_index].e;
        struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
 
-       if (flow->attr->esw_attr->dests[out_index].flags &
+       if (attr->esw_attr->dests[out_index].flags &
            MLX5_ESW_DEST_CHAIN_WITH_SRC_PORT_CHANGE)
                mlx5e_detach_encap_route(priv, flow, out_index);
 
@@ -733,6 +735,7 @@ static unsigned int mlx5e_route_tbl_get_last_update(struct mlx5e_priv *priv)
 
 static int mlx5e_attach_encap_route(struct mlx5e_priv *priv,
                                    struct mlx5e_tc_flow *flow,
+                                   struct mlx5_flow_attr *attr,
                                    struct mlx5e_encap_entry *e,
                                    bool new_encap_entry,
                                    unsigned long tbl_time_before,
@@ -740,6 +743,7 @@ static int mlx5e_attach_encap_route(struct mlx5e_priv *priv,
 
 int mlx5e_attach_encap(struct mlx5e_priv *priv,
                       struct mlx5e_tc_flow *flow,
+                      struct mlx5_flow_attr *attr,
                       struct net_device *mirred_dev,
                       int out_index,
                       struct netlink_ext_ack *extack,
@@ -748,7 +752,6 @@ int mlx5e_attach_encap(struct mlx5e_priv *priv,
 {
        struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
        struct mlx5e_tc_flow_parse_attr *parse_attr;
-       struct mlx5_flow_attr *attr = flow->attr;
        const struct ip_tunnel_info *tun_info;
        unsigned long tbl_time_before = 0;
        struct mlx5e_encap_entry *e;
@@ -834,8 +837,8 @@ int mlx5e_attach_encap(struct mlx5e_priv *priv,
        e->compl_result = 1;
 
 attach_flow:
-       err = mlx5e_attach_encap_route(priv, flow, e, entry_created, tbl_time_before,
-                                      out_index);
+       err = mlx5e_attach_encap_route(priv, flow, attr, e, entry_created,
+                                      tbl_time_before, out_index);
        if (err)
                goto out_err;
 
@@ -1198,6 +1201,7 @@ out:
 
 static int mlx5e_attach_encap_route(struct mlx5e_priv *priv,
                                    struct mlx5e_tc_flow *flow,
+                                   struct mlx5_flow_attr *attr,
                                    struct mlx5e_encap_entry *e,
                                    bool new_encap_entry,
                                    unsigned long tbl_time_before,
@@ -1206,7 +1210,6 @@ static int mlx5e_attach_encap_route(struct mlx5e_priv *priv,
        struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
        unsigned long tbl_time_after = tbl_time_before;
        struct mlx5e_tc_flow_parse_attr *parse_attr;
-       struct mlx5_flow_attr *attr = flow->attr;
        const struct ip_tunnel_info *tun_info;
        struct mlx5_esw_flow_attr *esw_attr;
        struct mlx5e_route_entry *r;
@@ -1377,7 +1380,7 @@ static void mlx5e_reoffload_encap(struct mlx5e_priv *priv,
                        continue;
                }
 
-               err = mlx5e_tc_add_flow_mod_hdr(priv, parse_attr, flow);
+               err = mlx5e_tc_add_flow_mod_hdr(priv, flow, attr);
                if (err) {
                        mlx5_core_warn(priv->mdev, "Failed to update flow mod_hdr err=%d",
                                       err);
index 3391504..d542b84 100644 (file)
@@ -7,15 +7,19 @@
 #include "tc_priv.h"
 
 void mlx5e_detach_encap(struct mlx5e_priv *priv,
-                       struct mlx5e_tc_flow *flow, int out_index);
+                       struct mlx5e_tc_flow *flow,
+                       struct mlx5_flow_attr *attr,
+                       int out_index);
 
 int mlx5e_attach_encap(struct mlx5e_priv *priv,
                       struct mlx5e_tc_flow *flow,
+                      struct mlx5_flow_attr *attr,
                       struct net_device *mirred_dev,
                       int out_index,
                       struct netlink_ext_ack *extack,
                       struct net_device **encap_dev,
                       bool *encap_valid);
+
 int mlx5e_attach_decap(struct mlx5e_priv *priv,
                       struct mlx5e_tc_flow *flow,
                       struct netlink_ext_ack *extack);
index 2022fa4..099d4ce 100644 (file)
@@ -295,13 +295,62 @@ mlx5_tc_rule_delete(struct mlx5e_priv *priv,
 
        if (is_mdev_switchdev_mode(priv->mdev)) {
                mlx5_eswitch_del_offloaded_rule(esw, rule, attr);
-
                return;
        }
 
        mlx5e_del_offloaded_nic_rule(priv, rule, attr);
 }
 
+struct mlx5_flow_handle *
+mlx5e_tc_rule_offload(struct mlx5e_priv *priv,
+                     struct mlx5_flow_spec *spec,
+                     struct mlx5_flow_attr *attr)
+{
+       struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+
+       if (attr->flags & MLX5_ATTR_FLAG_CT) {
+               struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts =
+                       &attr->parse_attr->mod_hdr_acts;
+
+               return mlx5_tc_ct_flow_offload(get_ct_priv(priv),
+                                              spec, attr,
+                                              mod_hdr_acts);
+       }
+
+       if (!is_mdev_switchdev_mode(priv->mdev))
+               return mlx5e_add_offloaded_nic_rule(priv, spec, attr);
+
+       if (attr->flags & MLX5_ATTR_FLAG_SAMPLE)
+               return mlx5e_tc_sample_offload(get_sample_priv(priv), spec, attr);
+
+       return mlx5_eswitch_add_offloaded_rule(esw, spec, attr);
+}
+
+void
+mlx5e_tc_rule_unoffload(struct mlx5e_priv *priv,
+                       struct mlx5_flow_handle *rule,
+                       struct mlx5_flow_attr *attr)
+{
+       struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+
+       if (attr->flags & MLX5_ATTR_FLAG_CT) {
+               mlx5_tc_ct_delete_flow(get_ct_priv(priv), attr);
+               return;
+       }
+
+       if (!is_mdev_switchdev_mode(priv->mdev)) {
+               mlx5e_del_offloaded_nic_rule(priv, rule, attr);
+               return;
+       }
+
+       if (attr->flags & MLX5_ATTR_FLAG_SAMPLE) {
+               mlx5e_tc_sample_unoffload(get_sample_priv(priv), rule, attr);
+               return;
+       }
+
+       mlx5_eswitch_del_offloaded_rule(esw, rule, attr);
+}
+
 int
 mlx5e_tc_match_to_reg_set(struct mlx5_core_dev *mdev,
                          struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts,
@@ -1039,6 +1088,21 @@ err_ft_get:
 }
 
 static int
+alloc_flow_attr_counter(struct mlx5_core_dev *counter_dev,
+                       struct mlx5_flow_attr *attr)
+
+{
+       struct mlx5_fc *counter;
+
+       counter = mlx5_fc_create(counter_dev, true);
+       if (IS_ERR(counter))
+               return PTR_ERR(counter);
+
+       attr->counter = counter;
+       return 0;
+}
+
+static int
 mlx5e_tc_add_nic_flow(struct mlx5e_priv *priv,
                      struct mlx5e_tc_flow *flow,
                      struct netlink_ext_ack *extack)
@@ -1046,7 +1110,6 @@ mlx5e_tc_add_nic_flow(struct mlx5e_priv *priv,
        struct mlx5e_tc_flow_parse_attr *parse_attr;
        struct mlx5_flow_attr *attr = flow->attr;
        struct mlx5_core_dev *dev = priv->mdev;
-       struct mlx5_fc *counter;
        int err;
 
        parse_attr = attr->parse_attr;
@@ -1058,11 +1121,9 @@ mlx5e_tc_add_nic_flow(struct mlx5e_priv *priv,
        }
 
        if (attr->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
-               counter = mlx5_fc_create(dev, true);
-               if (IS_ERR(counter))
-                       return PTR_ERR(counter);
-
-               attr->counter = counter;
+               err = alloc_flow_attr_counter(dev, attr);
+               if (err)
+                       return err;
        }
 
        if (attr->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR) {
@@ -1072,8 +1133,8 @@ mlx5e_tc_add_nic_flow(struct mlx5e_priv *priv,
                        return err;
        }
 
-       if (flow_flag_test(flow, CT))
-               flow->rule[0] = mlx5_tc_ct_flow_offload(get_ct_priv(priv), flow, &parse_attr->spec,
+       if (attr->flags & MLX5_ATTR_FLAG_CT)
+               flow->rule[0] = mlx5_tc_ct_flow_offload(get_ct_priv(priv), &parse_attr->spec,
                                                        attr, &parse_attr->mod_hdr_acts);
        else
                flow->rule[0] = mlx5e_add_offloaded_nic_rule(priv, &parse_attr->spec,
@@ -1107,8 +1168,8 @@ static void mlx5e_tc_del_nic_flow(struct mlx5e_priv *priv,
 
        flow_flag_clear(flow, OFFLOADED);
 
-       if (flow_flag_test(flow, CT))
-               mlx5_tc_ct_delete_flow(get_ct_priv(flow->priv), flow, attr);
+       if (attr->flags & MLX5_ATTR_FLAG_CT)
+               mlx5_tc_ct_delete_flow(get_ct_priv(flow->priv), attr);
        else if (!IS_ERR_OR_NULL(flow->rule[0]))
                mlx5e_del_offloaded_nic_rule(priv, flow->rule[0], attr);
 
@@ -1142,40 +1203,27 @@ mlx5e_tc_offload_fdb_rules(struct mlx5_eswitch *esw,
                           struct mlx5_flow_spec *spec,
                           struct mlx5_flow_attr *attr)
 {
-       struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts;
        struct mlx5_flow_handle *rule;
 
-       if (attr->flags & MLX5_ESW_ATTR_FLAG_SLOW_PATH)
+       if (attr->flags & MLX5_ATTR_FLAG_SLOW_PATH)
                return mlx5_eswitch_add_offloaded_rule(esw, spec, attr);
 
-       if (flow_flag_test(flow, CT)) {
-               mod_hdr_acts = &attr->parse_attr->mod_hdr_acts;
-
-               rule = mlx5_tc_ct_flow_offload(get_ct_priv(flow->priv),
-                                              flow, spec, attr,
-                                              mod_hdr_acts);
-       } else if (flow_flag_test(flow, SAMPLE)) {
-               rule = mlx5e_tc_sample_offload(get_sample_priv(flow->priv), spec, attr,
-                                              mlx5e_tc_get_flow_tun_id(flow));
-       } else {
-               rule = mlx5_eswitch_add_offloaded_rule(esw, spec, attr);
-       }
+       rule = mlx5e_tc_rule_offload(flow->priv, spec, attr);
 
        if (IS_ERR(rule))
                return rule;
 
        if (attr->esw_attr->split_count) {
                flow->rule[1] = mlx5_eswitch_add_fwd_rule(esw, spec, attr);
-               if (IS_ERR(flow->rule[1])) {
-                       if (flow_flag_test(flow, CT))
-                               mlx5_tc_ct_delete_flow(get_ct_priv(flow->priv), flow, attr);
-                       else
-                               mlx5_eswitch_del_offloaded_rule(esw, rule, attr);
-                       return flow->rule[1];
-               }
+               if (IS_ERR(flow->rule[1]))
+                       goto err_rule1;
        }
 
        return rule;
+
+err_rule1:
+       mlx5e_tc_rule_unoffload(flow->priv, rule, attr);
+       return flow->rule[1];
 }
 
 void mlx5e_tc_unoffload_fdb_rules(struct mlx5_eswitch *esw,
@@ -1184,19 +1232,13 @@ void mlx5e_tc_unoffload_fdb_rules(struct mlx5_eswitch *esw,
 {
        flow_flag_clear(flow, OFFLOADED);
 
-       if (attr->flags & MLX5_ESW_ATTR_FLAG_SLOW_PATH)
-               goto offload_rule_0;
+       if (attr->flags & MLX5_ATTR_FLAG_SLOW_PATH)
+               return mlx5_eswitch_del_offloaded_rule(esw, flow->rule[0], attr);
 
        if (attr->esw_attr->split_count)
                mlx5_eswitch_del_fwd_rule(esw, flow->rule[1], attr);
 
-       if (flow_flag_test(flow, CT))
-               mlx5_tc_ct_delete_flow(get_ct_priv(flow->priv), flow, attr);
-       else if (flow_flag_test(flow, SAMPLE))
-               mlx5e_tc_sample_unoffload(get_sample_priv(flow->priv), flow->rule[0], attr);
-       else
-offload_rule_0:
-               mlx5_eswitch_del_offloaded_rule(esw, flow->rule[0], attr);
+       mlx5e_tc_rule_unoffload(flow->priv, flow->rule[0], attr);
 }
 
 struct mlx5_flow_handle *
@@ -1214,7 +1256,7 @@ mlx5e_tc_offload_to_slow_path(struct mlx5_eswitch *esw,
        memcpy(slow_attr, flow->attr, ESW_FLOW_ATTR_SZ);
        slow_attr->action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
        slow_attr->esw_attr->split_count = 0;
-       slow_attr->flags |= MLX5_ESW_ATTR_FLAG_SLOW_PATH;
+       slow_attr->flags |= MLX5_ATTR_FLAG_SLOW_PATH;
 
        rule = mlx5e_tc_offload_fdb_rules(esw, flow, spec, slow_attr);
        if (!IS_ERR(rule))
@@ -1239,7 +1281,7 @@ void mlx5e_tc_unoffload_from_slow_path(struct mlx5_eswitch *esw,
        memcpy(slow_attr, flow->attr, ESW_FLOW_ATTR_SZ);
        slow_attr->action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
        slow_attr->esw_attr->split_count = 0;
-       slow_attr->flags |= MLX5_ESW_ATTR_FLAG_SLOW_PATH;
+       slow_attr->flags |= MLX5_ATTR_FLAG_SLOW_PATH;
        mlx5e_tc_unoffload_fdb_rules(esw, flow, slow_attr);
        flow_flag_clear(flow, SLOW);
        kfree(slow_attr);
@@ -1348,10 +1390,10 @@ int mlx5e_tc_query_route_vport(struct net_device *out_dev, struct net_device *ro
 }
 
 int mlx5e_tc_add_flow_mod_hdr(struct mlx5e_priv *priv,
-                             struct mlx5e_tc_flow_parse_attr *parse_attr,
-                             struct mlx5e_tc_flow *flow)
+                             struct mlx5e_tc_flow *flow,
+                             struct mlx5_flow_attr *attr)
 {
-       struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts = &parse_attr->mod_hdr_acts;
+       struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts = &attr->parse_attr->mod_hdr_acts;
        struct mlx5_modify_hdr *mod_hdr;
 
        mod_hdr = mlx5_modify_header_alloc(priv->mdev,
@@ -1361,13 +1403,101 @@ int mlx5e_tc_add_flow_mod_hdr(struct mlx5e_priv *priv,
        if (IS_ERR(mod_hdr))
                return PTR_ERR(mod_hdr);
 
-       WARN_ON(flow->attr->modify_hdr);
-       flow->attr->modify_hdr = mod_hdr;
+       WARN_ON(attr->modify_hdr);
+       attr->modify_hdr = mod_hdr;
 
        return 0;
 }
 
 static int
+set_encap_dests(struct mlx5e_priv *priv,
+               struct mlx5e_tc_flow *flow,
+               struct mlx5_flow_attr *attr,
+               struct netlink_ext_ack *extack,
+               bool *encap_valid,
+               bool *vf_tun)
+{
+       struct mlx5e_tc_flow_parse_attr *parse_attr;
+       struct mlx5_esw_flow_attr *esw_attr;
+       struct net_device *encap_dev = NULL;
+       struct mlx5e_rep_priv *rpriv;
+       struct mlx5e_priv *out_priv;
+       int out_index;
+       int err = 0;
+
+       parse_attr = attr->parse_attr;
+       esw_attr = attr->esw_attr;
+       *vf_tun = false;
+       *encap_valid = true;
+
+       for (out_index = 0; out_index < MLX5_MAX_FLOW_FWD_VPORTS; out_index++) {
+               struct net_device *out_dev;
+               int mirred_ifindex;
+
+               if (!(esw_attr->dests[out_index].flags & MLX5_ESW_DEST_ENCAP))
+                       continue;
+
+               mirred_ifindex = parse_attr->mirred_ifindex[out_index];
+               out_dev = dev_get_by_index(dev_net(priv->netdev), mirred_ifindex);
+               if (!out_dev) {
+                       NL_SET_ERR_MSG_MOD(extack, "Requested mirred device not found");
+                       err = -ENODEV;
+                       goto out;
+               }
+               err = mlx5e_attach_encap(priv, flow, attr, out_dev, out_index,
+                                        extack, &encap_dev, encap_valid);
+               dev_put(out_dev);
+               if (err)
+                       goto out;
+
+               if (esw_attr->dests[out_index].flags &
+                   MLX5_ESW_DEST_CHAIN_WITH_SRC_PORT_CHANGE &&
+                   !esw_attr->dest_int_port)
+                       *vf_tun = true;
+
+               out_priv = netdev_priv(encap_dev);
+               rpriv = out_priv->ppriv;
+               esw_attr->dests[out_index].rep = rpriv->rep;
+               esw_attr->dests[out_index].mdev = out_priv->mdev;
+       }
+
+       if (*vf_tun && esw_attr->out_count > 1) {
+               NL_SET_ERR_MSG_MOD(extack, "VF tunnel encap with mirroring is not supported");
+               err = -EOPNOTSUPP;
+               goto out;
+       }
+
+out:
+       return err;
+}
+
+static void
+clean_encap_dests(struct mlx5e_priv *priv,
+                 struct mlx5e_tc_flow *flow,
+                 struct mlx5_flow_attr *attr,
+                 bool *vf_tun)
+{
+       struct mlx5_esw_flow_attr *esw_attr;
+       int out_index;
+
+       esw_attr = attr->esw_attr;
+       *vf_tun = false;
+
+       for (out_index = 0; out_index < MLX5_MAX_FLOW_FWD_VPORTS; out_index++) {
+               if (!(esw_attr->dests[out_index].flags & MLX5_ESW_DEST_ENCAP))
+                       continue;
+
+               if (esw_attr->dests[out_index].flags &
+                   MLX5_ESW_DEST_CHAIN_WITH_SRC_PORT_CHANGE &&
+                   !esw_attr->dest_int_port)
+                       *vf_tun = true;
+
+               mlx5e_detach_encap(priv, flow, attr, out_index);
+               kfree(attr->parse_attr->tun_info[out_index]);
+       }
+}
+
+static int
 mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
                      struct mlx5e_tc_flow *flow,
                      struct netlink_ext_ack *extack)
@@ -1375,15 +1505,10 @@ mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
        struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
        struct mlx5e_tc_flow_parse_attr *parse_attr;
        struct mlx5_flow_attr *attr = flow->attr;
-       bool vf_tun = false, encap_valid = true;
-       struct net_device *encap_dev = NULL;
        struct mlx5_esw_flow_attr *esw_attr;
-       struct mlx5e_rep_priv *rpriv;
-       struct mlx5e_priv *out_priv;
-       struct mlx5_fc *counter;
+       bool vf_tun, encap_valid;
        u32 max_prio, max_chain;
        int err = 0;
-       int out_index;
 
        parse_attr = attr->parse_attr;
        esw_attr = attr->esw_attr;
@@ -1472,50 +1597,17 @@ mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
                esw_attr->int_port = int_port;
        }
 
-       for (out_index = 0; out_index < MLX5_MAX_FLOW_FWD_VPORTS; out_index++) {
-               struct net_device *out_dev;
-               int mirred_ifindex;
-
-               if (!(esw_attr->dests[out_index].flags & MLX5_ESW_DEST_ENCAP))
-                       continue;
-
-               mirred_ifindex = parse_attr->mirred_ifindex[out_index];
-               out_dev = dev_get_by_index(dev_net(priv->netdev), mirred_ifindex);
-               if (!out_dev) {
-                       NL_SET_ERR_MSG_MOD(extack, "Requested mirred device not found");
-                       err = -ENODEV;
-                       goto err_out;
-               }
-               err = mlx5e_attach_encap(priv, flow, out_dev, out_index,
-                                        extack, &encap_dev, &encap_valid);
-               dev_put(out_dev);
-               if (err)
-                       goto err_out;
-
-               if (esw_attr->dests[out_index].flags &
-                   MLX5_ESW_DEST_CHAIN_WITH_SRC_PORT_CHANGE &&
-                   !esw_attr->dest_int_port)
-                       vf_tun = true;
-               out_priv = netdev_priv(encap_dev);
-               rpriv = out_priv->ppriv;
-               esw_attr->dests[out_index].rep = rpriv->rep;
-               esw_attr->dests[out_index].mdev = out_priv->mdev;
-       }
-
-       if (vf_tun && esw_attr->out_count > 1) {
-               NL_SET_ERR_MSG_MOD(extack, "VF tunnel encap with mirroring is not supported");
-               err = -EOPNOTSUPP;
+       err = set_encap_dests(priv, flow, attr, extack, &encap_valid, &vf_tun);
+       if (err)
                goto err_out;
-       }
 
        err = mlx5_eswitch_add_vlan_action(esw, attr);
        if (err)
                goto err_out;
 
-       if (attr->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR &&
-           !(attr->ct_attr.ct_action & TCA_CT_ACT_CLEAR)) {
+       if (attr->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR) {
                if (vf_tun) {
-                       err = mlx5e_tc_add_flow_mod_hdr(priv, parse_attr, flow);
+                       err = mlx5e_tc_add_flow_mod_hdr(priv, flow, attr);
                        if (err)
                                goto err_out;
                } else {
@@ -1526,13 +1618,9 @@ mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
        }
 
        if (attr->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
-               counter = mlx5_fc_create(esw_attr->counter_dev, true);
-               if (IS_ERR(counter)) {
-                       err = PTR_ERR(counter);
+               err = alloc_flow_attr_counter(esw_attr->counter_dev, attr);
+               if (err)
                        goto err_out;
-               }
-
-               attr->counter = counter;
        }
 
        /* we get here if one of the following takes place:
@@ -1576,8 +1664,7 @@ static void mlx5e_tc_del_fdb_flow(struct mlx5e_priv *priv,
        struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
        struct mlx5_flow_attr *attr = flow->attr;
        struct mlx5_esw_flow_attr *esw_attr;
-       bool vf_tun = false;
-       int out_index;
+       bool vf_tun;
 
        esw_attr = attr->esw_attr;
        mlx5e_put_flow_tunnel_id(flow);
@@ -1601,16 +1688,7 @@ static void mlx5e_tc_del_fdb_flow(struct mlx5e_priv *priv,
        if (flow->decap_route)
                mlx5e_detach_decap_route(priv, flow);
 
-       for (out_index = 0; out_index < MLX5_MAX_FLOW_FWD_VPORTS; out_index++) {
-               if (esw_attr->dests[out_index].flags &
-                   MLX5_ESW_DEST_CHAIN_WITH_SRC_PORT_CHANGE &&
-                   !esw_attr->dest_int_port)
-                       vf_tun = true;
-               if (esw_attr->dests[out_index].flags & MLX5_ESW_DEST_ENCAP) {
-                       mlx5e_detach_encap(priv, flow, out_index);
-                       kfree(attr->parse_attr->tun_info[out_index]);
-               }
-       }
+       clean_encap_dests(priv, flow, attr, &vf_tun);
 
        mlx5_tc_ct_match_del(get_ct_priv(priv), &flow->attr->ct_attr);
 
@@ -1634,7 +1712,6 @@ static void mlx5e_tc_del_fdb_flow(struct mlx5e_priv *priv,
        if (flow_flag_test(flow, L3_TO_L2_DECAP))
                mlx5e_detach_decap(priv, flow);
 
-       kfree(attr->sample_attr);
        kvfree(attr->esw_attr->rx_tun_attr);
        kvfree(attr->parse_attr);
        kfree(flow->attr);
@@ -1854,7 +1931,7 @@ static int mlx5e_get_flow_tunnel_id(struct mlx5e_priv *priv,
                attr->action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR;
        }
 
-       flow->tunnel_id = value;
+       flow->attr->tunnel_id = value;
        return 0;
 
 err_set:
@@ -1868,8 +1945,8 @@ err_enc_opts:
 
 static void mlx5e_put_flow_tunnel_id(struct mlx5e_tc_flow *flow)
 {
-       u32 enc_opts_id = flow->tunnel_id & ENC_OPTS_BITS_MASK;
-       u32 tun_id = flow->tunnel_id >> ENC_OPTS_BITS;
+       u32 enc_opts_id = flow->attr->tunnel_id & ENC_OPTS_BITS_MASK;
+       u32 tun_id = flow->attr->tunnel_id >> ENC_OPTS_BITS;
        struct mlx5_rep_uplink_priv *uplink_priv;
        struct mlx5e_rep_priv *uplink_rpriv;
        struct mlx5_eswitch *esw;
@@ -1885,11 +1962,6 @@ static void mlx5e_put_flow_tunnel_id(struct mlx5e_tc_flow *flow)
                               enc_opts_id);
 }
 
-u32 mlx5e_tc_get_flow_tun_id(struct mlx5e_tc_flow *flow)
-{
-       return flow->tunnel_id;
-}
-
 void mlx5e_tc_set_ethertype(struct mlx5_core_dev *mdev,
                            struct flow_match_basic *match, bool outer,
                            void *headers_c, void *headers_v)
@@ -2811,14 +2883,15 @@ static unsigned long mask_to_le(unsigned long mask, int size)
 
        return mask;
 }
+
 static int offload_pedit_fields(struct mlx5e_priv *priv,
                                int namespace,
-                               struct pedit_headers_action *hdrs,
                                struct mlx5e_tc_flow_parse_attr *parse_attr,
                                u32 *action_flags,
                                struct netlink_ext_ack *extack)
 {
        struct pedit_headers *set_masks, *add_masks, *set_vals, *add_vals;
+       struct pedit_headers_action *hdrs = parse_attr->hdrs;
        void *headers_c, *headers_v, *action, *vals_p;
        u32 *s_masks_p, *a_masks_p, s_mask, a_mask;
        struct mlx5e_tc_mod_hdr_acts *mod_acts;
@@ -2944,35 +3017,43 @@ static int offload_pedit_fields(struct mlx5e_priv *priv,
 
 static const struct pedit_headers zero_masks = {};
 
-static int alloc_tc_pedit_action(struct mlx5e_priv *priv, int namespace,
-                                struct mlx5e_tc_flow_parse_attr *parse_attr,
-                                struct pedit_headers_action *hdrs,
-                                u32 *action_flags,
-                                struct netlink_ext_ack *extack)
+static int verify_offload_pedit_fields(struct mlx5e_priv *priv,
+                                      struct mlx5e_tc_flow_parse_attr *parse_attr,
+                                      struct netlink_ext_ack *extack)
 {
        struct pedit_headers *cmd_masks;
-       int err;
        u8 cmd;
 
-       err = offload_pedit_fields(priv, namespace, hdrs, parse_attr,
-                                  action_flags, extack);
-       if (err < 0)
-               goto out_dealloc_parsed_actions;
-
        for (cmd = 0; cmd < __PEDIT_CMD_MAX; cmd++) {
-               cmd_masks = &hdrs[cmd].masks;
+               cmd_masks = &parse_attr->hdrs[cmd].masks;
                if (memcmp(cmd_masks, &zero_masks, sizeof(zero_masks))) {
-                       NL_SET_ERR_MSG_MOD(extack,
-                                          "attempt to offload an unsupported field");
+                       NL_SET_ERR_MSG_MOD(extack, "attempt to offload an unsupported field");
                        netdev_warn(priv->netdev, "attempt to offload an unsupported field (cmd %d)\n", cmd);
                        print_hex_dump(KERN_WARNING, "mask: ", DUMP_PREFIX_ADDRESS,
                                       16, 1, cmd_masks, sizeof(zero_masks), true);
-                       err = -EOPNOTSUPP;
-                       goto out_dealloc_parsed_actions;
+                       return -EOPNOTSUPP;
                }
        }
 
        return 0;
+}
+
+static int alloc_tc_pedit_action(struct mlx5e_priv *priv, int namespace,
+                                struct mlx5e_tc_flow_parse_attr *parse_attr,
+                                u32 *action_flags,
+                                struct netlink_ext_ack *extack)
+{
+       int err;
+
+       err = offload_pedit_fields(priv, namespace, parse_attr, action_flags, extack);
+       if (err)
+               goto out_dealloc_parsed_actions;
+
+       err = verify_offload_pedit_fields(priv, parse_attr, extack);
+       if (err)
+               goto out_dealloc_parsed_actions;
+
+       return 0;
 
 out_dealloc_parsed_actions:
        mlx5e_mod_hdr_dealloc(&parse_attr->mod_hdr_acts);
@@ -3257,7 +3338,7 @@ parse_tc_actions(struct mlx5e_tc_act_parse_state *parse_state,
                        return -EOPNOTSUPP;
                }
 
-               if (!tc_act->can_offload(parse_state, act, i))
+               if (!tc_act->can_offload(parse_state, act, i, attr))
                        return -EOPNOTSUPP;
 
                err = tc_act->parse_action(parse_state, act, priv, attr);
@@ -3268,7 +3349,7 @@ parse_tc_actions(struct mlx5e_tc_act_parse_state *parse_state,
        flow_action_for_each(i, act, flow_action) {
                tc_act = mlx5e_tc_act_get(act->id, ns_type);
                if (!tc_act || !tc_act->post_parse ||
-                   !tc_act->can_offload(parse_state, act, i))
+                   !tc_act->can_offload(parse_state, act, i, attr))
                        continue;
 
                err = tc_act->post_parse(parse_state, priv, attr);
@@ -3283,10 +3364,10 @@ static int
 actions_prepare_mod_hdr_actions(struct mlx5e_priv *priv,
                                struct mlx5e_tc_flow *flow,
                                struct mlx5_flow_attr *attr,
-                               struct pedit_headers_action *hdrs,
                                struct netlink_ext_ack *extack)
 {
        struct mlx5e_tc_flow_parse_attr *parse_attr = attr->parse_attr;
+       struct pedit_headers_action *hdrs = parse_attr->hdrs;
        enum mlx5_flow_namespace_type ns_type;
        int err;
 
@@ -3296,8 +3377,7 @@ actions_prepare_mod_hdr_actions(struct mlx5e_priv *priv,
 
        ns_type = mlx5e_get_flow_namespace(flow);
 
-       err = alloc_tc_pedit_action(priv, ns_type, parse_attr, hdrs,
-                                   &attr->action, extack);
+       err = alloc_tc_pedit_action(priv, ns_type, parse_attr, &attr->action, extack);
        if (err)
                return err;
 
@@ -3345,7 +3425,6 @@ parse_tc_nic_actions(struct mlx5e_priv *priv,
        struct mlx5e_tc_act_parse_state *parse_state;
        struct mlx5e_tc_flow_parse_attr *parse_attr;
        struct mlx5_flow_attr *attr = flow->attr;
-       struct pedit_headers_action *hdrs;
        int err;
 
        err = flow_action_supported(flow_action, extack);
@@ -3357,13 +3436,12 @@ parse_tc_nic_actions(struct mlx5e_priv *priv,
        parse_state = &parse_attr->parse_state;
        mlx5e_tc_act_init_parse_state(parse_state, flow, flow_action, extack);
        parse_state->ct_priv = get_ct_priv(priv);
-       hdrs = parse_state->hdrs;
 
        err = parse_tc_actions(parse_state, flow_action);
        if (err)
                return err;
 
-       err = actions_prepare_mod_hdr_actions(priv, flow, attr, hdrs, extack);
+       err = actions_prepare_mod_hdr_actions(priv, flow, attr, extack);
        if (err)
                return err;
 
@@ -3468,7 +3546,6 @@ parse_tc_fdb_actions(struct mlx5e_priv *priv,
        struct mlx5e_tc_flow_parse_attr *parse_attr;
        struct mlx5_flow_attr *attr = flow->attr;
        struct mlx5_esw_flow_attr *esw_attr;
-       struct pedit_headers_action *hdrs;
        int err;
 
        err = flow_action_supported(flow_action, extack);
@@ -3480,7 +3557,6 @@ parse_tc_fdb_actions(struct mlx5e_priv *priv,
        parse_state = &parse_attr->parse_state;
        mlx5e_tc_act_init_parse_state(parse_state, flow, flow_action, extack);
        parse_state->ct_priv = get_ct_priv(priv);
-       hdrs = parse_state->hdrs;
 
        err = parse_tc_actions(parse_state, flow_action);
        if (err)
@@ -3494,7 +3570,7 @@ parse_tc_fdb_actions(struct mlx5e_priv *priv,
                return -EOPNOTSUPP;
        }
 
-       err = actions_prepare_mod_hdr_actions(priv, flow, attr, hdrs, extack);
+       err = actions_prepare_mod_hdr_actions(priv, flow, attr, extack);
        if (err)
                return err;
 
index 5ffae9b..c622172 100644 (file)
@@ -71,7 +71,7 @@ struct mlx5_flow_attr {
        struct mlx5_fc *counter;
        struct mlx5_modify_hdr *modify_hdr;
        struct mlx5_ct_attr ct_attr;
-       struct mlx5e_sample_attr *sample_attr;
+       struct mlx5e_sample_attr sample_attr;
        struct mlx5e_tc_flow_parse_attr *parse_attr;
        u32 chain;
        u16 prio;
@@ -82,6 +82,7 @@ struct mlx5_flow_attr {
        u8 outer_match_level;
        u8 ip_version;
        u8 tun_ip_version;
+       int tunnel_id; /* mapped tunnel id */
        u32 flags;
        union {
                struct mlx5_esw_flow_attr esw_attr[0];
@@ -89,6 +90,23 @@ struct mlx5_flow_attr {
        };
 };
 
+enum {
+       MLX5_ATTR_FLAG_VLAN_HANDLED  = BIT(0),
+       MLX5_ATTR_FLAG_SLOW_PATH     = BIT(1),
+       MLX5_ATTR_FLAG_NO_IN_PORT    = BIT(2),
+       MLX5_ATTR_FLAG_SRC_REWRITE   = BIT(3),
+       MLX5_ATTR_FLAG_SAMPLE        = BIT(4),
+       MLX5_ATTR_FLAG_ACCEPT        = BIT(5),
+       MLX5_ATTR_FLAG_CT            = BIT(6),
+};
+
+/* Returns true if any of the flags that require skipping further TC/NF processing are set. */
+static inline bool
+mlx5e_tc_attr_flags_skip(u32 attr_flags)
+{
+       return attr_flags & (MLX5_ATTR_FLAG_SLOW_PATH | MLX5_ATTR_FLAG_ACCEPT);
+}
+
 struct mlx5_rx_tun_attr {
        u16 decap_vport;
        union {
@@ -243,11 +261,8 @@ int mlx5e_tc_match_to_reg_set_and_get_id(struct mlx5_core_dev *mdev,
                                         u32 data);
 
 int mlx5e_tc_add_flow_mod_hdr(struct mlx5e_priv *priv,
-                             struct mlx5e_tc_flow_parse_attr *parse_attr,
-                             struct mlx5e_tc_flow *flow);
-
-struct mlx5e_tc_flow;
-u32 mlx5e_tc_get_flow_tun_id(struct mlx5e_tc_flow *flow);
+                             struct mlx5e_tc_flow *flow,
+                             struct mlx5_flow_attr *attr);
 
 void mlx5e_tc_set_ethertype(struct mlx5_core_dev *mdev,
                            struct flow_match_basic *match, bool outer,
index c275fe0..0abef71 100644 (file)
@@ -86,7 +86,7 @@ mlx5_esw_indir_table_needed(struct mlx5_eswitch *esw,
                mlx5_eswitch_is_vf_vport(esw, vport_num) &&
                esw->dev == dest_mdev &&
                attr->ip_version &&
-               attr->flags & MLX5_ESW_ATTR_FLAG_SRC_REWRITE;
+               attr->flags & MLX5_ATTR_FLAG_SRC_REWRITE;
 }
 
 u16
index ead5e8a..44321cd 100644 (file)
@@ -448,22 +448,6 @@ enum {
        MLX5_ESW_DEST_CHAIN_WITH_SRC_PORT_CHANGE  = BIT(2),
 };
 
-enum {
-       MLX5_ESW_ATTR_FLAG_VLAN_HANDLED  = BIT(0),
-       MLX5_ESW_ATTR_FLAG_SLOW_PATH     = BIT(1),
-       MLX5_ESW_ATTR_FLAG_NO_IN_PORT    = BIT(2),
-       MLX5_ESW_ATTR_FLAG_SRC_REWRITE   = BIT(3),
-       MLX5_ESW_ATTR_FLAG_SAMPLE        = BIT(4),
-       MLX5_ESW_ATTR_FLAG_ACCEPT        = BIT(5),
-};
-
-/* Returns true if any of the flags that require skipping further TC/NF processing are set. */
-static inline bool
-mlx5_esw_attr_flags_skip(u32 attr_flags)
-{
-       return attr_flags & (MLX5_ESW_ATTR_FLAG_SLOW_PATH | MLX5_ESW_ATTR_FLAG_ACCEPT);
-}
-
 struct mlx5_esw_flow_attr {
        struct mlx5_eswitch_rep *in_rep;
        struct mlx5_core_dev    *in_mdev;
index 9a7b256..2b31d8b 100644 (file)
@@ -180,7 +180,7 @@ esw_setup_decap_indir(struct mlx5_eswitch *esw,
 {
        struct mlx5_flow_table *ft;
 
-       if (!(attr->flags & MLX5_ESW_ATTR_FLAG_SRC_REWRITE))
+       if (!(attr->flags & MLX5_ATTR_FLAG_SRC_REWRITE))
                return -EOPNOTSUPP;
 
        ft = mlx5_esw_indir_table_get(esw, attr, spec,
@@ -201,12 +201,12 @@ esw_cleanup_decap_indir(struct mlx5_eswitch *esw,
 static int
 esw_setup_sampler_dest(struct mlx5_flow_destination *dest,
                       struct mlx5_flow_act *flow_act,
-                      struct mlx5_flow_attr *attr,
+                      u32 sampler_id,
                       int i)
 {
        flow_act->flags |= FLOW_ACT_IGNORE_FLOW_LEVEL;
        dest[i].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_SAMPLER;
-       dest[i].sampler_id = attr->sample_attr->sampler_id;
+       dest[i].sampler_id = sampler_id;
 
        return 0;
 }
@@ -297,7 +297,7 @@ esw_setup_chain_src_port_rewrite(struct mlx5_flow_destination *dest,
        struct mlx5_esw_flow_attr *esw_attr = attr->esw_attr;
        int err;
 
-       if (!(attr->flags & MLX5_ESW_ATTR_FLAG_SRC_REWRITE))
+       if (!(attr->flags & MLX5_ATTR_FLAG_SRC_REWRITE))
                return -EOPNOTSUPP;
 
        /* flow steering cannot handle more than one dest with the same ft
@@ -364,7 +364,7 @@ esw_setup_indir_table(struct mlx5_flow_destination *dest,
        struct mlx5_esw_flow_attr *esw_attr = attr->esw_attr;
        int j, err;
 
-       if (!(attr->flags & MLX5_ESW_ATTR_FLAG_SRC_REWRITE))
+       if (!(attr->flags & MLX5_ATTR_FLAG_SRC_REWRITE))
                return -EOPNOTSUPP;
 
        for (j = esw_attr->split_count; j < esw_attr->out_count; j++, (*i)++) {
@@ -463,15 +463,16 @@ esw_setup_dests(struct mlx5_flow_destination *dest,
 
        if (!mlx5_eswitch_termtbl_required(esw, attr, flow_act, spec) &&
            esw_src_port_rewrite_supported(esw))
-               attr->flags |= MLX5_ESW_ATTR_FLAG_SRC_REWRITE;
+               attr->flags |= MLX5_ATTR_FLAG_SRC_REWRITE;
 
-       if (attr->flags & MLX5_ESW_ATTR_FLAG_SAMPLE) {
-               esw_setup_sampler_dest(dest, flow_act, attr, *i);
+       if (attr->flags & MLX5_ATTR_FLAG_SAMPLE &&
+           !(attr->flags & MLX5_ATTR_FLAG_SLOW_PATH)) {
+               esw_setup_sampler_dest(dest, flow_act, attr->sample_attr.sampler_id, *i);
                (*i)++;
        } else if (attr->dest_ft) {
                esw_setup_ft_dest(dest, flow_act, esw, attr, spec, *i);
                (*i)++;
-       } else if (mlx5_esw_attr_flags_skip(attr->flags)) {
+       } else if (mlx5e_tc_attr_flags_skip(attr->flags)) {
                esw_setup_slow_path_dest(dest, flow_act, chains, *i);
                (*i)++;
        } else if (attr->dest_chain) {
@@ -498,7 +499,7 @@ esw_cleanup_dests(struct mlx5_eswitch *esw,
 
        if (attr->dest_ft) {
                esw_cleanup_decap_indir(esw, attr);
-       } else if (!mlx5_esw_attr_flags_skip(attr->flags)) {
+       } else if (!mlx5e_tc_attr_flags_skip(attr->flags)) {
                if (attr->dest_chain)
                        esw_cleanup_chain_dest(chains, attr->dest_chain, 1, 0);
                else if (esw_is_indir_table(esw, attr))
@@ -589,7 +590,7 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
                else
                        fdb = attr->ft;
 
-               if (!(attr->flags & MLX5_ESW_ATTR_FLAG_NO_IN_PORT))
+               if (!(attr->flags & MLX5_ATTR_FLAG_NO_IN_PORT))
                        mlx5_eswitch_set_rule_source_port(esw, spec, attr,
                                                          esw_attr->in_mdev->priv.eswitch,
                                                          esw_attr->in_rep->vport);
@@ -721,7 +722,7 @@ __mlx5_eswitch_del_rule(struct mlx5_eswitch *esw,
 
        mlx5_del_flow_rules(rule);
 
-       if (!mlx5_esw_attr_flags_skip(attr->flags)) {
+       if (!mlx5e_tc_attr_flags_skip(attr->flags)) {
                /* unref the term table */
                for (i = 0; i < MLX5_MAX_FLOW_FWD_VPORTS; i++) {
                        if (esw_attr->dests[i].termtbl)
@@ -863,7 +864,7 @@ int mlx5_eswitch_add_vlan_action(struct mlx5_eswitch *esw,
        if (err)
                goto unlock;
 
-       attr->flags &= ~MLX5_ESW_ATTR_FLAG_VLAN_HANDLED;
+       attr->flags &= ~MLX5_ATTR_FLAG_VLAN_HANDLED;
 
        vport = esw_vlan_action_get_vport(esw_attr, push, pop);
 
@@ -871,7 +872,7 @@ int mlx5_eswitch_add_vlan_action(struct mlx5_eswitch *esw,
                /* tracks VF --> wire rules without vlan push action */
                if (esw_attr->dests[0].rep->vport == MLX5_VPORT_UPLINK) {
                        vport->vlan_refcount++;
-                       attr->flags |= MLX5_ESW_ATTR_FLAG_VLAN_HANDLED;
+                       attr->flags |= MLX5_ATTR_FLAG_VLAN_HANDLED;
                }
 
                goto unlock;
@@ -902,7 +903,7 @@ skip_set_push:
        }
 out:
        if (!err)
-               attr->flags |= MLX5_ESW_ATTR_FLAG_VLAN_HANDLED;
+               attr->flags |= MLX5_ATTR_FLAG_VLAN_HANDLED;
 unlock:
        mutex_unlock(&esw->state_lock);
        return err;
@@ -921,7 +922,7 @@ int mlx5_eswitch_del_vlan_action(struct mlx5_eswitch *esw,
        if (mlx5_eswitch_vlan_actions_supported(esw->dev, 1))
                return 0;
 
-       if (!(attr->flags & MLX5_ESW_ATTR_FLAG_VLAN_HANDLED))
+       if (!(attr->flags & MLX5_ATTR_FLAG_VLAN_HANDLED))
                return 0;
 
        push = !!(attr->action & MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH);
index 182306b..ee568bf 100644 (file)
@@ -219,12 +219,14 @@ mlx5_eswitch_termtbl_required(struct mlx5_eswitch *esw,
 
        if (!MLX5_CAP_ESW_FLOWTABLE_FDB(esw->dev, termination_table) ||
            !MLX5_CAP_ESW_FLOWTABLE_FDB(esw->dev, ignore_flow_level) ||
-           mlx5_esw_attr_flags_skip(attr->flags) ||
+           mlx5e_tc_attr_flags_skip(attr->flags) ||
            (!mlx5_eswitch_offload_is_uplink_port(esw, spec) && !esw_attr->int_port))
                return false;
 
        /* push vlan on RX */
-       if (flow_act->action & MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH)
+       if (flow_act->action & MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH &&
+           !(mlx5_fs_get_capabilities(esw->dev, MLX5_FLOW_NAMESPACE_FDB) &
+             MLX5_FLOW_STEERING_CAP_VLAN_PUSH_ON_RX))
                return true;
 
        /* hairpin */
index dafe341..a0ac17c 100644 (file)
@@ -152,6 +152,12 @@ static int mlx5_cmd_stub_destroy_ns(struct mlx5_flow_root_namespace *ns)
        return 0;
 }
 
+static u32 mlx5_cmd_stub_get_capabilities(struct mlx5_flow_root_namespace *ns,
+                                         enum fs_flow_table_type ft_type)
+{
+       return 0;
+}
+
 static int mlx5_cmd_set_slave_root_fdb(struct mlx5_core_dev *master,
                                       struct mlx5_core_dev *slave,
                                       bool ft_id_valid,
@@ -971,6 +977,12 @@ static int mlx5_cmd_create_match_definer(struct mlx5_flow_root_namespace *ns,
        return err ? err : MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
 }
 
+static u32 mlx5_cmd_get_capabilities(struct mlx5_flow_root_namespace *ns,
+                                    enum fs_flow_table_type ft_type)
+{
+       return 0;
+}
+
 static const struct mlx5_flow_cmds mlx5_flow_cmds = {
        .create_flow_table = mlx5_cmd_create_flow_table,
        .destroy_flow_table = mlx5_cmd_destroy_flow_table,
@@ -990,6 +1002,7 @@ static const struct mlx5_flow_cmds mlx5_flow_cmds = {
        .set_peer = mlx5_cmd_stub_set_peer,
        .create_ns = mlx5_cmd_stub_create_ns,
        .destroy_ns = mlx5_cmd_stub_destroy_ns,
+       .get_capabilities = mlx5_cmd_get_capabilities,
 };
 
 static const struct mlx5_flow_cmds mlx5_flow_cmd_stubs = {
@@ -1011,6 +1024,7 @@ static const struct mlx5_flow_cmds mlx5_flow_cmd_stubs = {
        .set_peer = mlx5_cmd_stub_set_peer,
        .create_ns = mlx5_cmd_stub_create_ns,
        .destroy_ns = mlx5_cmd_stub_destroy_ns,
+       .get_capabilities = mlx5_cmd_stub_get_capabilities,
 };
 
 const struct mlx5_flow_cmds *mlx5_fs_cmd_get_fw_cmds(void)
index 220ec63..274004e 100644 (file)
@@ -101,6 +101,9 @@ struct mlx5_flow_cmds {
                                    u16 format_id, u32 *match_mask);
        int (*destroy_match_definer)(struct mlx5_flow_root_namespace *ns,
                                     int definer_id);
+
+       u32 (*get_capabilities)(struct mlx5_flow_root_namespace *ns,
+                               enum fs_flow_table_type ft_type);
 };
 
 int mlx5_cmd_fc_alloc(struct mlx5_core_dev *dev, u32 *id);
index b628917..42f878e 100644 (file)
@@ -3040,6 +3040,22 @@ void mlx5_fs_ingress_acls_cleanup(struct mlx5_core_dev *dev)
        steering->esw_ingress_root_ns = NULL;
 }
 
+u32 mlx5_fs_get_capabilities(struct mlx5_core_dev *dev, enum mlx5_flow_namespace_type type)
+{
+       struct mlx5_flow_root_namespace *root;
+       struct mlx5_flow_namespace *ns;
+
+       ns = mlx5_get_flow_namespace(dev, type);
+       if (!ns)
+               return 0;
+
+       root = find_root(&ns->node);
+       if (!root)
+               return 0;
+
+       return root->cmds->get_capabilities(root, root->table_type);
+}
+
 static int init_egress_root_ns(struct mlx5_flow_steering *steering)
 {
        int err;
index 5469b08..c488a7c 100644 (file)
@@ -120,6 +120,11 @@ enum mlx5_flow_steering_mode {
        MLX5_FLOW_STEERING_MODE_SMFS
 };
 
+enum mlx5_flow_steering_capabilty {
+       MLX5_FLOW_STEERING_CAP_VLAN_PUSH_ON_RX = 1UL << 0,
+       MLX5_FLOW_STEERING_CAP_VLAN_POP_ON_TX = 1UL << 1,
+};
+
 struct mlx5_flow_steering {
        struct mlx5_core_dev *dev;
        enum   mlx5_flow_steering_mode  mode;
@@ -301,6 +306,8 @@ void mlx5_fs_egress_acls_cleanup(struct mlx5_core_dev *dev);
 int mlx5_fs_ingress_acls_init(struct mlx5_core_dev *dev, int total_vports);
 void mlx5_fs_ingress_acls_cleanup(struct mlx5_core_dev *dev);
 
+u32 mlx5_fs_get_capabilities(struct mlx5_core_dev *dev, enum mlx5_flow_namespace_type type);
+
 struct mlx5_flow_root_namespace *find_root(struct fs_node *node);
 
 #define fs_get_obj(v, _node)  {v = container_of((_node), typeof(*v), node); }
index a476da2..033757b 100644 (file)
@@ -735,6 +735,16 @@ static int mlx5_cmd_dr_destroy_ns(struct mlx5_flow_root_namespace *ns)
        return mlx5dr_domain_destroy(ns->fs_dr_domain.dr_domain);
 }
 
+static u32 mlx5_cmd_dr_get_capabilities(struct mlx5_flow_root_namespace *ns,
+                                       enum fs_flow_table_type ft_type)
+{
+       if (ft_type != FS_FT_FDB ||
+           MLX5_CAP_GEN(ns->dev, steering_format_version) != MLX5_STEERING_FORMAT_CONNECTX_6DX)
+               return 0;
+
+       return MLX5_FLOW_STEERING_CAP_VLAN_PUSH_ON_RX | MLX5_FLOW_STEERING_CAP_VLAN_POP_ON_TX;
+}
+
 bool mlx5_fs_dr_is_supported(struct mlx5_core_dev *dev)
 {
        return mlx5dr_is_supported(dev);
@@ -759,6 +769,7 @@ static const struct mlx5_flow_cmds mlx5_flow_cmds_dr = {
        .set_peer = mlx5_cmd_dr_set_peer,
        .create_ns = mlx5_cmd_dr_create_ns,
        .destroy_ns = mlx5_cmd_dr_destroy_ns,
+       .get_capabilities = mlx5_cmd_dr_get_capabilities,
 };
 
 const struct mlx5_flow_cmds *mlx5_fs_cmd_get_dr_cmds(void)
index 866b935..f45df5f 100644 (file)
@@ -212,6 +212,29 @@ struct mlxsw_event_listener_item {
        void *priv;
 };
 
+static const u8 mlxsw_core_trap_groups[] = {
+       MLXSW_REG_HTGT_TRAP_GROUP_EMAD,
+       MLXSW_REG_HTGT_TRAP_GROUP_CORE_EVENT,
+};
+
+static int mlxsw_core_trap_groups_set(struct mlxsw_core *mlxsw_core)
+{
+       char htgt_pl[MLXSW_REG_HTGT_LEN];
+       int err;
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(mlxsw_core_trap_groups); i++) {
+               mlxsw_reg_htgt_pack(htgt_pl, mlxsw_core_trap_groups[i],
+                                   MLXSW_REG_HTGT_INVALID_POLICER,
+                                   MLXSW_REG_HTGT_DEFAULT_PRIORITY,
+                                   MLXSW_REG_HTGT_DEFAULT_TC);
+               err = mlxsw_reg_write(mlxsw_core, MLXSW_REG(htgt), htgt_pl);
+               if (err)
+                       return err;
+       }
+       return 0;
+}
+
 /******************
  * EMAD processing
  ******************/
@@ -777,16 +800,10 @@ static int mlxsw_emad_init(struct mlxsw_core *mlxsw_core)
        if (err)
                goto err_trap_register;
 
-       err = mlxsw_core->driver->basic_trap_groups_set(mlxsw_core);
-       if (err)
-               goto err_emad_trap_set;
        mlxsw_core->emad.use_emad = true;
 
        return 0;
 
-err_emad_trap_set:
-       mlxsw_core_trap_unregister(mlxsw_core, &mlxsw_emad_rx_listener,
-                                  mlxsw_core);
 err_trap_register:
        destroy_workqueue(mlxsw_core->emad_wq);
        return err;
@@ -1706,7 +1723,7 @@ static void mlxsw_core_health_listener_func(const struct mlxsw_reg_info *reg,
 }
 
 static const struct mlxsw_listener mlxsw_core_health_listener =
-       MLXSW_EVENTL(mlxsw_core_health_listener_func, MFDE, MFDE);
+       MLXSW_CORE_EVENTL(mlxsw_core_health_listener_func, MFDE);
 
 static int
 mlxsw_core_health_fw_fatal_dump_fatal_cause(const char *mfde_pl,
@@ -2122,6 +2139,10 @@ __mlxsw_core_bus_device_register(const struct mlxsw_bus_info *mlxsw_bus_info,
                }
        }
 
+       err = mlxsw_core_trap_groups_set(mlxsw_core);
+       if (err)
+               goto err_trap_groups_set;
+
        err = mlxsw_emad_init(mlxsw_core);
        if (err)
                goto err_emad_init;
@@ -2181,6 +2202,7 @@ err_fw_rev_validate:
 err_register_params:
        mlxsw_emad_fini(mlxsw_core);
 err_emad_init:
+err_trap_groups_set:
        kfree(mlxsw_core->lag.mapping);
 err_alloc_lag_mapping:
        mlxsw_ports_fini(mlxsw_core, reload);
@@ -2540,6 +2562,45 @@ void mlxsw_core_trap_unregister(struct mlxsw_core *mlxsw_core,
 }
 EXPORT_SYMBOL(mlxsw_core_trap_unregister);
 
+int mlxsw_core_traps_register(struct mlxsw_core *mlxsw_core,
+                             const struct mlxsw_listener *listeners,
+                             size_t listeners_count, void *priv)
+{
+       int i, err;
+
+       for (i = 0; i < listeners_count; i++) {
+               err = mlxsw_core_trap_register(mlxsw_core,
+                                              &listeners[i],
+                                              priv);
+               if (err)
+                       goto err_listener_register;
+       }
+       return 0;
+
+err_listener_register:
+       for (i--; i >= 0; i--) {
+               mlxsw_core_trap_unregister(mlxsw_core,
+                                          &listeners[i],
+                                          priv);
+       }
+       return err;
+}
+EXPORT_SYMBOL(mlxsw_core_traps_register);
+
+void mlxsw_core_traps_unregister(struct mlxsw_core *mlxsw_core,
+                                const struct mlxsw_listener *listeners,
+                                size_t listeners_count, void *priv)
+{
+       int i;
+
+       for (i = 0; i < listeners_count; i++) {
+               mlxsw_core_trap_unregister(mlxsw_core,
+                                          &listeners[i],
+                                          priv);
+       }
+}
+EXPORT_SYMBOL(mlxsw_core_traps_unregister);
+
 int mlxsw_core_trap_state_set(struct mlxsw_core *mlxsw_core,
                              const struct mlxsw_listener *listener,
                              bool enabled)
index f30bb86..6d30409 100644 (file)
@@ -163,6 +163,9 @@ struct mlxsw_listener {
                .enabled_on_register = true,                                    \
        }
 
+#define MLXSW_CORE_EVENTL(_func, _trap_id)             \
+       MLXSW_EVENTL(_func, _trap_id, CORE_EVENT)
+
 int mlxsw_core_rx_listener_register(struct mlxsw_core *mlxsw_core,
                                    const struct mlxsw_rx_listener *rxl,
                                    void *priv, bool enabled);
@@ -181,6 +184,12 @@ int mlxsw_core_trap_register(struct mlxsw_core *mlxsw_core,
 void mlxsw_core_trap_unregister(struct mlxsw_core *mlxsw_core,
                                const struct mlxsw_listener *listener,
                                void *priv);
+int mlxsw_core_traps_register(struct mlxsw_core *mlxsw_core,
+                             const struct mlxsw_listener *listeners,
+                             size_t listeners_count, void *priv);
+void mlxsw_core_traps_unregister(struct mlxsw_core *mlxsw_core,
+                                const struct mlxsw_listener *listeners,
+                                size_t listeners_count, void *priv);
 int mlxsw_core_trap_state_set(struct mlxsw_core *mlxsw_core,
                              const struct mlxsw_listener *listener,
                              bool enabled);
@@ -315,7 +324,6 @@ struct mlxsw_driver {
                    const struct mlxsw_bus_info *mlxsw_bus_info,
                    struct netlink_ext_ack *extack);
        void (*fini)(struct mlxsw_core *mlxsw_core);
-       int (*basic_trap_groups_set)(struct mlxsw_core *mlxsw_core);
        int (*port_type_set)(struct mlxsw_core *mlxsw_core, u16 local_port,
                             enum devlink_port_type new_type);
        int (*port_split)(struct mlxsw_core *mlxsw_core, u16 local_port,
index 77e82e6..fa33cae 100644 (file)
@@ -1957,6 +1957,83 @@ int mlxsw_afa_block_append_mcrouter(struct mlxsw_afa_block *block,
 }
 EXPORT_SYMBOL(mlxsw_afa_block_append_mcrouter);
 
+/* SIP DIP Action
+ * --------------
+ * The SIP_DIP_ACTION is used for modifying the SIP and DIP fields of the
+ * packet, e.g. for NAT. The L3 checksum is updated. Also, if the L4 is TCP or
+ * if the L4 is UDP and the checksum field is not zero, then the L4 checksum is
+ * updated.
+ */
+
+#define MLXSW_AFA_IP_CODE 0x11
+#define MLXSW_AFA_IP_SIZE 2
+
+enum mlxsw_afa_ip_s_d {
+       /* ip refers to dip */
+       MLXSW_AFA_IP_S_D_DIP,
+       /* ip refers to sip */
+       MLXSW_AFA_IP_S_D_SIP,
+};
+
+/* afa_ip_s_d
+ * Source or destination.
+ */
+MLXSW_ITEM32(afa, ip, s_d, 0x00, 31, 1);
+
+enum mlxsw_afa_ip_m_l {
+       /* LSB: ip[63:0] refers to ip[63:0] */
+       MLXSW_AFA_IP_M_L_LSB,
+       /* MSB: ip[63:0] refers to ip[127:64] */
+       MLXSW_AFA_IP_M_L_MSB,
+};
+
+/* afa_ip_m_l
+ * MSB or LSB.
+ */
+MLXSW_ITEM32(afa, ip, m_l, 0x00, 30, 1);
+
+/* afa_ip_ip_63_32
+ * Bits [63:32] in the IP address to change to.
+ */
+MLXSW_ITEM32(afa, ip, ip_63_32, 0x08, 0, 32);
+
+/* afa_ip_ip_31_0
+ * Bits [31:0] in the IP address to change to.
+ */
+MLXSW_ITEM32(afa, ip, ip_31_0, 0x0C, 0, 32);
+
+static void mlxsw_afa_ip_pack(char *payload, enum mlxsw_afa_ip_s_d s_d,
+                             enum mlxsw_afa_ip_m_l m_l, u32 ip_31_0,
+                             u32 ip_63_32)
+{
+       mlxsw_afa_ip_s_d_set(payload, s_d);
+       mlxsw_afa_ip_m_l_set(payload, m_l);
+       mlxsw_afa_ip_ip_31_0_set(payload, ip_31_0);
+       mlxsw_afa_ip_ip_63_32_set(payload, ip_63_32);
+}
+
+int mlxsw_afa_block_append_ip(struct mlxsw_afa_block *block, bool is_dip,
+                             bool is_lsb, u32 val_31_0, u32 val_63_32,
+                             struct netlink_ext_ack *extack)
+{
+       enum mlxsw_afa_ip_s_d s_d = is_dip ? MLXSW_AFA_IP_S_D_DIP :
+                                            MLXSW_AFA_IP_S_D_SIP;
+       enum mlxsw_afa_ip_m_l m_l = is_lsb ? MLXSW_AFA_IP_M_L_LSB :
+                                            MLXSW_AFA_IP_M_L_MSB;
+       char *act = mlxsw_afa_block_append_action(block,
+                                                 MLXSW_AFA_IP_CODE,
+                                                 MLXSW_AFA_IP_SIZE);
+
+       if (IS_ERR(act)) {
+               NL_SET_ERR_MSG_MOD(extack, "Cannot append IP action");
+               return PTR_ERR(act);
+       }
+
+       mlxsw_afa_ip_pack(act, s_d, m_l, val_31_0, val_63_32);
+       return 0;
+}
+EXPORT_SYMBOL(mlxsw_afa_block_append_ip);
+
 /* L4 Port Action
  * --------------
  * The L4_PORT_ACTION is used for modifying the sport and dport fields of the packet, e.g. for NAT.
index 16cbd6a..db58037 100644 (file)
@@ -92,6 +92,9 @@ int mlxsw_afa_block_append_fid_set(struct mlxsw_afa_block *block, u16 fid,
 int mlxsw_afa_block_append_mcrouter(struct mlxsw_afa_block *block,
                                    u16 expected_irif, u16 min_mtu,
                                    bool rmid_valid, u32 kvdl_index);
+int mlxsw_afa_block_append_ip(struct mlxsw_afa_block *block, bool is_dip,
+                             bool is_lsb, u32 val_31_0, u32 val_63_32,
+                             struct netlink_ext_ack *extack);
 int mlxsw_afa_block_append_l4port(struct mlxsw_afa_block *block, bool is_dport, u16 l4_port,
                                  struct netlink_ext_ack *extack);
 int mlxsw_afa_block_append_police(struct mlxsw_afa_block *block,
index 6dd4ae2..6ea4bf8 100644 (file)
@@ -18,6 +18,7 @@ struct mlxsw_env_module_info {
        int num_ports_mapped;
        int num_ports_up;
        enum ethtool_module_power_mode_policy power_mode_policy;
+       enum mlxsw_reg_pmtm_module_type type;
 };
 
 struct mlxsw_env {
@@ -27,14 +28,47 @@ struct mlxsw_env {
        struct mlxsw_env_module_info module_info[];
 };
 
-static int mlxsw_env_validate_cable_ident(struct mlxsw_core *core, int id,
-                                         bool *qsfp, bool *cmis)
+static int __mlxsw_env_validate_module_type(struct mlxsw_core *core, u8 module)
+{
+       struct mlxsw_env *mlxsw_env = mlxsw_core_env(core);
+       int err;
+
+       switch (mlxsw_env->module_info[module].type) {
+       case MLXSW_REG_PMTM_MODULE_TYPE_TWISTED_PAIR:
+               err = -EINVAL;
+               break;
+       default:
+               err = 0;
+       }
+
+       return err;
+}
+
+static int mlxsw_env_validate_module_type(struct mlxsw_core *core, u8 module)
+{
+       struct mlxsw_env *mlxsw_env = mlxsw_core_env(core);
+       int err;
+
+       mutex_lock(&mlxsw_env->module_info_lock);
+       err = __mlxsw_env_validate_module_type(core, module);
+       mutex_unlock(&mlxsw_env->module_info_lock);
+
+       return err;
+}
+
+static int
+mlxsw_env_validate_cable_ident(struct mlxsw_core *core, int id, bool *qsfp,
+                              bool *cmis)
 {
        char mcia_pl[MLXSW_REG_MCIA_LEN];
        char *eeprom_tmp;
        u8 ident;
        int err;
 
+       err = mlxsw_env_validate_module_type(core, id);
+       if (err)
+               return err;
+
        mlxsw_reg_mcia_pack(mcia_pl, id, 0, MLXSW_REG_MCIA_PAGE0_LO_OFF, 0, 1,
                            MLXSW_REG_MCIA_I2C_ADDR_LOW);
        err = mlxsw_reg_query(core, MLXSW_REG(mcia), mcia_pl);
@@ -206,7 +240,8 @@ int mlxsw_env_module_temp_thresholds_get(struct mlxsw_core *core, int module,
        return 0;
 }
 
-int mlxsw_env_get_module_info(struct mlxsw_core *mlxsw_core, int module,
+int mlxsw_env_get_module_info(struct net_device *netdev,
+                             struct mlxsw_core *mlxsw_core, int module,
                              struct ethtool_modinfo *modinfo)
 {
        u8 module_info[MLXSW_REG_MCIA_EEPROM_MODULE_INFO_SIZE];
@@ -215,6 +250,13 @@ int mlxsw_env_get_module_info(struct mlxsw_core *mlxsw_core, int module,
        unsigned int read_size;
        int err;
 
+       err = mlxsw_env_validate_module_type(mlxsw_core, module);
+       if (err) {
+               netdev_err(netdev,
+                          "EEPROM is not equipped on port module type");
+               return err;
+       }
+
        err = mlxsw_env_query_module_eeprom(mlxsw_core, module, 0, offset,
                                            module_info, false, &read_size);
        if (err)
@@ -356,6 +398,13 @@ mlxsw_env_get_module_eeprom_by_page(struct mlxsw_core *mlxsw_core, u8 module,
 {
        u32 bytes_read = 0;
        u16 device_addr;
+       int err;
+
+       err = mlxsw_env_validate_module_type(mlxsw_core, module);
+       if (err) {
+               NL_SET_ERR_MSG_MOD(extack, "EEPROM is not equipped on port module type");
+               return err;
+       }
 
        /* Offset cannot be larger than 2 * ETH_MODULE_EEPROM_PAGE_LEN */
        device_addr = page->offset;
@@ -364,7 +413,6 @@ mlxsw_env_get_module_eeprom_by_page(struct mlxsw_core *mlxsw_core, u8 module,
                char mcia_pl[MLXSW_REG_MCIA_LEN];
                char *eeprom_tmp;
                u8 size;
-               int err;
 
                size = min_t(u8, page->length - bytes_read,
                             MLXSW_REG_MCIA_EEPROM_SIZE);
@@ -419,6 +467,12 @@ int mlxsw_env_reset_module(struct net_device *netdev,
 
        mutex_lock(&mlxsw_env->module_info_lock);
 
+       err = __mlxsw_env_validate_module_type(mlxsw_core, module);
+       if (err) {
+               netdev_err(netdev, "Reset module is not supported on port module type\n");
+               goto out;
+       }
+
        if (mlxsw_env->module_info[module].num_ports_up) {
                netdev_err(netdev, "Cannot reset module when ports using it are administratively up\n");
                err = -EINVAL;
@@ -461,6 +515,12 @@ mlxsw_env_get_module_power_mode(struct mlxsw_core *mlxsw_core, u8 module,
 
        mutex_lock(&mlxsw_env->module_info_lock);
 
+       err = __mlxsw_env_validate_module_type(mlxsw_core, module);
+       if (err) {
+               NL_SET_ERR_MSG_MOD(extack, "Power mode is not supported on port module type");
+               goto out;
+       }
+
        params->policy = mlxsw_env->module_info[module].power_mode_policy;
 
        mlxsw_reg_mcion_pack(mcion_pl, module);
@@ -571,6 +631,13 @@ mlxsw_env_set_module_power_mode(struct mlxsw_core *mlxsw_core, u8 module,
 
        mutex_lock(&mlxsw_env->module_info_lock);
 
+       err = __mlxsw_env_validate_module_type(mlxsw_core, module);
+       if (err) {
+               NL_SET_ERR_MSG_MOD(extack,
+                                  "Power mode set is not supported on port module type");
+               goto out;
+       }
+
        if (mlxsw_env->module_info[module].power_mode_policy == policy)
                goto out;
 
@@ -661,13 +728,12 @@ static int mlxsw_env_temp_event_set(struct mlxsw_core *mlxsw_core,
        return mlxsw_reg_write(mlxsw_core, MLXSW_REG(mtmp), mtmp_pl);
 }
 
-static int mlxsw_env_module_temp_event_enable(struct mlxsw_core *mlxsw_core,
-                                             u8 module_count)
+static int mlxsw_env_module_temp_event_enable(struct mlxsw_core *mlxsw_core)
 {
        int i, err, sensor_index;
        bool has_temp_sensor;
 
-       for (i = 0; i < module_count; i++) {
+       for (i = 0; i < mlxsw_core_env(mlxsw_core)->module_count; i++) {
                err = mlxsw_env_module_has_temp_sensor(mlxsw_core, i,
                                                       &has_temp_sensor);
                if (err)
@@ -759,7 +825,7 @@ mlxsw_env_mtwe_listener_func(const struct mlxsw_reg_info *reg, char *mtwe_pl,
 }
 
 static const struct mlxsw_listener mlxsw_env_temp_warn_listener =
-       MLXSW_EVENTL(mlxsw_env_mtwe_listener_func, MTWE, MTWE);
+       MLXSW_CORE_EVENTL(mlxsw_env_mtwe_listener_func, MTWE);
 
 static int mlxsw_env_temp_warn_event_register(struct mlxsw_core *mlxsw_core)
 {
@@ -849,7 +915,7 @@ mlxsw_env_pmpe_listener_func(const struct mlxsw_reg_info *reg, char *pmpe_pl,
 }
 
 static const struct mlxsw_listener mlxsw_env_module_plug_listener =
-       MLXSW_EVENTL(mlxsw_env_pmpe_listener_func, PMPE, PMPE);
+       MLXSW_CORE_EVENTL(mlxsw_env_pmpe_listener_func, PMPE);
 
 static int
 mlxsw_env_module_plug_event_register(struct mlxsw_core *mlxsw_core)
@@ -876,12 +942,11 @@ mlxsw_env_module_plug_event_unregister(struct mlxsw_env *mlxsw_env)
 }
 
 static int
-mlxsw_env_module_oper_state_event_enable(struct mlxsw_core *mlxsw_core,
-                                        u8 module_count)
+mlxsw_env_module_oper_state_event_enable(struct mlxsw_core *mlxsw_core)
 {
        int i, err;
 
-       for (i = 0; i < module_count; i++) {
+       for (i = 0; i < mlxsw_core_env(mlxsw_core)->module_count; i++) {
                char pmaos_pl[MLXSW_REG_PMAOS_LEN];
 
                mlxsw_reg_pmaos_pack(pmaos_pl, i);
@@ -999,6 +1064,28 @@ out_unlock:
 }
 EXPORT_SYMBOL(mlxsw_env_module_port_down);
 
+static int
+mlxsw_env_module_type_set(struct mlxsw_core *mlxsw_core)
+{
+       struct mlxsw_env *mlxsw_env = mlxsw_core_env(mlxsw_core);
+       int i;
+
+       for (i = 0; i < mlxsw_env->module_count; i++) {
+               char pmtm_pl[MLXSW_REG_PMTM_LEN];
+               int err;
+
+               mlxsw_reg_pmtm_pack(pmtm_pl, 0, i);
+               err = mlxsw_reg_query(mlxsw_core, MLXSW_REG(pmtm), pmtm_pl);
+               if (err)
+                       return err;
+
+               mlxsw_env->module_info[i].type =
+                       mlxsw_reg_pmtm_module_type_get(pmtm_pl);
+       }
+
+       return 0;
+}
+
 int mlxsw_env_init(struct mlxsw_core *mlxsw_core, struct mlxsw_env **p_env)
 {
        char mgpir_pl[MLXSW_REG_MGPIR_LEN];
@@ -1037,17 +1124,21 @@ int mlxsw_env_init(struct mlxsw_core *mlxsw_core, struct mlxsw_env **p_env)
        if (err)
                goto err_module_plug_event_register;
 
-       err = mlxsw_env_module_oper_state_event_enable(mlxsw_core,
-                                                      env->module_count);
+       err = mlxsw_env_module_oper_state_event_enable(mlxsw_core);
        if (err)
                goto err_oper_state_event_enable;
 
-       err = mlxsw_env_module_temp_event_enable(mlxsw_core, env->module_count);
+       err = mlxsw_env_module_temp_event_enable(mlxsw_core);
        if (err)
                goto err_temp_event_enable;
 
+       err = mlxsw_env_module_type_set(mlxsw_core);
+       if (err)
+               goto err_type_set;
+
        return 0;
 
+err_type_set:
 err_temp_event_enable:
 err_oper_state_event_enable:
        mlxsw_env_module_plug_event_unregister(env);
index da121b1..ec6564e 100644 (file)
@@ -12,7 +12,8 @@ struct ethtool_eeprom;
 int mlxsw_env_module_temp_thresholds_get(struct mlxsw_core *core, int module,
                                         int off, int *temp);
 
-int mlxsw_env_get_module_info(struct mlxsw_core *mlxsw_core, int module,
+int mlxsw_env_get_module_info(struct net_device *netdev,
+                             struct mlxsw_core *mlxsw_core, int module,
                              struct ethtool_modinfo *modinfo);
 
 int mlxsw_env_get_module_eeprom(struct net_device *netdev,
index 10d13f5..9ac8ce0 100644 (file)
@@ -110,7 +110,8 @@ static int mlxsw_m_get_module_info(struct net_device *netdev,
        struct mlxsw_m_port *mlxsw_m_port = netdev_priv(netdev);
        struct mlxsw_core *core = mlxsw_m_port->mlxsw_m->core;
 
-       return mlxsw_env_get_module_info(core, mlxsw_m_port->module, modinfo);
+       return mlxsw_env_get_module_info(netdev, core, mlxsw_m_port->module,
+                                        modinfo);
 }
 
 static int
index 24cc650..eebd047 100644 (file)
@@ -4482,6 +4482,8 @@ MLXSW_ITEM32(reg, ptys, ext_eth_proto_cap, 0x08, 0, 32);
 #define MLXSW_REG_PTYS_ETH_SPEED_100GBASE_SR4          BIT(21)
 #define MLXSW_REG_PTYS_ETH_SPEED_100GBASE_KR4          BIT(22)
 #define MLXSW_REG_PTYS_ETH_SPEED_100GBASE_LR4_ER4      BIT(23)
+#define MLXSW_REG_PTYS_ETH_SPEED_100BASE_T             BIT(24)
+#define MLXSW_REG_PTYS_ETH_SPEED_1000BASE_T            BIT(25)
 #define MLXSW_REG_PTYS_ETH_SPEED_25GBASE_CR            BIT(27)
 #define MLXSW_REG_PTYS_ETH_SPEED_25GBASE_KR            BIT(28)
 #define MLXSW_REG_PTYS_ETH_SPEED_25GBASE_SR            BIT(29)
@@ -6062,6 +6064,58 @@ static inline void mlxsw_reg_pllp_unpack(char *payload, u8 *label_port,
        *slot_index = mlxsw_reg_pllp_slot_index_get(payload);
 }
 
+/* PMTM - Port Module Type Mapping Register
+ * ----------------------------------------
+ * The PMTM register allows query or configuration of module types.
+ * The register can only be set when the module is disabled by PMAOS register
+ */
+#define MLXSW_REG_PMTM_ID 0x5067
+#define MLXSW_REG_PMTM_LEN 0x10
+
+MLXSW_REG_DEFINE(pmtm, MLXSW_REG_PMTM_ID, MLXSW_REG_PMTM_LEN);
+
+/* reg_pmtm_slot_index
+ * Slot index.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, pmtm, slot_index, 0x00, 24, 4);
+
+/* reg_pmtm_module
+ * Module number.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, pmtm, module, 0x00, 16, 8);
+
+enum mlxsw_reg_pmtm_module_type {
+       MLXSW_REG_PMTM_MODULE_TYPE_BACKPLANE_4_LANES = 0,
+       MLXSW_REG_PMTM_MODULE_TYPE_QSFP = 1,
+       MLXSW_REG_PMTM_MODULE_TYPE_SFP = 2,
+       MLXSW_REG_PMTM_MODULE_TYPE_BACKPLANE_SINGLE_LANE = 4,
+       MLXSW_REG_PMTM_MODULE_TYPE_BACKPLANE_2_LANES = 8,
+       MLXSW_REG_PMTM_MODULE_TYPE_CHIP2CHIP4X = 10,
+       MLXSW_REG_PMTM_MODULE_TYPE_CHIP2CHIP2X = 11,
+       MLXSW_REG_PMTM_MODULE_TYPE_CHIP2CHIP1X = 12,
+       MLXSW_REG_PMTM_MODULE_TYPE_QSFP_DD = 14,
+       MLXSW_REG_PMTM_MODULE_TYPE_OSFP = 15,
+       MLXSW_REG_PMTM_MODULE_TYPE_SFP_DD = 16,
+       MLXSW_REG_PMTM_MODULE_TYPE_DSFP = 17,
+       MLXSW_REG_PMTM_MODULE_TYPE_CHIP2CHIP8X = 18,
+       MLXSW_REG_PMTM_MODULE_TYPE_TWISTED_PAIR = 19,
+};
+
+/* reg_pmtm_module_type
+ * Module type.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, pmtm, module_type, 0x04, 0, 5);
+
+static inline void mlxsw_reg_pmtm_pack(char *payload, u8 slot_index, u8 module)
+{
+       MLXSW_REG_ZERO(pmtm, payload);
+       mlxsw_reg_pmtm_slot_index_set(payload, slot_index);
+       mlxsw_reg_pmtm_module_set(payload, module);
+}
+
 /* HTGT - Host Trap Group Table
  * ----------------------------
  * Configures the properties for forwarding to CPU.
@@ -6087,9 +6141,7 @@ MLXSW_ITEM32(reg, htgt, type, 0x00, 8, 4);
 
 enum mlxsw_reg_htgt_trap_group {
        MLXSW_REG_HTGT_TRAP_GROUP_EMAD,
-       MLXSW_REG_HTGT_TRAP_GROUP_MFDE,
-       MLXSW_REG_HTGT_TRAP_GROUP_MTWE,
-       MLXSW_REG_HTGT_TRAP_GROUP_PMPE,
+       MLXSW_REG_HTGT_TRAP_GROUP_CORE_EVENT,
        MLXSW_REG_HTGT_TRAP_GROUP_SP_STP,
        MLXSW_REG_HTGT_TRAP_GROUP_SP_LACP,
        MLXSW_REG_HTGT_TRAP_GROUP_SP_LLDP,
@@ -12568,6 +12620,7 @@ static const struct mlxsw_reg_info *mlxsw_reg_infos[] = {
        MLXSW_REG(pddr),
        MLXSW_REG(pmmp),
        MLXSW_REG(pllp),
+       MLXSW_REG(pmtm),
        MLXSW_REG(htgt),
        MLXSW_REG(hpkt),
        MLXSW_REG(rgcr),
index c7fc650..daacf62 100644 (file)
@@ -33,6 +33,7 @@ enum mlxsw_res_id {
        MLXSW_RES_ID_ACL_MAX_REGIONS,
        MLXSW_RES_ID_ACL_MAX_GROUPS,
        MLXSW_RES_ID_ACL_MAX_GROUP_SIZE,
+       MLXSW_RES_ID_ACL_MAX_DEFAULT_ACTIONS,
        MLXSW_RES_ID_ACL_FLEX_KEYS,
        MLXSW_RES_ID_ACL_MAX_ACTION_PER_RULE,
        MLXSW_RES_ID_ACL_ACTIONS_PER_SET,
@@ -90,6 +91,7 @@ static u16 mlxsw_res_ids[] = {
        [MLXSW_RES_ID_ACL_MAX_REGIONS] = 0x2903,
        [MLXSW_RES_ID_ACL_MAX_GROUPS] = 0x2904,
        [MLXSW_RES_ID_ACL_MAX_GROUP_SIZE] = 0x2905,
+       [MLXSW_RES_ID_ACL_MAX_DEFAULT_ACTIONS] = 0x2908,
        [MLXSW_RES_ID_ACL_FLEX_KEYS] = 0x2910,
        [MLXSW_RES_ID_ACL_MAX_ACTION_PER_RULE] = 0x2911,
        [MLXSW_RES_ID_ACL_ACTIONS_PER_SET] = 0x2912,
index aa411de..a4b94ee 100644 (file)
@@ -2148,13 +2148,11 @@ static void mlxsw_sp_pude_event_func(const struct mlxsw_reg_info *reg,
        struct mlxsw_sp *mlxsw_sp = priv;
        struct mlxsw_sp_port *mlxsw_sp_port;
        enum mlxsw_reg_pude_oper_status status;
-       unsigned int max_ports;
        u16 local_port;
 
-       max_ports = mlxsw_core_max_ports(mlxsw_sp->core);
        local_port = mlxsw_reg_pude_local_port_get(pude_pl);
 
-       if (WARN_ON_ONCE(!local_port || local_port >= max_ports))
+       if (WARN_ON_ONCE(!mlxsw_sp_local_port_is_valid(mlxsw_sp, local_port)))
                return;
        mlxsw_sp_port = mlxsw_sp->ports[local_port];
        if (!mlxsw_sp_port)
@@ -2393,45 +2391,6 @@ static int mlxsw_sp_trap_groups_set(struct mlxsw_core *mlxsw_core)
        return 0;
 }
 
-static int mlxsw_sp_traps_register(struct mlxsw_sp *mlxsw_sp,
-                                  const struct mlxsw_listener listeners[],
-                                  size_t listeners_count)
-{
-       int i;
-       int err;
-
-       for (i = 0; i < listeners_count; i++) {
-               err = mlxsw_core_trap_register(mlxsw_sp->core,
-                                              &listeners[i],
-                                              mlxsw_sp);
-               if (err)
-                       goto err_listener_register;
-
-       }
-       return 0;
-
-err_listener_register:
-       for (i--; i >= 0; i--) {
-               mlxsw_core_trap_unregister(mlxsw_sp->core,
-                                          &listeners[i],
-                                          mlxsw_sp);
-       }
-       return err;
-}
-
-static void mlxsw_sp_traps_unregister(struct mlxsw_sp *mlxsw_sp,
-                                     const struct mlxsw_listener listeners[],
-                                     size_t listeners_count)
-{
-       int i;
-
-       for (i = 0; i < listeners_count; i++) {
-               mlxsw_core_trap_unregister(mlxsw_sp->core,
-                                          &listeners[i],
-                                          mlxsw_sp);
-       }
-}
-
 static int mlxsw_sp_traps_init(struct mlxsw_sp *mlxsw_sp)
 {
        struct mlxsw_sp_trap *trap;
@@ -2456,21 +2415,23 @@ static int mlxsw_sp_traps_init(struct mlxsw_sp *mlxsw_sp)
        if (err)
                goto err_trap_groups_set;
 
-       err = mlxsw_sp_traps_register(mlxsw_sp, mlxsw_sp_listener,
-                                     ARRAY_SIZE(mlxsw_sp_listener));
+       err = mlxsw_core_traps_register(mlxsw_sp->core, mlxsw_sp_listener,
+                                       ARRAY_SIZE(mlxsw_sp_listener),
+                                       mlxsw_sp);
        if (err)
                goto err_traps_register;
 
-       err = mlxsw_sp_traps_register(mlxsw_sp, mlxsw_sp->listeners,
-                                     mlxsw_sp->listeners_count);
+       err = mlxsw_core_traps_register(mlxsw_sp->core, mlxsw_sp->listeners,
+                                       mlxsw_sp->listeners_count, mlxsw_sp);
        if (err)
                goto err_extra_traps_init;
 
        return 0;
 
 err_extra_traps_init:
-       mlxsw_sp_traps_unregister(mlxsw_sp, mlxsw_sp_listener,
-                                 ARRAY_SIZE(mlxsw_sp_listener));
+       mlxsw_core_traps_unregister(mlxsw_sp->core, mlxsw_sp_listener,
+                                   ARRAY_SIZE(mlxsw_sp_listener),
+                                   mlxsw_sp);
 err_traps_register:
 err_trap_groups_set:
 err_cpu_policers_set:
@@ -2480,10 +2441,11 @@ err_cpu_policers_set:
 
 static void mlxsw_sp_traps_fini(struct mlxsw_sp *mlxsw_sp)
 {
-       mlxsw_sp_traps_unregister(mlxsw_sp, mlxsw_sp->listeners,
-                                 mlxsw_sp->listeners_count);
-       mlxsw_sp_traps_unregister(mlxsw_sp, mlxsw_sp_listener,
-                                 ARRAY_SIZE(mlxsw_sp_listener));
+       mlxsw_core_traps_unregister(mlxsw_sp->core, mlxsw_sp->listeners,
+                                   mlxsw_sp->listeners_count,
+                                   mlxsw_sp);
+       mlxsw_core_traps_unregister(mlxsw_sp->core, mlxsw_sp_listener,
+                                   ARRAY_SIZE(mlxsw_sp_listener), mlxsw_sp);
        kfree(mlxsw_sp->trap);
 }
 
@@ -2528,42 +2490,6 @@ static void mlxsw_sp_lag_fini(struct mlxsw_sp *mlxsw_sp)
        kfree(mlxsw_sp->lags);
 }
 
-static int mlxsw_sp_basic_trap_groups_set(struct mlxsw_core *mlxsw_core)
-{
-       char htgt_pl[MLXSW_REG_HTGT_LEN];
-       int err;
-
-       mlxsw_reg_htgt_pack(htgt_pl, MLXSW_REG_HTGT_TRAP_GROUP_EMAD,
-                           MLXSW_REG_HTGT_INVALID_POLICER,
-                           MLXSW_REG_HTGT_DEFAULT_PRIORITY,
-                           MLXSW_REG_HTGT_DEFAULT_TC);
-       err =  mlxsw_reg_write(mlxsw_core, MLXSW_REG(htgt), htgt_pl);
-       if (err)
-               return err;
-
-       mlxsw_reg_htgt_pack(htgt_pl, MLXSW_REG_HTGT_TRAP_GROUP_MFDE,
-                           MLXSW_REG_HTGT_INVALID_POLICER,
-                           MLXSW_REG_HTGT_DEFAULT_PRIORITY,
-                           MLXSW_REG_HTGT_DEFAULT_TC);
-       err = mlxsw_reg_write(mlxsw_core, MLXSW_REG(htgt), htgt_pl);
-       if (err)
-               return err;
-
-       mlxsw_reg_htgt_pack(htgt_pl, MLXSW_REG_HTGT_TRAP_GROUP_MTWE,
-                           MLXSW_REG_HTGT_INVALID_POLICER,
-                           MLXSW_REG_HTGT_DEFAULT_PRIORITY,
-                           MLXSW_REG_HTGT_DEFAULT_TC);
-       err = mlxsw_reg_write(mlxsw_core, MLXSW_REG(htgt), htgt_pl);
-       if (err)
-               return err;
-
-       mlxsw_reg_htgt_pack(htgt_pl, MLXSW_REG_HTGT_TRAP_GROUP_PMPE,
-                           MLXSW_REG_HTGT_INVALID_POLICER,
-                           MLXSW_REG_HTGT_DEFAULT_PRIORITY,
-                           MLXSW_REG_HTGT_DEFAULT_TC);
-       return mlxsw_reg_write(mlxsw_core, MLXSW_REG(htgt), htgt_pl);
-}
-
 static const struct mlxsw_sp_ptp_ops mlxsw_sp1_ptp_ops = {
        .clock_init     = mlxsw_sp1_ptp_clock_init,
        .clock_fini     = mlxsw_sp1_ptp_clock_fini,
@@ -3677,7 +3603,6 @@ static struct mlxsw_driver mlxsw_sp1_driver = {
        .fw_filename                    = MLXSW_SP1_FW_FILENAME,
        .init                           = mlxsw_sp1_init,
        .fini                           = mlxsw_sp_fini,
-       .basic_trap_groups_set          = mlxsw_sp_basic_trap_groups_set,
        .port_split                     = mlxsw_sp_port_split,
        .port_unsplit                   = mlxsw_sp_port_unsplit,
        .sb_pool_get                    = mlxsw_sp_sb_pool_get,
@@ -3717,7 +3642,6 @@ static struct mlxsw_driver mlxsw_sp2_driver = {
        .fw_filename                    = MLXSW_SP2_FW_FILENAME,
        .init                           = mlxsw_sp2_init,
        .fini                           = mlxsw_sp_fini,
-       .basic_trap_groups_set          = mlxsw_sp_basic_trap_groups_set,
        .port_split                     = mlxsw_sp_port_split,
        .port_unsplit                   = mlxsw_sp_port_unsplit,
        .sb_pool_get                    = mlxsw_sp_sb_pool_get,
@@ -3758,7 +3682,6 @@ static struct mlxsw_driver mlxsw_sp3_driver = {
        .fw_filename                    = MLXSW_SP3_FW_FILENAME,
        .init                           = mlxsw_sp3_init,
        .fini                           = mlxsw_sp_fini,
-       .basic_trap_groups_set          = mlxsw_sp_basic_trap_groups_set,
        .port_split                     = mlxsw_sp_port_split,
        .port_unsplit                   = mlxsw_sp_port_unsplit,
        .sb_pool_get                    = mlxsw_sp_sb_pool_get,
@@ -3797,7 +3720,6 @@ static struct mlxsw_driver mlxsw_sp4_driver = {
        .priv_size                      = sizeof(struct mlxsw_sp),
        .init                           = mlxsw_sp4_init,
        .fini                           = mlxsw_sp_fini,
-       .basic_trap_groups_set          = mlxsw_sp_basic_trap_groups_set,
        .port_split                     = mlxsw_sp_port_split,
        .port_unsplit                   = mlxsw_sp_port_unsplit,
        .sb_pool_get                    = mlxsw_sp_sb_pool_get,
index bb2442e..20588e6 100644 (file)
@@ -481,6 +481,13 @@ int
 mlxsw_sp_port_vlan_classification_set(struct mlxsw_sp_port *mlxsw_sp_port,
                                      bool is_8021ad_tagged,
                                      bool is_8021q_tagged);
+static inline bool
+mlxsw_sp_local_port_is_valid(struct mlxsw_sp *mlxsw_sp, u16 local_port)
+{
+       unsigned int max_ports = mlxsw_core_max_ports(mlxsw_sp->core);
+
+       return local_port < max_ports && local_port;
+}
 
 /* spectrum_buffers.c */
 struct mlxsw_sp_hdroom_prio {
@@ -813,6 +820,24 @@ int mlxsw_sp1_kvdl_resources_register(struct mlxsw_core *mlxsw_core);
 /* spectrum2_kvdl.c */
 extern const struct mlxsw_sp_kvdl_ops mlxsw_sp2_kvdl_ops;
 
+enum mlxsw_sp_acl_mangle_field {
+       MLXSW_SP_ACL_MANGLE_FIELD_IP_DSFIELD,
+       MLXSW_SP_ACL_MANGLE_FIELD_IP_DSCP,
+       MLXSW_SP_ACL_MANGLE_FIELD_IP_ECN,
+       MLXSW_SP_ACL_MANGLE_FIELD_IP_SPORT,
+       MLXSW_SP_ACL_MANGLE_FIELD_IP_DPORT,
+       MLXSW_SP_ACL_MANGLE_FIELD_IP4_SIP,
+       MLXSW_SP_ACL_MANGLE_FIELD_IP4_DIP,
+       MLXSW_SP_ACL_MANGLE_FIELD_IP6_SIP_1,
+       MLXSW_SP_ACL_MANGLE_FIELD_IP6_SIP_2,
+       MLXSW_SP_ACL_MANGLE_FIELD_IP6_SIP_3,
+       MLXSW_SP_ACL_MANGLE_FIELD_IP6_SIP_4,
+       MLXSW_SP_ACL_MANGLE_FIELD_IP6_DIP_1,
+       MLXSW_SP_ACL_MANGLE_FIELD_IP6_DIP_2,
+       MLXSW_SP_ACL_MANGLE_FIELD_IP6_DIP_3,
+       MLXSW_SP_ACL_MANGLE_FIELD_IP6_DIP_4,
+};
+
 struct mlxsw_sp_acl_rule_info {
        unsigned int priority;
        struct mlxsw_afk_element_values values;
@@ -821,9 +846,14 @@ struct mlxsw_sp_acl_rule_info {
           ingress_bind_blocker:1,
           egress_bind_blocker:1,
           counter_valid:1,
-          policer_index_valid:1;
+          policer_index_valid:1,
+          ipv6_valid:1;
        unsigned int counter_index;
        u16 policer_index;
+       struct {
+               u32 prev_val;
+               enum mlxsw_sp_acl_mangle_field prev_field;
+       } ipv6;
 };
 
 /* spectrum_flow.c */
index a9fff8a..d20e794 100644 (file)
@@ -213,7 +213,6 @@ mlxsw_sp1_kvdl_part_init(struct mlxsw_sp *mlxsw_sp,
        struct mlxsw_sp1_kvdl_part *part;
        bool need_update = true;
        unsigned int nr_entries;
-       size_t usage_size;
        u64 resource_size;
        int err;
 
@@ -225,8 +224,8 @@ mlxsw_sp1_kvdl_part_init(struct mlxsw_sp *mlxsw_sp,
        }
 
        nr_entries = div_u64(resource_size, info->alloc_size);
-       usage_size = BITS_TO_LONGS(nr_entries) * sizeof(unsigned long);
-       part = kzalloc(sizeof(*part) + usage_size, GFP_KERNEL);
+       part = kzalloc(struct_size(part, usage, BITS_TO_LONGS(nr_entries)),
+                      GFP_KERNEL);
        if (!part)
                return ERR_PTR(-ENOMEM);
 
index ad69913..5b02108 100644 (file)
@@ -77,7 +77,14 @@ static int mlxsw_sp2_acl_tcam_init(struct mlxsw_sp *mlxsw_sp, void *priv,
        int i;
        int err;
 
+       /* Some TCAM regions are not exposed to the host and used internally
+        * by the device. Allocate KVDL entries for the default actions of
+        * these regions to avoid the host from overwriting them.
+        */
        tcam->kvdl_count = _tcam->max_regions;
+       if (MLXSW_CORE_RES_VALID(mlxsw_sp->core, ACL_MAX_DEFAULT_ACTIONS))
+               tcam->kvdl_count = MLXSW_CORE_RES_GET(mlxsw_sp->core,
+                                                     ACL_MAX_DEFAULT_ACTIONS);
        err = mlxsw_sp_kvdl_alloc(mlxsw_sp, MLXSW_SP_KVDL_ENTRY_TYPE_ACTSET,
                                  tcam->kvdl_count, &tcam->kvdl_index);
        if (err)
@@ -97,7 +104,10 @@ static int mlxsw_sp2_acl_tcam_init(struct mlxsw_sp *mlxsw_sp, void *priv,
                goto err_afa_block_continue;
        enc_actions = mlxsw_afa_block_cur_set(afa_block);
 
-       for (i = 0; i < tcam->kvdl_count; i++) {
+       /* Only write to KVDL entries used by TCAM regions exposed to the
+        * host.
+        */
+       for (i = 0; i < _tcam->max_regions; i++) {
                mlxsw_reg_pefa_pack(pefa_pl, tcam->kvdl_index + i,
                                    true, enc_actions);
                err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(pefa), pefa_pl);
index 70c11bf..6c5af01 100644 (file)
@@ -505,14 +505,6 @@ int mlxsw_sp_acl_rulei_act_priority(struct mlxsw_sp *mlxsw_sp,
                                                      extack);
 }
 
-enum mlxsw_sp_acl_mangle_field {
-       MLXSW_SP_ACL_MANGLE_FIELD_IP_DSFIELD,
-       MLXSW_SP_ACL_MANGLE_FIELD_IP_DSCP,
-       MLXSW_SP_ACL_MANGLE_FIELD_IP_ECN,
-       MLXSW_SP_ACL_MANGLE_FIELD_IP_SPORT,
-       MLXSW_SP_ACL_MANGLE_FIELD_IP_DPORT,
-};
-
 struct mlxsw_sp_acl_mangle_action {
        enum flow_action_mangle_base htype;
        /* Offset is u32-aligned. */
@@ -561,6 +553,18 @@ static struct mlxsw_sp_acl_mangle_action mlxsw_sp_acl_mangle_actions[] = {
 
        MLXSW_SP_ACL_MANGLE_ACTION_UDP(0, 0x0000ffff, 16, IP_SPORT),
        MLXSW_SP_ACL_MANGLE_ACTION_UDP(0, 0xffff0000, 0,  IP_DPORT),
+
+       MLXSW_SP_ACL_MANGLE_ACTION_IP4(12, 0x00000000, 0, IP4_SIP),
+       MLXSW_SP_ACL_MANGLE_ACTION_IP4(16, 0x00000000, 0, IP4_DIP),
+
+       MLXSW_SP_ACL_MANGLE_ACTION_IP6(8, 0x00000000, 0, IP6_SIP_1),
+       MLXSW_SP_ACL_MANGLE_ACTION_IP6(12, 0x00000000, 0, IP6_SIP_2),
+       MLXSW_SP_ACL_MANGLE_ACTION_IP6(16, 0x00000000, 0, IP6_SIP_3),
+       MLXSW_SP_ACL_MANGLE_ACTION_IP6(20, 0x00000000, 0, IP6_SIP_4),
+       MLXSW_SP_ACL_MANGLE_ACTION_IP6(24, 0x00000000, 0, IP6_DIP_1),
+       MLXSW_SP_ACL_MANGLE_ACTION_IP6(28, 0x00000000, 0, IP6_DIP_2),
+       MLXSW_SP_ACL_MANGLE_ACTION_IP6(32, 0x00000000, 0, IP6_DIP_3),
+       MLXSW_SP_ACL_MANGLE_ACTION_IP6(36, 0x00000000, 0, IP6_DIP_4),
 };
 
 static int
@@ -599,6 +603,22 @@ static int mlxsw_sp1_acl_rulei_act_mangle_field(struct mlxsw_sp *mlxsw_sp,
        return err;
 }
 
+static int
+mlxsw_sp2_acl_rulei_act_mangle_field_ip_odd(struct mlxsw_sp_acl_rule_info *rulei,
+                                           enum mlxsw_sp_acl_mangle_field field,
+                                           u32 val, struct netlink_ext_ack *extack)
+{
+       if (!rulei->ipv6_valid) {
+               rulei->ipv6.prev_val = val;
+               rulei->ipv6_valid = true;
+               rulei->ipv6.prev_field = field;
+               return 0;
+       }
+
+       NL_SET_ERR_MSG_MOD(extack, "Unsupported mangle field order");
+       return -EOPNOTSUPP;
+}
+
 static int mlxsw_sp2_acl_rulei_act_mangle_field(struct mlxsw_sp *mlxsw_sp,
                                                struct mlxsw_sp_acl_rule_info *rulei,
                                                struct mlxsw_sp_acl_mangle_action *mact,
@@ -615,6 +635,61 @@ static int mlxsw_sp2_acl_rulei_act_mangle_field(struct mlxsw_sp *mlxsw_sp,
                return mlxsw_afa_block_append_l4port(rulei->act_block, false, val, extack);
        case MLXSW_SP_ACL_MANGLE_FIELD_IP_DPORT:
                return mlxsw_afa_block_append_l4port(rulei->act_block, true, val, extack);
+       /* IPv4 fields */
+       case MLXSW_SP_ACL_MANGLE_FIELD_IP4_SIP:
+               return mlxsw_afa_block_append_ip(rulei->act_block, false,
+                                                true, val, 0, extack);
+       case MLXSW_SP_ACL_MANGLE_FIELD_IP4_DIP:
+               return mlxsw_afa_block_append_ip(rulei->act_block, true,
+                                                true, val, 0, extack);
+       /* IPv6 fields */
+       case MLXSW_SP_ACL_MANGLE_FIELD_IP6_SIP_1:
+       case MLXSW_SP_ACL_MANGLE_FIELD_IP6_SIP_3:
+       case MLXSW_SP_ACL_MANGLE_FIELD_IP6_DIP_1:
+       case MLXSW_SP_ACL_MANGLE_FIELD_IP6_DIP_3:
+               return mlxsw_sp2_acl_rulei_act_mangle_field_ip_odd(rulei,
+                                                                  mact->field,
+                                                                  val, extack);
+       case MLXSW_SP_ACL_MANGLE_FIELD_IP6_SIP_2:
+               if (rulei->ipv6_valid &&
+                   rulei->ipv6.prev_field == MLXSW_SP_ACL_MANGLE_FIELD_IP6_SIP_1) {
+                       rulei->ipv6_valid = false;
+                       return mlxsw_afa_block_append_ip(rulei->act_block,
+                                                        false, false, val,
+                                                        rulei->ipv6.prev_val,
+                                                        extack);
+               }
+               break;
+       case MLXSW_SP_ACL_MANGLE_FIELD_IP6_SIP_4:
+               if (rulei->ipv6_valid &&
+                   rulei->ipv6.prev_field == MLXSW_SP_ACL_MANGLE_FIELD_IP6_SIP_3) {
+                       rulei->ipv6_valid = false;
+                       return mlxsw_afa_block_append_ip(rulei->act_block,
+                                                        false, true, val,
+                                                        rulei->ipv6.prev_val,
+                                                        extack);
+               }
+               break;
+       case MLXSW_SP_ACL_MANGLE_FIELD_IP6_DIP_2:
+               if (rulei->ipv6_valid &&
+                   rulei->ipv6.prev_field == MLXSW_SP_ACL_MANGLE_FIELD_IP6_DIP_1) {
+                       rulei->ipv6_valid = false;
+                       return mlxsw_afa_block_append_ip(rulei->act_block,
+                                                        true, false, val,
+                                                        rulei->ipv6.prev_val,
+                                                        extack);
+               }
+               break;
+       case MLXSW_SP_ACL_MANGLE_FIELD_IP6_DIP_4:
+               if (rulei->ipv6_valid &&
+                   rulei->ipv6.prev_field == MLXSW_SP_ACL_MANGLE_FIELD_IP6_DIP_3) {
+                       rulei->ipv6_valid = false;
+                       return mlxsw_afa_block_append_ip(rulei->act_block,
+                                                        true, true, val,
+                                                        rulei->ipv6.prev_val,
+                                                        extack);
+               }
+               break;
        default:
                break;
        }
index 2053071..8b5d7f8 100644 (file)
@@ -1034,13 +1034,10 @@ static int mlxsw_sp_get_module_info(struct net_device *netdev,
 {
        struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(netdev);
        struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
-       int err;
-
-       err = mlxsw_env_get_module_info(mlxsw_sp->core,
-                                       mlxsw_sp_port->mapping.module,
-                                       modinfo);
 
-       return err;
+       return mlxsw_env_get_module_info(netdev, mlxsw_sp->core,
+                                        mlxsw_sp_port->mapping.module,
+                                        modinfo);
 }
 
 static int mlxsw_sp_get_module_eeprom(struct net_device *netdev,
@@ -1048,13 +1045,10 @@ static int mlxsw_sp_get_module_eeprom(struct net_device *netdev,
 {
        struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(netdev);
        struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
-       int err;
-
-       err = mlxsw_env_get_module_eeprom(netdev, mlxsw_sp->core,
-                                         mlxsw_sp_port->mapping.module, ee,
-                                         data);
 
-       return err;
+       return mlxsw_env_get_module_eeprom(netdev, mlxsw_sp->core,
+                                          mlxsw_sp_port->mapping.module, ee,
+                                          data);
 }
 
 static int
@@ -1273,12 +1267,22 @@ struct mlxsw_sp1_port_link_mode {
 
 static const struct mlxsw_sp1_port_link_mode mlxsw_sp1_port_link_mode[] = {
        {
+               .mask           = MLXSW_REG_PTYS_ETH_SPEED_100BASE_T,
+               .mask_ethtool   = ETHTOOL_LINK_MODE_100baseT_Full_BIT,
+               .speed          = SPEED_100,
+       },
+       {
                .mask           = MLXSW_REG_PTYS_ETH_SPEED_SGMII |
                                  MLXSW_REG_PTYS_ETH_SPEED_1000BASE_KX,
                .mask_ethtool   = ETHTOOL_LINK_MODE_1000baseKX_Full_BIT,
                .speed          = SPEED_1000,
        },
        {
+               .mask           = MLXSW_REG_PTYS_ETH_SPEED_1000BASE_T,
+               .mask_ethtool   = ETHTOOL_LINK_MODE_1000baseT_Full_BIT,
+               .speed          = SPEED_1000,
+       },
+       {
                .mask           = MLXSW_REG_PTYS_ETH_SPEED_10GBASE_CX4 |
                                  MLXSW_REG_PTYS_ETH_SPEED_10GBASE_KX4,
                .mask_ethtool   = ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT,
index bb417db..f54af3d 100644 (file)
@@ -233,6 +233,12 @@ static int mlxsw_sp_flower_parse_actions(struct mlxsw_sp *mlxsw_sp,
                        return -EOPNOTSUPP;
                }
        }
+
+       if (rulei->ipv6_valid) {
+               NL_SET_ERR_MSG_MOD(extack, "Unsupported mangle field");
+               return -EOPNOTSUPP;
+       }
+
        return 0;
 }
 
index 0ff163f..35422e6 100644 (file)
@@ -568,12 +568,11 @@ void mlxsw_sp1_ptp_got_timestamp(struct mlxsw_sp *mlxsw_sp, bool ingress,
                                 u8 domain_number, u16 sequence_id,
                                 u64 timestamp)
 {
-       unsigned int max_ports = mlxsw_core_max_ports(mlxsw_sp->core);
        struct mlxsw_sp_port *mlxsw_sp_port;
        struct mlxsw_sp1_ptp_key key;
        u8 types;
 
-       if (WARN_ON_ONCE(local_port >= max_ports))
+       if (WARN_ON_ONCE(!mlxsw_sp_local_port_is_valid(mlxsw_sp, local_port)))
                return;
        mlxsw_sp_port = mlxsw_sp->ports[local_port];
        if (!mlxsw_sp_port)
index 65c1724..bffdb41 100644 (file)
@@ -2616,7 +2616,6 @@ static void mlxsw_sp_fdb_notify_mac_process(struct mlxsw_sp *mlxsw_sp,
                                            char *sfn_pl, int rec_index,
                                            bool adding)
 {
-       unsigned int max_ports = mlxsw_core_max_ports(mlxsw_sp->core);
        struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan;
        struct mlxsw_sp_bridge_device *bridge_device;
        struct mlxsw_sp_bridge_port *bridge_port;
@@ -2630,7 +2629,7 @@ static void mlxsw_sp_fdb_notify_mac_process(struct mlxsw_sp *mlxsw_sp,
 
        mlxsw_reg_sfn_mac_unpack(sfn_pl, rec_index, mac, &fid, &local_port);
 
-       if (WARN_ON_ONCE(local_port >= max_ports))
+       if (WARN_ON_ONCE(!mlxsw_sp_local_port_is_valid(mlxsw_sp, local_port)))
                return;
        mlxsw_sp_port = mlxsw_sp->ports[local_port];
        if (!mlxsw_sp_port) {
index 91a755e..5f1e7b8 100644 (file)
@@ -750,7 +750,7 @@ static int lan743x_ethtool_set_eee(struct net_device *netdev,
        }
 
        if (eee->eee_enabled) {
-               ret = phy_init_eee(phydev, 0);
+               ret = phy_init_eee(phydev, false);
                if (ret) {
                        netif_err(adapter, drv, adapter->netdev,
                                  "EEE initialization failed\n");
index 040cfff..a9ffc71 100644 (file)
@@ -7,4 +7,5 @@ obj-$(CONFIG_LAN966X_SWITCH) += lan966x-switch.o
 
 lan966x-switch-objs  := lan966x_main.o lan966x_phylink.o lan966x_port.o \
                        lan966x_mac.o lan966x_ethtool.o lan966x_switchdev.o \
-                       lan966x_vlan.o lan966x_fdb.o lan966x_mdb.o
+                       lan966x_vlan.o lan966x_fdb.o lan966x_mdb.o \
+                       lan966x_ptp.o
index 614f12c..e58a27f 100644 (file)
@@ -545,6 +545,39 @@ static int lan966x_set_pauseparam(struct net_device *dev,
        return phylink_ethtool_set_pauseparam(port->phylink, pause);
 }
 
+static int lan966x_get_ts_info(struct net_device *dev,
+                              struct ethtool_ts_info *info)
+{
+       struct lan966x_port *port = netdev_priv(dev);
+       struct lan966x *lan966x = port->lan966x;
+       struct lan966x_phc *phc;
+
+       if (!lan966x->ptp)
+               return ethtool_op_get_ts_info(dev, info);
+
+       phc = &lan966x->phc[LAN966X_PHC_PORT];
+
+       info->phc_index = phc->clock ? ptp_clock_index(phc->clock) : -1;
+       if (info->phc_index == -1) {
+               info->so_timestamping |= SOF_TIMESTAMPING_TX_SOFTWARE |
+                                        SOF_TIMESTAMPING_RX_SOFTWARE |
+                                        SOF_TIMESTAMPING_SOFTWARE;
+               return 0;
+       }
+       info->so_timestamping |= SOF_TIMESTAMPING_TX_SOFTWARE |
+                                SOF_TIMESTAMPING_RX_SOFTWARE |
+                                SOF_TIMESTAMPING_SOFTWARE |
+                                SOF_TIMESTAMPING_TX_HARDWARE |
+                                SOF_TIMESTAMPING_RX_HARDWARE |
+                                SOF_TIMESTAMPING_RAW_HARDWARE;
+       info->tx_types = BIT(HWTSTAMP_TX_OFF) | BIT(HWTSTAMP_TX_ON) |
+                        BIT(HWTSTAMP_TX_ONESTEP_SYNC);
+       info->rx_filters = BIT(HWTSTAMP_FILTER_NONE) |
+                          BIT(HWTSTAMP_FILTER_ALL);
+
+       return 0;
+}
+
 const struct ethtool_ops lan966x_ethtool_ops = {
        .get_link_ksettings     = lan966x_get_link_ksettings,
        .set_link_ksettings     = lan966x_set_link_ksettings,
@@ -556,6 +589,7 @@ const struct ethtool_ops lan966x_ethtool_ops = {
        .get_eth_mac_stats      = lan966x_get_eth_mac_stats,
        .get_rmon_stats         = lan966x_get_eth_rmon_stats,
        .get_link               = ethtool_op_get_link,
+       .get_ts_info            = lan966x_get_ts_info,
 };
 
 static void lan966x_check_stats_work(struct work_struct *work)
index 1f60fd1..d62484f 100644 (file)
@@ -4,11 +4,13 @@
 #include <linux/if_bridge.h>
 #include <linux/if_vlan.h>
 #include <linux/iopoll.h>
+#include <linux/ip.h>
 #include <linux/of_platform.h>
 #include <linux/of_net.h>
 #include <linux/packing.h>
 #include <linux/phy/phy.h>
 #include <linux/reset.h>
+#include <net/addrconf.h>
 
 #include "lan966x_main.h"
 
@@ -44,6 +46,7 @@ static const struct lan966x_main_io_resource lan966x_main_iomap[] =  {
        { TARGET_ORG,                         0, 1 }, /* 0xe2000000 */
        { TARGET_GCB,                    0x4000, 1 }, /* 0xe2004000 */
        { TARGET_QS,                     0x8000, 1 }, /* 0xe2008000 */
+       { TARGET_PTP,                    0xc000, 1 }, /* 0xe200c000 */
        { TARGET_CHIP_TOP,              0x10000, 1 }, /* 0xe2010000 */
        { TARGET_REW,                   0x14000, 1 }, /* 0xe2014000 */
        { TARGET_SYS,                   0x28000, 1 }, /* 0xe2028000 */
@@ -201,7 +204,7 @@ static int lan966x_port_ifh_xmit(struct sk_buff *skb,
        val = lan_rd(lan966x, QS_INJ_STATUS);
        if (!(QS_INJ_STATUS_FIFO_RDY_GET(val) & BIT(grp)) ||
            (QS_INJ_STATUS_WMARK_REACHED_GET(val) & BIT(grp)))
-               return NETDEV_TX_BUSY;
+               goto err;
 
        /* Write start of frame */
        lan_wr(QS_INJ_CTRL_GAP_SIZE_SET(1) |
@@ -213,7 +216,7 @@ static int lan966x_port_ifh_xmit(struct sk_buff *skb,
                /* Wait until the fifo is ready */
                err = lan966x_port_inj_ready(lan966x, grp);
                if (err)
-                       return NETDEV_TX_BUSY;
+                       goto err;
 
                lan_wr((__force u32)ifh[i], lan966x, QS_INJ_WR(grp));
        }
@@ -225,7 +228,7 @@ static int lan966x_port_ifh_xmit(struct sk_buff *skb,
                /* Wait until the fifo is ready */
                err = lan966x_port_inj_ready(lan966x, grp);
                if (err)
-                       return NETDEV_TX_BUSY;
+                       goto err;
 
                lan_wr(((u32 *)skb->data)[i], lan966x, QS_INJ_WR(grp));
        }
@@ -235,7 +238,7 @@ static int lan966x_port_ifh_xmit(struct sk_buff *skb,
                /* Wait until the fifo is ready */
                err = lan966x_port_inj_ready(lan966x, grp);
                if (err)
-                       return NETDEV_TX_BUSY;
+                       goto err;
 
                lan_wr(0, lan966x, QS_INJ_WR(grp));
                ++i;
@@ -255,8 +258,19 @@ static int lan966x_port_ifh_xmit(struct sk_buff *skb,
        dev->stats.tx_packets++;
        dev->stats.tx_bytes += skb->len;
 
+       if (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP &&
+           LAN966X_SKB_CB(skb)->rew_op == IFH_REW_OP_TWO_STEP_PTP)
+               return NETDEV_TX_OK;
+
        dev_consume_skb_any(skb);
        return NETDEV_TX_OK;
+
+err:
+       if (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP &&
+           LAN966X_SKB_CB(skb)->rew_op == IFH_REW_OP_TWO_STEP_PTP)
+               lan966x_ptp_txtstamp_release(port, skb);
+
+       return NETDEV_TX_BUSY;
 }
 
 static void lan966x_ifh_set_bypass(void *ifh, u64 bypass)
@@ -289,10 +303,23 @@ static void lan966x_ifh_set_vid(void *ifh, u64 vid)
                IFH_POS_TCI, IFH_LEN * 4, PACK, 0);
 }
 
+static void lan966x_ifh_set_rew_op(void *ifh, u64 rew_op)
+{
+       packing(ifh, &rew_op, IFH_POS_REW_CMD + IFH_WID_REW_CMD - 1,
+               IFH_POS_REW_CMD, IFH_LEN * 4, PACK, 0);
+}
+
+static void lan966x_ifh_set_timestamp(void *ifh, u64 timestamp)
+{
+       packing(ifh, &timestamp, IFH_POS_TIMESTAMP + IFH_WID_TIMESTAMP - 1,
+               IFH_POS_TIMESTAMP, IFH_LEN * 4, PACK, 0);
+}
+
 static int lan966x_port_xmit(struct sk_buff *skb, struct net_device *dev)
 {
        struct lan966x_port *port = netdev_priv(dev);
        __be32 ifh[IFH_LEN];
+       int err;
 
        memset(ifh, 0x0, sizeof(__be32) * IFH_LEN);
 
@@ -302,6 +329,15 @@ static int lan966x_port_xmit(struct sk_buff *skb, struct net_device *dev)
        lan966x_ifh_set_ipv(ifh, skb->priority >= 7 ? 0x7 : skb->priority);
        lan966x_ifh_set_vid(ifh, skb_vlan_tag_get(skb));
 
+       if (port->lan966x->ptp && skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) {
+               err = lan966x_ptp_txtstamp_request(port, skb);
+               if (err)
+                       return err;
+
+               lan966x_ifh_set_rew_op(ifh, LAN966X_SKB_CB(skb)->rew_op);
+               lan966x_ifh_set_timestamp(ifh, LAN966X_SKB_CB(skb)->ts_id);
+       }
+
        return lan966x_port_ifh_xmit(skb, ifh, dev);
 }
 
@@ -350,6 +386,23 @@ static int lan966x_port_get_parent_id(struct net_device *dev,
        return 0;
 }
 
+static int lan966x_port_ioctl(struct net_device *dev, struct ifreq *ifr,
+                             int cmd)
+{
+       struct lan966x_port *port = netdev_priv(dev);
+
+       if (!phy_has_hwtstamp(dev->phydev) && port->lan966x->ptp) {
+               switch (cmd) {
+               case SIOCSHWTSTAMP:
+                       return lan966x_ptp_hwtstamp_set(port, ifr);
+               case SIOCGHWTSTAMP:
+                       return lan966x_ptp_hwtstamp_get(port, ifr);
+               }
+       }
+
+       return phy_mii_ioctl(dev->phydev, ifr, cmd);
+}
+
 static const struct net_device_ops lan966x_port_netdev_ops = {
        .ndo_open                       = lan966x_port_open,
        .ndo_stop                       = lan966x_port_stop,
@@ -360,6 +413,7 @@ static const struct net_device_ops lan966x_port_netdev_ops = {
        .ndo_get_stats64                = lan966x_stats_get,
        .ndo_set_mac_address            = lan966x_port_set_mac_address,
        .ndo_get_port_parent_id         = lan966x_port_get_parent_id,
+       .ndo_eth_ioctl                  = lan966x_port_ioctl,
 };
 
 bool lan966x_netdevice_check(const struct net_device *dev)
@@ -367,6 +421,32 @@ bool lan966x_netdevice_check(const struct net_device *dev)
        return dev->netdev_ops == &lan966x_port_netdev_ops;
 }
 
+static bool lan966x_hw_offload(struct lan966x *lan966x, u32 port,
+                              struct sk_buff *skb)
+{
+       u32 val;
+
+       /* The IGMP and MLD frames are not forward by the HW if
+        * multicast snooping is enabled, therefor don't mark as
+        * offload to allow the SW to forward the frames accordingly.
+        */
+       val = lan_rd(lan966x, ANA_CPU_FWD_CFG(port));
+       if (!(val & (ANA_CPU_FWD_CFG_IGMP_REDIR_ENA |
+                    ANA_CPU_FWD_CFG_MLD_REDIR_ENA)))
+               return true;
+
+       if (skb->protocol == htons(ETH_P_IP) &&
+           ip_hdr(skb)->protocol == IPPROTO_IGMP)
+               return false;
+
+       if (skb->protocol == htons(ETH_P_IPV6) &&
+           ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr) &&
+           !ipv6_mc_check_mld(skb))
+               return false;
+
+       return true;
+}
+
 static int lan966x_port_xtr_status(struct lan966x *lan966x, u8 grp)
 {
        return lan_rd(lan966x, QS_XTR_RD(grp));
@@ -434,6 +514,12 @@ static void lan966x_ifh_get_len(void *ifh, u64 *len)
                IFH_POS_LEN, IFH_LEN * 4, UNPACK, 0);
 }
 
+static void lan966x_ifh_get_timestamp(void *ifh, u64 *timestamp)
+{
+       packing(ifh, timestamp, IFH_POS_TIMESTAMP + IFH_WID_TIMESTAMP - 1,
+               IFH_POS_TIMESTAMP, IFH_LEN * 4, UNPACK, 0);
+}
+
 static irqreturn_t lan966x_xtr_irq_handler(int irq, void *args)
 {
        struct lan966x *lan966x = args;
@@ -443,10 +529,10 @@ static irqreturn_t lan966x_xtr_irq_handler(int irq, void *args)
                return IRQ_NONE;
 
        do {
+               u64 src_port, len, timestamp;
                struct net_device *dev;
                struct sk_buff *skb;
                int sz = 0, buf_len;
-               u64 src_port, len;
                u32 ifh[IFH_LEN];
                u32 *buf;
                u32 val;
@@ -461,6 +547,7 @@ static irqreturn_t lan966x_xtr_irq_handler(int irq, void *args)
 
                lan966x_ifh_get_src_port(ifh, &src_port);
                lan966x_ifh_get_len(ifh, &len);
+               lan966x_ifh_get_timestamp(ifh, &timestamp);
 
                WARN_ON(src_port >= lan966x->num_phys_ports);
 
@@ -501,11 +588,17 @@ static irqreturn_t lan966x_xtr_irq_handler(int irq, void *args)
                        *buf = val;
                }
 
+               lan966x_ptp_rxtstamp(lan966x, skb, timestamp);
                skb->protocol = eth_type_trans(skb, dev);
 
-               if (lan966x->bridge_mask & BIT(src_port))
+               if (lan966x->bridge_mask & BIT(src_port)) {
                        skb->offload_fwd_mark = 1;
 
+                       skb_reset_network_header(skb);
+                       if (!lan966x_hw_offload(lan966x, src_port, skb))
+                               skb->offload_fwd_mark = 0;
+               }
+
                netif_rx_ni(skb);
                dev->stats.rx_bytes += len;
                dev->stats.rx_packets++;
@@ -628,7 +721,6 @@ static int lan966x_probe_port(struct lan966x *lan966x, u32 p,
        }
 
        port->phylink = phylink;
-       phylink_set_pcs(phylink, &port->phylink_pcs);
 
        err = register_netdev(dev);
        if (err) {
@@ -708,7 +800,7 @@ static void lan966x_init(struct lan966x *lan966x)
        /* Setup flooding PGIDs */
        lan_wr(ANA_FLOODING_IPMC_FLD_MC4_DATA_SET(PGID_MCIPV4) |
               ANA_FLOODING_IPMC_FLD_MC4_CTRL_SET(PGID_MC) |
-              ANA_FLOODING_IPMC_FLD_MC6_DATA_SET(PGID_MC) |
+              ANA_FLOODING_IPMC_FLD_MC6_DATA_SET(PGID_MCIPV6) |
               ANA_FLOODING_IPMC_FLD_MC6_CTRL_SET(PGID_MC),
               lan966x, ANA_FLOODING_IPMC);
 
@@ -770,6 +862,10 @@ static void lan966x_init(struct lan966x *lan966x)
                ANA_PGID_PGID,
                lan966x, ANA_PGID(PGID_MCIPV4));
 
+       lan_rmw(GENMASK(lan966x->num_phys_ports - 1, 0),
+               ANA_PGID_PGID,
+               lan966x, ANA_PGID(PGID_MCIPV6));
+
        /* Unicast to all other ports */
        lan_rmw(GENMASK(lan966x->num_phys_ports - 1, 0),
                ANA_PGID_PGID,
@@ -897,6 +993,17 @@ static int lan966x_probe(struct platform_device *pdev)
                        return dev_err_probe(&pdev->dev, err, "Unable to use ana irq");
        }
 
+       lan966x->ptp_irq = platform_get_irq_byname(pdev, "ptp");
+       if (lan966x->ptp_irq > 0) {
+               err = devm_request_threaded_irq(&pdev->dev, lan966x->ptp_irq, NULL,
+                                               lan966x_ptp_irq_handler, IRQF_ONESHOT,
+                                               "ptp irq", lan966x);
+               if (err)
+                       return dev_err_probe(&pdev->dev, err, "Unable to use ptp irq");
+
+               lan966x->ptp = 1;
+       }
+
        /* init switch */
        lan966x_init(lan966x);
        lan966x_stats_init(lan966x);
@@ -931,8 +1038,15 @@ static int lan966x_probe(struct platform_device *pdev)
        if (err)
                goto cleanup_ports;
 
+       err = lan966x_ptp_init(lan966x);
+       if (err)
+               goto cleanup_fdb;
+
        return 0;
 
+cleanup_fdb:
+       lan966x_fdb_deinit(lan966x);
+
 cleanup_ports:
        fwnode_handle_put(portnp);
 
@@ -958,6 +1072,7 @@ static int lan966x_remove(struct platform_device *pdev)
        lan966x_mac_purge_entries(lan966x);
        lan966x_mdb_deinit(lan966x);
        lan966x_fdb_deinit(lan966x);
+       lan966x_ptp_deinit(lan966x);
 
        return 0;
 }
index 99c6d0a..058e435 100644 (file)
@@ -8,6 +8,7 @@
 #include <linux/jiffies.h>
 #include <linux/phy.h>
 #include <linux/phylink.h>
+#include <linux/ptp_clock_kernel.h>
 #include <net/switchdev.h>
 
 #include "lan966x_regs.h"
 #define LAN966X_SPEED_100              2
 #define LAN966X_SPEED_10               3
 
+#define LAN966X_PHC_COUNT              3
+#define LAN966X_PHC_PORT               0
+
+#define IFH_REW_OP_NOOP                        0x0
+#define IFH_REW_OP_ONE_STEP_PTP                0x3
+#define IFH_REW_OP_TWO_STEP_PTP                0x4
+
 /* MAC table entry types.
  * ENTRYTYPE_NORMAL is subject to aging.
  * ENTRYTYPE_LOCKED is not subject to aging.
@@ -70,6 +78,24 @@ struct lan966x_stat_layout {
        char name[ETH_GSTRING_LEN];
 };
 
+struct lan966x_phc {
+       struct ptp_clock *clock;
+       struct ptp_clock_info info;
+       struct hwtstamp_config hwtstamp_config;
+       struct lan966x *lan966x;
+       u8 index;
+};
+
+struct lan966x_skb_cb {
+       u8 rew_op;
+       u16 ts_id;
+       unsigned long jiffies;
+};
+
+#define LAN966X_PTP_TIMEOUT            msecs_to_jiffies(10)
+#define LAN966X_SKB_CB(skb) \
+       ((struct lan966x_skb_cb *)((skb)->cb))
+
 struct lan966x {
        struct device *dev;
 
@@ -105,6 +131,7 @@ struct lan966x {
        /* interrupts */
        int xtr_irq;
        int ana_irq;
+       int ptp_irq;
 
        /* worqueue for fdb */
        struct workqueue_struct *fdb_work;
@@ -113,6 +140,14 @@ struct lan966x {
        /* mdb */
        struct list_head mdb_entries;
        struct list_head pgid_entries;
+
+       /* ptp */
+       bool ptp;
+       struct lan966x_phc phc[LAN966X_PHC_COUNT];
+       spinlock_t ptp_clock_lock; /* lock for phc */
+       spinlock_t ptp_ts_id_lock; /* lock for ts_id */
+       struct mutex ptp_lock; /* lock for ptp interface state */
+       u16 ptp_skbs;
 };
 
 struct lan966x_port_config {
@@ -135,6 +170,7 @@ struct lan966x_port {
        bool vlan_aware;
 
        bool learn_ena;
+       bool mcast_ena;
 
        struct phylink_config phylink_config;
        struct phylink_pcs phylink_pcs;
@@ -142,6 +178,10 @@ struct lan966x_port {
        struct phylink *phylink;
        struct phy *serdes;
        struct fwnode_handle *fwnode;
+
+       u8 ptp_cmd;
+       u16 ts_id;
+       struct sk_buff_head tx_skbs;
 };
 
 extern const struct phylink_mac_ops lan966x_phylink_mac_ops;
@@ -227,6 +267,20 @@ int lan966x_handle_port_mdb_del(struct lan966x_port *port,
                                const struct switchdev_obj *obj);
 void lan966x_mdb_erase_entries(struct lan966x *lan966x, u16 vid);
 void lan966x_mdb_write_entries(struct lan966x *lan966x, u16 vid);
+void lan966x_mdb_clear_entries(struct lan966x *lan966x);
+void lan966x_mdb_restore_entries(struct lan966x *lan966x);
+
+int lan966x_ptp_init(struct lan966x *lan966x);
+void lan966x_ptp_deinit(struct lan966x *lan966x);
+int lan966x_ptp_hwtstamp_set(struct lan966x_port *port, struct ifreq *ifr);
+int lan966x_ptp_hwtstamp_get(struct lan966x_port *port, struct ifreq *ifr);
+void lan966x_ptp_rxtstamp(struct lan966x *lan966x, struct sk_buff *skb,
+                         u64 timestamp);
+int lan966x_ptp_txtstamp_request(struct lan966x_port *port,
+                                struct sk_buff *skb);
+void lan966x_ptp_txtstamp_release(struct lan966x_port *port,
+                                 struct sk_buff *skb);
+irqreturn_t lan966x_ptp_irq_handler(int irq, void *args);
 
 static inline void __iomem *lan_addr(void __iomem *base[],
                                     int id, int tinst, int tcnt,
index c68d0a9..2af5526 100644 (file)
@@ -504,3 +504,48 @@ void lan966x_mdb_erase_entries(struct lan966x *lan966x, u16 vid)
                        lan966x_mdb_l2_cpu_remove(lan966x, mdb_entry, type);
        }
 }
+
+void lan966x_mdb_clear_entries(struct lan966x *lan966x)
+{
+       struct lan966x_mdb_entry *mdb_entry;
+       enum macaccess_entry_type type;
+       unsigned char mac[ETH_ALEN];
+
+       list_for_each_entry(mdb_entry, &lan966x->mdb_entries, list) {
+               type = lan966x_mdb_classify(mdb_entry->mac);
+
+               lan966x_mdb_encode_mac(mac, mdb_entry, type);
+               /* Remove just the MAC entry, still keep the PGID in case of L2
+                * entries because this can be restored at later point
+                */
+               lan966x_mac_forget(lan966x, mac, mdb_entry->vid, type);
+       }
+}
+
+void lan966x_mdb_restore_entries(struct lan966x *lan966x)
+{
+       struct lan966x_mdb_entry *mdb_entry;
+       enum macaccess_entry_type type;
+       unsigned char mac[ETH_ALEN];
+       bool cpu_copy = false;
+
+       list_for_each_entry(mdb_entry, &lan966x->mdb_entries, list) {
+               type = lan966x_mdb_classify(mdb_entry->mac);
+
+               lan966x_mdb_encode_mac(mac, mdb_entry, type);
+               if (type == ENTRYTYPE_MACV4 || type == ENTRYTYPE_MACV6) {
+                       /* Copy the frame to CPU only if the CPU is in the VLAN */
+                       if (lan966x_vlan_cpu_member_cpu_vlan_mask(lan966x,
+                                                                 mdb_entry->vid) &&
+                           mdb_entry->cpu_copy)
+                               cpu_copy = true;
+
+                       lan966x_mac_ip_learn(lan966x, cpu_copy, mac,
+                                            mdb_entry->vid, type);
+               } else {
+                       lan966x_mac_learn(lan966x, mdb_entry->pgid->index,
+                                         mdb_entry->mac,
+                                         mdb_entry->vid, type);
+               }
+       }
+}
index b66a9aa..38a7e95 100644 (file)
@@ -9,6 +9,14 @@
 
 #include "lan966x_main.h"
 
+static struct phylink_pcs *lan966x_phylink_mac_select(struct phylink_config *config,
+                                                     phy_interface_t interface)
+{
+       struct lan966x_port *port = netdev_priv(to_net_dev(config->dev));
+
+       return &port->phylink_pcs;
+}
+
 static void lan966x_phylink_mac_config(struct phylink_config *config,
                                       unsigned int mode,
                                       const struct phylink_link_state *state)
@@ -114,6 +122,7 @@ static void lan966x_pcs_aneg_restart(struct phylink_pcs *pcs)
 
 const struct phylink_mac_ops lan966x_phylink_mac_ops = {
        .validate = phylink_generic_validate,
+       .mac_select_pcs = lan966x_phylink_mac_select,
        .mac_config = lan966x_phylink_mac_config,
        .mac_prepare = lan966x_phylink_mac_prepare,
        .mac_link_down = lan966x_phylink_mac_link_down,
diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_ptp.c b/drivers/net/ethernet/microchip/lan966x/lan966x_ptp.c
new file mode 100644 (file)
index 0000000..ae78277
--- /dev/null
@@ -0,0 +1,618 @@
+// SPDX-License-Identifier: GPL-2.0+
+
+#include <linux/ptp_classify.h>
+
+#include "lan966x_main.h"
+
+#define LAN966X_MAX_PTP_ID     512
+
+/* Represents 1ppm adjustment in 2^59 format with 6.037735849ns as reference
+ * The value is calculated as following: (1/1000000)/((2^-59)/6.037735849)
+ */
+#define LAN966X_1PPM_FORMAT            3480517749723LL
+
+/* Represents 1ppb adjustment in 2^29 format with 6.037735849ns as reference
+ * The value is calculated as following: (1/1000000000)/((2^59)/6.037735849)
+ */
+#define LAN966X_1PPB_FORMAT            3480517749LL
+
+#define TOD_ACC_PIN            0x5
+
+enum {
+       PTP_PIN_ACTION_IDLE = 0,
+       PTP_PIN_ACTION_LOAD,
+       PTP_PIN_ACTION_SAVE,
+       PTP_PIN_ACTION_CLOCK,
+       PTP_PIN_ACTION_DELTA,
+       PTP_PIN_ACTION_TOD
+};
+
+static u64 lan966x_ptp_get_nominal_value(void)
+{
+       u64 res = 0x304d2df1;
+
+       res <<= 32;
+       return res;
+}
+
+int lan966x_ptp_hwtstamp_set(struct lan966x_port *port, struct ifreq *ifr)
+{
+       struct lan966x *lan966x = port->lan966x;
+       struct hwtstamp_config cfg;
+       struct lan966x_phc *phc;
+
+       /* For now don't allow to run ptp on ports that are part of a bridge,
+        * because in case of transparent clock the HW will still forward the
+        * frames, so there would be duplicate frames
+        */
+       if (lan966x->bridge_mask & BIT(port->chip_port))
+               return -EINVAL;
+
+       if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
+               return -EFAULT;
+
+       switch (cfg.tx_type) {
+       case HWTSTAMP_TX_ON:
+               port->ptp_cmd = IFH_REW_OP_TWO_STEP_PTP;
+               break;
+       case HWTSTAMP_TX_ONESTEP_SYNC:
+               port->ptp_cmd = IFH_REW_OP_ONE_STEP_PTP;
+               break;
+       case HWTSTAMP_TX_OFF:
+               port->ptp_cmd = IFH_REW_OP_NOOP;
+               break;
+       default:
+               return -ERANGE;
+       }
+
+       switch (cfg.rx_filter) {
+       case HWTSTAMP_FILTER_NONE:
+               break;
+       case HWTSTAMP_FILTER_ALL:
+       case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
+       case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
+       case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
+       case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
+       case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
+       case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
+       case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
+       case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
+       case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
+       case HWTSTAMP_FILTER_PTP_V2_EVENT:
+       case HWTSTAMP_FILTER_PTP_V2_SYNC:
+       case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
+       case HWTSTAMP_FILTER_NTP_ALL:
+               cfg.rx_filter = HWTSTAMP_FILTER_ALL;
+               break;
+       default:
+               return -ERANGE;
+       }
+
+       /* Commit back the result & save it */
+       mutex_lock(&lan966x->ptp_lock);
+       phc = &lan966x->phc[LAN966X_PHC_PORT];
+       memcpy(&phc->hwtstamp_config, &cfg, sizeof(cfg));
+       mutex_unlock(&lan966x->ptp_lock);
+
+       return copy_to_user(ifr->ifr_data, &cfg, sizeof(cfg)) ? -EFAULT : 0;
+}
+
+int lan966x_ptp_hwtstamp_get(struct lan966x_port *port, struct ifreq *ifr)
+{
+       struct lan966x *lan966x = port->lan966x;
+       struct lan966x_phc *phc;
+
+       phc = &lan966x->phc[LAN966X_PHC_PORT];
+       return copy_to_user(ifr->ifr_data, &phc->hwtstamp_config,
+                           sizeof(phc->hwtstamp_config)) ? -EFAULT : 0;
+}
+
+static int lan966x_ptp_classify(struct lan966x_port *port, struct sk_buff *skb)
+{
+       struct ptp_header *header;
+       u8 msgtype;
+       int type;
+
+       if (port->ptp_cmd == IFH_REW_OP_NOOP)
+               return IFH_REW_OP_NOOP;
+
+       type = ptp_classify_raw(skb);
+       if (type == PTP_CLASS_NONE)
+               return IFH_REW_OP_NOOP;
+
+       header = ptp_parse_header(skb, type);
+       if (!header)
+               return IFH_REW_OP_NOOP;
+
+       if (port->ptp_cmd == IFH_REW_OP_TWO_STEP_PTP)
+               return IFH_REW_OP_TWO_STEP_PTP;
+
+       /* If it is sync and run 1 step then set the correct operation,
+        * otherwise run as 2 step
+        */
+       msgtype = ptp_get_msgtype(header, type);
+       if ((msgtype & 0xf) == 0)
+               return IFH_REW_OP_ONE_STEP_PTP;
+
+       return IFH_REW_OP_TWO_STEP_PTP;
+}
+
+static void lan966x_ptp_txtstamp_old_release(struct lan966x_port *port)
+{
+       struct sk_buff *skb, *skb_tmp;
+       unsigned long flags;
+
+       spin_lock_irqsave(&port->tx_skbs.lock, flags);
+       skb_queue_walk_safe(&port->tx_skbs, skb, skb_tmp) {
+               if time_after(LAN966X_SKB_CB(skb)->jiffies + LAN966X_PTP_TIMEOUT,
+                             jiffies)
+                       break;
+
+               __skb_unlink(skb, &port->tx_skbs);
+               dev_kfree_skb_any(skb);
+       }
+       spin_unlock_irqrestore(&port->tx_skbs.lock, flags);
+}
+
+int lan966x_ptp_txtstamp_request(struct lan966x_port *port,
+                                struct sk_buff *skb)
+{
+       struct lan966x *lan966x = port->lan966x;
+       unsigned long flags;
+       u8 rew_op;
+
+       rew_op = lan966x_ptp_classify(port, skb);
+       LAN966X_SKB_CB(skb)->rew_op = rew_op;
+
+       if (rew_op != IFH_REW_OP_TWO_STEP_PTP)
+               return 0;
+
+       lan966x_ptp_txtstamp_old_release(port);
+
+       spin_lock_irqsave(&lan966x->ptp_ts_id_lock, flags);
+       if (lan966x->ptp_skbs == LAN966X_MAX_PTP_ID) {
+               spin_unlock_irqrestore(&lan966x->ptp_ts_id_lock, flags);
+               return -EBUSY;
+       }
+
+       skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS;
+
+       skb_queue_tail(&port->tx_skbs, skb);
+       LAN966X_SKB_CB(skb)->ts_id = port->ts_id;
+       LAN966X_SKB_CB(skb)->jiffies = jiffies;
+
+       lan966x->ptp_skbs++;
+       port->ts_id++;
+       if (port->ts_id == LAN966X_MAX_PTP_ID)
+               port->ts_id = 0;
+
+       spin_unlock_irqrestore(&lan966x->ptp_ts_id_lock, flags);
+
+       return 0;
+}
+
+void lan966x_ptp_txtstamp_release(struct lan966x_port *port,
+                                 struct sk_buff *skb)
+{
+       struct lan966x *lan966x = port->lan966x;
+       unsigned long flags;
+
+       spin_lock_irqsave(&lan966x->ptp_ts_id_lock, flags);
+       port->ts_id--;
+       lan966x->ptp_skbs--;
+       skb_unlink(skb, &port->tx_skbs);
+       spin_unlock_irqrestore(&lan966x->ptp_ts_id_lock, flags);
+}
+
+static void lan966x_get_hwtimestamp(struct lan966x *lan966x,
+                                   struct timespec64 *ts,
+                                   u32 nsec)
+{
+       /* Read current PTP time to get seconds */
+       unsigned long flags;
+       u32 curr_nsec;
+
+       spin_lock_irqsave(&lan966x->ptp_clock_lock, flags);
+
+       lan_rmw(PTP_PIN_CFG_PIN_ACTION_SET(PTP_PIN_ACTION_SAVE) |
+               PTP_PIN_CFG_PIN_DOM_SET(LAN966X_PHC_PORT) |
+               PTP_PIN_CFG_PIN_SYNC_SET(0),
+               PTP_PIN_CFG_PIN_ACTION |
+               PTP_PIN_CFG_PIN_DOM |
+               PTP_PIN_CFG_PIN_SYNC,
+               lan966x, PTP_PIN_CFG(TOD_ACC_PIN));
+
+       ts->tv_sec = lan_rd(lan966x, PTP_TOD_SEC_LSB(TOD_ACC_PIN));
+       curr_nsec = lan_rd(lan966x, PTP_TOD_NSEC(TOD_ACC_PIN));
+
+       ts->tv_nsec = nsec;
+
+       /* Sec has incremented since the ts was registered */
+       if (curr_nsec < nsec)
+               ts->tv_sec--;
+
+       spin_unlock_irqrestore(&lan966x->ptp_clock_lock, flags);
+}
+
+irqreturn_t lan966x_ptp_irq_handler(int irq, void *args)
+{
+       int budget = LAN966X_MAX_PTP_ID;
+       struct lan966x *lan966x = args;
+
+       while (budget--) {
+               struct sk_buff *skb, *skb_tmp, *skb_match = NULL;
+               struct skb_shared_hwtstamps shhwtstamps;
+               struct lan966x_port *port;
+               struct timespec64 ts;
+               unsigned long flags;
+               u32 val, id, txport;
+               u32 delay;
+
+               val = lan_rd(lan966x, PTP_TWOSTEP_CTRL);
+
+               /* Check if a timestamp can be retrieved */
+               if (!(val & PTP_TWOSTEP_CTRL_VLD))
+                       break;
+
+               WARN_ON(val & PTP_TWOSTEP_CTRL_OVFL);
+
+               if (!(val & PTP_TWOSTEP_CTRL_STAMP_TX))
+                       continue;
+
+               /* Retrieve the ts Tx port */
+               txport = PTP_TWOSTEP_CTRL_STAMP_PORT_GET(val);
+
+               /* Retrieve its associated skb */
+               port = lan966x->ports[txport];
+
+               /* Retrieve the delay */
+               delay = lan_rd(lan966x, PTP_TWOSTEP_STAMP);
+               delay = PTP_TWOSTEP_STAMP_STAMP_NSEC_GET(delay);
+
+               /* Get next timestamp from fifo, which needs to be the
+                * rx timestamp which represents the id of the frame
+                */
+               lan_rmw(PTP_TWOSTEP_CTRL_NXT_SET(1),
+                       PTP_TWOSTEP_CTRL_NXT,
+                       lan966x, PTP_TWOSTEP_CTRL);
+
+               val = lan_rd(lan966x, PTP_TWOSTEP_CTRL);
+
+               /* Check if a timestamp can be retried */
+               if (!(val & PTP_TWOSTEP_CTRL_VLD))
+                       break;
+
+               /* Read RX timestamping to get the ID */
+               id = lan_rd(lan966x, PTP_TWOSTEP_STAMP);
+
+               spin_lock_irqsave(&port->tx_skbs.lock, flags);
+               skb_queue_walk_safe(&port->tx_skbs, skb, skb_tmp) {
+                       if (LAN966X_SKB_CB(skb)->ts_id != id)
+                               continue;
+
+                       __skb_unlink(skb, &port->tx_skbs);
+                       skb_match = skb;
+                       break;
+               }
+               spin_unlock_irqrestore(&port->tx_skbs.lock, flags);
+
+               /* Next ts */
+               lan_rmw(PTP_TWOSTEP_CTRL_NXT_SET(1),
+                       PTP_TWOSTEP_CTRL_NXT,
+                       lan966x, PTP_TWOSTEP_CTRL);
+
+               if (WARN_ON(!skb_match))
+                       continue;
+
+               spin_lock(&lan966x->ptp_ts_id_lock);
+               lan966x->ptp_skbs--;
+               spin_unlock(&lan966x->ptp_ts_id_lock);
+
+               /* Get the h/w timestamp */
+               lan966x_get_hwtimestamp(lan966x, &ts, delay);
+
+               /* Set the timestamp into the skb */
+               shhwtstamps.hwtstamp = ktime_set(ts.tv_sec, ts.tv_nsec);
+               skb_tstamp_tx(skb_match, &shhwtstamps);
+
+               dev_kfree_skb_any(skb_match);
+       }
+
+       return IRQ_HANDLED;
+}
+
+static int lan966x_ptp_adjfine(struct ptp_clock_info *ptp, long scaled_ppm)
+{
+       struct lan966x_phc *phc = container_of(ptp, struct lan966x_phc, info);
+       struct lan966x *lan966x = phc->lan966x;
+       unsigned long flags;
+       bool neg_adj = 0;
+       u64 tod_inc;
+       u64 ref;
+
+       if (!scaled_ppm)
+               return 0;
+
+       if (scaled_ppm < 0) {
+               neg_adj = 1;
+               scaled_ppm = -scaled_ppm;
+       }
+
+       tod_inc = lan966x_ptp_get_nominal_value();
+
+       /* The multiplication is split in 2 separate additions because of
+        * overflow issues. If scaled_ppm with 16bit fractional part was bigger
+        * than 20ppm then we got overflow.
+        */
+       ref = LAN966X_1PPM_FORMAT * (scaled_ppm >> 16);
+       ref += (LAN966X_1PPM_FORMAT * (0xffff & scaled_ppm)) >> 16;
+       tod_inc = neg_adj ? tod_inc - ref : tod_inc + ref;
+
+       spin_lock_irqsave(&lan966x->ptp_clock_lock, flags);
+
+       lan_rmw(PTP_DOM_CFG_CLKCFG_DIS_SET(1 << BIT(phc->index)),
+               PTP_DOM_CFG_CLKCFG_DIS,
+               lan966x, PTP_DOM_CFG);
+
+       lan_wr((u32)tod_inc & 0xFFFFFFFF, lan966x,
+              PTP_CLK_PER_CFG(phc->index, 0));
+       lan_wr((u32)(tod_inc >> 32), lan966x,
+              PTP_CLK_PER_CFG(phc->index, 1));
+
+       lan_rmw(PTP_DOM_CFG_CLKCFG_DIS_SET(0),
+               PTP_DOM_CFG_CLKCFG_DIS,
+               lan966x, PTP_DOM_CFG);
+
+       spin_unlock_irqrestore(&lan966x->ptp_clock_lock, flags);
+
+       return 0;
+}
+
+static int lan966x_ptp_settime64(struct ptp_clock_info *ptp,
+                                const struct timespec64 *ts)
+{
+       struct lan966x_phc *phc = container_of(ptp, struct lan966x_phc, info);
+       struct lan966x *lan966x = phc->lan966x;
+       unsigned long flags;
+
+       spin_lock_irqsave(&lan966x->ptp_clock_lock, flags);
+
+       /* Must be in IDLE mode before the time can be loaded */
+       lan_rmw(PTP_PIN_CFG_PIN_ACTION_SET(PTP_PIN_ACTION_IDLE) |
+               PTP_PIN_CFG_PIN_DOM_SET(phc->index) |
+               PTP_PIN_CFG_PIN_SYNC_SET(0),
+               PTP_PIN_CFG_PIN_ACTION |
+               PTP_PIN_CFG_PIN_DOM |
+               PTP_PIN_CFG_PIN_SYNC,
+               lan966x, PTP_PIN_CFG(TOD_ACC_PIN));
+
+       /* Set new value */
+       lan_wr(PTP_TOD_SEC_MSB_TOD_SEC_MSB_SET(upper_32_bits(ts->tv_sec)),
+              lan966x, PTP_TOD_SEC_MSB(TOD_ACC_PIN));
+       lan_wr(lower_32_bits(ts->tv_sec),
+              lan966x, PTP_TOD_SEC_LSB(TOD_ACC_PIN));
+       lan_wr(ts->tv_nsec, lan966x, PTP_TOD_NSEC(TOD_ACC_PIN));
+
+       /* Apply new values */
+       lan_rmw(PTP_PIN_CFG_PIN_ACTION_SET(PTP_PIN_ACTION_LOAD) |
+               PTP_PIN_CFG_PIN_DOM_SET(phc->index) |
+               PTP_PIN_CFG_PIN_SYNC_SET(0),
+               PTP_PIN_CFG_PIN_ACTION |
+               PTP_PIN_CFG_PIN_DOM |
+               PTP_PIN_CFG_PIN_SYNC,
+               lan966x, PTP_PIN_CFG(TOD_ACC_PIN));
+
+       spin_unlock_irqrestore(&lan966x->ptp_clock_lock, flags);
+
+       return 0;
+}
+
+static int lan966x_ptp_gettime64(struct ptp_clock_info *ptp,
+                                struct timespec64 *ts)
+{
+       struct lan966x_phc *phc = container_of(ptp, struct lan966x_phc, info);
+       struct lan966x *lan966x = phc->lan966x;
+       unsigned long flags;
+       time64_t s;
+       s64 ns;
+
+       spin_lock_irqsave(&lan966x->ptp_clock_lock, flags);
+
+       lan_rmw(PTP_PIN_CFG_PIN_ACTION_SET(PTP_PIN_ACTION_SAVE) |
+               PTP_PIN_CFG_PIN_DOM_SET(phc->index) |
+               PTP_PIN_CFG_PIN_SYNC_SET(0),
+               PTP_PIN_CFG_PIN_ACTION |
+               PTP_PIN_CFG_PIN_DOM |
+               PTP_PIN_CFG_PIN_SYNC,
+               lan966x, PTP_PIN_CFG(TOD_ACC_PIN));
+
+       s = lan_rd(lan966x, PTP_TOD_SEC_MSB(TOD_ACC_PIN));
+       s <<= 32;
+       s |= lan_rd(lan966x, PTP_TOD_SEC_LSB(TOD_ACC_PIN));
+       ns = lan_rd(lan966x, PTP_TOD_NSEC(TOD_ACC_PIN));
+       ns &= PTP_TOD_NSEC_TOD_NSEC;
+
+       spin_unlock_irqrestore(&lan966x->ptp_clock_lock, flags);
+
+       /* Deal with negative values */
+       if ((ns & 0xFFFFFFF0) == 0x3FFFFFF0) {
+               s--;
+               ns &= 0xf;
+               ns += 999999984;
+       }
+
+       set_normalized_timespec64(ts, s, ns);
+       return 0;
+}
+
+static int lan966x_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta)
+{
+       struct lan966x_phc *phc = container_of(ptp, struct lan966x_phc, info);
+       struct lan966x *lan966x = phc->lan966x;
+
+       if (delta > -(NSEC_PER_SEC / 2) && delta < (NSEC_PER_SEC / 2)) {
+               unsigned long flags;
+
+               spin_lock_irqsave(&lan966x->ptp_clock_lock, flags);
+
+               /* Must be in IDLE mode before the time can be loaded */
+               lan_rmw(PTP_PIN_CFG_PIN_ACTION_SET(PTP_PIN_ACTION_IDLE) |
+                       PTP_PIN_CFG_PIN_DOM_SET(phc->index) |
+                       PTP_PIN_CFG_PIN_SYNC_SET(0),
+                       PTP_PIN_CFG_PIN_ACTION |
+                       PTP_PIN_CFG_PIN_DOM |
+                       PTP_PIN_CFG_PIN_SYNC,
+                       lan966x, PTP_PIN_CFG(TOD_ACC_PIN));
+
+               lan_wr(PTP_TOD_NSEC_TOD_NSEC_SET(delta),
+                      lan966x, PTP_TOD_NSEC(TOD_ACC_PIN));
+
+               /* Adjust time with the value of PTP_TOD_NSEC */
+               lan_rmw(PTP_PIN_CFG_PIN_ACTION_SET(PTP_PIN_ACTION_DELTA) |
+                       PTP_PIN_CFG_PIN_DOM_SET(phc->index) |
+                       PTP_PIN_CFG_PIN_SYNC_SET(0),
+                       PTP_PIN_CFG_PIN_ACTION |
+                       PTP_PIN_CFG_PIN_DOM |
+                       PTP_PIN_CFG_PIN_SYNC,
+                       lan966x, PTP_PIN_CFG(TOD_ACC_PIN));
+
+               spin_unlock_irqrestore(&lan966x->ptp_clock_lock, flags);
+       } else {
+               /* Fall back using lan966x_ptp_settime64 which is not exact */
+               struct timespec64 ts;
+               u64 now;
+
+               lan966x_ptp_gettime64(ptp, &ts);
+
+               now = ktime_to_ns(timespec64_to_ktime(ts));
+               ts = ns_to_timespec64(now + delta);
+
+               lan966x_ptp_settime64(ptp, &ts);
+       }
+
+       return 0;
+}
+
+static struct ptp_clock_info lan966x_ptp_clock_info = {
+       .owner          = THIS_MODULE,
+       .name           = "lan966x ptp",
+       .max_adj        = 200000,
+       .gettime64      = lan966x_ptp_gettime64,
+       .settime64      = lan966x_ptp_settime64,
+       .adjtime        = lan966x_ptp_adjtime,
+       .adjfine        = lan966x_ptp_adjfine,
+};
+
+static int lan966x_ptp_phc_init(struct lan966x *lan966x,
+                               int index,
+                               struct ptp_clock_info *clock_info)
+{
+       struct lan966x_phc *phc = &lan966x->phc[index];
+
+       phc->info = *clock_info;
+       phc->clock = ptp_clock_register(&phc->info, lan966x->dev);
+       if (IS_ERR(phc->clock))
+               return PTR_ERR(phc->clock);
+
+       phc->index = index;
+       phc->lan966x = lan966x;
+
+       /* PTP Rx stamping is always enabled.  */
+       phc->hwtstamp_config.rx_filter = HWTSTAMP_FILTER_PTP_V2_EVENT;
+
+       return 0;
+}
+
+int lan966x_ptp_init(struct lan966x *lan966x)
+{
+       u64 tod_adj = lan966x_ptp_get_nominal_value();
+       struct lan966x_port *port;
+       int err, i;
+
+       if (!lan966x->ptp)
+               return 0;
+
+       for (i = 0; i < LAN966X_PHC_COUNT; ++i) {
+               err = lan966x_ptp_phc_init(lan966x, i, &lan966x_ptp_clock_info);
+               if (err)
+                       return err;
+       }
+
+       spin_lock_init(&lan966x->ptp_clock_lock);
+       spin_lock_init(&lan966x->ptp_ts_id_lock);
+       mutex_init(&lan966x->ptp_lock);
+
+       /* Disable master counters */
+       lan_wr(PTP_DOM_CFG_ENA_SET(0), lan966x, PTP_DOM_CFG);
+
+       /* Configure the nominal TOD increment per clock cycle */
+       lan_rmw(PTP_DOM_CFG_CLKCFG_DIS_SET(0x7),
+               PTP_DOM_CFG_CLKCFG_DIS,
+               lan966x, PTP_DOM_CFG);
+
+       for (i = 0; i < LAN966X_PHC_COUNT; ++i) {
+               lan_wr((u32)tod_adj & 0xFFFFFFFF, lan966x,
+                      PTP_CLK_PER_CFG(i, 0));
+               lan_wr((u32)(tod_adj >> 32), lan966x,
+                      PTP_CLK_PER_CFG(i, 1));
+       }
+
+       lan_rmw(PTP_DOM_CFG_CLKCFG_DIS_SET(0),
+               PTP_DOM_CFG_CLKCFG_DIS,
+               lan966x, PTP_DOM_CFG);
+
+       /* Enable master counters */
+       lan_wr(PTP_DOM_CFG_ENA_SET(0x7), lan966x, PTP_DOM_CFG);
+
+       for (i = 0; i < lan966x->num_phys_ports; i++) {
+               port = lan966x->ports[i];
+               if (!port)
+                       continue;
+
+               skb_queue_head_init(&port->tx_skbs);
+       }
+
+       return 0;
+}
+
+void lan966x_ptp_deinit(struct lan966x *lan966x)
+{
+       struct lan966x_port *port;
+       int i;
+
+       for (i = 0; i < lan966x->num_phys_ports; i++) {
+               port = lan966x->ports[i];
+               if (!port)
+                       continue;
+
+               skb_queue_purge(&port->tx_skbs);
+       }
+
+       for (i = 0; i < LAN966X_PHC_COUNT; ++i)
+               ptp_clock_unregister(lan966x->phc[i].clock);
+}
+
+void lan966x_ptp_rxtstamp(struct lan966x *lan966x, struct sk_buff *skb,
+                         u64 timestamp)
+{
+       struct skb_shared_hwtstamps *shhwtstamps;
+       struct lan966x_phc *phc;
+       struct timespec64 ts;
+       u64 full_ts_in_ns;
+
+       if (!lan966x->ptp)
+               return;
+
+       phc = &lan966x->phc[LAN966X_PHC_PORT];
+       lan966x_ptp_gettime64(&phc->info, &ts);
+
+       /* Drop the sub-ns precision */
+       timestamp = timestamp >> 2;
+       if (ts.tv_nsec < timestamp)
+               ts.tv_sec--;
+       ts.tv_nsec = timestamp;
+       full_ts_in_ns = ktime_set(ts.tv_sec, ts.tv_nsec);
+
+       shhwtstamps = skb_hwtstamps(skb);
+       shhwtstamps->hwtstamp = full_ts_in_ns;
+}
index 7975601..0c0b3e1 100644 (file)
@@ -19,6 +19,7 @@ enum lan966x_target {
        TARGET_DEV = 13,
        TARGET_GCB = 27,
        TARGET_ORG = 36,
+       TARGET_PTP = 41,
        TARGET_QS = 42,
        TARGET_QSYS = 46,
        TARGET_REW = 47,
@@ -298,6 +299,24 @@ enum lan966x_target {
 /*      ANA:PORT:CPU_FWD_CFG */
 #define ANA_CPU_FWD_CFG(g)        __REG(TARGET_ANA, 0, 1, 28672, g, 9, 128, 96, 0, 1, 4)
 
+#define ANA_CPU_FWD_CFG_MLD_REDIR_ENA            BIT(6)
+#define ANA_CPU_FWD_CFG_MLD_REDIR_ENA_SET(x)\
+       FIELD_PREP(ANA_CPU_FWD_CFG_MLD_REDIR_ENA, x)
+#define ANA_CPU_FWD_CFG_MLD_REDIR_ENA_GET(x)\
+       FIELD_GET(ANA_CPU_FWD_CFG_MLD_REDIR_ENA, x)
+
+#define ANA_CPU_FWD_CFG_IGMP_REDIR_ENA           BIT(5)
+#define ANA_CPU_FWD_CFG_IGMP_REDIR_ENA_SET(x)\
+       FIELD_PREP(ANA_CPU_FWD_CFG_IGMP_REDIR_ENA, x)
+#define ANA_CPU_FWD_CFG_IGMP_REDIR_ENA_GET(x)\
+       FIELD_GET(ANA_CPU_FWD_CFG_IGMP_REDIR_ENA, x)
+
+#define ANA_CPU_FWD_CFG_IPMC_CTRL_COPY_ENA       BIT(4)
+#define ANA_CPU_FWD_CFG_IPMC_CTRL_COPY_ENA_SET(x)\
+       FIELD_PREP(ANA_CPU_FWD_CFG_IPMC_CTRL_COPY_ENA, x)
+#define ANA_CPU_FWD_CFG_IPMC_CTRL_COPY_ENA_GET(x)\
+       FIELD_GET(ANA_CPU_FWD_CFG_IPMC_CTRL_COPY_ENA, x)
+
 #define ANA_CPU_FWD_CFG_SRC_COPY_ENA             BIT(3)
 #define ANA_CPU_FWD_CFG_SRC_COPY_ENA_SET(x)\
        FIELD_PREP(ANA_CPU_FWD_CFG_SRC_COPY_ENA, x)
@@ -559,6 +578,108 @@ enum lan966x_target {
 #define DEV_PCS1G_STICKY_LINK_DOWN_STICKY_GET(x)\
        FIELD_GET(DEV_PCS1G_STICKY_LINK_DOWN_STICKY, x)
 
+/*      PTP:PTP_CFG:PTP_DOM_CFG */
+#define PTP_DOM_CFG               __REG(TARGET_PTP, 0, 1, 512, 0, 1, 16, 12, 0, 1, 4)
+
+#define PTP_DOM_CFG_ENA                          GENMASK(11, 9)
+#define PTP_DOM_CFG_ENA_SET(x)\
+       FIELD_PREP(PTP_DOM_CFG_ENA, x)
+#define PTP_DOM_CFG_ENA_GET(x)\
+       FIELD_GET(PTP_DOM_CFG_ENA, x)
+
+#define PTP_DOM_CFG_CLKCFG_DIS                   GENMASK(2, 0)
+#define PTP_DOM_CFG_CLKCFG_DIS_SET(x)\
+       FIELD_PREP(PTP_DOM_CFG_CLKCFG_DIS, x)
+#define PTP_DOM_CFG_CLKCFG_DIS_GET(x)\
+       FIELD_GET(PTP_DOM_CFG_CLKCFG_DIS, x)
+
+/*      PTP:PTP_TOD_DOMAINS:CLK_PER_CFG */
+#define PTP_CLK_PER_CFG(g, r)     __REG(TARGET_PTP, 0, 1, 528, g, 3, 28, 0, r, 2, 4)
+
+/*      PTP:PTP_PINS:PTP_PIN_CFG */
+#define PTP_PIN_CFG(g)            __REG(TARGET_PTP, 0, 1, 0, g, 8, 64, 0, 0, 1, 4)
+
+#define PTP_PIN_CFG_PIN_ACTION                   GENMASK(29, 27)
+#define PTP_PIN_CFG_PIN_ACTION_SET(x)\
+       FIELD_PREP(PTP_PIN_CFG_PIN_ACTION, x)
+#define PTP_PIN_CFG_PIN_ACTION_GET(x)\
+       FIELD_GET(PTP_PIN_CFG_PIN_ACTION, x)
+
+#define PTP_PIN_CFG_PIN_SYNC                     GENMASK(26, 25)
+#define PTP_PIN_CFG_PIN_SYNC_SET(x)\
+       FIELD_PREP(PTP_PIN_CFG_PIN_SYNC, x)
+#define PTP_PIN_CFG_PIN_SYNC_GET(x)\
+       FIELD_GET(PTP_PIN_CFG_PIN_SYNC, x)
+
+#define PTP_PIN_CFG_PIN_DOM                      GENMASK(17, 16)
+#define PTP_PIN_CFG_PIN_DOM_SET(x)\
+       FIELD_PREP(PTP_PIN_CFG_PIN_DOM, x)
+#define PTP_PIN_CFG_PIN_DOM_GET(x)\
+       FIELD_GET(PTP_PIN_CFG_PIN_DOM, x)
+
+/*      PTP:PTP_PINS:PTP_TOD_SEC_MSB */
+#define PTP_TOD_SEC_MSB(g)        __REG(TARGET_PTP, 0, 1, 0, g, 8, 64, 4, 0, 1, 4)
+
+#define PTP_TOD_SEC_MSB_TOD_SEC_MSB              GENMASK(15, 0)
+#define PTP_TOD_SEC_MSB_TOD_SEC_MSB_SET(x)\
+       FIELD_PREP(PTP_TOD_SEC_MSB_TOD_SEC_MSB, x)
+#define PTP_TOD_SEC_MSB_TOD_SEC_MSB_GET(x)\
+       FIELD_GET(PTP_TOD_SEC_MSB_TOD_SEC_MSB, x)
+
+/*      PTP:PTP_PINS:PTP_TOD_SEC_LSB */
+#define PTP_TOD_SEC_LSB(g)        __REG(TARGET_PTP, 0, 1, 0, g, 8, 64, 8, 0, 1, 4)
+
+/*      PTP:PTP_PINS:PTP_TOD_NSEC */
+#define PTP_TOD_NSEC(g)           __REG(TARGET_PTP, 0, 1, 0, g, 8, 64, 12, 0, 1, 4)
+
+#define PTP_TOD_NSEC_TOD_NSEC                    GENMASK(29, 0)
+#define PTP_TOD_NSEC_TOD_NSEC_SET(x)\
+       FIELD_PREP(PTP_TOD_NSEC_TOD_NSEC, x)
+#define PTP_TOD_NSEC_TOD_NSEC_GET(x)\
+       FIELD_GET(PTP_TOD_NSEC_TOD_NSEC, x)
+
+/*      PTP:PTP_TS_FIFO:PTP_TWOSTEP_CTRL */
+#define PTP_TWOSTEP_CTRL          __REG(TARGET_PTP, 0, 1, 612, 0, 1, 12, 0, 0, 1, 4)
+
+#define PTP_TWOSTEP_CTRL_NXT                     BIT(11)
+#define PTP_TWOSTEP_CTRL_NXT_SET(x)\
+       FIELD_PREP(PTP_TWOSTEP_CTRL_NXT, x)
+#define PTP_TWOSTEP_CTRL_NXT_GET(x)\
+       FIELD_GET(PTP_TWOSTEP_CTRL_NXT, x)
+
+#define PTP_TWOSTEP_CTRL_VLD                     BIT(10)
+#define PTP_TWOSTEP_CTRL_VLD_SET(x)\
+       FIELD_PREP(PTP_TWOSTEP_CTRL_VLD, x)
+#define PTP_TWOSTEP_CTRL_VLD_GET(x)\
+       FIELD_GET(PTP_TWOSTEP_CTRL_VLD, x)
+
+#define PTP_TWOSTEP_CTRL_STAMP_TX                BIT(9)
+#define PTP_TWOSTEP_CTRL_STAMP_TX_SET(x)\
+       FIELD_PREP(PTP_TWOSTEP_CTRL_STAMP_TX, x)
+#define PTP_TWOSTEP_CTRL_STAMP_TX_GET(x)\
+       FIELD_GET(PTP_TWOSTEP_CTRL_STAMP_TX, x)
+
+#define PTP_TWOSTEP_CTRL_STAMP_PORT              GENMASK(8, 1)
+#define PTP_TWOSTEP_CTRL_STAMP_PORT_SET(x)\
+       FIELD_PREP(PTP_TWOSTEP_CTRL_STAMP_PORT, x)
+#define PTP_TWOSTEP_CTRL_STAMP_PORT_GET(x)\
+       FIELD_GET(PTP_TWOSTEP_CTRL_STAMP_PORT, x)
+
+#define PTP_TWOSTEP_CTRL_OVFL                    BIT(0)
+#define PTP_TWOSTEP_CTRL_OVFL_SET(x)\
+       FIELD_PREP(PTP_TWOSTEP_CTRL_OVFL, x)
+#define PTP_TWOSTEP_CTRL_OVFL_GET(x)\
+       FIELD_GET(PTP_TWOSTEP_CTRL_OVFL, x)
+
+/*      PTP:PTP_TS_FIFO:PTP_TWOSTEP_STAMP */
+#define PTP_TWOSTEP_STAMP         __REG(TARGET_PTP, 0, 1, 612, 0, 1, 12, 4, 0, 1, 4)
+
+#define PTP_TWOSTEP_STAMP_STAMP_NSEC             GENMASK(31, 2)
+#define PTP_TWOSTEP_STAMP_STAMP_NSEC_SET(x)\
+       FIELD_PREP(PTP_TWOSTEP_STAMP_STAMP_NSEC, x)
+#define PTP_TWOSTEP_STAMP_STAMP_NSEC_GET(x)\
+       FIELD_GET(PTP_TWOSTEP_STAMP_STAMP_NSEC, x)
+
 /*      DEVCPU_QS:XTR:XTR_GRP_CFG */
 #define QS_XTR_GRP_CFG(r)         __REG(TARGET_QS, 0, 1, 0, 0, 1, 36, 0, r, 2, 4)
 
index 7de55f6..9fce865 100644 (file)
@@ -9,6 +9,37 @@ static struct notifier_block lan966x_netdevice_nb __read_mostly;
 static struct notifier_block lan966x_switchdev_nb __read_mostly;
 static struct notifier_block lan966x_switchdev_blocking_nb __read_mostly;
 
+static void lan966x_port_set_mcast_ip_flood(struct lan966x_port *port,
+                                           u32 pgid_ip)
+{
+       struct lan966x *lan966x = port->lan966x;
+       u32 flood_mask_ip;
+
+       flood_mask_ip = lan_rd(lan966x, ANA_PGID(pgid_ip));
+       flood_mask_ip = ANA_PGID_PGID_GET(flood_mask_ip);
+
+       /* If mcast snooping is not enabled then use mcast flood mask
+        * to decide to enable multicast flooding or not.
+        */
+       if (!port->mcast_ena) {
+               u32 flood_mask;
+
+               flood_mask = lan_rd(lan966x, ANA_PGID(PGID_MC));
+               flood_mask = ANA_PGID_PGID_GET(flood_mask);
+
+               if (flood_mask & BIT(port->chip_port))
+                       flood_mask_ip |= BIT(port->chip_port);
+               else
+                       flood_mask_ip &= ~BIT(port->chip_port);
+       } else {
+               flood_mask_ip &= ~BIT(port->chip_port);
+       }
+
+       lan_rmw(ANA_PGID_PGID_SET(flood_mask_ip),
+               ANA_PGID_PGID,
+               lan966x, ANA_PGID(pgid_ip));
+}
+
 static void lan966x_port_set_mcast_flood(struct lan966x_port *port,
                                         bool enabled)
 {
@@ -23,6 +54,11 @@ static void lan966x_port_set_mcast_flood(struct lan966x_port *port,
        lan_rmw(ANA_PGID_PGID_SET(val),
                ANA_PGID_PGID,
                port->lan966x, ANA_PGID(PGID_MC));
+
+       if (!port->mcast_ena) {
+               lan966x_port_set_mcast_ip_flood(port, PGID_MCIPV4);
+               lan966x_port_set_mcast_ip_flood(port, PGID_MCIPV6);
+       }
 }
 
 static void lan966x_port_set_ucast_flood(struct lan966x_port *port,
@@ -144,6 +180,28 @@ static void lan966x_port_ageing_set(struct lan966x_port *port,
        lan966x_mac_set_ageing(port->lan966x, ageing_time);
 }
 
+static void lan966x_port_mc_set(struct lan966x_port *port, bool mcast_ena)
+{
+       struct lan966x *lan966x = port->lan966x;
+
+       port->mcast_ena = mcast_ena;
+       if (mcast_ena)
+               lan966x_mdb_restore_entries(lan966x);
+       else
+               lan966x_mdb_clear_entries(lan966x);
+
+       lan_rmw(ANA_CPU_FWD_CFG_IGMP_REDIR_ENA_SET(mcast_ena) |
+               ANA_CPU_FWD_CFG_MLD_REDIR_ENA_SET(mcast_ena) |
+               ANA_CPU_FWD_CFG_IPMC_CTRL_COPY_ENA_SET(mcast_ena),
+               ANA_CPU_FWD_CFG_IGMP_REDIR_ENA |
+               ANA_CPU_FWD_CFG_MLD_REDIR_ENA |
+               ANA_CPU_FWD_CFG_IPMC_CTRL_COPY_ENA,
+               lan966x, ANA_CPU_FWD_CFG(port->chip_port));
+
+       lan966x_port_set_mcast_ip_flood(port, PGID_MCIPV4);
+       lan966x_port_set_mcast_ip_flood(port, PGID_MCIPV6);
+}
+
 static int lan966x_port_attr_set(struct net_device *dev, const void *ctx,
                                 const struct switchdev_attr *attr,
                                 struct netlink_ext_ack *extack)
@@ -171,6 +229,9 @@ static int lan966x_port_attr_set(struct net_device *dev, const void *ctx,
                lan966x_vlan_port_set_vlan_aware(port, attr->u.vlan_filtering);
                lan966x_vlan_port_apply(port);
                break;
+       case SWITCHDEV_ATTR_ID_BRIDGE_MC_DISABLED:
+               lan966x_port_mc_set(port, !attr->u.mc_disabled);
+               break;
        default:
                err = -EOPNOTSUPP;
                break;
index 1626627..394de85 100644 (file)
@@ -291,7 +291,6 @@ static int sparx5_create_port(struct sparx5 *sparx5,
        /* Create a phylink for PHY management.  Also handles SFPs */
        spx5_port->phylink_config.dev = &spx5_port->ndev->dev;
        spx5_port->phylink_config.type = PHYLINK_NETDEV;
-       spx5_port->phylink_config.pcs_poll = true;
        spx5_port->phylink_config.mac_capabilities = MAC_ASYM_PAUSE |
                MAC_SYM_PAUSE | MAC_10 | MAC_100 | MAC_1000FD |
                MAC_2500FD | MAC_5000FD | MAC_10000FD | MAC_25000FD;
@@ -328,7 +327,6 @@ static int sparx5_create_port(struct sparx5 *sparx5,
                return PTR_ERR(phylink);
 
        spx5_port->phylink = phylink;
-       phylink_set_pcs(phylink, &spx5_port->phylink_pcs);
 
        return 0;
 }
index 8ba33bc..830da0e 100644 (file)
@@ -26,6 +26,15 @@ static bool port_conf_has_changed(struct sparx5_port_config *a, struct sparx5_po
        return false;
 }
 
+static struct phylink_pcs *
+sparx5_phylink_mac_select_pcs(struct phylink_config *config,
+                             phy_interface_t interface)
+{
+       struct sparx5_port *port = netdev_priv(to_net_dev(config->dev));
+
+       return &port->phylink_pcs;
+}
+
 static void sparx5_phylink_mac_config(struct phylink_config *config,
                                      unsigned int mode,
                                      const struct phylink_link_state *state)
@@ -130,6 +139,7 @@ const struct phylink_pcs_ops sparx5_phylink_pcs_ops = {
 
 const struct phylink_mac_ops sparx5_phylink_mac_ops = {
        .validate = phylink_generic_validate,
+       .mac_select_pcs = sparx5_phylink_mac_select_pcs,
        .mac_config = sparx5_phylink_mac_config,
        .mac_link_down = sparx5_phylink_mac_link_down,
        .mac_link_up = sparx5_phylink_mac_link_up,
index 636dfef..49b85ca 100644 (file)
@@ -663,7 +663,7 @@ static int mana_gd_create_dma_region(struct gdma_dev *gd,
        struct gdma_context *gc = gd->gdma_context;
        struct hw_channel_context *hwc;
        u32 length = gmi->length;
-       u32 req_msg_size;
+       size_t req_msg_size;
        int err;
        int i;
 
@@ -674,7 +674,7 @@ static int mana_gd_create_dma_region(struct gdma_dev *gd,
                return -EINVAL;
 
        hwc = gc->hwc.driver_data;
-       req_msg_size = sizeof(*req) + num_page * sizeof(u64);
+       req_msg_size = struct_size(req, page_addr_list, num_page);
        if (req_msg_size > hwc->max_req_msg_size)
                return -EINVAL;
 
index 9a12607..d36405a 100644 (file)
@@ -48,7 +48,15 @@ enum TRI_STATE {
 
 #define MAX_PORTS_IN_MANA_DEV 256
 
-struct mana_stats {
+struct mana_stats_rx {
+       u64 packets;
+       u64 bytes;
+       u64 xdp_drop;
+       u64 xdp_tx;
+       struct u64_stats_sync syncp;
+};
+
+struct mana_stats_tx {
        u64 packets;
        u64 bytes;
        struct u64_stats_sync syncp;
@@ -76,7 +84,7 @@ struct mana_txq {
 
        atomic_t pending_sends;
 
-       struct mana_stats stats;
+       struct mana_stats_tx stats;
 };
 
 /* skb data and frags dma mappings */
@@ -298,10 +306,11 @@ struct mana_rxq {
 
        u32 buf_index;
 
-       struct mana_stats stats;
+       struct mana_stats_rx stats;
 
        struct bpf_prog __rcu *bpf_prog;
        struct xdp_rxq_info xdp_rxq;
+       struct page *xdp_save_page;
 
        /* MUST BE THE LAST MEMBER:
         * Each receive buffer has an associated mana_recv_buf_oob.
index 498d0f9..b7d3ba1 100644 (file)
@@ -136,7 +136,7 @@ int mana_start_xmit(struct sk_buff *skb, struct net_device *ndev)
        bool ipv4 = false, ipv6 = false;
        struct mana_tx_package pkg = {};
        struct netdev_queue *net_txq;
-       struct mana_stats *tx_stats;
+       struct mana_stats_tx *tx_stats;
        struct gdma_queue *gdma_sq;
        unsigned int csum_type;
        struct mana_txq *txq;
@@ -299,7 +299,8 @@ static void mana_get_stats64(struct net_device *ndev,
 {
        struct mana_port_context *apc = netdev_priv(ndev);
        unsigned int num_queues = apc->num_queues;
-       struct mana_stats *stats;
+       struct mana_stats_rx *rx_stats;
+       struct mana_stats_tx *tx_stats;
        unsigned int start;
        u64 packets, bytes;
        int q;
@@ -310,26 +311,26 @@ static void mana_get_stats64(struct net_device *ndev,
        netdev_stats_to_stats64(st, &ndev->stats);
 
        for (q = 0; q < num_queues; q++) {
-               stats = &apc->rxqs[q]->stats;
+               rx_stats = &apc->rxqs[q]->stats;
 
                do {
-                       start = u64_stats_fetch_begin_irq(&stats->syncp);
-                       packets = stats->packets;
-                       bytes = stats->bytes;
-               } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
+                       start = u64_stats_fetch_begin_irq(&rx_stats->syncp);
+                       packets = rx_stats->packets;
+                       bytes = rx_stats->bytes;
+               } while (u64_stats_fetch_retry_irq(&rx_stats->syncp, start));
 
                st->rx_packets += packets;
                st->rx_bytes += bytes;
        }
 
        for (q = 0; q < num_queues; q++) {
-               stats = &apc->tx_qp[q].txq.stats;
+               tx_stats = &apc->tx_qp[q].txq.stats;
 
                do {
-                       start = u64_stats_fetch_begin_irq(&stats->syncp);
-                       packets = stats->packets;
-                       bytes = stats->bytes;
-               } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
+                       start = u64_stats_fetch_begin_irq(&tx_stats->syncp);
+                       packets = tx_stats->packets;
+                       bytes = tx_stats->bytes;
+               } while (u64_stats_fetch_retry_irq(&tx_stats->syncp, start));
 
                st->tx_packets += packets;
                st->tx_bytes += bytes;
@@ -986,7 +987,7 @@ static struct sk_buff *mana_build_skb(void *buf_va, uint pkt_len,
 static void mana_rx_skb(void *buf_va, struct mana_rxcomp_oob *cqe,
                        struct mana_rxq *rxq)
 {
-       struct mana_stats *rx_stats = &rxq->stats;
+       struct mana_stats_rx *rx_stats = &rxq->stats;
        struct net_device *ndev = rxq->ndev;
        uint pkt_len = cqe->ppi[0].pkt_len;
        u16 rxq_idx = rxq->rxq_idx;
@@ -1007,7 +1008,7 @@ static void mana_rx_skb(void *buf_va, struct mana_rxcomp_oob *cqe,
        act = mana_run_xdp(ndev, rxq, &xdp, buf_va, pkt_len);
 
        if (act != XDP_PASS && act != XDP_TX)
-               goto drop;
+               goto drop_xdp;
 
        skb = mana_build_skb(buf_va, pkt_len, &xdp);
 
@@ -1034,6 +1035,14 @@ static void mana_rx_skb(void *buf_va, struct mana_rxcomp_oob *cqe,
                        skb_set_hash(skb, hash_value, PKT_HASH_TYPE_L3);
        }
 
+       u64_stats_update_begin(&rx_stats->syncp);
+       rx_stats->packets++;
+       rx_stats->bytes += pkt_len;
+
+       if (act == XDP_TX)
+               rx_stats->xdp_tx++;
+       u64_stats_update_end(&rx_stats->syncp);
+
        if (act == XDP_TX) {
                skb_set_queue_mapping(skb, rxq_idx);
                mana_xdp_tx(skb, ndev);
@@ -1042,15 +1051,19 @@ static void mana_rx_skb(void *buf_va, struct mana_rxcomp_oob *cqe,
 
        napi_gro_receive(napi, skb);
 
+       return;
+
+drop_xdp:
        u64_stats_update_begin(&rx_stats->syncp);
-       rx_stats->packets++;
-       rx_stats->bytes += pkt_len;
+       rx_stats->xdp_drop++;
        u64_stats_update_end(&rx_stats->syncp);
-       return;
 
 drop:
-       free_page((unsigned long)buf_va);
+       WARN_ON_ONCE(rxq->xdp_save_page);
+       rxq->xdp_save_page = virt_to_page(buf_va);
+
        ++ndev->stats.rx_dropped;
+
        return;
 }
 
@@ -1072,8 +1085,10 @@ static void mana_process_rx_cqe(struct mana_rxq *rxq, struct mana_cq *cq,
                break;
 
        case CQE_RX_TRUNCATED:
-               netdev_err(ndev, "Dropped a truncated packet\n");
-               return;
+               ++ndev->stats.rx_dropped;
+               rxbuf_oob = &rxq->rx_oobs[rxq->buf_index];
+               netdev_warn_once(ndev, "Dropped a truncated packet\n");
+               goto drop;
 
        case CQE_RX_COALESCED_4:
                netdev_err(ndev, "RX coalescing is unsupported\n");
@@ -1089,9 +1104,6 @@ static void mana_process_rx_cqe(struct mana_rxq *rxq, struct mana_cq *cq,
                return;
        }
 
-       if (oob->cqe_hdr.cqe_type != CQE_RX_OKAY)
-               return;
-
        pktlen = oob->ppi[0].pkt_len;
 
        if (pktlen == 0) {
@@ -1105,7 +1117,13 @@ static void mana_process_rx_cqe(struct mana_rxq *rxq, struct mana_cq *cq,
        rxbuf_oob = &rxq->rx_oobs[curr];
        WARN_ON_ONCE(rxbuf_oob->wqe_inf.wqe_size_in_bu != 1);
 
-       new_page = alloc_page(GFP_ATOMIC);
+       /* Reuse XDP dropped page if available */
+       if (rxq->xdp_save_page) {
+               new_page = rxq->xdp_save_page;
+               rxq->xdp_save_page = NULL;
+       } else {
+               new_page = alloc_page(GFP_ATOMIC);
+       }
 
        if (new_page) {
                da = dma_map_page(dev, new_page, XDP_PACKET_HEADROOM, rxq->datasize,
@@ -1135,6 +1153,7 @@ static void mana_process_rx_cqe(struct mana_rxq *rxq, struct mana_cq *cq,
 
        mana_rx_skb(old_buf, oob, rxq);
 
+drop:
        mana_move_wq_tail(rxq->gdma_rq, rxbuf_oob->wqe_inf.wqe_size_in_bu);
 
        mana_post_pkt_rxq(rxq);
@@ -1392,6 +1411,9 @@ static void mana_destroy_rxq(struct mana_port_context *apc,
 
        mana_deinit_cq(apc, &rxq->rx_cq);
 
+       if (rxq->xdp_save_page)
+               __free_page(rxq->xdp_save_page);
+
        for (i = 0; i < rxq->num_rx_buf; i++) {
                rx_oob = &rxq->rx_oobs[i];
 
index c3c81ae..e13f245 100644 (file)
@@ -23,7 +23,7 @@ static int mana_get_sset_count(struct net_device *ndev, int stringset)
        if (stringset != ETH_SS_STATS)
                return -EINVAL;
 
-       return ARRAY_SIZE(mana_eth_stats) + num_queues * 4;
+       return ARRAY_SIZE(mana_eth_stats) + num_queues * 6;
 }
 
 static void mana_get_strings(struct net_device *ndev, u32 stringset, u8 *data)
@@ -46,6 +46,10 @@ static void mana_get_strings(struct net_device *ndev, u32 stringset, u8 *data)
                p += ETH_GSTRING_LEN;
                sprintf(p, "rx_%d_bytes", i);
                p += ETH_GSTRING_LEN;
+               sprintf(p, "rx_%d_xdp_drop", i);
+               p += ETH_GSTRING_LEN;
+               sprintf(p, "rx_%d_xdp_tx", i);
+               p += ETH_GSTRING_LEN;
        }
 
        for (i = 0; i < num_queues; i++) {
@@ -62,9 +66,12 @@ static void mana_get_ethtool_stats(struct net_device *ndev,
        struct mana_port_context *apc = netdev_priv(ndev);
        unsigned int num_queues = apc->num_queues;
        void *eth_stats = &apc->eth_stats;
-       struct mana_stats *stats;
+       struct mana_stats_rx *rx_stats;
+       struct mana_stats_tx *tx_stats;
        unsigned int start;
        u64 packets, bytes;
+       u64 xdp_drop;
+       u64 xdp_tx;
        int q, i = 0;
 
        if (!apc->port_is_up)
@@ -74,26 +81,30 @@ static void mana_get_ethtool_stats(struct net_device *ndev,
                data[i++] = *(u64 *)(eth_stats + mana_eth_stats[q].offset);
 
        for (q = 0; q < num_queues; q++) {
-               stats = &apc->rxqs[q]->stats;
+               rx_stats = &apc->rxqs[q]->stats;
 
                do {
-                       start = u64_stats_fetch_begin_irq(&stats->syncp);
-                       packets = stats->packets;
-                       bytes = stats->bytes;
-               } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
+                       start = u64_stats_fetch_begin_irq(&rx_stats->syncp);
+                       packets = rx_stats->packets;
+                       bytes = rx_stats->bytes;
+                       xdp_drop = rx_stats->xdp_drop;
+                       xdp_tx = rx_stats->xdp_tx;
+               } while (u64_stats_fetch_retry_irq(&rx_stats->syncp, start));
 
                data[i++] = packets;
                data[i++] = bytes;
+               data[i++] = xdp_drop;
+               data[i++] = xdp_tx;
        }
 
        for (q = 0; q < num_queues; q++) {
-               stats = &apc->tx_qp[q].txq.stats;
+               tx_stats = &apc->tx_qp[q].txq.stats;
 
                do {
-                       start = u64_stats_fetch_begin_irq(&stats->syncp);
-                       packets = stats->packets;
-                       bytes = stats->bytes;
-               } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
+                       start = u64_stats_fetch_begin_irq(&tx_stats->syncp);
+                       packets = tx_stats->packets;
+                       bytes = tx_stats->bytes;
+               } while (u64_stats_fetch_retry_irq(&tx_stats->syncp, start));
 
                data[i++] = packets;
                data[i++] = bytes;
index 0a326e0..cd50db7 100644 (file)
@@ -356,7 +356,7 @@ __nfp_tun_add_route_to_cache(struct list_head *route_list,
                        return 0;
                }
 
-       entry = kmalloc(sizeof(*entry) + add_len, GFP_ATOMIC);
+       entry = kmalloc(struct_size(entry, ip_add, add_len), GFP_ATOMIC);
        if (!entry) {
                spin_unlock_bh(list_lock);
                return -ENOMEM;
index 3d61a8c..50007cc 100644 (file)
@@ -1,8 +1,7 @@
 /* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
 /* Copyright (C) 2015-2018 Netronome Systems, Inc. */
 
-/*
- * nfp_net_ctrl.h
+/* nfp_net_ctrl.h
  * Netronome network device driver: Control BAR layout
  * Authors: Jakub Kicinski <jakub.kicinski@netronome.com>
  *          Jason McMullan <jason.mcmullan@netronome.com>
 
 #include <linux/types.h>
 
-/**
- * Configuration BAR size.
+/* Configuration BAR size.
  *
  * The configuration BAR is 8K in size, but due to
  * THB-350, 32k needs to be reserved.
  */
 #define NFP_NET_CFG_BAR_SZ             (32 * 1024)
 
-/**
- * Offset in Freelist buffer where packet starts on RX
- */
+/* Offset in Freelist buffer where packet starts on RX */
 #define NFP_NET_RX_OFFSET              32
 
-/**
- * LSO parameters
+/* LSO parameters
  * %NFP_NET_LSO_MAX_HDR_SZ:    Maximum header size supported for LSO frames
  * %NFP_NET_LSO_MAX_SEGS:      Maximum number of segments LSO frame can produce
  */
 #define NFP_NET_LSO_MAX_HDR_SZ         255
 #define NFP_NET_LSO_MAX_SEGS           64
 
-/**
- * Prepend field types
- */
+/* Prepend field types */
 #define NFP_NET_META_FIELD_SIZE                4
 #define NFP_NET_META_HASH              1 /* next field carries hash type */
 #define NFP_NET_META_MARK              2
@@ -49,9 +42,7 @@
 
 #define NFP_META_PORT_ID_CTRL          ~0U
 
-/**
- * Hash type pre-pended when a RSS hash was computed
- */
+/* Hash type pre-pended when a RSS hash was computed */
 #define NFP_NET_RSS_NONE               0
 #define NFP_NET_RSS_IPV4               1
 #define NFP_NET_RSS_IPV6               2
 #define NFP_NET_RSS_IPV6_UDP           8
 #define NFP_NET_RSS_IPV6_EX_UDP                9
 
-/**
- * Ring counts
+/* Ring counts
  * %NFP_NET_TXR_MAX:        Maximum number of TX rings
  * %NFP_NET_RXR_MAX:        Maximum number of RX rings
  */
 #define NFP_NET_TXR_MAX                        64
 #define NFP_NET_RXR_MAX                        64
 
-/**
- * Read/Write config words (0x0000 - 0x002c)
+/* Read/Write config words (0x0000 - 0x002c)
  * %NFP_NET_CFG_CTRL:       Global control
  * %NFP_NET_CFG_UPDATE:      Indicate which fields are updated
  * %NFP_NET_CFG_TXRS_ENABLE: Bitmask of enabled TX rings
 #define NFP_NET_CFG_LSC                        0x0020
 #define NFP_NET_CFG_MACADDR            0x0024
 
-/**
- * Read-only words (0x0030 - 0x0050):
+/* Read-only words (0x0030 - 0x0050):
  * %NFP_NET_CFG_VERSION:     Firmware version number
  * %NFP_NET_CFG_STS:        Status
  * %NFP_NET_CFG_CAP:        Capabilities (same bits as %NFP_NET_CFG_CTRL)
 #define NFP_NET_CFG_START_TXQ          0x0048
 #define NFP_NET_CFG_START_RXQ          0x004c
 
-/**
- * Prepend configuration
+/* Prepend configuration
  */
 #define NFP_NET_CFG_RX_OFFSET          0x0050
 #define NFP_NET_CFG_RX_OFFSET_DYNAMIC          0       /* Prepend mode */
 
-/**
- * RSS capabilities
+/* RSS capabilities
  * %NFP_NET_CFG_RSS_CAP_HFUNC: supported hash functions (same bits as
  *                             %NFP_NET_CFG_RSS_HFUNC)
  */
 #define NFP_NET_CFG_RSS_CAP            0x0054
 #define   NFP_NET_CFG_RSS_CAP_HFUNC      0xff000000
 
-/**
- * TLV area start
+/* TLV area start
  * %NFP_NET_CFG_TLV_BASE:      start anchor of the TLV area
  */
 #define NFP_NET_CFG_TLV_BASE           0x0058
 
-/**
- * VXLAN/UDP encap configuration
+/* VXLAN/UDP encap configuration
  * %NFP_NET_CFG_VXLAN_PORT:    Base address of table of tunnels' UDP dst ports
  * %NFP_NET_CFG_VXLAN_SZ:      Size of the UDP port table in bytes
  */
 #define NFP_NET_CFG_VXLAN_PORT         0x0060
 #define NFP_NET_CFG_VXLAN_SZ             0x0008
 
-/**
- * BPF section
+/* BPF section
  * %NFP_NET_CFG_BPF_ABI:       BPF ABI version
  * %NFP_NET_CFG_BPF_CAP:       BPF capabilities
  * %NFP_NET_CFG_BPF_MAX_LEN:   Maximum size of JITed BPF code in bytes
 #define   NFP_NET_CFG_BPF_CFG_MASK     7ULL
 #define   NFP_NET_CFG_BPF_ADDR_MASK    (~NFP_NET_CFG_BPF_CFG_MASK)
 
-/**
- * 40B reserved for future use (0x0098 - 0x00c0)
+/* 40B reserved for future use (0x0098 - 0x00c0)
  */
 #define NFP_NET_CFG_RESERVED           0x0098
 #define NFP_NET_CFG_RESERVED_SZ                0x0028
 
-/**
- * RSS configuration (0x0100 - 0x01ac):
+/* RSS configuration (0x0100 - 0x01ac):
  * Used only when NFP_NET_CFG_CTRL_RSS is enabled
  * %NFP_NET_CFG_RSS_CFG:     RSS configuration word
  * %NFP_NET_CFG_RSS_KEY:     RSS "secret" key
                                         NFP_NET_CFG_RSS_KEY_SZ)
 #define NFP_NET_CFG_RSS_ITBL_SZ                0x80
 
-/**
- * TX ring configuration (0x200 - 0x800)
+/* TX ring configuration (0x200 - 0x800)
  * %NFP_NET_CFG_TXR_BASE:    Base offset for TX ring configuration
  * %NFP_NET_CFG_TXR_ADDR:    Per TX ring DMA address (8B entries)
  * %NFP_NET_CFG_TXR_WB_ADDR: Per TX ring write back DMA address (8B entries)
 #define NFP_NET_CFG_TXR_IRQ_MOD(_x)    (NFP_NET_CFG_TXR_BASE + 0x500 + \
                                         ((_x) * 0x4))
 
-/**
- * RX ring configuration (0x0800 - 0x0c00)
+/* RX ring configuration (0x0800 - 0x0c00)
  * %NFP_NET_CFG_RXR_BASE:    Base offset for RX ring configuration
  * %NFP_NET_CFG_RXR_ADDR:    Per RX ring DMA address (8B entries)
  * %NFP_NET_CFG_RXR_SZ:      Per RX ring ring size (1B entries)
 #define NFP_NET_CFG_RXR_IRQ_MOD(_x)    (NFP_NET_CFG_RXR_BASE + 0x300 + \
                                         ((_x) * 0x4))
 
-/**
- * Interrupt Control/Cause registers (0x0c00 - 0x0d00)
+/* Interrupt Control/Cause registers (0x0c00 - 0x0d00)
  * These registers are only used when MSI-X auto-masking is not
  * enabled (%NFP_NET_CFG_CTRL_MSIXAUTO not set).  The array is index
  * by MSI-X entry and are 1B in size.  If an entry is zero, the
 #define   NFP_NET_CFG_ICR_RXTX         0x1
 #define   NFP_NET_CFG_ICR_LSC          0x2
 
-/**
- * General device stats (0x0d00 - 0x0d90)
+/* General device stats (0x0d00 - 0x0d90)
  * all counters are 64bit.
  */
 #define NFP_NET_CFG_STATS_BASE         0x0d00
 #define NFP_NET_CFG_STATS_APP3_FRAMES  (NFP_NET_CFG_STATS_BASE + 0xc0)
 #define NFP_NET_CFG_STATS_APP3_BYTES   (NFP_NET_CFG_STATS_BASE + 0xc8)
 
-/**
- * Per ring stats (0x1000 - 0x1800)
+/* Per ring stats (0x1000 - 0x1800)
  * options, 64bit per entry
  * %NFP_NET_CFG_TXR_STATS:   TX ring statistics (Packet and Byte count)
  * %NFP_NET_CFG_RXR_STATS:   RX ring statistics (Packet and Byte count)
 #define NFP_NET_CFG_RXR_STATS(_x)      (NFP_NET_CFG_RXR_STATS_BASE + \
                                         ((_x) * 0x10))
 
-/**
- * General use mailbox area (0x1800 - 0x19ff)
+/* General use mailbox area (0x1800 - 0x19ff)
  * 4B used for update command and 4B return code
  * followed by a max of 504B of variable length value
  */
 #define NFP_NET_CFG_MBOX_CMD_PCI_DSCP_PRIOMAP_SET      5
 #define NFP_NET_CFG_MBOX_CMD_TLV_CMSG                  6
 
-/**
- * VLAN filtering using general use mailbox
+/* VLAN filtering using general use mailbox
  * %NFP_NET_CFG_VLAN_FILTER:           Base address of VLAN filter mailbox
  * %NFP_NET_CFG_VLAN_FILTER_VID:       VLAN ID to filter
  * %NFP_NET_CFG_VLAN_FILTER_PROTO:     VLAN proto to filter
 #define  NFP_NET_CFG_VLAN_FILTER_PROTO  (NFP_NET_CFG_VLAN_FILTER + 2)
 #define NFP_NET_CFG_VLAN_FILTER_SZ      0x0004
 
-/**
- * TLV capabilities
+/* TLV capabilities
  * %NFP_NET_CFG_TLV_TYPE:      Offset of type within the TLV
  * %NFP_NET_CFG_TLV_TYPE_REQUIRED: Driver must be able to parse the TLV
  * %NFP_NET_CFG_TLV_LENGTH:    Offset of length within the TLV
 #define NFP_NET_CFG_TLV_HEADER_TYPE    0x7fff0000
 #define NFP_NET_CFG_TLV_HEADER_LENGTH  0x0000ffff
 
-/**
- * Capability TLV types
+/* Capability TLV types
  *
  * %NFP_NET_CFG_TLV_TYPE_UNKNOWN:
  * Special TLV type to catch bugs, should never be encountered.  Drivers should
 
 struct device;
 
-/**
- * struct nfp_net_tlv_caps - parsed control BAR TLV capabilities
+/* struct nfp_net_tlv_caps - parsed control BAR TLV capabilities
  * @me_freq_mhz:       ME clock_freq (MHz)
  * @mbox_off:          vNIC mailbox area offset
  * @mbox_len:          vNIC mailbox area length
index a3db0cb..786be58 100644 (file)
@@ -4,8 +4,7 @@
 #ifndef _NFP_NET_SRIOV_H_
 #define _NFP_NET_SRIOV_H_
 
-/**
- * SRIOV VF configuration.
+/* SRIOV VF configuration.
  * The configuration memory begins with a mailbox region for communication with
  * the firmware followed by individual VF entries.
  */
index ae4da18..df316b9 100644 (file)
@@ -132,8 +132,7 @@ void nfp_devlink_port_unregister(struct nfp_port *port);
 void nfp_devlink_port_type_eth_set(struct nfp_port *port);
 void nfp_devlink_port_type_clear(struct nfp_port *port);
 
-/**
- * Mac stats (0x0000 - 0x0200)
+/* Mac stats (0x0000 - 0x0200)
  * all counters are 64bit.
  */
 #define NFP_MAC_STATS_BASE                0x0000
index 10e7d8b..730fea2 100644 (file)
@@ -513,7 +513,7 @@ nfp_nsp_command_buf_dma_sg(struct nfp_nsp *nsp,
        dma_size = BIT_ULL(dma_order);
        nseg = DIV_ROUND_UP(max_size, chunk_size);
 
-       chunks = kzalloc(array_size(sizeof(*chunks), nseg), GFP_KERNEL);
+       chunks = kcalloc(nseg, sizeof(*chunks), GFP_KERNEL);
        if (!chunks)
                return -ENOMEM;
 
index 5e25411..602f4d4 100644 (file)
@@ -18,7 +18,7 @@ struct ionic_lif;
 #define PCI_DEVICE_ID_PENSANDO_IONIC_ETH_PF    0x1002
 #define PCI_DEVICE_ID_PENSANDO_IONIC_ETH_VF    0x1003
 
-#define DEVCMD_TIMEOUT  10
+#define DEVCMD_TIMEOUT                 5
 #define IONIC_ADMINQ_TIME_SLICE                msecs_to_jiffies(100)
 
 #define IONIC_PHC_UPDATE_NS    10000000000         /* 10s in nanoseconds */
@@ -78,6 +78,9 @@ void ionic_adminq_netdev_err_print(struct ionic_lif *lif, u8 opcode,
                                   u8 status, int err);
 
 int ionic_dev_cmd_wait(struct ionic *ionic, unsigned long max_wait);
+int ionic_dev_cmd_wait_nomsg(struct ionic *ionic, unsigned long max_wait);
+void ionic_dev_cmd_dev_err_print(struct ionic *ionic, u8 opcode, u8 status,
+                                int err);
 int ionic_set_dma_mask(struct ionic *ionic);
 int ionic_setup(struct ionic *ionic);
 
@@ -89,4 +92,6 @@ int ionic_port_identify(struct ionic *ionic);
 int ionic_port_init(struct ionic *ionic);
 int ionic_port_reset(struct ionic *ionic);
 
+const char *ionic_vf_attr_to_str(enum ionic_vf_attr attr);
+
 #endif /* _IONIC_H_ */
index 7e296fa..6ffc62c 100644 (file)
@@ -109,8 +109,8 @@ void ionic_bus_unmap_dbpage(struct ionic *ionic, void __iomem *page)
 
 static void ionic_vf_dealloc_locked(struct ionic *ionic)
 {
+       struct ionic_vf_setattr_cmd vfc = { .attr = IONIC_VF_ATTR_STATSADDR };
        struct ionic_vf *v;
-       dma_addr_t dma = 0;
        int i;
 
        if (!ionic->vfs)
@@ -120,9 +120,8 @@ static void ionic_vf_dealloc_locked(struct ionic *ionic)
                v = &ionic->vfs[i];
 
                if (v->stats_pa) {
-                       (void)ionic_set_vf_config(ionic, i,
-                                                 IONIC_VF_ATTR_STATSADDR,
-                                                 (u8 *)&dma);
+                       vfc.stats_pa = 0;
+                       (void)ionic_set_vf_config(ionic, i, &vfc);
                        dma_unmap_single(ionic->dev, v->stats_pa,
                                         sizeof(v->stats), DMA_FROM_DEVICE);
                        v->stats_pa = 0;
@@ -143,6 +142,7 @@ static void ionic_vf_dealloc(struct ionic *ionic)
 
 static int ionic_vf_alloc(struct ionic *ionic, int num_vfs)
 {
+       struct ionic_vf_setattr_cmd vfc = { .attr = IONIC_VF_ATTR_STATSADDR };
        struct ionic_vf *v;
        int err = 0;
        int i;
@@ -166,9 +166,10 @@ static int ionic_vf_alloc(struct ionic *ionic, int num_vfs)
                }
 
                ionic->num_vfs++;
+
                /* ignore failures from older FW, we just won't get stats */
-               (void)ionic_set_vf_config(ionic, i, IONIC_VF_ATTR_STATSADDR,
-                                         (u8 *)&v->stats_pa);
+               vfc.stats_pa = cpu_to_le64(v->stats_pa);
+               (void)ionic_set_vf_config(ionic, i, &vfc);
        }
 
 out:
@@ -331,6 +332,9 @@ static int ionic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
                goto err_out_deregister_lifs;
        }
 
+       mod_timer(&ionic->watchdog_timer,
+                 round_jiffies(jiffies + ionic->watchdog_period));
+
        return 0;
 
 err_out_deregister_lifs:
@@ -348,7 +352,6 @@ err_out_port_reset:
 err_out_reset:
        ionic_reset(ionic);
 err_out_teardown:
-       del_timer_sync(&ionic->watchdog_timer);
        pci_clear_master(pdev);
        /* Don't fail the probe for these errors, keep
         * the hw interface around for inspection
index d57e80d..52a1b5c 100644 (file)
@@ -33,7 +33,8 @@ static void ionic_watchdog_cb(struct timer_list *t)
            !test_bit(IONIC_LIF_F_FW_RESET, lif->state))
                ionic_link_status_check_request(lif, CAN_NOT_SLEEP);
 
-       if (test_bit(IONIC_LIF_F_FILTER_SYNC_NEEDED, lif->state)) {
+       if (test_bit(IONIC_LIF_F_FILTER_SYNC_NEEDED, lif->state) &&
+           !test_bit(IONIC_LIF_F_FW_RESET, lif->state)) {
                work = kzalloc(sizeof(*work), GFP_ATOMIC);
                if (!work) {
                        netdev_err(lif->netdev, "rxmode change dropped\n");
@@ -46,6 +47,24 @@ static void ionic_watchdog_cb(struct timer_list *t)
        }
 }
 
+static void ionic_watchdog_init(struct ionic *ionic)
+{
+       struct ionic_dev *idev = &ionic->idev;
+
+       timer_setup(&ionic->watchdog_timer, ionic_watchdog_cb, 0);
+       ionic->watchdog_period = IONIC_WATCHDOG_SECS * HZ;
+
+       /* set times to ensure the first check will proceed */
+       atomic_long_set(&idev->last_check_time, jiffies - 2 * HZ);
+       idev->last_hb_time = jiffies - 2 * ionic->watchdog_period;
+       /* init as ready, so no transition if the first check succeeds */
+       idev->last_fw_hb = 0;
+       idev->fw_hb_ready = true;
+       idev->fw_status_ready = true;
+       idev->fw_generation = IONIC_FW_STS_F_GENERATION &
+                             ioread8(&idev->dev_info_regs->fw_status);
+}
+
 void ionic_init_devinfo(struct ionic *ionic)
 {
        struct ionic_dev *idev = &ionic->idev;
@@ -109,21 +128,7 @@ int ionic_dev_setup(struct ionic *ionic)
                return -EFAULT;
        }
 
-       timer_setup(&ionic->watchdog_timer, ionic_watchdog_cb, 0);
-       ionic->watchdog_period = IONIC_WATCHDOG_SECS * HZ;
-
-       /* set times to ensure the first check will proceed */
-       atomic_long_set(&idev->last_check_time, jiffies - 2 * HZ);
-       idev->last_hb_time = jiffies - 2 * ionic->watchdog_period;
-       /* init as ready, so no transition if the first check succeeds */
-       idev->last_fw_hb = 0;
-       idev->fw_hb_ready = true;
-       idev->fw_status_ready = true;
-       idev->fw_generation = IONIC_FW_STS_F_GENERATION &
-                             ioread8(&idev->dev_info_regs->fw_status);
-
-       mod_timer(&ionic->watchdog_timer,
-                 round_jiffies(jiffies + ionic->watchdog_period));
+       ionic_watchdog_init(ionic);
 
        idev->db_pages = bar->vaddr;
        idev->phy_db_pages = bar->bus_addr;
@@ -132,10 +137,21 @@ int ionic_dev_setup(struct ionic *ionic)
 }
 
 /* Devcmd Interface */
+bool ionic_is_fw_running(struct ionic_dev *idev)
+{
+       u8 fw_status = ioread8(&idev->dev_info_regs->fw_status);
+
+       /* firmware is useful only if the running bit is set and
+        * fw_status != 0xff (bad PCI read)
+        */
+       return (fw_status != 0xff) && (fw_status & IONIC_FW_STS_F_RUNNING);
+}
+
 int ionic_heartbeat_check(struct ionic *ionic)
 {
-       struct ionic_dev *idev = &ionic->idev;
        unsigned long check_time, last_check_time;
+       struct ionic_dev *idev = &ionic->idev;
+       struct ionic_lif *lif = ionic->lif;
        bool fw_status_ready = true;
        bool fw_hb_ready;
        u8 fw_generation;
@@ -155,13 +171,10 @@ do_check_time:
                goto do_check_time;
        }
 
-       /* firmware is useful only if the running bit is set and
-        * fw_status != 0xff (bad PCI read)
-        * If fw_status is not ready don't bother with the generation.
-        */
        fw_status = ioread8(&idev->dev_info_regs->fw_status);
 
-       if (fw_status == 0xff || !(fw_status & IONIC_FW_STS_F_RUNNING)) {
+       /* If fw_status is not ready don't bother with the generation */
+       if (!ionic_is_fw_running(idev)) {
                fw_status_ready = false;
        } else {
                fw_generation = fw_status & IONIC_FW_STS_F_GENERATION;
@@ -176,31 +189,41 @@ do_check_time:
                         * the down, the next watchdog will see the fw is up
                         * and the generation value stable, so will trigger
                         * the fw-up activity.
+                        *
+                        * If we had already moved to FW_RESET from a RESET event,
+                        * it is possible that we never saw the fw_status go to 0,
+                        * so we fake the current idev->fw_status_ready here to
+                        * force the transition and get FW up again.
                         */
-                       fw_status_ready = false;
+                       if (test_bit(IONIC_LIF_F_FW_RESET, lif->state))
+                               idev->fw_status_ready = false;  /* go to running */
+                       else
+                               fw_status_ready = false;        /* go to down */
                }
        }
 
        /* is this a transition? */
        if (fw_status_ready != idev->fw_status_ready) {
-               struct ionic_lif *lif = ionic->lif;
                bool trigger = false;
 
-               idev->fw_status_ready = fw_status_ready;
-
-               if (!fw_status_ready) {
-                       dev_info(ionic->dev, "FW stopped %u\n", fw_status);
-                       if (lif && !test_bit(IONIC_LIF_F_FW_RESET, lif->state))
-                               trigger = true;
-               } else {
-                       dev_info(ionic->dev, "FW running %u\n", fw_status);
-                       if (lif && test_bit(IONIC_LIF_F_FW_RESET, lif->state))
-                               trigger = true;
+               if (!fw_status_ready && lif &&
+                   !test_bit(IONIC_LIF_F_FW_RESET, lif->state) &&
+                   !test_and_set_bit(IONIC_LIF_F_FW_STOPPING, lif->state)) {
+                       dev_info(ionic->dev, "FW stopped 0x%02x\n", fw_status);
+                       trigger = true;
+
+               } else if (fw_status_ready && lif &&
+                          test_bit(IONIC_LIF_F_FW_RESET, lif->state) &&
+                          !test_bit(IONIC_LIF_F_FW_STOPPING, lif->state)) {
+                       dev_info(ionic->dev, "FW running 0x%02x\n", fw_status);
+                       trigger = true;
                }
 
                if (trigger) {
                        struct ionic_deferred_work *work;
 
+                       idev->fw_status_ready = fw_status_ready;
+
                        work = kzalloc(sizeof(*work), GFP_ATOMIC);
                        if (work) {
                                work->type = IONIC_DW_TYPE_LIF_RESET;
@@ -210,12 +233,14 @@ do_check_time:
                }
        }
 
-       if (!fw_status_ready)
+       if (!idev->fw_status_ready)
                return -ENXIO;
 
-       /* wait at least one watchdog period since the last heartbeat */
+       /* Because of some variability in the actual FW heartbeat, we
+        * wait longer than the DEVCMD_TIMEOUT before checking again.
+        */
        last_check_time = idev->last_hb_time;
-       if (time_before(check_time, last_check_time + ionic->watchdog_period))
+       if (time_before(check_time, last_check_time + DEVCMD_TIMEOUT * 2 * HZ))
                return 0;
 
        fw_hb = ioread32(&idev->dev_info_regs->fw_heartbeat);
@@ -392,60 +417,63 @@ void ionic_dev_cmd_port_pause(struct ionic_dev *idev, u8 pause_type)
 }
 
 /* VF commands */
-int ionic_set_vf_config(struct ionic *ionic, int vf, u8 attr, u8 *data)
+int ionic_set_vf_config(struct ionic *ionic, int vf,
+                       struct ionic_vf_setattr_cmd *vfc)
 {
        union ionic_dev_cmd cmd = {
                .vf_setattr.opcode = IONIC_CMD_VF_SETATTR,
-               .vf_setattr.attr = attr,
+               .vf_setattr.attr = vfc->attr,
                .vf_setattr.vf_index = cpu_to_le16(vf),
        };
        int err;
 
+       memcpy(cmd.vf_setattr.pad, vfc->pad, sizeof(vfc->pad));
+
+       mutex_lock(&ionic->dev_cmd_lock);
+       ionic_dev_cmd_go(&ionic->idev, &cmd);
+       err = ionic_dev_cmd_wait(ionic, DEVCMD_TIMEOUT);
+       mutex_unlock(&ionic->dev_cmd_lock);
+
+       return err;
+}
+
+int ionic_dev_cmd_vf_getattr(struct ionic *ionic, int vf, u8 attr,
+                            struct ionic_vf_getattr_comp *comp)
+{
+       union ionic_dev_cmd cmd = {
+               .vf_getattr.opcode = IONIC_CMD_VF_GETATTR,
+               .vf_getattr.attr = attr,
+               .vf_getattr.vf_index = cpu_to_le16(vf),
+       };
+       int err;
+
+       if (vf >= ionic->num_vfs)
+               return -EINVAL;
+
        switch (attr) {
        case IONIC_VF_ATTR_SPOOFCHK:
-               cmd.vf_setattr.spoofchk = *data;
-               dev_dbg(ionic->dev, "%s: vf %d spoof %d\n",
-                       __func__, vf, *data);
-               break;
        case IONIC_VF_ATTR_TRUST:
-               cmd.vf_setattr.trust = *data;
-               dev_dbg(ionic->dev, "%s: vf %d trust %d\n",
-                       __func__, vf, *data);
-               break;
        case IONIC_VF_ATTR_LINKSTATE:
-               cmd.vf_setattr.linkstate = *data;
-               dev_dbg(ionic->dev, "%s: vf %d linkstate %d\n",
-                       __func__, vf, *data);
-               break;
        case IONIC_VF_ATTR_MAC:
-               ether_addr_copy(cmd.vf_setattr.macaddr, data);
-               dev_dbg(ionic->dev, "%s: vf %d macaddr %pM\n",
-                       __func__, vf, data);
-               break;
        case IONIC_VF_ATTR_VLAN:
-               cmd.vf_setattr.vlanid = cpu_to_le16(*(u16 *)data);
-               dev_dbg(ionic->dev, "%s: vf %d vlan %d\n",
-                       __func__, vf, *(u16 *)data);
-               break;
        case IONIC_VF_ATTR_RATE:
-               cmd.vf_setattr.maxrate = cpu_to_le32(*(u32 *)data);
-               dev_dbg(ionic->dev, "%s: vf %d maxrate %d\n",
-                       __func__, vf, *(u32 *)data);
                break;
        case IONIC_VF_ATTR_STATSADDR:
-               cmd.vf_setattr.stats_pa = cpu_to_le64(*(u64 *)data);
-               dev_dbg(ionic->dev, "%s: vf %d stats_pa 0x%08llx\n",
-                       __func__, vf, *(u64 *)data);
-               break;
        default:
                return -EINVAL;
        }
 
        mutex_lock(&ionic->dev_cmd_lock);
        ionic_dev_cmd_go(&ionic->idev, &cmd);
-       err = ionic_dev_cmd_wait(ionic, DEVCMD_TIMEOUT);
+       err = ionic_dev_cmd_wait_nomsg(ionic, DEVCMD_TIMEOUT);
+       memcpy_fromio(comp, &ionic->idev.dev_cmd_regs->comp.vf_getattr,
+                     sizeof(*comp));
        mutex_unlock(&ionic->dev_cmd_lock);
 
+       if (err && comp->status != IONIC_RC_ENOSUPP)
+               ionic_dev_cmd_dev_err_print(ionic, cmd.vf_getattr.opcode,
+                                           comp->status, err);
+
        return err;
 }
 
index e5acf3b..563c302 100644 (file)
@@ -318,7 +318,10 @@ void ionic_dev_cmd_port_autoneg(struct ionic_dev *idev, u8 an_enable);
 void ionic_dev_cmd_port_fec(struct ionic_dev *idev, u8 fec_type);
 void ionic_dev_cmd_port_pause(struct ionic_dev *idev, u8 pause_type);
 
-int ionic_set_vf_config(struct ionic *ionic, int vf, u8 attr, u8 *data);
+int ionic_set_vf_config(struct ionic *ionic, int vf,
+                       struct ionic_vf_setattr_cmd *vfc);
+int ionic_dev_cmd_vf_getattr(struct ionic *ionic, int vf, u8 attr,
+                            struct ionic_vf_getattr_comp *comp);
 void ionic_dev_cmd_queue_identify(struct ionic_dev *idev,
                                  u16 lif_type, u8 qtype, u8 qver);
 void ionic_dev_cmd_lif_identify(struct ionic_dev *idev, u8 type, u8 ver);
@@ -353,5 +356,6 @@ void ionic_q_rewind(struct ionic_queue *q, struct ionic_desc_info *start);
 void ionic_q_service(struct ionic_queue *q, struct ionic_cq_info *cq_info,
                     unsigned int stop_index);
 int ionic_heartbeat_check(struct ionic *ionic);
+bool ionic_is_fw_running(struct ionic_dev *idev);
 
 #endif /* _IONIC_DEV_H_ */
index 2ff7be1..542e395 100644 (file)
@@ -1112,12 +1112,17 @@ static bool ionic_notifyq_service(struct ionic_cq *cq,
                ionic_link_status_check_request(lif, CAN_NOT_SLEEP);
                break;
        case IONIC_EVENT_RESET:
-               work = kzalloc(sizeof(*work), GFP_ATOMIC);
-               if (!work) {
-                       netdev_err(lif->netdev, "Reset event dropped\n");
-               } else {
-                       work->type = IONIC_DW_TYPE_LIF_RESET;
-                       ionic_lif_deferred_enqueue(&lif->deferred, work);
+               if (lif->ionic->idev.fw_status_ready &&
+                   !test_bit(IONIC_LIF_F_FW_RESET, lif->state) &&
+                   !test_and_set_bit(IONIC_LIF_F_FW_STOPPING, lif->state)) {
+                       work = kzalloc(sizeof(*work), GFP_ATOMIC);
+                       if (!work) {
+                               netdev_err(lif->netdev, "Reset event dropped\n");
+                               clear_bit(IONIC_LIF_F_FW_STOPPING, lif->state);
+                       } else {
+                               work->type = IONIC_DW_TYPE_LIF_RESET;
+                               ionic_lif_deferred_enqueue(&lif->deferred, work);
+                       }
                }
                break;
        default:
@@ -1782,7 +1787,7 @@ static void ionic_lif_quiesce(struct ionic_lif *lif)
 
        err = ionic_adminq_post_wait(lif, &ctx);
        if (err)
-               netdev_err(lif->netdev, "lif quiesce failed %d\n", err);
+               netdev_dbg(lif->netdev, "lif quiesce failed %d\n", err);
 }
 
 static void ionic_txrx_disable(struct ionic_lif *lif)
@@ -2152,6 +2157,76 @@ static int ionic_eth_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd
        }
 }
 
+static int ionic_update_cached_vf_config(struct ionic *ionic, int vf)
+{
+       struct ionic_vf_getattr_comp comp = { 0 };
+       int err;
+       u8 attr;
+
+       attr = IONIC_VF_ATTR_VLAN;
+       err = ionic_dev_cmd_vf_getattr(ionic, vf, attr, &comp);
+       if (err && comp.status != IONIC_RC_ENOSUPP)
+               goto err_out;
+       if (!err)
+               ionic->vfs[vf].vlanid = comp.vlanid;
+
+       attr = IONIC_VF_ATTR_SPOOFCHK;
+       err = ionic_dev_cmd_vf_getattr(ionic, vf, attr, &comp);
+       if (err && comp.status != IONIC_RC_ENOSUPP)
+               goto err_out;
+       if (!err)
+               ionic->vfs[vf].spoofchk = comp.spoofchk;
+
+       attr = IONIC_VF_ATTR_LINKSTATE;
+       err = ionic_dev_cmd_vf_getattr(ionic, vf, attr, &comp);
+       if (err && comp.status != IONIC_RC_ENOSUPP)
+               goto err_out;
+       if (!err) {
+               switch (comp.linkstate) {
+               case IONIC_VF_LINK_STATUS_UP:
+                       ionic->vfs[vf].linkstate = IFLA_VF_LINK_STATE_ENABLE;
+                       break;
+               case IONIC_VF_LINK_STATUS_DOWN:
+                       ionic->vfs[vf].linkstate = IFLA_VF_LINK_STATE_DISABLE;
+                       break;
+               case IONIC_VF_LINK_STATUS_AUTO:
+                       ionic->vfs[vf].linkstate = IFLA_VF_LINK_STATE_AUTO;
+                       break;
+               default:
+                       dev_warn(ionic->dev, "Unexpected link state %u\n", comp.linkstate);
+                       break;
+               }
+       }
+
+       attr = IONIC_VF_ATTR_RATE;
+       err = ionic_dev_cmd_vf_getattr(ionic, vf, attr, &comp);
+       if (err && comp.status != IONIC_RC_ENOSUPP)
+               goto err_out;
+       if (!err)
+               ionic->vfs[vf].maxrate = comp.maxrate;
+
+       attr = IONIC_VF_ATTR_TRUST;
+       err = ionic_dev_cmd_vf_getattr(ionic, vf, attr, &comp);
+       if (err && comp.status != IONIC_RC_ENOSUPP)
+               goto err_out;
+       if (!err)
+               ionic->vfs[vf].trusted = comp.trust;
+
+       attr = IONIC_VF_ATTR_MAC;
+       err = ionic_dev_cmd_vf_getattr(ionic, vf, attr, &comp);
+       if (err && comp.status != IONIC_RC_ENOSUPP)
+               goto err_out;
+       if (!err)
+               ether_addr_copy(ionic->vfs[vf].macaddr, comp.macaddr);
+
+err_out:
+       if (err)
+               dev_err(ionic->dev, "Failed to get %s for VF %d\n",
+                       ionic_vf_attr_to_str(attr), vf);
+
+       return err;
+}
+
 static int ionic_get_vf_config(struct net_device *netdev,
                               int vf, struct ifla_vf_info *ivf)
 {
@@ -2167,14 +2242,18 @@ static int ionic_get_vf_config(struct net_device *netdev,
        if (vf >= pci_num_vf(ionic->pdev) || !ionic->vfs) {
                ret = -EINVAL;
        } else {
-               ivf->vf           = vf;
-               ivf->vlan         = le16_to_cpu(ionic->vfs[vf].vlanid);
-               ivf->qos          = 0;
-               ivf->spoofchk     = ionic->vfs[vf].spoofchk;
-               ivf->linkstate    = ionic->vfs[vf].linkstate;
-               ivf->max_tx_rate  = le32_to_cpu(ionic->vfs[vf].maxrate);
-               ivf->trusted      = ionic->vfs[vf].trusted;
-               ether_addr_copy(ivf->mac, ionic->vfs[vf].macaddr);
+               ivf->vf = vf;
+               ivf->qos = 0;
+
+               ret = ionic_update_cached_vf_config(ionic, vf);
+               if (!ret) {
+                       ivf->vlan         = le16_to_cpu(ionic->vfs[vf].vlanid);
+                       ivf->spoofchk     = ionic->vfs[vf].spoofchk;
+                       ivf->linkstate    = ionic->vfs[vf].linkstate;
+                       ivf->max_tx_rate  = le32_to_cpu(ionic->vfs[vf].maxrate);
+                       ivf->trusted      = ionic->vfs[vf].trusted;
+                       ether_addr_copy(ivf->mac, ionic->vfs[vf].macaddr);
+               }
        }
 
        up_read(&ionic->vf_op_lock);
@@ -2220,6 +2299,7 @@ static int ionic_get_vf_stats(struct net_device *netdev, int vf,
 
 static int ionic_set_vf_mac(struct net_device *netdev, int vf, u8 *mac)
 {
+       struct ionic_vf_setattr_cmd vfc = { .attr = IONIC_VF_ATTR_MAC };
        struct ionic_lif *lif = netdev_priv(netdev);
        struct ionic *ionic = lif->ionic;
        int ret;
@@ -2235,7 +2315,11 @@ static int ionic_set_vf_mac(struct net_device *netdev, int vf, u8 *mac)
        if (vf >= pci_num_vf(ionic->pdev) || !ionic->vfs) {
                ret = -EINVAL;
        } else {
-               ret = ionic_set_vf_config(ionic, vf, IONIC_VF_ATTR_MAC, mac);
+               ether_addr_copy(vfc.macaddr, mac);
+               dev_dbg(ionic->dev, "%s: vf %d macaddr %pM\n",
+                       __func__, vf, vfc.macaddr);
+
+               ret = ionic_set_vf_config(ionic, vf, &vfc);
                if (!ret)
                        ether_addr_copy(ionic->vfs[vf].macaddr, mac);
        }
@@ -2247,6 +2331,7 @@ static int ionic_set_vf_mac(struct net_device *netdev, int vf, u8 *mac)
 static int ionic_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan,
                             u8 qos, __be16 proto)
 {
+       struct ionic_vf_setattr_cmd vfc = { .attr = IONIC_VF_ATTR_VLAN };
        struct ionic_lif *lif = netdev_priv(netdev);
        struct ionic *ionic = lif->ionic;
        int ret;
@@ -2269,8 +2354,11 @@ static int ionic_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan,
        if (vf >= pci_num_vf(ionic->pdev) || !ionic->vfs) {
                ret = -EINVAL;
        } else {
-               ret = ionic_set_vf_config(ionic, vf,
-                                         IONIC_VF_ATTR_VLAN, (u8 *)&vlan);
+               vfc.vlanid = cpu_to_le16(vlan);
+               dev_dbg(ionic->dev, "%s: vf %d vlan %d\n",
+                       __func__, vf, le16_to_cpu(vfc.vlanid));
+
+               ret = ionic_set_vf_config(ionic, vf, &vfc);
                if (!ret)
                        ionic->vfs[vf].vlanid = cpu_to_le16(vlan);
        }
@@ -2282,6 +2370,7 @@ static int ionic_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan,
 static int ionic_set_vf_rate(struct net_device *netdev, int vf,
                             int tx_min, int tx_max)
 {
+       struct ionic_vf_setattr_cmd vfc = { .attr = IONIC_VF_ATTR_RATE };
        struct ionic_lif *lif = netdev_priv(netdev);
        struct ionic *ionic = lif->ionic;
        int ret;
@@ -2298,8 +2387,11 @@ static int ionic_set_vf_rate(struct net_device *netdev, int vf,
        if (vf >= pci_num_vf(ionic->pdev) || !ionic->vfs) {
                ret = -EINVAL;
        } else {
-               ret = ionic_set_vf_config(ionic, vf,
-                                         IONIC_VF_ATTR_RATE, (u8 *)&tx_max);
+               vfc.maxrate = cpu_to_le32(tx_max);
+               dev_dbg(ionic->dev, "%s: vf %d maxrate %d\n",
+                       __func__, vf, le32_to_cpu(vfc.maxrate));
+
+               ret = ionic_set_vf_config(ionic, vf, &vfc);
                if (!ret)
                        lif->ionic->vfs[vf].maxrate = cpu_to_le32(tx_max);
        }
@@ -2310,9 +2402,9 @@ static int ionic_set_vf_rate(struct net_device *netdev, int vf,
 
 static int ionic_set_vf_spoofchk(struct net_device *netdev, int vf, bool set)
 {
+       struct ionic_vf_setattr_cmd vfc = { .attr = IONIC_VF_ATTR_SPOOFCHK };
        struct ionic_lif *lif = netdev_priv(netdev);
        struct ionic *ionic = lif->ionic;
-       u8 data = set;  /* convert to u8 for config */
        int ret;
 
        if (!netif_device_present(netdev))
@@ -2323,10 +2415,13 @@ static int ionic_set_vf_spoofchk(struct net_device *netdev, int vf, bool set)
        if (vf >= pci_num_vf(ionic->pdev) || !ionic->vfs) {
                ret = -EINVAL;
        } else {
-               ret = ionic_set_vf_config(ionic, vf,
-                                         IONIC_VF_ATTR_SPOOFCHK, &data);
+               vfc.spoofchk = set;
+               dev_dbg(ionic->dev, "%s: vf %d spoof %d\n",
+                       __func__, vf, vfc.spoofchk);
+
+               ret = ionic_set_vf_config(ionic, vf, &vfc);
                if (!ret)
-                       ionic->vfs[vf].spoofchk = data;
+                       ionic->vfs[vf].spoofchk = set;
        }
 
        up_write(&ionic->vf_op_lock);
@@ -2335,9 +2430,9 @@ static int ionic_set_vf_spoofchk(struct net_device *netdev, int vf, bool set)
 
 static int ionic_set_vf_trust(struct net_device *netdev, int vf, bool set)
 {
+       struct ionic_vf_setattr_cmd vfc = { .attr = IONIC_VF_ATTR_TRUST };
        struct ionic_lif *lif = netdev_priv(netdev);
        struct ionic *ionic = lif->ionic;
-       u8 data = set;  /* convert to u8 for config */
        int ret;
 
        if (!netif_device_present(netdev))
@@ -2348,10 +2443,13 @@ static int ionic_set_vf_trust(struct net_device *netdev, int vf, bool set)
        if (vf >= pci_num_vf(ionic->pdev) || !ionic->vfs) {
                ret = -EINVAL;
        } else {
-               ret = ionic_set_vf_config(ionic, vf,
-                                         IONIC_VF_ATTR_TRUST, &data);
+               vfc.trust = set;
+               dev_dbg(ionic->dev, "%s: vf %d trust %d\n",
+                       __func__, vf, vfc.trust);
+
+               ret = ionic_set_vf_config(ionic, vf, &vfc);
                if (!ret)
-                       ionic->vfs[vf].trusted = data;
+                       ionic->vfs[vf].trusted = set;
        }
 
        up_write(&ionic->vf_op_lock);
@@ -2360,20 +2458,21 @@ static int ionic_set_vf_trust(struct net_device *netdev, int vf, bool set)
 
 static int ionic_set_vf_link_state(struct net_device *netdev, int vf, int set)
 {
+       struct ionic_vf_setattr_cmd vfc = { .attr = IONIC_VF_ATTR_LINKSTATE };
        struct ionic_lif *lif = netdev_priv(netdev);
        struct ionic *ionic = lif->ionic;
-       u8 data;
+       u8 vfls;
        int ret;
 
        switch (set) {
        case IFLA_VF_LINK_STATE_ENABLE:
-               data = IONIC_VF_LINK_STATUS_UP;
+               vfls = IONIC_VF_LINK_STATUS_UP;
                break;
        case IFLA_VF_LINK_STATE_DISABLE:
-               data = IONIC_VF_LINK_STATUS_DOWN;
+               vfls = IONIC_VF_LINK_STATUS_DOWN;
                break;
        case IFLA_VF_LINK_STATE_AUTO:
-               data = IONIC_VF_LINK_STATUS_AUTO;
+               vfls = IONIC_VF_LINK_STATUS_AUTO;
                break;
        default:
                return -EINVAL;
@@ -2387,8 +2486,11 @@ static int ionic_set_vf_link_state(struct net_device *netdev, int vf, int set)
        if (vf >= pci_num_vf(ionic->pdev) || !ionic->vfs) {
                ret = -EINVAL;
        } else {
-               ret = ionic_set_vf_config(ionic, vf,
-                                         IONIC_VF_ATTR_LINKSTATE, &data);
+               vfc.linkstate = vfls;
+               dev_dbg(ionic->dev, "%s: vf %d linkstate %d\n",
+                       __func__, vf, vfc.linkstate);
+
+               ret = ionic_set_vf_config(ionic, vf, &vfc);
                if (!ret)
                        ionic->vfs[vf].linkstate = set;
        }
@@ -2835,6 +2937,7 @@ static void ionic_lif_handle_fw_down(struct ionic_lif *lif)
 
        mutex_unlock(&lif->queue_lock);
 
+       clear_bit(IONIC_LIF_F_FW_STOPPING, lif->state);
        dev_info(ionic->dev, "FW Down: LIFs stopped\n");
 }
 
@@ -2934,8 +3037,6 @@ void ionic_lif_free(struct ionic_lif *lif)
        /* unmap doorbell page */
        ionic_bus_unmap_dbpage(lif->ionic, lif->kern_dbpage);
        lif->kern_dbpage = NULL;
-       kfree(lif->dbid_inuse);
-       lif->dbid_inuse = NULL;
 
        mutex_destroy(&lif->config_lock);
        mutex_destroy(&lif->queue_lock);
@@ -3135,22 +3236,12 @@ int ionic_lif_init(struct ionic_lif *lif)
                return -EINVAL;
        }
 
-       lif->dbid_inuse = bitmap_zalloc(lif->dbid_count, GFP_KERNEL);
-       if (!lif->dbid_inuse) {
-               dev_err(dev, "Failed alloc doorbell id bitmap, aborting\n");
-               return -ENOMEM;
-       }
-
-       /* first doorbell id reserved for kernel (dbid aka pid == zero) */
-       set_bit(0, lif->dbid_inuse);
        lif->kern_pid = 0;
-
        dbpage_num = ionic_db_page_num(lif, lif->kern_pid);
        lif->kern_dbpage = ionic_bus_map_dbpage(lif->ionic, dbpage_num);
        if (!lif->kern_dbpage) {
                dev_err(dev, "Cannot map dbpage, aborting\n");
-               err = -ENOMEM;
-               goto err_out_free_dbid;
+               return -ENOMEM;
        }
 
        err = ionic_lif_adminq_init(lif);
@@ -3186,15 +3277,13 @@ int ionic_lif_init(struct ionic_lif *lif)
        return 0;
 
 err_out_notifyq_deinit:
+       napi_disable(&lif->adminqcq->napi);
        ionic_lif_qcq_deinit(lif, lif->notifyqcq);
 err_out_adminq_deinit:
        ionic_lif_qcq_deinit(lif, lif->adminqcq);
        ionic_lif_reset(lif);
        ionic_bus_unmap_dbpage(lif->ionic, lif->kern_dbpage);
        lif->kern_dbpage = NULL;
-err_out_free_dbid:
-       kfree(lif->dbid_inuse);
-       lif->dbid_inuse = NULL;
 
        return err;
 }
index 9f7ab2f..a53984b 100644 (file)
@@ -135,6 +135,7 @@ enum ionic_lif_state_flags {
        IONIC_LIF_F_LINK_CHECK_REQUESTED,
        IONIC_LIF_F_FILTER_SYNC_NEEDED,
        IONIC_LIF_F_FW_RESET,
+       IONIC_LIF_F_FW_STOPPING,
        IONIC_LIF_F_SPLIT_INTR,
        IONIC_LIF_F_BROKEN,
        IONIC_LIF_F_TX_DIM_INTR,
@@ -213,7 +214,6 @@ struct ionic_lif {
        u32 rx_coalesce_hw;             /* what the hw is using */
        u32 tx_coalesce_usecs;          /* what the user asked for */
        u32 tx_coalesce_hw;             /* what the hw is using */
-       unsigned long *dbid_inuse;
        unsigned int dbid_count;
 
        struct ionic_phc *phc;
index 875f4ec..4029b4e 100644 (file)
@@ -188,6 +188,28 @@ static const char *ionic_opcode_to_str(enum ionic_cmd_opcode opcode)
        }
 }
 
+const char *ionic_vf_attr_to_str(enum ionic_vf_attr attr)
+{
+       switch (attr) {
+       case IONIC_VF_ATTR_SPOOFCHK:
+               return "IONIC_VF_ATTR_SPOOFCHK";
+       case IONIC_VF_ATTR_TRUST:
+               return "IONIC_VF_ATTR_TRUST";
+       case IONIC_VF_ATTR_LINKSTATE:
+               return "IONIC_VF_ATTR_LINKSTATE";
+       case IONIC_VF_ATTR_MAC:
+               return "IONIC_VF_ATTR_MAC";
+       case IONIC_VF_ATTR_VLAN:
+               return "IONIC_VF_ATTR_VLAN";
+       case IONIC_VF_ATTR_RATE:
+               return "IONIC_VF_ATTR_RATE";
+       case IONIC_VF_ATTR_STATSADDR:
+               return "IONIC_VF_ATTR_STATSADDR";
+       default:
+               return "IONIC_VF_ATTR_UNKNOWN";
+       }
+}
+
 static void ionic_adminq_flush(struct ionic_lif *lif)
 {
        struct ionic_desc_info *desc_info;
@@ -215,9 +237,13 @@ static void ionic_adminq_flush(struct ionic_lif *lif)
 void ionic_adminq_netdev_err_print(struct ionic_lif *lif, u8 opcode,
                                   u8 status, int err)
 {
+       const char *stat_str;
+
+       stat_str = (err == -ETIMEDOUT) ? "TIMEOUT" :
+                                        ionic_error_to_str(status);
+
        netdev_err(lif->netdev, "%s (%d) failed: %s (%d)\n",
-                  ionic_opcode_to_str(opcode), opcode,
-                  ionic_error_to_str(status), err);
+                  ionic_opcode_to_str(opcode), opcode, stat_str, err);
 }
 
 static int ionic_adminq_check_err(struct ionic_lif *lif,
@@ -318,6 +344,7 @@ int ionic_adminq_wait(struct ionic_lif *lif, struct ionic_admin_ctx *ctx,
                if (do_msg && !test_bit(IONIC_LIF_F_FW_RESET, lif->state))
                        netdev_err(netdev, "Posting of %s (%d) failed: %d\n",
                                   name, ctx->cmd.cmd.opcode, err);
+               ctx->comp.comp.status = IONIC_RC_ERROR;
                return err;
        }
 
@@ -331,11 +358,15 @@ int ionic_adminq_wait(struct ionic_lif *lif, struct ionic_admin_ctx *ctx,
                if (remaining)
                        break;
 
-               /* interrupt the wait if FW stopped */
-               if (test_bit(IONIC_LIF_F_FW_RESET, lif->state)) {
+               /* force a check of FW status and break out if FW reset */
+               (void)ionic_heartbeat_check(lif->ionic);
+               if ((test_bit(IONIC_LIF_F_FW_RESET, lif->state) &&
+                    !lif->ionic->idev.fw_status_ready) ||
+                   test_bit(IONIC_LIF_F_FW_STOPPING, lif->state)) {
                        if (do_msg)
-                               netdev_err(netdev, "%s (%d) interrupted, FW in reset\n",
-                                          name, ctx->cmd.cmd.opcode);
+                               netdev_warn(netdev, "%s (%d) interrupted, FW in reset\n",
+                                           name, ctx->cmd.cmd.opcode);
+                       ctx->comp.comp.status = IONIC_RC_ERROR;
                        return -ENXIO;
                }
 
@@ -370,21 +401,34 @@ int ionic_adminq_post_wait_nomsg(struct ionic_lif *lif, struct ionic_admin_ctx *
 
 static void ionic_dev_cmd_clean(struct ionic *ionic)
 {
-       union __iomem ionic_dev_cmd_regs *regs = ionic->idev.dev_cmd_regs;
+       struct ionic_dev *idev = &ionic->idev;
 
-       iowrite32(0, &regs->doorbell);
-       memset_io(&regs->cmd, 0, sizeof(regs->cmd));
+       iowrite32(0, &idev->dev_cmd_regs->doorbell);
+       memset_io(&idev->dev_cmd_regs->cmd, 0, sizeof(idev->dev_cmd_regs->cmd));
 }
 
-int ionic_dev_cmd_wait(struct ionic *ionic, unsigned long max_seconds)
+void ionic_dev_cmd_dev_err_print(struct ionic *ionic, u8 opcode, u8 status,
+                                int err)
+{
+       const char *stat_str;
+
+       stat_str = (err == -ETIMEDOUT) ? "TIMEOUT" :
+                                        ionic_error_to_str(status);
+
+       dev_err(ionic->dev, "DEV_CMD %s (%d) error, %s (%d) failed\n",
+               ionic_opcode_to_str(opcode), opcode, stat_str, err);
+}
+
+static int __ionic_dev_cmd_wait(struct ionic *ionic, unsigned long max_seconds,
+                               const bool do_msg)
 {
        struct ionic_dev *idev = &ionic->idev;
        unsigned long start_time;
        unsigned long max_wait;
        unsigned long duration;
+       int done = 0;
+       bool fw_up;
        int opcode;
-       int hb = 0;
-       int done;
        int err;
 
        /* Wait for dev cmd to complete, retrying if we get EAGAIN,
@@ -394,31 +438,24 @@ int ionic_dev_cmd_wait(struct ionic *ionic, unsigned long max_seconds)
 try_again:
        opcode = readb(&idev->dev_cmd_regs->cmd.cmd.opcode);
        start_time = jiffies;
-       do {
+       for (fw_up = ionic_is_fw_running(idev);
+            !done && fw_up && time_before(jiffies, max_wait);
+            fw_up = ionic_is_fw_running(idev)) {
                done = ionic_dev_cmd_done(idev);
                if (done)
                        break;
                usleep_range(100, 200);
-
-               /* Don't check the heartbeat on FW_CONTROL commands as they are
-                * notorious for interrupting the firmware's heartbeat update.
-                */
-               if (opcode != IONIC_CMD_FW_CONTROL)
-                       hb = ionic_heartbeat_check(ionic);
-       } while (!done && !hb && time_before(jiffies, max_wait));
+       }
        duration = jiffies - start_time;
 
        dev_dbg(ionic->dev, "DEVCMD %s (%d) done=%d took %ld secs (%ld jiffies)\n",
                ionic_opcode_to_str(opcode), opcode,
                done, duration / HZ, duration);
 
-       if (!done && hb) {
-               /* It is possible (but unlikely) that FW was busy and missed a
-                * heartbeat check but is still alive and will process this
-                * request, so don't clean the dev_cmd in this case.
-                */
-               dev_dbg(ionic->dev, "DEVCMD %s (%d) failed - FW halted\n",
-                       ionic_opcode_to_str(opcode), opcode);
+       if (!done && !fw_up) {
+               ionic_dev_cmd_clean(ionic);
+               dev_warn(ionic->dev, "DEVCMD %s (%d) interrupted - FW is down\n",
+                        ionic_opcode_to_str(opcode), opcode);
                return -ENXIO;
        }
 
@@ -444,9 +481,9 @@ try_again:
                }
 
                if (!(opcode == IONIC_CMD_FW_CONTROL && err == IONIC_RC_EAGAIN))
-                       dev_err(ionic->dev, "DEV_CMD %s (%d) error, %s (%d) failed\n",
-                               ionic_opcode_to_str(opcode), opcode,
-                               ionic_error_to_str(err), err);
+                       if (do_msg)
+                               ionic_dev_cmd_dev_err_print(ionic, opcode, err,
+                                                           ionic_error_to_errno(err));
 
                return ionic_error_to_errno(err);
        }
@@ -454,6 +491,16 @@ try_again:
        return 0;
 }
 
+int ionic_dev_cmd_wait(struct ionic *ionic, unsigned long max_seconds)
+{
+       return __ionic_dev_cmd_wait(ionic, max_seconds, true);
+}
+
+int ionic_dev_cmd_wait_nomsg(struct ionic *ionic, unsigned long max_seconds)
+{
+       return __ionic_dev_cmd_wait(ionic, max_seconds, false);
+}
+
 int ionic_setup(struct ionic *ionic)
 {
        int err;
@@ -540,6 +587,9 @@ int ionic_reset(struct ionic *ionic)
        struct ionic_dev *idev = &ionic->idev;
        int err;
 
+       if (!ionic_is_fw_running(idev))
+               return 0;
+
        mutex_lock(&ionic->dev_cmd_lock);
        ionic_dev_cmd_reset(idev);
        err = ionic_dev_cmd_wait(ionic, DEVCMD_TIMEOUT);
@@ -612,15 +662,17 @@ int ionic_port_init(struct ionic *ionic)
 int ionic_port_reset(struct ionic *ionic)
 {
        struct ionic_dev *idev = &ionic->idev;
-       int err;
+       int err = 0;
 
        if (!idev->port_info)
                return 0;
 
-       mutex_lock(&ionic->dev_cmd_lock);
-       ionic_dev_cmd_port_reset(idev);
-       err = ionic_dev_cmd_wait(ionic, DEVCMD_TIMEOUT);
-       mutex_unlock(&ionic->dev_cmd_lock);
+       if (ionic_is_fw_running(idev)) {
+               mutex_lock(&ionic->dev_cmd_lock);
+               ionic_dev_cmd_port_reset(idev);
+               err = ionic_dev_cmd_wait(ionic, DEVCMD_TIMEOUT);
+               mutex_unlock(&ionic->dev_cmd_lock);
+       }
 
        dma_free_coherent(ionic->dev, idev->port_info_sz,
                          idev->port_info, idev->port_info_pa);
@@ -628,9 +680,6 @@ int ionic_port_reset(struct ionic *ionic)
        idev->port_info = NULL;
        idev->port_info_pa = 0;
 
-       if (err)
-               dev_err(ionic->dev, "Failed to reset port\n");
-
        return err;
 }
 
index f6e785f..b736337 100644 (file)
@@ -376,10 +376,24 @@ static int ionic_lif_filter_add(struct ionic_lif *lif,
 
                spin_unlock_bh(&lif->rx_filters.lock);
 
-               if (err == -ENOSPC) {
-                       if (le16_to_cpu(ctx.cmd.rx_filter_add.match) == IONIC_RX_FILTER_MATCH_VLAN)
-                               lif->max_vlans = lif->nvlans;
+               /* store the max_vlans limit that we found */
+               if (err == -ENOSPC &&
+                   le16_to_cpu(ctx.cmd.rx_filter_add.match) == IONIC_RX_FILTER_MATCH_VLAN)
+                       lif->max_vlans = lif->nvlans;
+
+               /* Prevent unnecessary error messages on recoverable
+                * errors as the filter will get retried on the next
+                * sync attempt.
+                */
+               switch (err) {
+               case -ENOSPC:
+               case -ENXIO:
+               case -ETIMEDOUT:
+               case -EAGAIN:
+               case -EBUSY:
                        return 0;
+               default:
+                       break;
                }
 
                ionic_adminq_netdev_err_print(lif, ctx.cmd.cmd.opcode,
@@ -494,9 +508,22 @@ static int ionic_lif_filter_del(struct ionic_lif *lif,
        spin_unlock_bh(&lif->rx_filters.lock);
 
        if (state != IONIC_FILTER_STATE_NEW) {
-               err = ionic_adminq_post_wait(lif, &ctx);
-               if (err && err != -EEXIST)
+               err = ionic_adminq_post_wait_nomsg(lif, &ctx);
+
+               switch (err) {
+                       /* ignore these errors */
+               case -EEXIST:
+               case -ENXIO:
+               case -ETIMEDOUT:
+               case -EAGAIN:
+               case -EBUSY:
+               case 0:
+                       break;
+               default:
+                       ionic_adminq_netdev_err_print(lif, ctx.cmd.cmd.opcode,
+                                                     ctx.comp.comp.status, err);
                        return err;
+               }
        }
 
        return 0;
index 94384f5..d197a70 100644 (file)
@@ -669,27 +669,37 @@ dma_fail:
        return -EIO;
 }
 
+static void ionic_tx_desc_unmap_bufs(struct ionic_queue *q,
+                                    struct ionic_desc_info *desc_info)
+{
+       struct ionic_buf_info *buf_info = desc_info->bufs;
+       struct device *dev = q->dev;
+       unsigned int i;
+
+       if (!desc_info->nbufs)
+               return;
+
+       dma_unmap_single(dev, (dma_addr_t)buf_info->dma_addr,
+                        buf_info->len, DMA_TO_DEVICE);
+       buf_info++;
+       for (i = 1; i < desc_info->nbufs; i++, buf_info++)
+               dma_unmap_page(dev, (dma_addr_t)buf_info->dma_addr,
+                              buf_info->len, DMA_TO_DEVICE);
+
+       desc_info->nbufs = 0;
+}
+
 static void ionic_tx_clean(struct ionic_queue *q,
                           struct ionic_desc_info *desc_info,
                           struct ionic_cq_info *cq_info,
                           void *cb_arg)
 {
-       struct ionic_buf_info *buf_info = desc_info->bufs;
        struct ionic_tx_stats *stats = q_to_tx_stats(q);
        struct ionic_qcq *qcq = q_to_qcq(q);
        struct sk_buff *skb = cb_arg;
-       struct device *dev = q->dev;
-       unsigned int i;
        u16 qi;
 
-       if (desc_info->nbufs) {
-               dma_unmap_single(dev, (dma_addr_t)buf_info->dma_addr,
-                                buf_info->len, DMA_TO_DEVICE);
-               buf_info++;
-               for (i = 1; i < desc_info->nbufs; i++, buf_info++)
-                       dma_unmap_page(dev, (dma_addr_t)buf_info->dma_addr,
-                                      buf_info->len, DMA_TO_DEVICE);
-       }
+       ionic_tx_desc_unmap_bufs(q, desc_info);
 
        if (!skb)
                return;
@@ -931,8 +941,11 @@ static int ionic_tx_tso(struct ionic_queue *q, struct sk_buff *skb)
                err = ionic_tx_tcp_inner_pseudo_csum(skb);
        else
                err = ionic_tx_tcp_pseudo_csum(skb);
-       if (err)
+       if (err) {
+               /* clean up mapping from ionic_tx_map_skb */
+               ionic_tx_desc_unmap_bufs(q, desc_info);
                return err;
+       }
 
        if (encap)
                hdrlen = skb_inner_transport_header(skb) - skb->data +
@@ -1003,8 +1016,8 @@ static int ionic_tx_tso(struct ionic_queue *q, struct sk_buff *skb)
        return 0;
 }
 
-static int ionic_tx_calc_csum(struct ionic_queue *q, struct sk_buff *skb,
-                             struct ionic_desc_info *desc_info)
+static void ionic_tx_calc_csum(struct ionic_queue *q, struct sk_buff *skb,
+                              struct ionic_desc_info *desc_info)
 {
        struct ionic_txq_desc *desc = desc_info->txq_desc;
        struct ionic_buf_info *buf_info = desc_info->bufs;
@@ -1038,12 +1051,10 @@ static int ionic_tx_calc_csum(struct ionic_queue *q, struct sk_buff *skb,
                stats->crc32_csum++;
        else
                stats->csum++;
-
-       return 0;
 }
 
-static int ionic_tx_calc_no_csum(struct ionic_queue *q, struct sk_buff *skb,
-                                struct ionic_desc_info *desc_info)
+static void ionic_tx_calc_no_csum(struct ionic_queue *q, struct sk_buff *skb,
+                                 struct ionic_desc_info *desc_info)
 {
        struct ionic_txq_desc *desc = desc_info->txq_desc;
        struct ionic_buf_info *buf_info = desc_info->bufs;
@@ -1074,12 +1085,10 @@ static int ionic_tx_calc_no_csum(struct ionic_queue *q, struct sk_buff *skb,
        desc->csum_offset = 0;
 
        stats->csum_none++;
-
-       return 0;
 }
 
-static int ionic_tx_skb_frags(struct ionic_queue *q, struct sk_buff *skb,
-                             struct ionic_desc_info *desc_info)
+static void ionic_tx_skb_frags(struct ionic_queue *q, struct sk_buff *skb,
+                              struct ionic_desc_info *desc_info)
 {
        struct ionic_txq_sg_desc *sg_desc = desc_info->txq_sg_desc;
        struct ionic_buf_info *buf_info = &desc_info->bufs[1];
@@ -1093,31 +1102,24 @@ static int ionic_tx_skb_frags(struct ionic_queue *q, struct sk_buff *skb,
        }
 
        stats->frags += skb_shinfo(skb)->nr_frags;
-
-       return 0;
 }
 
 static int ionic_tx(struct ionic_queue *q, struct sk_buff *skb)
 {
        struct ionic_desc_info *desc_info = &q->info[q->head_idx];
        struct ionic_tx_stats *stats = q_to_tx_stats(q);
-       int err;
 
        if (unlikely(ionic_tx_map_skb(q, skb, desc_info)))
                return -EIO;
 
        /* set up the initial descriptor */
        if (skb->ip_summed == CHECKSUM_PARTIAL)
-               err = ionic_tx_calc_csum(q, skb, desc_info);
+               ionic_tx_calc_csum(q, skb, desc_info);
        else
-               err = ionic_tx_calc_no_csum(q, skb, desc_info);
-       if (err)
-               return err;
+               ionic_tx_calc_no_csum(q, skb, desc_info);
 
        /* add frags */
-       err = ionic_tx_skb_frags(q, skb, desc_info);
-       if (err)
-               return err;
+       ionic_tx_skb_frags(q, skb, desc_info);
 
        skb_tx_timestamp(skb);
        stats->pkts++;
index cc4ec2b..672480c 100644 (file)
@@ -3098,6 +3098,9 @@ int qed_hw_init(struct qed_dev *cdev, struct qed_hw_init_params *p_params)
                        continue;
                }
 
+               /* Some flows may keep variable set */
+               p_hwfn->mcp_info->mcp_handling_status = 0;
+
                rc = qed_calc_hw_mode(p_hwfn);
                if (rc)
                        return rc;
index da1eada..9fb1fa4 100644 (file)
@@ -140,7 +140,7 @@ static struct qed_mcp_cmd_elem *qed_mcp_cmd_get_elem(struct qed_hwfn *p_hwfn,
 int qed_mcp_free(struct qed_hwfn *p_hwfn)
 {
        if (p_hwfn->mcp_info) {
-               struct qed_mcp_cmd_elem *p_cmd_elem, *p_tmp;
+               struct qed_mcp_cmd_elem *p_cmd_elem = NULL, *p_tmp;
 
                kfree(p_hwfn->mcp_info->mfw_mb_cur);
                kfree(p_hwfn->mcp_info->mfw_mb_shadow);
@@ -249,6 +249,7 @@ int qed_mcp_cmd_init(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
        /* Initialize the MFW spinlock */
        spin_lock_init(&p_info->cmd_lock);
        spin_lock_init(&p_info->link_lock);
+       spin_lock_init(&p_info->unload_lock);
 
        INIT_LIST_HEAD(&p_info->cmd_list);
 
@@ -614,12 +615,13 @@ static int qed_mcp_cmd_and_union(struct qed_hwfn *p_hwfn,
                                      usecs);
 }
 
-int qed_mcp_cmd(struct qed_hwfn *p_hwfn,
-               struct qed_ptt *p_ptt,
-               u32 cmd,
-               u32 param,
-               u32 *o_mcp_resp,
-               u32 *o_mcp_param)
+static int _qed_mcp_cmd(struct qed_hwfn *p_hwfn,
+                       struct qed_ptt *p_ptt,
+                       u32 cmd,
+                       u32 param,
+                       u32 *o_mcp_resp,
+                       u32 *o_mcp_param,
+                       bool can_sleep)
 {
        struct qed_mcp_mb_params mb_params;
        int rc;
@@ -627,6 +629,7 @@ int qed_mcp_cmd(struct qed_hwfn *p_hwfn,
        memset(&mb_params, 0, sizeof(mb_params));
        mb_params.cmd = cmd;
        mb_params.param = param;
+       mb_params.flags = can_sleep ? QED_MB_FLAG_CAN_SLEEP : 0;
 
        rc = qed_mcp_cmd_and_union(p_hwfn, p_ptt, &mb_params);
        if (rc)
@@ -638,6 +641,28 @@ int qed_mcp_cmd(struct qed_hwfn *p_hwfn,
        return 0;
 }
 
+int qed_mcp_cmd(struct qed_hwfn *p_hwfn,
+               struct qed_ptt *p_ptt,
+               u32 cmd,
+               u32 param,
+               u32 *o_mcp_resp,
+               u32 *o_mcp_param)
+{
+       return (_qed_mcp_cmd(p_hwfn, p_ptt, cmd, param,
+                            o_mcp_resp, o_mcp_param, true));
+}
+
+int qed_mcp_cmd_nosleep(struct qed_hwfn *p_hwfn,
+                       struct qed_ptt *p_ptt,
+                       u32 cmd,
+                       u32 param,
+                       u32 *o_mcp_resp,
+                       u32 *o_mcp_param)
+{
+       return (_qed_mcp_cmd(p_hwfn, p_ptt, cmd, param,
+                            o_mcp_resp, o_mcp_param, false));
+}
+
 static int
 qed_mcp_nvm_wr_cmd(struct qed_hwfn *p_hwfn,
                   struct qed_ptt *p_ptt,
@@ -1071,10 +1096,15 @@ int qed_mcp_load_done(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
        return 0;
 }
 
+#define MFW_COMPLETION_MAX_ITER 5000
+#define MFW_COMPLETION_INTERVAL_MS 1
+
 int qed_mcp_unload_req(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
 {
        struct qed_mcp_mb_params mb_params;
+       u32 cnt = MFW_COMPLETION_MAX_ITER;
        u32 wol_param;
+       int rc;
 
        switch (p_hwfn->cdev->wol_config) {
        case QED_OV_WOL_DISABLED:
@@ -1097,7 +1127,23 @@ int qed_mcp_unload_req(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
        mb_params.param = wol_param;
        mb_params.flags = QED_MB_FLAG_CAN_SLEEP | QED_MB_FLAG_AVOID_BLOCK;
 
-       return qed_mcp_cmd_and_union(p_hwfn, p_ptt, &mb_params);
+       spin_lock_bh(&p_hwfn->mcp_info->unload_lock);
+       set_bit(QED_MCP_BYPASS_PROC_BIT,
+               &p_hwfn->mcp_info->mcp_handling_status);
+       spin_unlock_bh(&p_hwfn->mcp_info->unload_lock);
+
+       rc = qed_mcp_cmd_and_union(p_hwfn, p_ptt, &mb_params);
+
+       while (test_bit(QED_MCP_IN_PROCESSING_BIT,
+                       &p_hwfn->mcp_info->mcp_handling_status) && --cnt)
+               msleep(MFW_COMPLETION_INTERVAL_MS);
+
+       if (!cnt)
+               DP_NOTICE(p_hwfn,
+                         "Failed to wait MFW event completion after %d msec\n",
+                         MFW_COMPLETION_MAX_ITER * MFW_COMPLETION_INTERVAL_MS);
+
+       return rc;
 }
 
 int qed_mcp_unload_done(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
@@ -1728,8 +1774,8 @@ static void qed_mcp_update_bw(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
        qed_configure_pf_max_bandwidth(p_hwfn->cdev, p_info->bandwidth_max);
 
        /* Acknowledge the MFW */
-       qed_mcp_cmd(p_hwfn, p_ptt, DRV_MSG_CODE_BW_UPDATE_ACK, 0, &resp,
-                   &param);
+       qed_mcp_cmd_nosleep(p_hwfn, p_ptt, DRV_MSG_CODE_BW_UPDATE_ACK, 0, &resp,
+                           &param);
 }
 
 static void qed_mcp_update_stag(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
@@ -1766,8 +1812,8 @@ static void qed_mcp_update_stag(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
                   p_hwfn->mcp_info->func_info.ovlan, p_hwfn->hw_info.hw_mode);
 
        /* Acknowledge the MFW */
-       qed_mcp_cmd(p_hwfn, p_ptt, DRV_MSG_CODE_S_TAG_UPDATE_ACK, 0,
-                   &resp, &param);
+       qed_mcp_cmd_nosleep(p_hwfn, p_ptt, DRV_MSG_CODE_S_TAG_UPDATE_ACK, 0,
+                           &resp, &param);
 }
 
 static void qed_mcp_handle_fan_failure(struct qed_hwfn *p_hwfn,
@@ -1997,6 +2043,19 @@ int qed_mcp_handle_events(struct qed_hwfn *p_hwfn,
                           "Msg [%d] - old CMD 0x%02x, new CMD 0x%02x\n",
                           i, info->mfw_mb_shadow[i], info->mfw_mb_cur[i]);
 
+               spin_lock_bh(&p_hwfn->mcp_info->unload_lock);
+               if (test_bit(QED_MCP_BYPASS_PROC_BIT,
+                            &p_hwfn->mcp_info->mcp_handling_status)) {
+                       spin_unlock_bh(&p_hwfn->mcp_info->unload_lock);
+                       DP_INFO(p_hwfn,
+                               "Msg [%d] is bypassed on unload flow\n", i);
+                       continue;
+               }
+
+               set_bit(QED_MCP_IN_PROCESSING_BIT,
+                       &p_hwfn->mcp_info->mcp_handling_status);
+               spin_unlock_bh(&p_hwfn->mcp_info->unload_lock);
+
                switch (i) {
                case MFW_DRV_MSG_LINK_CHANGE:
                        qed_mcp_handle_link_change(p_hwfn, p_ptt, false);
@@ -2050,6 +2109,9 @@ int qed_mcp_handle_events(struct qed_hwfn *p_hwfn,
                        DP_INFO(p_hwfn, "Unimplemented MFW message %d\n", i);
                        rc = -EINVAL;
                }
+
+               clear_bit(QED_MCP_IN_PROCESSING_BIT,
+                         &p_hwfn->mcp_info->mcp_handling_status);
        }
 
        /* ACK everything */
@@ -3675,8 +3737,8 @@ static int qed_mcp_resource_cmd(struct qed_hwfn *p_hwfn,
 {
        int rc;
 
-       rc = qed_mcp_cmd(p_hwfn, p_ptt, DRV_MSG_CODE_RESOURCE_CMD, param,
-                        p_mcp_resp, p_mcp_param);
+       rc = qed_mcp_cmd_nosleep(p_hwfn, p_ptt, DRV_MSG_CODE_RESOURCE_CMD,
+                                param, p_mcp_resp, p_mcp_param);
        if (rc)
                return rc;
 
index 369e189..9bd0565 100644 (file)
@@ -393,11 +393,12 @@ int qed_mcp_get_board_config(struct qed_hwfn *p_hwfn,
                             struct qed_ptt *p_ptt, u32 *p_board_config);
 
 /**
- * qed_mcp_cmd(): General function for sending commands to the MCP
+ * qed_mcp_cmd(): Sleepable function for sending commands to the MCP
  *                mailbox. It acquire mutex lock for the entire
  *                operation, from sending the request until the MCP
  *                response. Waiting for MCP response will be checked up
- *                to 5 seconds every 5ms.
+ *                to 5 seconds every 10ms. Should not be called from atomic
+ *                context.
  *
  * @p_hwfn: HW device data.
  * @p_ptt: PTT required for register access.
@@ -417,6 +418,31 @@ int qed_mcp_cmd(struct qed_hwfn *p_hwfn,
                u32 *o_mcp_param);
 
 /**
+ * qed_mcp_cmd_nosleep(): Function for sending commands to the MCP
+ *                        mailbox. It acquire mutex lock for the entire
+ *                        operation, from sending the request until the MCP
+ *                        response. Waiting for MCP response will be checked up
+ *                        to 5 seconds every 10us. Should be called when sleep
+ *                        is not allowed.
+ *
+ * @p_hwfn: HW device data.
+ * @p_ptt: PTT required for register access.
+ * @cmd: command to be sent to the MCP.
+ * @param: Optional param
+ * @o_mcp_resp: The MCP response code (exclude sequence).
+ * @o_mcp_param: Optional parameter provided by the MCP
+ *                     response
+ *
+ * Return: Int - 0 - Operation was successul.
+ */
+int qed_mcp_cmd_nosleep(struct qed_hwfn *p_hwfn,
+                       struct qed_ptt *p_ptt,
+                       u32 cmd,
+                       u32 param,
+                       u32 *o_mcp_resp,
+                       u32 *o_mcp_param);
+
+/**
  * qed_mcp_drain(): drains the nig, allowing completion to pass in
  *                  case of pauses.
  *                  (Should be called only from sleepable context)
@@ -762,6 +788,14 @@ struct qed_mcp_info {
 
        /* S/N for debug data mailbox commands */
        atomic_t dbg_data_seq;
+
+       /* Spinlock used to sync the flag mcp_handling_status with
+        * the mfw events handler
+        */
+       spinlock_t unload_lock;
+       unsigned long mcp_handling_status;
+#define QED_MCP_BYPASS_PROC_BIT 0
+#define QED_MCP_IN_PROCESSING_BIT       1
 };
 
 struct qed_mcp_mb_params {
index 19e2621..67014eb 100644 (file)
@@ -2667,10 +2667,7 @@ static void rtl_enable_exit_l1(struct rtl8169_private *tp)
        case RTL_GIGA_MAC_VER_37 ... RTL_GIGA_MAC_VER_38:
                rtl_eri_set_bits(tp, 0xd4, 0x0c00);
                break;
-       case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_53:
-               rtl_eri_set_bits(tp, 0xd4, 0x1f80);
-               break;
-       case RTL_GIGA_MAC_VER_60 ... RTL_GIGA_MAC_VER_63:
+       case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_63:
                r8168_mac_ocp_modify(tp, 0xc0ac, 0, 0x1f80);
                break;
        default:
@@ -2678,13 +2675,48 @@ static void rtl_enable_exit_l1(struct rtl8169_private *tp)
        }
 }
 
+static void rtl_disable_exit_l1(struct rtl8169_private *tp)
+{
+       switch (tp->mac_version) {
+       case RTL_GIGA_MAC_VER_34 ... RTL_GIGA_MAC_VER_38:
+               rtl_eri_clear_bits(tp, 0xd4, 0x1f00);
+               break;
+       case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_63:
+               r8168_mac_ocp_modify(tp, 0xc0ac, 0x1f80, 0);
+               break;
+       default:
+               break;
+       }
+}
+
 static void rtl_hw_aspm_clkreq_enable(struct rtl8169_private *tp, bool enable)
 {
        /* Don't enable ASPM in the chip if OS can't control ASPM */
        if (enable && tp->aspm_manageable) {
                RTL_W8(tp, Config5, RTL_R8(tp, Config5) | ASPM_en);
                RTL_W8(tp, Config2, RTL_R8(tp, Config2) | ClkReqEn);
+
+               switch (tp->mac_version) {
+               case RTL_GIGA_MAC_VER_45 ... RTL_GIGA_MAC_VER_48:
+               case RTL_GIGA_MAC_VER_60 ... RTL_GIGA_MAC_VER_63:
+                       /* reset ephy tx/rx disable timer */
+                       r8168_mac_ocp_modify(tp, 0xe094, 0xff00, 0);
+                       /* chip can trigger L1.2 */
+                       r8168_mac_ocp_modify(tp, 0xe092, 0x00ff, BIT(2));
+                       break;
+               default:
+                       break;
+               }
        } else {
+               switch (tp->mac_version) {
+               case RTL_GIGA_MAC_VER_45 ... RTL_GIGA_MAC_VER_48:
+               case RTL_GIGA_MAC_VER_60 ... RTL_GIGA_MAC_VER_63:
+                       r8168_mac_ocp_modify(tp, 0xe092, 0x00ff, 0);
+                       break;
+               default:
+                       break;
+               }
+
                RTL_W8(tp, Config2, RTL_R8(tp, Config2) & ~ClkReqEn);
                RTL_W8(tp, Config5, RTL_R8(tp, Config5) & ~ASPM_en);
        }
@@ -4683,7 +4715,7 @@ static void rtl8169_down(struct rtl8169_private *tp)
        rtl_pci_commit(tp);
 
        rtl8169_cleanup(tp, true);
-
+       rtl_disable_exit_l1(tp);
        rtl_prepare_power_down(tp);
 }
 
@@ -4843,8 +4875,6 @@ static void rtl8169_net_suspend(struct rtl8169_private *tp)
                rtl8169_down(tp);
 }
 
-#ifdef CONFIG_PM
-
 static int rtl8169_runtime_resume(struct device *dev)
 {
        struct rtl8169_private *tp = dev_get_drvdata(dev);
@@ -4860,7 +4890,7 @@ static int rtl8169_runtime_resume(struct device *dev)
        return 0;
 }
 
-static int __maybe_unused rtl8169_suspend(struct device *device)
+static int rtl8169_suspend(struct device *device)
 {
        struct rtl8169_private *tp = dev_get_drvdata(device);
 
@@ -4873,7 +4903,7 @@ static int __maybe_unused rtl8169_suspend(struct device *device)
        return 0;
 }
 
-static int __maybe_unused rtl8169_resume(struct device *device)
+static int rtl8169_resume(struct device *device)
 {
        struct rtl8169_private *tp = dev_get_drvdata(device);
 
@@ -4915,13 +4945,11 @@ static int rtl8169_runtime_idle(struct device *device)
 }
 
 static const struct dev_pm_ops rtl8169_pm_ops = {
-       SET_SYSTEM_SLEEP_PM_OPS(rtl8169_suspend, rtl8169_resume)
-       SET_RUNTIME_PM_OPS(rtl8169_runtime_suspend, rtl8169_runtime_resume,
-                          rtl8169_runtime_idle)
+       SYSTEM_SLEEP_PM_OPS(rtl8169_suspend, rtl8169_resume)
+       RUNTIME_PM_OPS(rtl8169_runtime_suspend, rtl8169_runtime_resume,
+                      rtl8169_runtime_idle)
 };
 
-#endif /* CONFIG_PM */
-
 static void rtl_wol_shutdown_quirk(struct rtl8169_private *tp)
 {
        /* WoL fails with 8168b when the receiver is disabled. */
@@ -5255,6 +5283,16 @@ done:
        rtl_rar_set(tp, mac_addr);
 }
 
+/* register is set if system vendor successfully tested ASPM 1.2 */
+static bool rtl_aspm_is_safe(struct rtl8169_private *tp)
+{
+       if (tp->mac_version >= RTL_GIGA_MAC_VER_60 &&
+           r8168_mac_ocp_read(tp, 0xc0b2) & 0xf)
+               return true;
+
+       return false;
+}
+
 static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 {
        struct rtl8169_private *tp;
@@ -5333,7 +5371,9 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
         * Chips from RTL8168h partially have issues with L1.2, but seem
         * to work fine with L1 and L1.1.
         */
-       if (tp->mac_version >= RTL_GIGA_MAC_VER_45)
+       if (rtl_aspm_is_safe(tp))
+               rc = 0;
+       else if (tp->mac_version >= RTL_GIGA_MAC_VER_45)
                rc = pci_disable_link_state(pdev, PCIE_LINK_STATE_L1_2);
        else
                rc = pci_disable_link_state(pdev, PCIE_LINK_STATE_L1);
@@ -5460,9 +5500,7 @@ static struct pci_driver rtl8169_pci_driver = {
        .probe          = rtl_init_one,
        .remove         = rtl_remove_one,
        .shutdown       = rtl_shutdown,
-#ifdef CONFIG_PM
-       .driver.pm      = &rtl8169_pm_ops,
-#endif
+       .driver.pm      = pm_ptr(&rtl8169_pm_ops),
 };
 
 module_pci_driver(rtl8169_pci_driver);
index f7ad548..15c295f 100644 (file)
@@ -429,15 +429,6 @@ static const struct phy_reg rtl8168d_1_phy_reg_init_0[] = {
        { 0x0d, 0xf880 }
 };
 
-static const struct phy_reg rtl8168d_1_phy_reg_init_1[] = {
-       { 0x1f, 0x0002 },
-       { 0x05, 0x669a },
-       { 0x1f, 0x0005 },
-       { 0x05, 0x8330 },
-       { 0x06, 0x669a },
-       { 0x1f, 0x0002 }
-};
-
 static void rtl8168d_apply_firmware_cond(struct rtl8169_private *tp,
                                         struct phy_device *phydev,
                                         u16 val)
@@ -455,6 +446,29 @@ static void rtl8168d_apply_firmware_cond(struct rtl8169_private *tp,
                r8169_apply_firmware(tp);
 }
 
+static void rtl8168d_1_common(struct phy_device *phydev)
+{
+       u16 val;
+
+       phy_write_paged(phydev, 0x0002, 0x05, 0x669a);
+       r8168d_phy_param(phydev, 0x8330, 0xffff, 0x669a);
+       phy_write(phydev, 0x1f, 0x0002);
+
+       val = phy_read(phydev, 0x0d);
+
+       if ((val & 0x00ff) != 0x006c) {
+               static const u16 set[] = {
+                       0x0065, 0x0066, 0x0067, 0x0068,
+                       0x0069, 0x006a, 0x006b, 0x006c
+               };
+               int i;
+
+               val &= 0xff00;
+               for (i = 0; i < ARRAY_SIZE(set); i++)
+                       phy_write(phydev, 0x0d, val | set[i]);
+       }
+}
+
 static void rtl8168d_1_hw_phy_config(struct rtl8169_private *tp,
                                     struct phy_device *phydev)
 {
@@ -469,25 +483,7 @@ static void rtl8168d_1_hw_phy_config(struct rtl8169_private *tp,
        phy_modify(phydev, 0x0c, 0x5d00, 0xa200);
 
        if (rtl8168d_efuse_read(tp, 0x01) == 0xb1) {
-               int val;
-
-               rtl_writephy_batch(phydev, rtl8168d_1_phy_reg_init_1);
-
-               val = phy_read(phydev, 0x0d);
-
-               if ((val & 0x00ff) != 0x006c) {
-                       static const u32 set[] = {
-                               0x0065, 0x0066, 0x0067, 0x0068,
-                               0x0069, 0x006a, 0x006b, 0x006c
-                       };
-                       int i;
-
-                       phy_write(phydev, 0x1f, 0x0002);
-
-                       val &= 0xff00;
-                       for (i = 0; i < ARRAY_SIZE(set); i++)
-                               phy_write(phydev, 0x0d, val | set[i]);
-               }
+               rtl8168d_1_common(phydev);
        } else {
                phy_write_paged(phydev, 0x0002, 0x05, 0x6662);
                r8168d_phy_param(phydev, 0x8330, 0xffff, 0x6662);
@@ -513,24 +509,7 @@ static void rtl8168d_2_hw_phy_config(struct rtl8169_private *tp,
        rtl_writephy_batch(phydev, rtl8168d_1_phy_reg_init_0);
 
        if (rtl8168d_efuse_read(tp, 0x01) == 0xb1) {
-               int val;
-
-               rtl_writephy_batch(phydev, rtl8168d_1_phy_reg_init_1);
-
-               val = phy_read(phydev, 0x0d);
-               if ((val & 0x00ff) != 0x006c) {
-                       static const u32 set[] = {
-                               0x0065, 0x0066, 0x0067, 0x0068,
-                               0x0069, 0x006a, 0x006b, 0x006c
-                       };
-                       int i;
-
-                       phy_write(phydev, 0x1f, 0x0002);
-
-                       val &= 0xff00;
-                       for (i = 0; i < ARRAY_SIZE(set); i++)
-                               phy_write(phydev, 0x0d, val | set[i]);
-               }
+               rtl8168d_1_common(phydev);
        } else {
                phy_write_paged(phydev, 0x0002, 0x05, 0x2642);
                r8168d_phy_param(phydev, 0x8330, 0xffff, 0x2642);
index b215cde..24e2635 100644 (file)
@@ -1432,11 +1432,7 @@ static int ravb_phy_init(struct net_device *ndev)
         * at this time.
         */
        if (soc_device_match(r8a7795es10)) {
-               err = phy_set_max_speed(phydev, SPEED_100);
-               if (err) {
-                       netdev_err(ndev, "failed to limit PHY to 100Mbit/s\n");
-                       goto err_phy_disconnect;
-               }
+               phy_set_max_speed(phydev, SPEED_100);
 
                netdev_info(ndev, "limited PHY to 100Mbit/s\n");
        }
@@ -1457,8 +1453,6 @@ static int ravb_phy_init(struct net_device *ndev)
 
        return 0;
 
-err_phy_disconnect:
-       phy_disconnect(phydev);
 err_deregister_fixed_link:
        if (of_phy_is_fixed_link(np))
                of_phy_deregister_fixed_link(np);
@@ -2854,7 +2848,6 @@ static int ravb_wol_restore(struct net_device *ndev)
 {
        struct ravb_private *priv = netdev_priv(ndev);
        const struct ravb_hw_info *info = priv->info;
-       int ret;
 
        if (info->nc_queues)
                napi_enable(&priv->napi[RAVB_NC]);
@@ -2863,9 +2856,7 @@ static int ravb_wol_restore(struct net_device *ndev)
        /* Disable MagicPacket */
        ravb_modify(ndev, ECMR, ECMR_MPDE, 0);
 
-       ret = ravb_close(ndev);
-       if (ret < 0)
-               return ret;
+       ravb_close(ndev);
 
        return disable_irq_wake(priv->emac_irq);
 }
index d947a62..67ade78 100644 (file)
@@ -2026,14 +2026,8 @@ static int sh_eth_phy_init(struct net_device *ndev)
        }
 
        /* mask with MAC supported features */
-       if (mdp->cd->register_type != SH_ETH_REG_GIGABIT) {
-               int err = phy_set_max_speed(phydev, SPEED_100);
-               if (err) {
-                       netdev_err(ndev, "failed to limit PHY to 100 Mbit/s\n");
-                       phy_disconnect(phydev);
-                       return err;
-               }
-       }
+       if (mdp->cd->register_type != SH_ETH_REG_GIGABIT)
+               phy_set_max_speed(phydev, SPEED_100);
 
        phy_attached_info(phydev);
 
@@ -3450,9 +3444,7 @@ static int sh_eth_wol_restore(struct net_device *ndev)
         * both be reset and all registers restored. This is what
         * happens during suspend and resume without WoL enabled.
         */
-       ret = sh_eth_close(ndev);
-       if (ret < 0)
-               return ret;
+       sh_eth_close(ndev);
        ret = sh_eth_open(ndev);
        if (ret < 0)
                return ret;
@@ -3464,7 +3456,7 @@ static int sh_eth_suspend(struct device *dev)
 {
        struct net_device *ndev = dev_get_drvdata(dev);
        struct sh_eth_private *mdp = netdev_priv(ndev);
-       int ret = 0;
+       int ret;
 
        if (!netif_running(ndev))
                return 0;
@@ -3483,7 +3475,7 @@ static int sh_eth_resume(struct device *dev)
 {
        struct net_device *ndev = dev_get_drvdata(dev);
        struct sh_eth_private *mdp = netdev_priv(ndev);
-       int ret = 0;
+       int ret;
 
        if (!netif_running(ndev))
                return 0;
index 32161a5..77a0d9d 100644 (file)
@@ -127,7 +127,7 @@ bool sxgbe_eee_init(struct sxgbe_priv_data * const priv)
        /* MAC core supports the EEE feature. */
        if (priv->hw_cap.eee) {
                /* Check if the PHY supports EEE */
-               if (phy_init_eee(ndev->phydev, 1))
+               if (phy_init_eee(ndev->phydev, true))
                        return false;
 
                priv->eee_active = 1;
index cf366ed..50d5359 100644 (file)
@@ -3990,6 +3990,30 @@ static unsigned int ef10_check_caps(const struct efx_nic *efx,
        }
 }
 
+static unsigned int efx_ef10_recycle_ring_size(const struct efx_nic *efx)
+{
+       unsigned int ret = EFX_RECYCLE_RING_SIZE_10G;
+
+       /* There is no difference between PFs and VFs. The side is based on
+        * the maximum link speed of a given NIC.
+        */
+       switch (efx->pci_dev->device & 0xfff) {
+       case 0x0903:    /* Farmingdale can do up to 10G */
+               break;
+       case 0x0923:    /* Greenport can do up to 40G */
+       case 0x0a03:    /* Medford can do up to 40G */
+               ret *= 4;
+               break;
+       default:        /* Medford2 can do up to 100G */
+               ret *= 10;
+       }
+
+       if (IS_ENABLED(CONFIG_PPC64))
+               ret *= 4;
+
+       return ret;
+}
+
 #define EF10_OFFLOAD_FEATURES          \
        (NETIF_F_IP_CSUM |              \
         NETIF_F_HW_VLAN_CTAG_FILTER |  \
@@ -4106,6 +4130,7 @@ const struct efx_nic_type efx_hunt_a0_vf_nic_type = {
        .check_caps = ef10_check_caps,
        .print_additional_fwver = efx_ef10_print_additional_fwver,
        .sensor_event = efx_mcdi_sensor_event,
+       .rx_recycle_ring_size = efx_ef10_recycle_ring_size,
 };
 
 const struct efx_nic_type efx_hunt_a0_nic_type = {
@@ -4243,4 +4268,5 @@ const struct efx_nic_type efx_hunt_a0_nic_type = {
        .check_caps = ef10_check_caps,
        .print_additional_fwver = efx_ef10_print_additional_fwver,
        .sensor_event = efx_mcdi_sensor_event,
+       .rx_recycle_ring_size = efx_ef10_recycle_ring_size,
 };
index f79b14a..a07cbf4 100644 (file)
@@ -23,6 +23,7 @@
 #include "ef100_rx.h"
 #include "ef100_tx.h"
 #include "ef100_netdev.h"
+#include "rx_common.h"
 
 #define EF100_MAX_VIS 4096
 #define EF100_NUM_MCDI_BUFFERS 1
@@ -696,6 +697,12 @@ static unsigned int ef100_check_caps(const struct efx_nic *efx,
        }
 }
 
+static unsigned int efx_ef100_recycle_ring_size(const struct efx_nic *efx)
+{
+       /* Maximum link speed for Riverhead is 100G */
+       return 10 * EFX_RECYCLE_RING_SIZE_10G;
+}
+
 /*     NIC level access functions
  */
 #define EF100_OFFLOAD_FEATURES (NETIF_F_HW_CSUM | NETIF_F_RXCSUM |     \
@@ -770,6 +777,7 @@ const struct efx_nic_type ef100_pf_nic_type = {
        .rx_push_rss_context_config = efx_mcdi_rx_push_rss_context_config,
        .rx_pull_rss_context_config = efx_mcdi_rx_pull_rss_context_config,
        .rx_restore_rss_contexts = efx_mcdi_rx_restore_rss_contexts,
+       .rx_recycle_ring_size = efx_ef100_recycle_ring_size,
 
        .reconfigure_mac = ef100_reconfigure_mac,
        .reconfigure_port = efx_mcdi_port_reconfigure,
@@ -849,6 +857,7 @@ const struct efx_nic_type ef100_vf_nic_type = {
        .rx_pull_rss_config = efx_mcdi_rx_pull_rss_config,
        .rx_push_rss_config = efx_mcdi_pf_rx_push_rss_config,
        .rx_restore_rss_contexts = efx_mcdi_rx_restore_rss_contexts,
+       .rx_recycle_ring_size = efx_ef100_recycle_ring_size,
 
        .reconfigure_mac = ef100_reconfigure_mac,
        .test_nvram = efx_new_mcdi_nvram_test_all,
index cc15ee8..c75dc75 100644 (file)
@@ -1282,6 +1282,7 @@ struct efx_udp_tunnel {
  * @udp_tnl_has_port: Check if a port has been added as UDP tunnel
  * @print_additional_fwver: Dump NIC-specific additional FW version info
  * @sensor_event: Handle a sensor event from MCDI
+ * @rx_recycle_ring_size: Size of the RX recycle ring
  * @revision: Hardware architecture revision
  * @txd_ptr_tbl_base: TX descriptor ring base address
  * @rxd_ptr_tbl_base: RX descriptor ring base address
@@ -1460,6 +1461,7 @@ struct efx_nic_type {
        size_t (*print_additional_fwver)(struct efx_nic *efx, char *buf,
                                         size_t len);
        void (*sensor_event)(struct efx_nic *efx, efx_qword_t *ev);
+       unsigned int (*rx_recycle_ring_size)(const struct efx_nic *efx);
 
        int revision;
        unsigned int txd_ptr_tbl_base;
index b9cafe9..0cef35c 100644 (file)
@@ -195,6 +195,11 @@ static inline void efx_sensor_event(struct efx_nic *efx, efx_qword_t *ev)
                efx->type->sensor_event(efx, ev);
 }
 
+static inline unsigned int efx_rx_recycle_ring_size(const struct efx_nic *efx)
+{
+       return efx->type->rx_recycle_ring_size(efx);
+}
+
 /* Some statistics are computed as A - B where A and B each increase
  * linearly with some hardware counter(s) and the counters are read
  * asynchronously.  If the counters contributing to B are always read
index 633ca77..1b22c7b 100644 (file)
@@ -23,13 +23,6 @@ module_param(rx_refill_threshold, uint, 0444);
 MODULE_PARM_DESC(rx_refill_threshold,
                 "RX descriptor ring refill threshold (%)");
 
-/* Number of RX buffers to recycle pages for.  When creating the RX page recycle
- * ring, this number is divided by the number of buffers per page to calculate
- * the number of pages to store in the RX page recycle ring.
- */
-#define EFX_RECYCLE_RING_SIZE_IOMMU 4096
-#define EFX_RECYCLE_RING_SIZE_NOIOMMU (2 * EFX_RX_PREFERRED_BATCH)
-
 /* RX maximum head room required.
  *
  * This must be at least 1 to prevent overflow, plus one packet-worth
@@ -141,16 +134,7 @@ static void efx_init_rx_recycle_ring(struct efx_rx_queue *rx_queue)
        unsigned int bufs_in_recycle_ring, page_ring_size;
        struct efx_nic *efx = rx_queue->efx;
 
-       /* Set the RX recycle ring size */
-#ifdef CONFIG_PPC64
-       bufs_in_recycle_ring = EFX_RECYCLE_RING_SIZE_IOMMU;
-#else
-       if (iommu_present(&pci_bus_type))
-               bufs_in_recycle_ring = EFX_RECYCLE_RING_SIZE_IOMMU;
-       else
-               bufs_in_recycle_ring = EFX_RECYCLE_RING_SIZE_NOIOMMU;
-#endif /* CONFIG_PPC64 */
-
+       bufs_in_recycle_ring = efx_rx_recycle_ring_size(efx);
        page_ring_size = roundup_pow_of_two(bufs_in_recycle_ring /
                                            efx->rx_bufs_per_page);
        rx_queue->page_ring = kcalloc(page_ring_size,
index 207ccd8..fbd2769 100644 (file)
 #define EFX_RX_MAX_FRAGS DIV_ROUND_UP(EFX_MAX_FRAME_LEN(EFX_MAX_MTU), \
                                      EFX_RX_USR_BUF_SIZE)
 
+/* Number of RX buffers to recycle pages for.  When creating the RX page recycle
+ * ring, this number is divided by the number of buffers per page to calculate
+ * the number of pages to store in the RX page recycle ring.
+ */
+#define EFX_RECYCLE_RING_SIZE_10G      256
+
 static inline u8 *efx_rx_buf_va(struct efx_rx_buffer *buf)
 {
        return page_address(buf->page) + buf->page_offset;
index 16347a6..ce3060e 100644 (file)
@@ -25,6 +25,7 @@
 #include "mcdi_port_common.h"
 #include "selftest.h"
 #include "siena_sriov.h"
+#include "rx_common.h"
 
 /* Hardware control for SFC9000 family including SFL9021 (aka Siena). */
 
@@ -958,6 +959,12 @@ static unsigned int siena_check_caps(const struct efx_nic *efx,
        return 0;
 }
 
+static unsigned int efx_siena_recycle_ring_size(const struct efx_nic *efx)
+{
+       /* Maximum link speed is 10G */
+       return EFX_RECYCLE_RING_SIZE_10G;
+}
+
 /**************************************************************************
  *
  * Revision-dependent attributes used by efx.c and nic.c
@@ -1098,4 +1105,5 @@ const struct efx_nic_type siena_a0_nic_type = {
        .rx_hash_key_size = 16,
        .check_caps = siena_check_caps,
        .sensor_event = efx_mcdi_sensor_event,
+       .rx_recycle_ring_size = efx_siena_recycle_ring_size,
 };
index 8e8778c..5943ff9 100644 (file)
@@ -383,10 +383,10 @@ static int intel_crosststamp(ktime_t *device,
 
        /* Repeat until the timestamps are from the FIFO last segment */
        for (i = 0; i < num_snapshot; i++) {
-               spin_lock_irqsave(&priv->ptp_lock, flags);
+               read_lock_irqsave(&priv->ptp_lock, flags);
                stmmac_get_ptptime(priv, ptpaddr, &ptp_time);
                *device = ns_to_ktime(ptp_time);
-               spin_unlock_irqrestore(&priv->ptp_lock, flags);
+               read_unlock_irqrestore(&priv->ptp_lock, flags);
                get_arttime(priv->mii, intel_priv->mdio_adhoc_addr, &art_time);
                *system = convert_art_to_tsc(art_time);
        }
index 09644ab..f86cc83 100644 (file)
@@ -16,6 +16,7 @@
 #include <linux/of_net.h>
 #include <linux/phy.h>
 #include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
 #include <linux/regulator/consumer.h>
 #include <linux/regmap.h>
 #include <linux/stmmac.h>
@@ -57,7 +58,6 @@ struct emac_variant {
 };
 
 /* struct sunxi_priv_data - hold all sunxi private data
- * @tx_clk:    reference to MAC TX clock
  * @ephy_clk:  reference to the optional EPHY clock for the internal PHY
  * @regulator: reference to the optional regulator
  * @rst_ephy:  reference to the optional EPHY reset for the internal PHY
@@ -68,7 +68,6 @@ struct emac_variant {
  * @mux_handle:        Internal pointer used by mdio-mux lib
  */
 struct sunxi_priv_data {
-       struct clk *tx_clk;
        struct clk *ephy_clk;
        struct regulator *regulator;
        struct reset_control *rst_ephy;
@@ -579,22 +578,14 @@ static int sun8i_dwmac_init(struct platform_device *pdev, void *priv)
                }
        }
 
-       ret = clk_prepare_enable(gmac->tx_clk);
-       if (ret) {
-               dev_err(&pdev->dev, "Could not enable AHB clock\n");
-               goto err_disable_regulator;
-       }
-
        if (gmac->use_internal_phy) {
                ret = sun8i_dwmac_power_internal_phy(netdev_priv(ndev));
                if (ret)
-                       goto err_disable_clk;
+                       goto err_disable_regulator;
        }
 
        return 0;
 
-err_disable_clk:
-       clk_disable_unprepare(gmac->tx_clk);
 err_disable_regulator:
        if (gmac->regulator)
                regulator_disable(gmac->regulator);
@@ -1043,8 +1034,6 @@ static void sun8i_dwmac_exit(struct platform_device *pdev, void *priv)
        if (gmac->variant->soc_has_internal_phy)
                sun8i_dwmac_unpower_internal_phy(gmac);
 
-       clk_disable_unprepare(gmac->tx_clk);
-
        if (gmac->regulator)
                regulator_disable(gmac->regulator);
 }
@@ -1167,12 +1156,6 @@ static int sun8i_dwmac_probe(struct platform_device *pdev)
                return -EINVAL;
        }
 
-       gmac->tx_clk = devm_clk_get(dev, "stmmaceth");
-       if (IS_ERR(gmac->tx_clk)) {
-               dev_err(dev, "Could not get TX clock\n");
-               return PTR_ERR(gmac->tx_clk);
-       }
-
        /* Optional regulator for PHY */
        gmac->regulator = devm_regulator_get_optional(dev, "phy");
        if (IS_ERR(gmac->regulator)) {
@@ -1254,6 +1237,12 @@ static int sun8i_dwmac_probe(struct platform_device *pdev)
        ndev = dev_get_drvdata(&pdev->dev);
        priv = netdev_priv(ndev);
 
+       /* the MAC is runtime suspended after stmmac_dvr_probe(), so we
+        * need to ensure the MAC resume back before other operations such
+        * as reset.
+        */
+       pm_runtime_get_sync(&pdev->dev);
+
        /* The mux must be registered after parent MDIO
         * so after stmmac_dvr_probe()
         */
@@ -1272,12 +1261,15 @@ static int sun8i_dwmac_probe(struct platform_device *pdev)
                        goto dwmac_remove;
        }
 
+       pm_runtime_put(&pdev->dev);
+
        return 0;
 
 dwmac_mux:
        reset_control_put(gmac->rst_ephy);
        clk_put(gmac->ephy_clk);
 dwmac_remove:
+       pm_runtime_put_noidle(&pdev->dev);
        stmmac_dvr_remove(&pdev->dev);
 dwmac_exit:
        sun8i_dwmac_exit(pdev, gmac);
index 5b195d5..57970ae 100644 (file)
@@ -263,7 +263,7 @@ struct stmmac_priv {
        u32 adv_ts;
        int use_riwt;
        int irq_wake;
-       spinlock_t ptp_lock;
+       rwlock_t ptp_lock;
        /* Protects auxiliary snapshot registers from concurrent access. */
        struct mutex aux_ts_lock;
 
index a7ec9f4..22fea0f 100644 (file)
@@ -196,9 +196,9 @@ static void timestamp_interrupt(struct stmmac_priv *priv)
                       GMAC_TIMESTAMP_ATSNS_SHIFT;
 
        for (i = 0; i < num_snapshot; i++) {
-               spin_lock_irqsave(&priv->ptp_lock, flags);
+               read_lock_irqsave(&priv->ptp_lock, flags);
                get_ptptime(priv->ptpaddr, &ptp_time);
-               spin_unlock_irqrestore(&priv->ptp_lock, flags);
+               read_unlock_irqrestore(&priv->ptp_lock, flags);
                event.type = PTP_CLOCK_EXTTS;
                event.index = 0;
                event.timestamp = ptp_time;
index bde76ea..b745d62 100644 (file)
@@ -938,105 +938,15 @@ static void stmmac_mac_flow_ctrl(struct stmmac_priv *priv, u32 duplex)
                        priv->pause, tx_cnt);
 }
 
-static void stmmac_validate(struct phylink_config *config,
-                           unsigned long *supported,
-                           struct phylink_link_state *state)
+static struct phylink_pcs *stmmac_mac_select_pcs(struct phylink_config *config,
+                                                phy_interface_t interface)
 {
        struct stmmac_priv *priv = netdev_priv(to_net_dev(config->dev));
-       __ETHTOOL_DECLARE_LINK_MODE_MASK(mac_supported) = { 0, };
-       __ETHTOOL_DECLARE_LINK_MODE_MASK(mask) = { 0, };
-       int tx_cnt = priv->plat->tx_queues_to_use;
-       int max_speed = priv->plat->max_speed;
-
-       phylink_set(mac_supported, 10baseT_Half);
-       phylink_set(mac_supported, 10baseT_Full);
-       phylink_set(mac_supported, 100baseT_Half);
-       phylink_set(mac_supported, 100baseT_Full);
-       phylink_set(mac_supported, 1000baseT_Half);
-       phylink_set(mac_supported, 1000baseT_Full);
-       phylink_set(mac_supported, 1000baseKX_Full);
-
-       phylink_set(mac_supported, Autoneg);
-       phylink_set(mac_supported, Pause);
-       phylink_set(mac_supported, Asym_Pause);
-       phylink_set_port_modes(mac_supported);
-
-       /* Cut down 1G if asked to */
-       if ((max_speed > 0) && (max_speed < 1000)) {
-               phylink_set(mask, 1000baseT_Full);
-               phylink_set(mask, 1000baseX_Full);
-       } else if (priv->plat->has_gmac4) {
-               if (!max_speed || max_speed >= 2500) {
-                       phylink_set(mac_supported, 2500baseT_Full);
-                       phylink_set(mac_supported, 2500baseX_Full);
-               }
-       } else if (priv->plat->has_xgmac) {
-               if (!max_speed || (max_speed >= 2500)) {
-                       phylink_set(mac_supported, 2500baseT_Full);
-                       phylink_set(mac_supported, 2500baseX_Full);
-               }
-               if (!max_speed || (max_speed >= 5000)) {
-                       phylink_set(mac_supported, 5000baseT_Full);
-               }
-               if (!max_speed || (max_speed >= 10000)) {
-                       phylink_set(mac_supported, 10000baseSR_Full);
-                       phylink_set(mac_supported, 10000baseLR_Full);
-                       phylink_set(mac_supported, 10000baseER_Full);
-                       phylink_set(mac_supported, 10000baseLRM_Full);
-                       phylink_set(mac_supported, 10000baseT_Full);
-                       phylink_set(mac_supported, 10000baseKX4_Full);
-                       phylink_set(mac_supported, 10000baseKR_Full);
-               }
-               if (!max_speed || (max_speed >= 25000)) {
-                       phylink_set(mac_supported, 25000baseCR_Full);
-                       phylink_set(mac_supported, 25000baseKR_Full);
-                       phylink_set(mac_supported, 25000baseSR_Full);
-               }
-               if (!max_speed || (max_speed >= 40000)) {
-                       phylink_set(mac_supported, 40000baseKR4_Full);
-                       phylink_set(mac_supported, 40000baseCR4_Full);
-                       phylink_set(mac_supported, 40000baseSR4_Full);
-                       phylink_set(mac_supported, 40000baseLR4_Full);
-               }
-               if (!max_speed || (max_speed >= 50000)) {
-                       phylink_set(mac_supported, 50000baseCR2_Full);
-                       phylink_set(mac_supported, 50000baseKR2_Full);
-                       phylink_set(mac_supported, 50000baseSR2_Full);
-                       phylink_set(mac_supported, 50000baseKR_Full);
-                       phylink_set(mac_supported, 50000baseSR_Full);
-                       phylink_set(mac_supported, 50000baseCR_Full);
-                       phylink_set(mac_supported, 50000baseLR_ER_FR_Full);
-                       phylink_set(mac_supported, 50000baseDR_Full);
-               }
-               if (!max_speed || (max_speed >= 100000)) {
-                       phylink_set(mac_supported, 100000baseKR4_Full);
-                       phylink_set(mac_supported, 100000baseSR4_Full);
-                       phylink_set(mac_supported, 100000baseCR4_Full);
-                       phylink_set(mac_supported, 100000baseLR4_ER4_Full);
-                       phylink_set(mac_supported, 100000baseKR2_Full);
-                       phylink_set(mac_supported, 100000baseSR2_Full);
-                       phylink_set(mac_supported, 100000baseCR2_Full);
-                       phylink_set(mac_supported, 100000baseLR2_ER2_FR2_Full);
-                       phylink_set(mac_supported, 100000baseDR2_Full);
-               }
-       }
-
-       /* Half-Duplex can only work with single queue */
-       if (tx_cnt > 1) {
-               phylink_set(mask, 10baseT_Half);
-               phylink_set(mask, 100baseT_Half);
-               phylink_set(mask, 1000baseT_Half);
-       }
-
-       linkmode_and(supported, supported, mac_supported);
-       linkmode_andnot(supported, supported, mask);
 
-       linkmode_and(state->advertising, state->advertising, mac_supported);
-       linkmode_andnot(state->advertising, state->advertising, mask);
+       if (!priv->hw->xpcs)
+               return NULL;
 
-       /* If PCS is supported, check which modes it supports. */
-       if (priv->hw->xpcs)
-               xpcs_validate(priv->hw->xpcs, supported, state);
+       return &priv->hw->xpcs->pcs;
 }
 
 static void stmmac_mac_config(struct phylink_config *config, unsigned int mode,
@@ -1175,7 +1085,8 @@ static void stmmac_mac_link_up(struct phylink_config *config,
 }
 
 static const struct phylink_mac_ops stmmac_phylink_mac_ops = {
-       .validate = stmmac_validate,
+       .validate = phylink_generic_validate,
+       .mac_select_pcs = stmmac_mac_select_pcs,
        .mac_config = stmmac_mac_config,
        .mac_link_down = stmmac_mac_link_down,
        .mac_link_up = stmmac_mac_link_up,
@@ -1255,12 +1166,12 @@ static int stmmac_phy_setup(struct stmmac_priv *priv)
 {
        struct stmmac_mdio_bus_data *mdio_bus_data = priv->plat->mdio_bus_data;
        struct fwnode_handle *fwnode = of_fwnode_handle(priv->plat->phylink_node);
+       int max_speed = priv->plat->max_speed;
        int mode = priv->plat->phy_interface;
        struct phylink *phylink;
 
        priv->phylink_config.dev = &priv->dev->dev;
        priv->phylink_config.type = PHYLINK_NETDEV;
-       priv->phylink_config.pcs_poll = true;
        if (priv->plat->mdio_bus_data)
                priv->phylink_config.ovr_an_inband =
                        mdio_bus_data->xpcs_an_inband;
@@ -1268,14 +1179,50 @@ static int stmmac_phy_setup(struct stmmac_priv *priv)
        if (!fwnode)
                fwnode = dev_fwnode(priv->device);
 
+       /* Set the platform/firmware specified interface mode */
+       __set_bit(mode, priv->phylink_config.supported_interfaces);
+
+       /* If we have an xpcs, it defines which PHY interfaces are supported. */
+       if (priv->hw->xpcs)
+               xpcs_get_interfaces(priv->hw->xpcs,
+                                   priv->phylink_config.supported_interfaces);
+
+       priv->phylink_config.mac_capabilities = MAC_ASYM_PAUSE | MAC_SYM_PAUSE |
+               MAC_10 | MAC_100;
+
+       if (!max_speed || max_speed >= 1000)
+               priv->phylink_config.mac_capabilities |= MAC_1000;
+
+       if (priv->plat->has_gmac4) {
+               if (!max_speed || max_speed >= 2500)
+                       priv->phylink_config.mac_capabilities |= MAC_2500FD;
+       } else if (priv->plat->has_xgmac) {
+               if (!max_speed || max_speed >= 2500)
+                       priv->phylink_config.mac_capabilities |= MAC_2500FD;
+               if (!max_speed || max_speed >= 5000)
+                       priv->phylink_config.mac_capabilities |= MAC_5000FD;
+               if (!max_speed || max_speed >= 10000)
+                       priv->phylink_config.mac_capabilities |= MAC_10000FD;
+               if (!max_speed || max_speed >= 25000)
+                       priv->phylink_config.mac_capabilities |= MAC_25000FD;
+               if (!max_speed || max_speed >= 40000)
+                       priv->phylink_config.mac_capabilities |= MAC_40000FD;
+               if (!max_speed || max_speed >= 50000)
+                       priv->phylink_config.mac_capabilities |= MAC_50000FD;
+               if (!max_speed || max_speed >= 100000)
+                       priv->phylink_config.mac_capabilities |= MAC_100000FD;
+       }
+
+       /* Half-Duplex can only work with single queue */
+       if (priv->plat->tx_queues_to_use > 1)
+               priv->phylink_config.mac_capabilities &=
+                       ~(MAC_10HD | MAC_100HD | MAC_1000HD);
+
        phylink = phylink_create(&priv->phylink_config, fwnode,
                                 mode, &stmmac_phylink_mac_ops);
        if (IS_ERR(phylink))
                return PTR_ERR(phylink);
 
-       if (priv->hw->xpcs)
-               phylink_set_pcs(phylink, &priv->hw->xpcs->pcs);
-
        priv->phylink = phylink;
        return 0;
 }
index 1c9f02f..e45fb19 100644 (file)
@@ -39,9 +39,9 @@ static int stmmac_adjust_freq(struct ptp_clock_info *ptp, s32 ppb)
        diff = div_u64(adj, 1000000000ULL);
        addend = neg_adj ? (addend - diff) : (addend + diff);
 
-       spin_lock_irqsave(&priv->ptp_lock, flags);
+       write_lock_irqsave(&priv->ptp_lock, flags);
        stmmac_config_addend(priv, priv->ptpaddr, addend);
-       spin_unlock_irqrestore(&priv->ptp_lock, flags);
+       write_unlock_irqrestore(&priv->ptp_lock, flags);
 
        return 0;
 }
@@ -86,9 +86,9 @@ static int stmmac_adjust_time(struct ptp_clock_info *ptp, s64 delta)
                mutex_unlock(&priv->plat->est->lock);
        }
 
-       spin_lock_irqsave(&priv->ptp_lock, flags);
+       write_lock_irqsave(&priv->ptp_lock, flags);
        stmmac_adjust_systime(priv, priv->ptpaddr, sec, nsec, neg_adj, xmac);
-       spin_unlock_irqrestore(&priv->ptp_lock, flags);
+       write_unlock_irqrestore(&priv->ptp_lock, flags);
 
        /* Caculate new basetime and re-configured EST after PTP time adjust. */
        if (est_rst) {
@@ -137,9 +137,9 @@ static int stmmac_get_time(struct ptp_clock_info *ptp, struct timespec64 *ts)
        unsigned long flags;
        u64 ns = 0;
 
-       spin_lock_irqsave(&priv->ptp_lock, flags);
+       read_lock_irqsave(&priv->ptp_lock, flags);
        stmmac_get_systime(priv, priv->ptpaddr, &ns);
-       spin_unlock_irqrestore(&priv->ptp_lock, flags);
+       read_unlock_irqrestore(&priv->ptp_lock, flags);
 
        *ts = ns_to_timespec64(ns);
 
@@ -162,9 +162,9 @@ static int stmmac_set_time(struct ptp_clock_info *ptp,
            container_of(ptp, struct stmmac_priv, ptp_clock_ops);
        unsigned long flags;
 
-       spin_lock_irqsave(&priv->ptp_lock, flags);
+       write_lock_irqsave(&priv->ptp_lock, flags);
        stmmac_init_systime(priv, priv->ptpaddr, ts->tv_sec, ts->tv_nsec);
-       spin_unlock_irqrestore(&priv->ptp_lock, flags);
+       write_unlock_irqrestore(&priv->ptp_lock, flags);
 
        return 0;
 }
@@ -194,12 +194,12 @@ static int stmmac_enable(struct ptp_clock_info *ptp,
                cfg->period.tv_sec = rq->perout.period.sec;
                cfg->period.tv_nsec = rq->perout.period.nsec;
 
-               spin_lock_irqsave(&priv->ptp_lock, flags);
+               write_lock_irqsave(&priv->ptp_lock, flags);
                ret = stmmac_flex_pps_config(priv, priv->ioaddr,
                                             rq->perout.index, cfg, on,
                                             priv->sub_second_inc,
                                             priv->systime_flags);
-               spin_unlock_irqrestore(&priv->ptp_lock, flags);
+               write_unlock_irqrestore(&priv->ptp_lock, flags);
                break;
        case PTP_CLK_REQ_EXTTS:
                priv->plat->ext_snapshot_en = on;
@@ -314,7 +314,7 @@ void stmmac_ptp_register(struct stmmac_priv *priv)
        stmmac_ptp_clock_ops.n_per_out = priv->dma_cap.pps_out_num;
        stmmac_ptp_clock_ops.n_ext_ts = priv->dma_cap.aux_snapshot_n;
 
-       spin_lock_init(&priv->ptp_lock);
+       rwlock_init(&priv->ptp_lock);
        mutex_init(&priv->aux_ts_lock);
        priv->ptp_clock_ops = stmmac_ptp_clock_ops;
 
index be3cb63..9f17595 100644 (file)
@@ -1777,9 +1777,9 @@ static int stmmac_test_tbs(struct stmmac_priv *priv)
        if (ret)
                return ret;
 
-       spin_lock_irqsave(&priv->ptp_lock, flags);
+       read_lock_irqsave(&priv->ptp_lock, flags);
        stmmac_get_systime(priv, priv->ptpaddr, &curr_time);
-       spin_unlock_irqrestore(&priv->ptp_lock, flags);
+       read_unlock_irqrestore(&priv->ptp_lock, flags);
 
        if (!curr_time) {
                ret = -EOPNOTSUPP;
@@ -1799,9 +1799,9 @@ static int stmmac_test_tbs(struct stmmac_priv *priv)
                goto fail_disable;
 
        /* Check if expected time has elapsed */
-       spin_lock_irqsave(&priv->ptp_lock, flags);
+       read_lock_irqsave(&priv->ptp_lock, flags);
        stmmac_get_systime(priv, priv->ptpaddr, &curr_time);
-       spin_unlock_irqrestore(&priv->ptp_lock, flags);
+       read_unlock_irqrestore(&priv->ptp_lock, flags);
 
        if ((curr_time - start_time) < STMMAC_TBS_LT_OFFSET)
                ret = -EINVAL;
index 5b4d153..4010896 100644 (file)
@@ -386,6 +386,7 @@ struct axidma_bd {
  * @phylink:   Pointer to phylink instance
  * @phylink_config: phylink configuration settings
  * @pcs_phy:   Reference to PCS/PMA PHY if used
+ * @pcs:       phylink pcs structure for PCS PHY
  * @switch_x_sgmii: Whether switchable 1000BaseX/SGMII mode is enabled in the core
  * @axi_clk:   AXI4-Lite bus clock
  * @misc_clks: Misc ethernet clocks (AXI4-Stream, Ref, MGT clocks)
@@ -434,6 +435,7 @@ struct axienet_local {
        struct phylink_config phylink_config;
 
        struct mdio_device *pcs_phy;
+       struct phylink_pcs pcs;
 
        bool switch_x_sgmii;
 
index 377c94e..de0a637 100644 (file)
@@ -1537,78 +1537,78 @@ static const struct ethtool_ops axienet_ethtool_ops = {
        .nway_reset     = axienet_ethtools_nway_reset,
 };
 
-static void axienet_mac_pcs_get_state(struct phylink_config *config,
-                                     struct phylink_link_state *state)
+static struct axienet_local *pcs_to_axienet_local(struct phylink_pcs *pcs)
 {
-       struct net_device *ndev = to_net_dev(config->dev);
-       struct axienet_local *lp = netdev_priv(ndev);
+       return container_of(pcs, struct axienet_local, pcs);
+}
 
-       switch (state->interface) {
-       case PHY_INTERFACE_MODE_SGMII:
-       case PHY_INTERFACE_MODE_1000BASEX:
-               phylink_mii_c22_pcs_get_state(lp->pcs_phy, state);
-               break;
-       default:
-               break;
-       }
+static void axienet_pcs_get_state(struct phylink_pcs *pcs,
+                                 struct phylink_link_state *state)
+{
+       struct mdio_device *pcs_phy = pcs_to_axienet_local(pcs)->pcs_phy;
+
+       phylink_mii_c22_pcs_get_state(pcs_phy, state);
 }
 
-static void axienet_mac_an_restart(struct phylink_config *config)
+static void axienet_pcs_an_restart(struct phylink_pcs *pcs)
 {
-       struct net_device *ndev = to_net_dev(config->dev);
-       struct axienet_local *lp = netdev_priv(ndev);
+       struct mdio_device *pcs_phy = pcs_to_axienet_local(pcs)->pcs_phy;
 
-       phylink_mii_c22_pcs_an_restart(lp->pcs_phy);
+       phylink_mii_c22_pcs_an_restart(pcs_phy);
 }
 
-static int axienet_mac_prepare(struct phylink_config *config, unsigned int mode,
-                              phy_interface_t iface)
+static int axienet_pcs_config(struct phylink_pcs *pcs, unsigned int mode,
+                             phy_interface_t interface,
+                             const unsigned long *advertising,
+                             bool permit_pause_to_mac)
 {
-       struct net_device *ndev = to_net_dev(config->dev);
+       struct mdio_device *pcs_phy = pcs_to_axienet_local(pcs)->pcs_phy;
+       struct net_device *ndev = pcs_to_axienet_local(pcs)->ndev;
        struct axienet_local *lp = netdev_priv(ndev);
        int ret;
 
-       switch (iface) {
-       case PHY_INTERFACE_MODE_SGMII:
-       case PHY_INTERFACE_MODE_1000BASEX:
-               if (!lp->switch_x_sgmii)
-                       return 0;
-
-               ret = mdiobus_write(lp->pcs_phy->bus,
-                                   lp->pcs_phy->addr,
-                                   XLNX_MII_STD_SELECT_REG,
-                                   iface == PHY_INTERFACE_MODE_SGMII ?
+       if (lp->switch_x_sgmii) {
+               ret = mdiodev_write(pcs_phy, XLNX_MII_STD_SELECT_REG,
+                                   interface == PHY_INTERFACE_MODE_SGMII ?
                                        XLNX_MII_STD_SELECT_SGMII : 0);
-               if (ret < 0)
-                       netdev_warn(ndev, "Failed to switch PHY interface: %d\n",
+               if (ret < 0) {
+                       netdev_warn(ndev,
+                                   "Failed to switch PHY interface: %d\n",
                                    ret);
-               return ret;
-       default:
-               return 0;
+                       return ret;
+               }
        }
+
+       ret = phylink_mii_c22_pcs_config(pcs_phy, mode, interface, advertising);
+       if (ret < 0)
+               netdev_warn(ndev, "Failed to configure PCS: %d\n", ret);
+
+       return ret;
 }
 
-static void axienet_mac_config(struct phylink_config *config, unsigned int mode,
-                              const struct phylink_link_state *state)
+static const struct phylink_pcs_ops axienet_pcs_ops = {
+       .pcs_get_state = axienet_pcs_get_state,
+       .pcs_config = axienet_pcs_config,
+       .pcs_an_restart = axienet_pcs_an_restart,
+};
+
+static struct phylink_pcs *axienet_mac_select_pcs(struct phylink_config *config,
+                                                 phy_interface_t interface)
 {
        struct net_device *ndev = to_net_dev(config->dev);
        struct axienet_local *lp = netdev_priv(ndev);
-       int ret;
 
-       switch (state->interface) {
-       case PHY_INTERFACE_MODE_SGMII:
-       case PHY_INTERFACE_MODE_1000BASEX:
-               ret = phylink_mii_c22_pcs_config(lp->pcs_phy, mode,
-                                                state->interface,
-                                                state->advertising);
-               if (ret < 0)
-                       netdev_warn(ndev, "Failed to configure PCS: %d\n",
-                                   ret);
-               break;
+       if (interface == PHY_INTERFACE_MODE_1000BASEX ||
+           interface ==  PHY_INTERFACE_MODE_SGMII)
+               return &lp->pcs;
 
-       default:
-               break;
-       }
+       return NULL;
+}
+
+static void axienet_mac_config(struct phylink_config *config, unsigned int mode,
+                              const struct phylink_link_state *state)
+{
+       /* nothing meaningful to do */
 }
 
 static void axienet_mac_link_down(struct phylink_config *config,
@@ -1663,9 +1663,7 @@ static void axienet_mac_link_up(struct phylink_config *config,
 
 static const struct phylink_mac_ops axienet_phylink_ops = {
        .validate = phylink_generic_validate,
-       .mac_pcs_get_state = axienet_mac_pcs_get_state,
-       .mac_an_restart = axienet_mac_an_restart,
-       .mac_prepare = axienet_mac_prepare,
+       .mac_select_pcs = axienet_mac_select_pcs,
        .mac_config = axienet_mac_config,
        .mac_link_down = axienet_mac_link_down,
        .mac_link_up = axienet_mac_link_up,
@@ -2079,12 +2077,12 @@ static int axienet_probe(struct platform_device *pdev)
                        ret = -EPROBE_DEFER;
                        goto cleanup_mdio;
                }
-               lp->phylink_config.pcs_poll = true;
+               lp->pcs.ops = &axienet_pcs_ops;
+               lp->pcs.poll = true;
        }
 
        lp->phylink_config.dev = &ndev->dev;
        lp->phylink_config.type = PHYLINK_NETDEV;
-       lp->phylink_config.legacy_pre_march2020 = true;
        lp->phylink_config.mac_capabilities = MAC_SYM_PAUSE | MAC_ASYM_PAUSE |
                MAC_10FD | MAC_100FD | MAC_1000FD;
 
index ebd2870..5805e4a 100644 (file)
@@ -1514,10 +1514,9 @@ acpi_find_extended_socket_device(acpi_handle obj_handle, u32 level,
 {
        struct acpi_device *device;
        bool *found = context;
-       int result;
 
-       result = acpi_bus_get_device(obj_handle, &device);
-       if (result)
+       device = acpi_fetch_acpi_dev(obj_handle);
+       if (!device)
                return AE_OK;
 
        if (strcmp(acpi_device_hid(device), ACPI_MOTHERBOARD_RESOURCE_HID))
index afa81a9..e675d10 100644 (file)
@@ -154,19 +154,15 @@ static void free_netvsc_device(struct rcu_head *head)
 
        kfree(nvdev->extension);
 
-       if (nvdev->recv_original_buf) {
-               hv_unmap_memory(nvdev->recv_buf);
+       if (nvdev->recv_original_buf)
                vfree(nvdev->recv_original_buf);
-       } else {
+       else
                vfree(nvdev->recv_buf);
-       }
 
-       if (nvdev->send_original_buf) {
-               hv_unmap_memory(nvdev->send_buf);
+       if (nvdev->send_original_buf)
                vfree(nvdev->send_original_buf);
-       } else {
+       else
                vfree(nvdev->send_buf);
-       }
 
        bitmap_free(nvdev->send_section_map);
 
@@ -765,6 +761,12 @@ void netvsc_device_remove(struct hv_device *device)
                netvsc_teardown_send_gpadl(device, net_device, ndev);
        }
 
+       if (net_device->recv_original_buf)
+               hv_unmap_memory(net_device->recv_buf);
+
+       if (net_device->send_original_buf)
+               hv_unmap_memory(net_device->send_buf);
+
        /* Release all resources */
        free_netvsc_device_rcu(net_device);
 }
@@ -1821,6 +1823,12 @@ cleanup:
        netif_napi_del(&net_device->chan_table[0].napi);
 
 cleanup2:
+       if (net_device->recv_original_buf)
+               hv_unmap_memory(net_device->recv_buf);
+
+       if (net_device->send_original_buf)
+               hv_unmap_memory(net_device->send_buf);
+
        free_netvsc_device(&net_device->rcu);
 
        return ERR_PTR(ret);
index 2f5e7b3..07bafbf 100644 (file)
@@ -74,81 +74,6 @@ struct atusb_chip_data {
        int (*set_txpower)(struct ieee802154_hw*, s32);
 };
 
-/* ----- USB commands without data ----------------------------------------- */
-
-/* To reduce the number of error checks in the code, we record the first error
- * in atusb->err and reject all subsequent requests until the error is cleared.
- */
-
-static int atusb_control_msg(struct atusb *atusb, unsigned int pipe,
-                            __u8 request, __u8 requesttype,
-                            __u16 value, __u16 index,
-                            void *data, __u16 size, int timeout)
-{
-       struct usb_device *usb_dev = atusb->usb_dev;
-       int ret;
-
-       if (atusb->err)
-               return atusb->err;
-
-       ret = usb_control_msg(usb_dev, pipe, request, requesttype,
-                             value, index, data, size, timeout);
-       if (ret < size) {
-               ret = ret < 0 ? ret : -ENODATA;
-
-               atusb->err = ret;
-               dev_err(&usb_dev->dev,
-                       "%s: req 0x%02x val 0x%x idx 0x%x, error %d\n",
-                       __func__, request, value, index, ret);
-       }
-       return ret;
-}
-
-static int atusb_command(struct atusb *atusb, u8 cmd, u8 arg)
-{
-       struct usb_device *usb_dev = atusb->usb_dev;
-
-       dev_dbg(&usb_dev->dev, "%s: cmd = 0x%x\n", __func__, cmd);
-       return atusb_control_msg(atusb, usb_sndctrlpipe(usb_dev, 0),
-                                cmd, ATUSB_REQ_TO_DEV, arg, 0, NULL, 0, 1000);
-}
-
-static int atusb_write_reg(struct atusb *atusb, u8 reg, u8 value)
-{
-       struct usb_device *usb_dev = atusb->usb_dev;
-
-       dev_dbg(&usb_dev->dev, "%s: 0x%02x <- 0x%02x\n", __func__, reg, value);
-       return atusb_control_msg(atusb, usb_sndctrlpipe(usb_dev, 0),
-                                ATUSB_REG_WRITE, ATUSB_REQ_TO_DEV,
-                                value, reg, NULL, 0, 1000);
-}
-
-static int atusb_read_reg(struct atusb *atusb, u8 reg)
-{
-       struct usb_device *usb_dev = atusb->usb_dev;
-       int ret;
-       u8 *buffer;
-       u8 value;
-
-       buffer = kmalloc(1, GFP_KERNEL);
-       if (!buffer)
-               return -ENOMEM;
-
-       dev_dbg(&usb_dev->dev, "%s: reg = 0x%x\n", __func__, reg);
-       ret = atusb_control_msg(atusb, usb_rcvctrlpipe(usb_dev, 0),
-                               ATUSB_REG_READ, ATUSB_REQ_FROM_DEV,
-                               0, reg, buffer, 1, 1000);
-
-       if (ret >= 0) {
-               value = buffer[0];
-               kfree(buffer);
-               return value;
-       } else {
-               kfree(buffer);
-               return ret;
-       }
-}
-
 static int atusb_write_subreg(struct atusb *atusb, u8 reg, u8 mask,
                              u8 shift, u8 value)
 {
@@ -158,7 +83,10 @@ static int atusb_write_subreg(struct atusb *atusb, u8 reg, u8 mask,
 
        dev_dbg(&usb_dev->dev, "%s: 0x%02x <- 0x%02x\n", __func__, reg, value);
 
-       orig = atusb_read_reg(atusb, reg);
+       ret = usb_control_msg_recv(usb_dev, 0, ATUSB_REG_READ, ATUSB_REQ_FROM_DEV,
+                                  0, reg, &orig, 1, 1000, GFP_KERNEL);
+       if (ret < 0)
+               return ret;
 
        /* Write the value only into that part of the register which is allowed
         * by the mask. All other bits stay as before.
@@ -167,7 +95,8 @@ static int atusb_write_subreg(struct atusb *atusb, u8 reg, u8 mask,
        tmp |= (value << shift) & mask;
 
        if (tmp != orig)
-               ret = atusb_write_reg(atusb, reg, tmp);
+               ret = usb_control_msg_send(usb_dev, 0, ATUSB_REG_WRITE, ATUSB_REQ_TO_DEV,
+                                          tmp, reg, NULL, 0, 1000, GFP_KERNEL);
 
        return ret;
 }
@@ -176,12 +105,16 @@ static int atusb_read_subreg(struct atusb *lp,
                             unsigned int addr, unsigned int mask,
                             unsigned int shift)
 {
-       int rc;
+       int reg, ret;
+
+       ret = usb_control_msg_recv(lp->usb_dev, 0, ATUSB_REG_READ, ATUSB_REQ_FROM_DEV,
+                                  0, addr, &reg, 1, 1000, GFP_KERNEL);
+       if (ret < 0)
+               return ret;
 
-       rc = atusb_read_reg(lp, addr);
-       rc = (rc & mask) >> shift;
+       reg = (reg & mask) >> shift;
 
-       return rc;
+       return reg;
 }
 
 static int atusb_get_and_clear_error(struct atusb *atusb)
@@ -419,16 +352,22 @@ static int atusb_set_hw_addr_filt(struct ieee802154_hw *hw,
                u16 addr = le16_to_cpu(filt->short_addr);
 
                dev_vdbg(dev, "%s called for saddr\n", __func__);
-               atusb_write_reg(atusb, RG_SHORT_ADDR_0, addr);
-               atusb_write_reg(atusb, RG_SHORT_ADDR_1, addr >> 8);
+               usb_control_msg_send(atusb->usb_dev, 0, ATUSB_REG_WRITE, ATUSB_REQ_TO_DEV,
+                                    addr, RG_SHORT_ADDR_0, NULL, 0, 1000, GFP_KERNEL);
+
+               usb_control_msg_send(atusb->usb_dev, 0, ATUSB_REG_WRITE, ATUSB_REQ_TO_DEV,
+                                    addr >> 8, RG_SHORT_ADDR_1, NULL, 0, 1000, GFP_KERNEL);
        }
 
        if (changed & IEEE802154_AFILT_PANID_CHANGED) {
                u16 pan = le16_to_cpu(filt->pan_id);
 
                dev_vdbg(dev, "%s called for pan id\n", __func__);
-               atusb_write_reg(atusb, RG_PAN_ID_0, pan);
-               atusb_write_reg(atusb, RG_PAN_ID_1, pan >> 8);
+               usb_control_msg_send(atusb->usb_dev, 0, ATUSB_REG_WRITE, ATUSB_REQ_TO_DEV,
+                                    pan, RG_PAN_ID_0, NULL, 0, 1000, GFP_KERNEL);
+
+               usb_control_msg_send(atusb->usb_dev, 0, ATUSB_REG_WRITE, ATUSB_REQ_TO_DEV,
+                                    pan >> 8, RG_PAN_ID_1, NULL, 0, 1000, GFP_KERNEL);
        }
 
        if (changed & IEEE802154_AFILT_IEEEADDR_CHANGED) {
@@ -437,7 +376,9 @@ static int atusb_set_hw_addr_filt(struct ieee802154_hw *hw,
                memcpy(addr, &filt->ieee_addr, IEEE802154_EXTENDED_ADDR_LEN);
                dev_vdbg(dev, "%s called for IEEE addr\n", __func__);
                for (i = 0; i < 8; i++)
-                       atusb_write_reg(atusb, RG_IEEE_ADDR_0 + i, addr[i]);
+                       usb_control_msg_send(atusb->usb_dev, 0, ATUSB_REG_WRITE, ATUSB_REQ_TO_DEV,
+                                            addr[i], RG_IEEE_ADDR_0 + i, NULL, 0,
+                                            1000, GFP_KERNEL);
        }
 
        if (changed & IEEE802154_AFILT_PANC_CHANGED) {
@@ -459,7 +400,8 @@ static int atusb_start(struct ieee802154_hw *hw)
 
        dev_dbg(&usb_dev->dev, "%s\n", __func__);
        schedule_delayed_work(&atusb->work, 0);
-       atusb_command(atusb, ATUSB_RX_MODE, 1);
+       usb_control_msg_send(atusb->usb_dev, 0, ATUSB_RX_MODE, ATUSB_REQ_TO_DEV, 1, 0,
+                            NULL, 0, 1000, GFP_KERNEL);
        ret = atusb_get_and_clear_error(atusb);
        if (ret < 0)
                usb_kill_anchored_urbs(&atusb->idle_urbs);
@@ -473,7 +415,8 @@ static void atusb_stop(struct ieee802154_hw *hw)
 
        dev_dbg(&usb_dev->dev, "%s\n", __func__);
        usb_kill_anchored_urbs(&atusb->idle_urbs);
-       atusb_command(atusb, ATUSB_RX_MODE, 0);
+       usb_control_msg_send(atusb->usb_dev, 0, ATUSB_RX_MODE, ATUSB_REQ_TO_DEV, 0, 0,
+                            NULL, 0, 1000, GFP_KERNEL);
        atusb_get_and_clear_error(atusb);
 }
 
@@ -580,9 +523,11 @@ atusb_set_cca_mode(struct ieee802154_hw *hw, const struct wpan_phy_cca *cca)
 
 static int hulusb_set_cca_ed_level(struct atusb *lp, int rssi_base_val)
 {
-       unsigned int cca_ed_thres;
+       int cca_ed_thres;
 
        cca_ed_thres = atusb_read_subreg(lp, SR_CCA_ED_THRES);
+       if (cca_ed_thres < 0)
+               return cca_ed_thres;
 
        switch (rssi_base_val) {
        case -98:
@@ -799,18 +744,13 @@ static int atusb_get_and_show_revision(struct atusb *atusb)
 {
        struct usb_device *usb_dev = atusb->usb_dev;
        char *hw_name;
-       unsigned char *buffer;
+       unsigned char buffer[3];
        int ret;
 
-       buffer = kmalloc(3, GFP_KERNEL);
-       if (!buffer)
-               return -ENOMEM;
-
        /* Get a couple of the ATMega Firmware values */
-       ret = atusb_control_msg(atusb, usb_rcvctrlpipe(usb_dev, 0),
-                               ATUSB_ID, ATUSB_REQ_FROM_DEV, 0, 0,
-                               buffer, 3, 1000);
-       if (ret >= 0) {
+       ret = usb_control_msg_recv(atusb->usb_dev, 0, ATUSB_ID, ATUSB_REQ_FROM_DEV, 0, 0,
+                                  buffer, 3, 1000, GFP_KERNEL);
+       if (!ret) {
                atusb->fw_ver_maj = buffer[0];
                atusb->fw_ver_min = buffer[1];
                atusb->fw_hw_type = buffer[2];
@@ -849,7 +789,6 @@ static int atusb_get_and_show_revision(struct atusb *atusb)
                dev_info(&usb_dev->dev, "Please update to version 0.2 or newer");
        }
 
-       kfree(buffer);
        return ret;
 }
 
@@ -863,7 +802,6 @@ static int atusb_get_and_show_build(struct atusb *atusb)
        if (!build)
                return -ENOMEM;
 
-       /* We cannot call atusb_control_msg() here, since this request may read various length data */
        ret = usb_control_msg(atusb->usb_dev, usb_rcvctrlpipe(usb_dev, 0), ATUSB_BUILD,
                              ATUSB_REQ_FROM_DEV, 0, 0, build, ATUSB_BUILD_SIZE, 1000);
        if (ret >= 0) {
@@ -881,14 +819,27 @@ static int atusb_get_and_conf_chip(struct atusb *atusb)
        u8 man_id_0, man_id_1, part_num, version_num;
        const char *chip;
        struct ieee802154_hw *hw = atusb->hw;
+       int ret;
 
-       man_id_0 = atusb_read_reg(atusb, RG_MAN_ID_0);
-       man_id_1 = atusb_read_reg(atusb, RG_MAN_ID_1);
-       part_num = atusb_read_reg(atusb, RG_PART_NUM);
-       version_num = atusb_read_reg(atusb, RG_VERSION_NUM);
+       ret = usb_control_msg_recv(usb_dev, 0, ATUSB_REG_READ, ATUSB_REQ_FROM_DEV,
+                                  0, RG_MAN_ID_0, &man_id_0, 1, 1000, GFP_KERNEL);
+       if (ret < 0)
+               return ret;
 
-       if (atusb->err)
-               return atusb->err;
+       ret = usb_control_msg_recv(usb_dev, 0, ATUSB_REG_READ, ATUSB_REQ_FROM_DEV,
+                                  0, RG_MAN_ID_1, &man_id_1, 1, 1000, GFP_KERNEL);
+       if (ret < 0)
+               return ret;
+
+       ret = usb_control_msg_recv(usb_dev, 0, ATUSB_REG_READ, ATUSB_REQ_FROM_DEV,
+                                  0, RG_PART_NUM, &part_num, 1, 1000, GFP_KERNEL);
+       if (ret < 0)
+               return ret;
+
+       ret = usb_control_msg_recv(usb_dev, 0, ATUSB_REG_READ, ATUSB_REQ_FROM_DEV,
+                                  0, RG_VERSION_NUM, &version_num, 1, 1000, GFP_KERNEL);
+       if (ret < 0)
+               return ret;
 
        hw->flags = IEEE802154_HW_TX_OMIT_CKSUM | IEEE802154_HW_AFILT |
                    IEEE802154_HW_PROMISCUOUS | IEEE802154_HW_CSMA_PARAMS;
@@ -969,7 +920,7 @@ fail:
 static int atusb_set_extended_addr(struct atusb *atusb)
 {
        struct usb_device *usb_dev = atusb->usb_dev;
-       unsigned char *buffer;
+       unsigned char buffer[IEEE802154_EXTENDED_ADDR_LEN];
        __le64 extended_addr;
        u64 addr;
        int ret;
@@ -982,18 +933,12 @@ static int atusb_set_extended_addr(struct atusb *atusb)
                return 0;
        }
 
-       buffer = kmalloc(IEEE802154_EXTENDED_ADDR_LEN, GFP_KERNEL);
-       if (!buffer)
-               return -ENOMEM;
-
        /* Firmware is new enough so we fetch the address from EEPROM */
-       ret = atusb_control_msg(atusb, usb_rcvctrlpipe(usb_dev, 0),
-                               ATUSB_EUI64_READ, ATUSB_REQ_FROM_DEV, 0, 0,
-                               buffer, IEEE802154_EXTENDED_ADDR_LEN, 1000);
+       ret = usb_control_msg_recv(atusb->usb_dev, 0, ATUSB_EUI64_READ, ATUSB_REQ_FROM_DEV, 0, 0,
+                                  buffer, IEEE802154_EXTENDED_ADDR_LEN, 1000, GFP_KERNEL);
        if (ret < 0) {
                dev_err(&usb_dev->dev, "failed to fetch extended address, random address set\n");
                ieee802154_random_extended_addr(&atusb->hw->phy->perm_extended_addr);
-               kfree(buffer);
                return ret;
        }
 
@@ -1009,7 +954,6 @@ static int atusb_set_extended_addr(struct atusb *atusb)
                         &addr);
        }
 
-       kfree(buffer);
        return ret;
 }
 
@@ -1051,7 +995,8 @@ static int atusb_probe(struct usb_interface *interface,
 
        hw->parent = &usb_dev->dev;
 
-       atusb_command(atusb, ATUSB_RF_RESET, 0);
+       usb_control_msg_send(atusb->usb_dev, 0, ATUSB_RF_RESET, ATUSB_REQ_TO_DEV, 0, 0,
+                            NULL, 0, 1000, GFP_KERNEL);
        atusb_get_and_conf_chip(atusb);
        atusb_get_and_show_revision(atusb);
        atusb_get_and_show_build(atusb);
@@ -1076,7 +1021,9 @@ static int atusb_probe(struct usb_interface *interface,
         * explicitly. Any resets after that will send us straight to TRX_OFF,
         * making the command below redundant.
         */
-       atusb_write_reg(atusb, RG_TRX_STATE, STATE_FORCE_TRX_OFF);
+       usb_control_msg_send(atusb->usb_dev, 0, ATUSB_REG_WRITE, ATUSB_REQ_TO_DEV,
+                            STATE_FORCE_TRX_OFF, RG_TRX_STATE, NULL, 0, 1000, GFP_KERNEL);
+
        msleep(1);      /* reset => TRX_OFF, tTR13 = 37 us */
 
 #if 0
@@ -1104,7 +1051,8 @@ static int atusb_probe(struct usb_interface *interface,
 
        atusb_write_subreg(atusb, SR_RX_SAFE_MODE, 1);
 #endif
-       atusb_write_reg(atusb, RG_IRQ_MASK, 0xff);
+       usb_control_msg_send(atusb->usb_dev, 0, ATUSB_REG_WRITE, ATUSB_REQ_TO_DEV,
+                            0xff, RG_IRQ_MASK, NULL, 0, 1000, GFP_KERNEL);
 
        ret = atusb_get_and_clear_error(atusb);
        if (!ret)
index 36f1c5a..38c217b 100644 (file)
@@ -791,7 +791,7 @@ static int hwsim_add_one(struct genl_info *info, struct device *dev,
        phy->idx = idx;
        INIT_LIST_HEAD(&phy->edges);
 
-       hw->flags = IEEE802154_HW_PROMISCUOUS;
+       hw->flags = IEEE802154_HW_PROMISCUOUS | IEEE802154_HW_RX_DROP_BAD_CKSUM;
        hw->parent = dev;
 
        err = ieee802154_register_hw(hw);
index 1544564..87e1d43 100644 (file)
@@ -320,6 +320,17 @@ gsi_trans_tre_release(struct gsi_trans_info *trans_info, u32 tre_count)
        atomic_add(tre_count, &trans_info->tre_avail);
 }
 
+/* Return true if no transactions are allocated, false otherwise */
+bool gsi_channel_trans_idle(struct gsi *gsi, u32 channel_id)
+{
+       u32 tre_max = gsi_channel_tre_max(gsi, channel_id);
+       struct gsi_trans_info *trans_info;
+
+       trans_info = &gsi->channel[channel_id].trans_info;
+
+       return atomic_read(&trans_info->tre_avail) == tre_max;
+}
+
 /* Allocate a GSI transaction on a channel */
 struct gsi_trans *gsi_channel_trans_alloc(struct gsi *gsi, u32 channel_id,
                                          u32 tre_count,
index 17fd182..af379b4 100644 (file)
@@ -130,6 +130,16 @@ void *gsi_trans_pool_alloc_dma(struct gsi_trans_pool *pool, dma_addr_t *addr);
 void gsi_trans_pool_exit_dma(struct device *dev, struct gsi_trans_pool *pool);
 
 /**
+ * gsi_channel_trans_idle() - Return whether no transactions are allocated
+ * @gsi:       GSI pointer
+ * @channel_id:        Channel the transaction is associated with
+ *
+ * Return:     True if no transactions are allocated, false otherwise
+ *
+ */
+bool gsi_channel_trans_idle(struct gsi *gsi, u32 channel_id);
+
+/**
  * gsi_channel_trans_alloc() - Allocate a GSI transaction on a channel
  * @gsi:       GSI pointer
  * @channel_id:        Channel the transaction is associated with
index 06ddb85..8ff351a 100644 (file)
@@ -101,6 +101,7 @@ static const struct ipa_gsi_endpoint_data ipa_gsi_endpoint_data[] = {
                                .aggregation    = true,
                                .status_enable  = true,
                                .rx = {
+                                       .buffer_size    = 8192,
                                        .pad_align      = ilog2(sizeof(u32)),
                                },
                        },
@@ -148,6 +149,7 @@ static const struct ipa_gsi_endpoint_data ipa_gsi_endpoint_data[] = {
                                .qmap           = true,
                                .aggregation    = true,
                                .rx = {
+                                       .buffer_size    = 8192,
                                        .aggr_close_eof = true,
                                },
                        },
index 760c22b..d1c466a 100644 (file)
@@ -92,6 +92,7 @@ static const struct ipa_gsi_endpoint_data ipa_gsi_endpoint_data[] = {
                                .aggregation    = true,
                                .status_enable  = true,
                                .rx = {
+                                       .buffer_size    = 8192,
                                        .pad_align      = ilog2(sizeof(u32)),
                                },
                        },
@@ -140,6 +141,7 @@ static const struct ipa_gsi_endpoint_data ipa_gsi_endpoint_data[] = {
                                .qmap           = true,
                                .aggregation    = true,
                                .rx = {
+                                       .buffer_size    = 8192,
                                        .aggr_close_eof = true,
                                },
                        },
index fea9145..b1991cc 100644 (file)
@@ -86,6 +86,7 @@ static const struct ipa_gsi_endpoint_data ipa_gsi_endpoint_data[] = {
                                .aggregation    = true,
                                .status_enable  = true,
                                .rx = {
+                                       .buffer_size    = 8192,
                                        .pad_align      = ilog2(sizeof(u32)),
                                },
                        },
@@ -133,6 +134,7 @@ static const struct ipa_gsi_endpoint_data ipa_gsi_endpoint_data[] = {
                                .qmap           = true,
                                .aggregation    = true,
                                .rx = {
+                                       .buffer_size    = 32768,
                                        .aggr_close_eof = true,
                                },
                        },
index 2a231e7..1190a43 100644 (file)
@@ -82,6 +82,7 @@ static const struct ipa_gsi_endpoint_data ipa_gsi_endpoint_data[] = {
                                .aggregation    = true,
                                .status_enable  = true,
                                .rx = {
+                                       .buffer_size    = 8192,
                                        .pad_align      = ilog2(sizeof(u32)),
                                },
                        },
@@ -130,6 +131,7 @@ static const struct ipa_gsi_endpoint_data ipa_gsi_endpoint_data[] = {
                                .qmap           = true,
                                .aggregation    = true,
                                .rx = {
+                                       .buffer_size    = 8192,
                                        .aggr_close_eof = true,
                                },
                        },
index 2da2c41..944f72b 100644 (file)
@@ -95,6 +95,7 @@ static const struct ipa_gsi_endpoint_data ipa_gsi_endpoint_data[] = {
                                .aggregation    = true,
                                .status_enable  = true,
                                .rx = {
+                                       .buffer_size    = 8192,
                                        .pad_align      = ilog2(sizeof(u32)),
                                },
                        },
@@ -142,6 +143,7 @@ static const struct ipa_gsi_endpoint_data ipa_gsi_endpoint_data[] = {
                                .qmap           = true,
                                .aggregation    = true,
                                .rx = {
+                                       .buffer_size    = 8192,
                                        .aggr_close_eof = true,
                                },
                        },
index 2421b5a..16786bf 100644 (file)
@@ -87,6 +87,7 @@ static const struct ipa_gsi_endpoint_data ipa_gsi_endpoint_data[] = {
                                .aggregation    = true,
                                .status_enable  = true,
                                .rx = {
+                                       .buffer_size    = 8192,
                                        .pad_align      = ilog2(sizeof(u32)),
                                },
                        },
@@ -134,6 +135,7 @@ static const struct ipa_gsi_endpoint_data ipa_gsi_endpoint_data[] = {
                                .qmap           = true,
                                .aggregation    = true,
                                .rx = {
+                                       .buffer_size    = 8192,
                                        .aggr_close_eof = true,
                                },
                        },
index 6d329e9..dbbeecf 100644 (file)
@@ -112,6 +112,7 @@ struct ipa_endpoint_tx_data {
 
 /**
  * struct ipa_endpoint_rx_data - configuration data for RX endpoints
+ * @buffer_size: requested receive buffer size (bytes)
  * @pad_align: power-of-2 boundary to which packet payload is aligned
  * @aggr_close_eof: whether aggregation closes on end-of-frame
  *
@@ -125,6 +126,7 @@ struct ipa_endpoint_tx_data {
  * a "frame" consisting of several transfers has ended.
  */
 struct ipa_endpoint_rx_data {
+       u32 buffer_size;
        u32 pad_align;
        bool aggr_close_eof;
 };
index 68291a3..888e942 100644 (file)
 
 #define atomic_dec_not_zero(v) atomic_add_unless((v), -1, 0)
 
-#define IPA_REPLENISH_BATCH    16
-
-/* RX buffer is 1 page (or a power-of-2 contiguous pages) */
-#define IPA_RX_BUFFER_SIZE     8192    /* PAGE_SIZE > 4096 wastes a LOT */
+/* Hardware is told about receive buffers once a "batch" has been queued */
+#define IPA_REPLENISH_BATCH    16              /* Must be non-zero */
 
 /* The amount of RX buffer space consumed by standard skb overhead */
 #define IPA_RX_BUFFER_OVERHEAD (PAGE_SIZE - SKB_MAX_ORDER(NET_SKB_PAD, 0))
@@ -75,6 +73,14 @@ struct ipa_status {
 #define IPA_STATUS_FLAGS1_RT_RULE_ID_FMASK     GENMASK(31, 22)
 #define IPA_STATUS_FLAGS2_TAG_FMASK            GENMASK_ULL(63, 16)
 
+static u32 aggr_byte_limit_max(enum ipa_version version)
+{
+       if (version < IPA_VERSION_4_5)
+               return field_max(aggr_byte_limit_fmask(true));
+
+       return field_max(aggr_byte_limit_fmask(false));
+}
+
 static bool ipa_endpoint_data_valid_one(struct ipa *ipa, u32 count,
                            const struct ipa_gsi_endpoint_data *all_data,
                            const struct ipa_gsi_endpoint_data *data)
@@ -87,6 +93,9 @@ static bool ipa_endpoint_data_valid_one(struct ipa *ipa, u32 count,
                return true;
 
        if (!data->toward_ipa) {
+               u32 buffer_size;
+               u32 limit;
+
                if (data->endpoint.filter_support) {
                        dev_err(dev, "filtering not supported for "
                                        "RX endpoint %u\n",
@@ -94,6 +103,41 @@ static bool ipa_endpoint_data_valid_one(struct ipa *ipa, u32 count,
                        return false;
                }
 
+               /* Nothing more to check for non-AP RX */
+               if (data->ee_id != GSI_EE_AP)
+                       return true;
+
+               buffer_size = data->endpoint.config.rx.buffer_size;
+               /* The buffer size must hold an MTU plus overhead */
+               limit = IPA_MTU + IPA_RX_BUFFER_OVERHEAD;
+               if (buffer_size < limit) {
+                       dev_err(dev, "RX buffer size too small for RX endpoint %u (%u < %u)\n",
+                               data->endpoint_id, buffer_size, limit);
+                       return false;
+               }
+
+               /* For an endpoint supporting receive aggregation, the
+                * aggregation byte limit defines the point at which an
+                * aggregation window will close.  It is programmed into the
+                * IPA hardware as a number of KB.  We don't use "hard byte
+                * limit" aggregation, so we need to supply enough space in
+                * a receive buffer to hold a complete MTU plus normal skb
+                * overhead *after* that aggregation byte limit has been
+                * crossed.
+                *
+                * This check just ensures the receive buffer size doesn't
+                * exceed what's representable in the aggregation limit field.
+                */
+               if (data->endpoint.config.aggregation) {
+                       limit += SZ_1K * aggr_byte_limit_max(ipa->version);
+                       if (buffer_size > limit) {
+                               dev_err(dev, "RX buffer size too large for aggregated RX endpoint %u (%u > %u)\n",
+                                       data->endpoint_id, buffer_size, limit);
+
+                               return false;
+                       }
+               }
+
                return true;    /* Nothing more to check for RX */
        }
 
@@ -156,21 +200,12 @@ static bool ipa_endpoint_data_valid_one(struct ipa *ipa, u32 count,
        return true;
 }
 
-static u32 aggr_byte_limit_max(enum ipa_version version)
-{
-       if (version < IPA_VERSION_4_5)
-               return field_max(aggr_byte_limit_fmask(true));
-
-       return field_max(aggr_byte_limit_fmask(false));
-}
-
 static bool ipa_endpoint_data_valid(struct ipa *ipa, u32 count,
                                    const struct ipa_gsi_endpoint_data *data)
 {
        const struct ipa_gsi_endpoint_data *dp = data;
        struct device *dev = &ipa->pdev->dev;
        enum ipa_endpoint_name name;
-       u32 limit;
 
        if (count > IPA_ENDPOINT_COUNT) {
                dev_err(dev, "too many endpoints specified (%u > %u)\n",
@@ -178,26 +213,6 @@ static bool ipa_endpoint_data_valid(struct ipa *ipa, u32 count,
                return false;
        }
 
-       /* The aggregation byte limit defines the point at which an
-        * aggregation window will close.  It is programmed into the
-        * IPA hardware as a number of KB.  We don't use "hard byte
-        * limit" aggregation, which means that we need to supply
-        * enough space in a receive buffer to hold a complete MTU
-        * plus normal skb overhead *after* that aggregation byte
-        * limit has been crossed.
-        *
-        * This check ensures we don't define a receive buffer size
-        * that would exceed what we can represent in the field that
-        * is used to program its size.
-        */
-       limit = aggr_byte_limit_max(ipa->version) * SZ_1K;
-       limit += IPA_MTU + IPA_RX_BUFFER_OVERHEAD;
-       if (limit < IPA_RX_BUFFER_SIZE) {
-               dev_err(dev, "buffer size too big for aggregation (%u > %u)\n",
-                       IPA_RX_BUFFER_SIZE, limit);
-               return false;
-       }
-
        /* Make sure needed endpoints have defined data */
        if (ipa_gsi_endpoint_data_empty(&data[IPA_ENDPOINT_AP_COMMAND_TX])) {
                dev_err(dev, "command TX endpoint not defined\n");
@@ -723,13 +738,15 @@ static void ipa_endpoint_init_aggr(struct ipa_endpoint *endpoint)
 
        if (endpoint->data->aggregation) {
                if (!endpoint->toward_ipa) {
+                       const struct ipa_endpoint_rx_data *rx_data;
                        bool close_eof;
                        u32 limit;
 
+                       rx_data = &endpoint->data->rx;
                        val |= u32_encode_bits(IPA_ENABLE_AGGR, AGGR_EN_FMASK);
                        val |= u32_encode_bits(IPA_GENERIC, AGGR_TYPE_FMASK);
 
-                       limit = ipa_aggr_size_kb(IPA_RX_BUFFER_SIZE);
+                       limit = ipa_aggr_size_kb(rx_data->buffer_size);
                        val |= aggr_byte_limit_encoded(version, limit);
 
                        limit = IPA_AGGR_TIME_LIMIT;
@@ -737,7 +754,7 @@ static void ipa_endpoint_init_aggr(struct ipa_endpoint *endpoint)
 
                        /* AGGR_PKT_LIMIT is 0 (unlimited) */
 
-                       close_eof = endpoint->data->rx.aggr_close_eof;
+                       close_eof = rx_data->aggr_close_eof;
                        val |= aggr_sw_eof_active_encoded(version, close_eof);
 
                        /* AGGR_HARD_BYTE_LIMIT_ENABLE is 0 */
@@ -1020,134 +1037,98 @@ static void ipa_endpoint_status(struct ipa_endpoint *endpoint)
        iowrite32(val, ipa->reg_virt + offset);
 }
 
-static int ipa_endpoint_replenish_one(struct ipa_endpoint *endpoint)
+static int ipa_endpoint_replenish_one(struct ipa_endpoint *endpoint,
+                                     struct gsi_trans *trans)
 {
-       struct gsi_trans *trans;
-       bool doorbell = false;
        struct page *page;
+       u32 buffer_size;
        u32 offset;
        u32 len;
        int ret;
 
-       page = dev_alloc_pages(get_order(IPA_RX_BUFFER_SIZE));
+       buffer_size = endpoint->data->rx.buffer_size;
+       page = dev_alloc_pages(get_order(buffer_size));
        if (!page)
                return -ENOMEM;
 
-       trans = ipa_endpoint_trans_alloc(endpoint, 1);
-       if (!trans)
-               goto err_free_pages;
-
        /* Offset the buffer to make space for skb headroom */
        offset = NET_SKB_PAD;
-       len = IPA_RX_BUFFER_SIZE - offset;
+       len = buffer_size - offset;
 
        ret = gsi_trans_page_add(trans, page, len, offset);
        if (ret)
-               goto err_trans_free;
-       trans->data = page;     /* transaction owns page now */
-
-       if (++endpoint->replenish_ready == IPA_REPLENISH_BATCH) {
-               doorbell = true;
-               endpoint->replenish_ready = 0;
-       }
-
-       gsi_trans_commit(trans, doorbell);
-
-       return 0;
-
-err_trans_free:
-       gsi_trans_free(trans);
-err_free_pages:
-       __free_pages(page, get_order(IPA_RX_BUFFER_SIZE));
+               __free_pages(page, get_order(buffer_size));
+       else
+               trans->data = page;     /* transaction owns page now */
 
-       return -ENOMEM;
+       return ret;
 }
 
 /**
  * ipa_endpoint_replenish() - Replenish endpoint receive buffers
  * @endpoint:  Endpoint to be replenished
- * @add_one:   Whether this is replacing a just-consumed buffer
  *
  * The IPA hardware can hold a fixed number of receive buffers for an RX
  * endpoint, based on the number of entries in the underlying channel ring
  * buffer.  If an endpoint's "backlog" is non-zero, it indicates how many
  * more receive buffers can be supplied to the hardware.  Replenishing for
- * an endpoint can be disabled, in which case requests to replenish a
- * buffer are "saved", and transferred to the backlog once it is re-enabled
- * again.
+ * an endpoint can be disabled, in which case buffers are not queued to
+ * the hardware.
  */
-static void ipa_endpoint_replenish(struct ipa_endpoint *endpoint, bool add_one)
+static void ipa_endpoint_replenish(struct ipa_endpoint *endpoint)
 {
-       struct gsi *gsi;
-       u32 backlog;
-       int delta;
+       struct gsi_trans *trans;
 
-       if (!test_bit(IPA_REPLENISH_ENABLED, endpoint->replenish_flags)) {
-               if (add_one)
-                       atomic_inc(&endpoint->replenish_saved);
+       if (!test_bit(IPA_REPLENISH_ENABLED, endpoint->replenish_flags))
                return;
-       }
 
-       /* If already active, just update the backlog */
-       if (test_and_set_bit(IPA_REPLENISH_ACTIVE, endpoint->replenish_flags)) {
-               if (add_one)
-                       atomic_inc(&endpoint->replenish_backlog);
+       /* Skip it if it's already active */
+       if (test_and_set_bit(IPA_REPLENISH_ACTIVE, endpoint->replenish_flags))
                return;
-       }
 
-       while (atomic_dec_not_zero(&endpoint->replenish_backlog))
-               if (ipa_endpoint_replenish_one(endpoint))
+       while ((trans = ipa_endpoint_trans_alloc(endpoint, 1))) {
+               bool doorbell;
+
+               if (ipa_endpoint_replenish_one(endpoint, trans))
                        goto try_again_later;
 
-       clear_bit(IPA_REPLENISH_ACTIVE, endpoint->replenish_flags);
 
-       if (add_one)
-               atomic_inc(&endpoint->replenish_backlog);
+               /* Ring the doorbell if we've got a full batch */
+               doorbell = !(++endpoint->replenish_count % IPA_REPLENISH_BATCH);
+               gsi_trans_commit(trans, doorbell);
+       }
+
+       clear_bit(IPA_REPLENISH_ACTIVE, endpoint->replenish_flags);
 
        return;
 
 try_again_later:
+       gsi_trans_free(trans);
        clear_bit(IPA_REPLENISH_ACTIVE, endpoint->replenish_flags);
 
-       /* The last one didn't succeed, so fix the backlog */
-       delta = add_one ? 2 : 1;
-       backlog = atomic_add_return(delta, &endpoint->replenish_backlog);
-
        /* Whenever a receive buffer transaction completes we'll try to
         * replenish again.  It's unlikely, but if we fail to supply even
         * one buffer, nothing will trigger another replenish attempt.
-        * Receive buffer transactions use one TRE, so schedule work to
-        * try replenishing again if our backlog is *all* available TREs.
+        * If the hardware has no receive buffers queued, schedule work to
+        * try replenishing again.
         */
-       gsi = &endpoint->ipa->gsi;
-       if (backlog == gsi_channel_tre_max(gsi, endpoint->channel_id))
+       if (gsi_channel_trans_idle(&endpoint->ipa->gsi, endpoint->channel_id))
                schedule_delayed_work(&endpoint->replenish_work,
                                      msecs_to_jiffies(1));
 }
 
 static void ipa_endpoint_replenish_enable(struct ipa_endpoint *endpoint)
 {
-       struct gsi *gsi = &endpoint->ipa->gsi;
-       u32 max_backlog;
-       u32 saved;
-
        set_bit(IPA_REPLENISH_ENABLED, endpoint->replenish_flags);
-       while ((saved = atomic_xchg(&endpoint->replenish_saved, 0)))
-               atomic_add(saved, &endpoint->replenish_backlog);
 
        /* Start replenishing if hardware currently has no buffers */
-       max_backlog = gsi_channel_tre_max(gsi, endpoint->channel_id);
-       if (atomic_read(&endpoint->replenish_backlog) == max_backlog)
-               ipa_endpoint_replenish(endpoint, false);
+       if (gsi_channel_trans_idle(&endpoint->ipa->gsi, endpoint->channel_id))
+               ipa_endpoint_replenish(endpoint);
 }
 
 static void ipa_endpoint_replenish_disable(struct ipa_endpoint *endpoint)
 {
-       u32 backlog;
-
        clear_bit(IPA_REPLENISH_ENABLED, endpoint->replenish_flags);
-       while ((backlog = atomic_xchg(&endpoint->replenish_backlog, 0)))
-               atomic_add(backlog, &endpoint->replenish_saved);
 }
 
 static void ipa_endpoint_replenish_work(struct work_struct *work)
@@ -1157,7 +1138,7 @@ static void ipa_endpoint_replenish_work(struct work_struct *work)
 
        endpoint = container_of(dwork, struct ipa_endpoint, replenish_work);
 
-       ipa_endpoint_replenish(endpoint, false);
+       ipa_endpoint_replenish(endpoint);
 }
 
 static void ipa_endpoint_skb_copy(struct ipa_endpoint *endpoint,
@@ -1183,15 +1164,16 @@ static void ipa_endpoint_skb_copy(struct ipa_endpoint *endpoint,
 static bool ipa_endpoint_skb_build(struct ipa_endpoint *endpoint,
                                   struct page *page, u32 len)
 {
+       u32 buffer_size = endpoint->data->rx.buffer_size;
        struct sk_buff *skb;
 
        /* Nothing to do if there's no netdev */
        if (!endpoint->netdev)
                return false;
 
-       WARN_ON(len > SKB_WITH_OVERHEAD(IPA_RX_BUFFER_SIZE - NET_SKB_PAD));
+       WARN_ON(len > SKB_WITH_OVERHEAD(buffer_size - NET_SKB_PAD));
 
-       skb = build_skb(page_address(page), IPA_RX_BUFFER_SIZE);
+       skb = build_skb(page_address(page), buffer_size);
        if (skb) {
                /* Reserve the headroom and account for the data */
                skb_reserve(skb, NET_SKB_PAD);
@@ -1289,8 +1271,9 @@ static bool ipa_endpoint_status_drop(struct ipa_endpoint *endpoint,
 static void ipa_endpoint_status_parse(struct ipa_endpoint *endpoint,
                                      struct page *page, u32 total_len)
 {
+       u32 buffer_size = endpoint->data->rx.buffer_size;
        void *data = page_address(page) + NET_SKB_PAD;
-       u32 unused = IPA_RX_BUFFER_SIZE - total_len;
+       u32 unused = buffer_size - total_len;
        u32 resid = total_len;
 
        while (resid) {
@@ -1360,10 +1343,8 @@ static void ipa_endpoint_rx_complete(struct ipa_endpoint *endpoint,
 {
        struct page *page;
 
-       ipa_endpoint_replenish(endpoint, true);
-
        if (trans->cancelled)
-               return;
+               goto done;
 
        /* Parse or build a socket buffer using the actual received length */
        page = trans->data;
@@ -1371,6 +1352,8 @@ static void ipa_endpoint_rx_complete(struct ipa_endpoint *endpoint,
                ipa_endpoint_status_parse(endpoint, page, trans->len);
        else if (ipa_endpoint_skb_build(endpoint, page, trans->len))
                trans->data = NULL;     /* Pages have been consumed */
+done:
+       ipa_endpoint_replenish(endpoint);
 }
 
 void ipa_endpoint_trans_complete(struct ipa_endpoint *endpoint,
@@ -1398,8 +1381,11 @@ void ipa_endpoint_trans_release(struct ipa_endpoint *endpoint,
        } else {
                struct page *page = trans->data;
 
-               if (page)
-                       __free_pages(page, get_order(IPA_RX_BUFFER_SIZE));
+               if (page) {
+                       u32 buffer_size = endpoint->data->rx.buffer_size;
+
+                       __free_pages(page, get_order(buffer_size));
+               }
        }
 }
 
@@ -1704,9 +1690,6 @@ static void ipa_endpoint_setup_one(struct ipa_endpoint *endpoint)
                 */
                clear_bit(IPA_REPLENISH_ENABLED, endpoint->replenish_flags);
                clear_bit(IPA_REPLENISH_ACTIVE, endpoint->replenish_flags);
-               atomic_set(&endpoint->replenish_saved,
-                          gsi_channel_tre_max(gsi, endpoint->channel_id));
-               atomic_set(&endpoint->replenish_backlog, 0);
                INIT_DELAYED_WORK(&endpoint->replenish_work,
                                  ipa_endpoint_replenish_work);
        }
@@ -1882,6 +1865,8 @@ u32 ipa_endpoint_init(struct ipa *ipa, u32 count,
        enum ipa_endpoint_name name;
        u32 filter_map;
 
+       BUILD_BUG_ON(!IPA_REPLENISH_BATCH);
+
        if (!ipa_endpoint_data_valid(ipa, count, data))
                return 0;       /* Error */
 
index 0313cdc..12fd5b1 100644 (file)
@@ -65,9 +65,7 @@ enum ipa_replenish_flag {
  * @evt_ring_id:       GSI event ring used by the endpoint
  * @netdev:            Network device pointer, if endpoint uses one
  * @replenish_flags:   Replenishing state flags
- * @replenish_ready:   Number of replenish transactions without doorbell
- * @replenish_saved:   Replenish requests held while disabled
- * @replenish_backlog: Number of buffers needed to fill hardware queue
+ * @replenish_count:   Total number of replenish transactions committed
  * @replenish_work:    Work item used for repeated replenish failures
  */
 struct ipa_endpoint {
@@ -86,9 +84,7 @@ struct ipa_endpoint {
 
        /* Receive buffer replenishing for RX endpoints */
        DECLARE_BITMAP(replenish_flags, IPA_REPLENISH_COUNT);
-       u32 replenish_ready;
-       atomic_t replenish_saved;
-       atomic_t replenish_backlog;
+       u64 replenish_count;
        struct delayed_work replenish_work;             /* global wq */
 };
 
index 7ab4e26..7aafc22 100644 (file)
@@ -285,7 +285,8 @@ static acpi_status acpi_register_phy(acpi_handle handle, u32 lvl,
        const union acpi_object *obj;
        u32 phy_addr;
 
-       if (acpi_bus_get_device(handle, &adev))
+       adev = acpi_fetch_acpi_dev(handle);
+       if (!adev)
                return AE_OK;
 
        if (acpi_dev_get_property(adev, "phy-channel", ACPI_TYPE_INTEGER, &obj))
index cd6742e..61418d4 100644 (file)
@@ -632,35 +632,43 @@ static void xpcs_resolve_pma(struct dw_xpcs *xpcs,
        }
 }
 
-void xpcs_validate(struct dw_xpcs *xpcs, unsigned long *supported,
-                  struct phylink_link_state *state)
+static int xpcs_validate(struct phylink_pcs *pcs, unsigned long *supported,
+                        const struct phylink_link_state *state)
 {
-       __ETHTOOL_DECLARE_LINK_MODE_MASK(xpcs_supported);
+       __ETHTOOL_DECLARE_LINK_MODE_MASK(xpcs_supported) = { 0, };
        const struct xpcs_compat *compat;
+       struct dw_xpcs *xpcs;
        int i;
 
-       /* phylink expects us to report all supported modes with
-        * PHY_INTERFACE_MODE_NA, just don't limit the supported and
-        * advertising masks and exit.
-        */
-       if (state->interface == PHY_INTERFACE_MODE_NA)
-               return;
-
-       linkmode_zero(xpcs_supported);
-
+       xpcs = phylink_pcs_to_xpcs(pcs);
        compat = xpcs_find_compat(xpcs->id, state->interface);
 
-       /* Populate the supported link modes for this
-        * PHY interface type
+       /* Populate the supported link modes for this PHY interface type.
+        * FIXME: what about the port modes and autoneg bit? This masks
+        * all those away.
         */
        if (compat)
                for (i = 0; compat->supported[i] != __ETHTOOL_LINK_MODE_MASK_NBITS; i++)
                        set_bit(compat->supported[i], xpcs_supported);
 
        linkmode_and(supported, supported, xpcs_supported);
-       linkmode_and(state->advertising, state->advertising, xpcs_supported);
+
+       return 0;
+}
+
+void xpcs_get_interfaces(struct dw_xpcs *xpcs, unsigned long *interfaces)
+{
+       int i, j;
+
+       for (i = 0; i < DW_XPCS_INTERFACE_MAX; i++) {
+               const struct xpcs_compat *compat = &xpcs->id->compat[i];
+
+               for (j = 0; j < compat->num_interfaces; j++)
+                       if (compat->interface[j] < PHY_INTERFACE_MODE_MAX)
+                               __set_bit(compat->interface[j], interfaces);
+       }
 }
-EXPORT_SYMBOL_GPL(xpcs_validate);
+EXPORT_SYMBOL_GPL(xpcs_get_interfaces);
 
 int xpcs_config_eee(struct dw_xpcs *xpcs, int mult_fact_100ns, int enable)
 {
@@ -1106,6 +1114,7 @@ static const struct xpcs_id xpcs_id_list[] = {
 };
 
 static const struct phylink_pcs_ops xpcs_phylink_ops = {
+       .pcs_validate = xpcs_validate,
        .pcs_config = xpcs_config,
        .pcs_get_state = xpcs_get_state,
        .pcs_link_up = xpcs_link_up,
index 968dd43..a8db1a1 100644 (file)
@@ -533,9 +533,7 @@ static int aqcs109_config_init(struct phy_device *phydev)
         * PMA speed ability bits are the same for all members of the family,
         * AQCS109 however supports speeds up to 2.5G only.
         */
-       ret = phy_set_max_speed(phydev, SPEED_2500);
-       if (ret)
-               return ret;
+       phy_set_max_speed(phydev, SPEED_2500);
 
        return aqr107_set_downshift(phydev, MDIO_AN_VEND_PROV_DOWNSHIFT_DFLT);
 }
index 29aa811..7392600 100644 (file)
@@ -19,6 +19,8 @@
 #include <linux/regulator/of_regulator.h>
 #include <linux/regulator/driver.h>
 #include <linux/regulator/consumer.h>
+#include <linux/phylink.h>
+#include <linux/sfp.h>
 #include <dt-bindings/net/qca-ar803x.h>
 
 #define AT803X_SPECIFIC_FUNCTION_CONTROL       0x10
@@ -51,6 +53,8 @@
 #define AT803X_INTR_ENABLE_PAGE_RECEIVED       BIT(12)
 #define AT803X_INTR_ENABLE_LINK_FAIL           BIT(11)
 #define AT803X_INTR_ENABLE_LINK_SUCCESS                BIT(10)
+#define AT803X_INTR_ENABLE_LINK_FAIL_BX                BIT(8)
+#define AT803X_INTR_ENABLE_LINK_SUCCESS_BX     BIT(7)
 #define AT803X_INTR_ENABLE_WIRESPEED_DOWNGRADE BIT(5)
 #define AT803X_INTR_ENABLE_POLARITY_CHANGED    BIT(1)
 #define AT803X_INTR_ENABLE_WOL                 BIT(0)
 #define AT803X_DEBUG_DATA                      0x1E
 
 #define AT803X_MODE_CFG_MASK                   0x0F
-#define AT803X_MODE_CFG_SGMII                  0x01
+#define AT803X_MODE_CFG_BASET_RGMII            0x00
+#define AT803X_MODE_CFG_BASET_SGMII            0x01
+#define AT803X_MODE_CFG_BX1000_RGMII_50OHM     0x02
+#define AT803X_MODE_CFG_BX1000_RGMII_75OHM     0x03
+#define AT803X_MODE_CFG_BX1000_CONV_50OHM      0x04
+#define AT803X_MODE_CFG_BX1000_CONV_75OHM      0x05
+#define AT803X_MODE_CFG_FX100_RGMII_50OHM      0x06
+#define AT803X_MODE_CFG_FX100_CONV_50OHM       0x07
+#define AT803X_MODE_CFG_RGMII_AUTO_MDET                0x0B
+#define AT803X_MODE_CFG_FX100_RGMII_75OHM      0x0E
+#define AT803X_MODE_CFG_FX100_CONV_75OHM       0x0F
 
 #define AT803X_PSSR                            0x11    /*PHY-Specific Status Register*/
 #define AT803X_PSSR_MR_AN_COMPLETE             0x0200
@@ -283,6 +297,8 @@ struct at803x_priv {
        u16 clk_25m_mask;
        u8 smarteee_lpi_tw_1g;
        u8 smarteee_lpi_tw_100m;
+       bool is_fiber;
+       bool is_1000basex;
        struct regulator_dev *vddio_rdev;
        struct regulator_dev *vddh_rdev;
        struct regulator *vddio;
@@ -650,6 +666,55 @@ static int at8031_register_regulators(struct phy_device *phydev)
        return 0;
 }
 
+static int at803x_sfp_insert(void *upstream, const struct sfp_eeprom_id *id)
+{
+       struct phy_device *phydev = upstream;
+       __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_support);
+       __ETHTOOL_DECLARE_LINK_MODE_MASK(sfp_support);
+       phy_interface_t iface;
+
+       linkmode_zero(phy_support);
+       phylink_set(phy_support, 1000baseX_Full);
+       phylink_set(phy_support, 1000baseT_Full);
+       phylink_set(phy_support, Autoneg);
+       phylink_set(phy_support, Pause);
+       phylink_set(phy_support, Asym_Pause);
+
+       linkmode_zero(sfp_support);
+       sfp_parse_support(phydev->sfp_bus, id, sfp_support);
+       /* Some modules support 10G modes as well as others we support.
+        * Mask out non-supported modes so the correct interface is picked.
+        */
+       linkmode_and(sfp_support, phy_support, sfp_support);
+
+       if (linkmode_empty(sfp_support)) {
+               dev_err(&phydev->mdio.dev, "incompatible SFP module inserted\n");
+               return -EINVAL;
+       }
+
+       iface = sfp_select_interface(phydev->sfp_bus, sfp_support);
+
+       /* Only 1000Base-X is supported by AR8031/8033 as the downstream SerDes
+        * interface for use with SFP modules.
+        * However, some copper modules detected as having a preferred SGMII
+        * interface do default to and function in 1000Base-X mode, so just
+        * print a warning and allow such modules, as they may have some chance
+        * of working.
+        */
+       if (iface == PHY_INTERFACE_MODE_SGMII)
+               dev_warn(&phydev->mdio.dev, "module may not function if 1000Base-X not supported\n");
+       else if (iface != PHY_INTERFACE_MODE_1000BASEX)
+               return -EINVAL;
+
+       return 0;
+}
+
+static const struct sfp_upstream_ops at803x_sfp_ops = {
+       .attach = phy_sfp_attach,
+       .detach = phy_sfp_detach,
+       .module_insert = at803x_sfp_insert,
+};
+
 static int at803x_parse_dt(struct phy_device *phydev)
 {
        struct device_node *node = phydev->mdio.dev.of_node;
@@ -757,6 +822,11 @@ static int at803x_parse_dt(struct phy_device *phydev)
                        phydev_err(phydev, "failed to get VDDIO regulator\n");
                        return PTR_ERR(priv->vddio);
                }
+
+               /* Only AR8031/8033 support 1000Base-X for SFP modules */
+               ret = phy_sfp_probe(phydev, &at803x_sfp_ops);
+               if (ret < 0)
+                       return ret;
        }
 
        return 0;
@@ -784,16 +854,24 @@ static int at803x_probe(struct phy_device *phydev)
                        return ret;
        }
 
-       /* Some bootloaders leave the fiber page selected.
-        * Switch to the copper page, as otherwise we read
-        * the PHY capabilities from the fiber side.
-        */
        if (phydev->drv->phy_id == ATH8031_PHY_ID) {
-               phy_lock_mdio_bus(phydev);
-               ret = at803x_write_page(phydev, AT803X_PAGE_COPPER);
-               phy_unlock_mdio_bus(phydev);
-               if (ret)
+               int ccr = phy_read(phydev, AT803X_REG_CHIP_CONFIG);
+               int mode_cfg;
+
+               if (ccr < 0)
                        goto err;
+               mode_cfg = ccr & AT803X_MODE_CFG_MASK;
+
+               switch (mode_cfg) {
+               case AT803X_MODE_CFG_BX1000_RGMII_50OHM:
+               case AT803X_MODE_CFG_BX1000_RGMII_75OHM:
+                       priv->is_1000basex = true;
+                       fallthrough;
+               case AT803X_MODE_CFG_FX100_RGMII_50OHM:
+               case AT803X_MODE_CFG_FX100_RGMII_75OHM:
+                       priv->is_fiber = true;
+                       break;
+               }
        }
 
        return 0;
@@ -815,6 +893,7 @@ static void at803x_remove(struct phy_device *phydev)
 
 static int at803x_get_features(struct phy_device *phydev)
 {
+       struct at803x_priv *priv = phydev->priv;
        int err;
 
        err = genphy_read_abilities(phydev);
@@ -841,12 +920,13 @@ static int at803x_get_features(struct phy_device *phydev)
         * As a result of that, ESTATUS_1000_XFULL is set
         * to 1 even when operating in copper TP mode.
         *
-        * Remove this mode from the supported link modes,
-        * as this driver currently only supports copper
-        * operation.
+        * Remove this mode from the supported link modes
+        * when not operating in 1000BaseX mode.
         */
-       linkmode_clear_bit(ETHTOOL_LINK_MODE_1000baseX_Full_BIT,
-                          phydev->supported);
+       if (!priv->is_1000basex)
+               linkmode_clear_bit(ETHTOOL_LINK_MODE_1000baseX_Full_BIT,
+                                  phydev->supported);
+
        return 0;
 }
 
@@ -910,8 +990,27 @@ static int at8031_pll_config(struct phy_device *phydev)
 
 static int at803x_config_init(struct phy_device *phydev)
 {
+       struct at803x_priv *priv = phydev->priv;
        int ret;
 
+       if (phydev->drv->phy_id == ATH8031_PHY_ID) {
+               /* Some bootloaders leave the fiber page selected.
+                * Switch to the appropriate page (fiber or copper), as otherwise we
+                * read the PHY capabilities from the wrong page.
+                */
+               phy_lock_mdio_bus(phydev);
+               ret = at803x_write_page(phydev,
+                                       priv->is_fiber ? AT803X_PAGE_FIBER :
+                                                        AT803X_PAGE_COPPER);
+               phy_unlock_mdio_bus(phydev);
+               if (ret)
+                       return ret;
+
+               ret = at8031_pll_config(phydev);
+               if (ret < 0)
+                       return ret;
+       }
+
        /* The RX and TX delay default is:
         *   after HW reset: RX delay enabled and TX delay disabled
         *   after SW reset: RX delay enabled, while TX delay retains the
@@ -941,12 +1040,6 @@ static int at803x_config_init(struct phy_device *phydev)
        if (ret < 0)
                return ret;
 
-       if (phydev->drv->phy_id == ATH8031_PHY_ID) {
-               ret = at8031_pll_config(phydev);
-               if (ret < 0)
-                       return ret;
-       }
-
        /* Ar803x extended next page bit is enabled by default. Cisco
         * multigig switches read this bit and attempt to negotiate 10Gbps
         * rates even if the next page bit is disabled. This is incorrect
@@ -967,6 +1060,7 @@ static int at803x_ack_interrupt(struct phy_device *phydev)
 
 static int at803x_config_intr(struct phy_device *phydev)
 {
+       struct at803x_priv *priv = phydev->priv;
        int err;
        int value;
 
@@ -983,6 +1077,10 @@ static int at803x_config_intr(struct phy_device *phydev)
                value |= AT803X_INTR_ENABLE_DUPLEX_CHANGED;
                value |= AT803X_INTR_ENABLE_LINK_FAIL;
                value |= AT803X_INTR_ENABLE_LINK_SUCCESS;
+               if (priv->is_fiber) {
+                       value |= AT803X_INTR_ENABLE_LINK_FAIL_BX;
+                       value |= AT803X_INTR_ENABLE_LINK_SUCCESS_BX;
+               }
 
                err = phy_write(phydev, AT803X_INTR_ENABLE, value);
        } else {
@@ -1115,8 +1213,12 @@ static int at803x_read_specific_status(struct phy_device *phydev)
 
 static int at803x_read_status(struct phy_device *phydev)
 {
+       struct at803x_priv *priv = phydev->priv;
        int err, old_link = phydev->link;
 
+       if (priv->is_1000basex)
+               return genphy_c37_read_status(phydev);
+
        /* Update the link, but return if there was an error */
        err = genphy_update_link(phydev);
        if (err)
@@ -1170,6 +1272,7 @@ static int at803x_config_mdix(struct phy_device *phydev, u8 ctrl)
 
 static int at803x_config_aneg(struct phy_device *phydev)
 {
+       struct at803x_priv *priv = phydev->priv;
        int ret;
 
        ret = at803x_config_mdix(phydev, phydev->mdix_ctrl);
@@ -1186,6 +1289,9 @@ static int at803x_config_aneg(struct phy_device *phydev)
                        return ret;
        }
 
+       if (priv->is_1000basex)
+               return genphy_c37_config_aneg(phydev);
+
        /* Do not restart auto-negotiation by setting ret to 0 defautly,
         * when calling __genphy_config_aneg later.
         */
index 271fc01..2001f33 100644 (file)
@@ -243,7 +243,7 @@ size_t phy_speeds(unsigned int *speeds, size_t size,
        return count;
 }
 
-static int __set_linkmode_max_speed(u32 max_speed, unsigned long *addr)
+static void __set_linkmode_max_speed(u32 max_speed, unsigned long *addr)
 {
        const struct phy_setting *p;
        int i;
@@ -254,13 +254,11 @@ static int __set_linkmode_max_speed(u32 max_speed, unsigned long *addr)
                else
                        break;
        }
-
-       return 0;
 }
 
-static int __set_phy_supported(struct phy_device *phydev, u32 max_speed)
+static void __set_phy_supported(struct phy_device *phydev, u32 max_speed)
 {
-       return __set_linkmode_max_speed(max_speed, phydev->supported);
+       __set_linkmode_max_speed(max_speed, phydev->supported);
 }
 
 /**
@@ -273,17 +271,11 @@ static int __set_phy_supported(struct phy_device *phydev, u32 max_speed)
  * is connected to a 1G PHY. This function allows the MAC to indicate its
  * maximum speed, and so limit what the PHY will advertise.
  */
-int phy_set_max_speed(struct phy_device *phydev, u32 max_speed)
+void phy_set_max_speed(struct phy_device *phydev, u32 max_speed)
 {
-       int err;
-
-       err = __set_phy_supported(phydev, max_speed);
-       if (err)
-               return err;
+       __set_phy_supported(phydev, max_speed);
 
        phy_advertise_supported(phydev);
-
-       return 0;
 }
 EXPORT_SYMBOL(phy_set_max_speed);
 
@@ -440,7 +432,9 @@ int phy_speed_down_core(struct phy_device *phydev)
        if (min_common_speed == SPEED_UNKNOWN)
                return -EINVAL;
 
-       return __set_linkmode_max_speed(min_common_speed, phydev->advertising);
+       __set_linkmode_max_speed(min_common_speed, phydev->advertising);
+
+       return 0;
 }
 
 static void mmd_phy_indirect(struct mii_bus *bus, int phy_addr, int devad,
index 4202018..5b53a3e 100644 (file)
@@ -132,17 +132,6 @@ void phylink_set_port_modes(unsigned long *mask)
 }
 EXPORT_SYMBOL_GPL(phylink_set_port_modes);
 
-void phylink_set_10g_modes(unsigned long *mask)
-{
-       phylink_set(mask, 10000baseT_Full);
-       phylink_set(mask, 10000baseCR_Full);
-       phylink_set(mask, 10000baseSR_Full);
-       phylink_set(mask, 10000baseLR_Full);
-       phylink_set(mask, 10000baseLRM_Full);
-       phylink_set(mask, 10000baseER_Full);
-}
-EXPORT_SYMBOL_GPL(phylink_set_10g_modes);
-
 static int phylink_is_empty_linkmode(const unsigned long *linkmode)
 {
        __ETHTOOL_DECLARE_LINK_MODE_MASK(tmp) = { 0, };
index b554054..e62fc4f 100644 (file)
@@ -358,6 +358,7 @@ config USB_NET_SMSC95XX
        select BITREVERSE
        select CRC16
        select CRC32
+       imply NET_SELFTESTS
        help
          This option adds support for SMSC LAN95XX based USB 2.0
          10/100 Ethernet adapters.
index 2a1e31d..4334aaf 100644 (file)
@@ -192,8 +192,8 @@ extern const struct driver_info ax88172a_info;
 /* ASIX specific flags */
 #define FLAG_EEPROM_MAC                (1UL << 0)  /* init device MAC from eeprom */
 
-int asix_read_cmd(struct usbnet *dev, u8 cmd, u16 value, u16 index,
-                 u16 size, void *data, int in_pm);
+int __must_check asix_read_cmd(struct usbnet *dev, u8 cmd, u16 value, u16 index,
+                              u16 size, void *data, int in_pm);
 
 int asix_write_cmd(struct usbnet *dev, u8 cmd, u16 value, u16 index,
                   u16 size, void *data, int in_pm);
index 7168297..5248052 100644 (file)
@@ -11,8 +11,8 @@
 
 #define AX_HOST_EN_RETRIES     30
 
-int asix_read_cmd(struct usbnet *dev, u8 cmd, u16 value, u16 index,
-                 u16 size, void *data, int in_pm)
+int __must_check asix_read_cmd(struct usbnet *dev, u8 cmd, u16 value, u16 index,
+                              u16 size, void *data, int in_pm)
 {
        int ret;
        int (*fn)(struct usbnet *, u8, u8, u16, u16, void *, u16);
@@ -27,9 +27,12 @@ int asix_read_cmd(struct usbnet *dev, u8 cmd, u16 value, u16 index,
        ret = fn(dev, cmd, USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE,
                 value, index, data, size);
 
-       if (unlikely(ret < 0))
+       if (unlikely(ret < size)) {
+               ret = ret < 0 ? ret : -ENODATA;
+
                netdev_warn(dev->net, "Failed to read reg index 0x%04x: %d\n",
                            index, ret);
+       }
 
        return ret;
 }
@@ -79,7 +82,7 @@ static int asix_check_host_enable(struct usbnet *dev, int in_pm)
                                    0, 0, 1, &smsr, in_pm);
                if (ret == -ENODEV)
                        break;
-               else if (ret < sizeof(smsr))
+               else if (ret < 0)
                        continue;
                else if (smsr & AX_HOST_EN)
                        break;
@@ -579,8 +582,12 @@ int asix_mdio_read_nopm(struct net_device *netdev, int phy_id, int loc)
                return ret;
        }
 
-       asix_read_cmd(dev, AX_CMD_READ_MII_REG, phy_id,
-                     (__u16)loc, 2, &res, 1);
+       ret = asix_read_cmd(dev, AX_CMD_READ_MII_REG, phy_id,
+                           (__u16)loc, 2, &res, 1);
+       if (ret < 0) {
+               mutex_unlock(&dev->phy_mutex);
+               return ret;
+       }
        asix_set_hw_mii(dev, 1);
        mutex_unlock(&dev->phy_mutex);
 
index 4514d35..6ea44e5 100644 (file)
@@ -755,7 +755,12 @@ static int ax88772_bind(struct usbnet *dev, struct usb_interface *intf)
        priv->phy_addr = ret;
        priv->embd_phy = ((priv->phy_addr & 0x1f) == 0x10);
 
-       asix_read_cmd(dev, AX_CMD_STATMNGSTS_REG, 0, 0, 1, &chipcode, 0);
+       ret = asix_read_cmd(dev, AX_CMD_STATMNGSTS_REG, 0, 0, 1, &chipcode, 0);
+       if (ret < 0) {
+               netdev_dbg(dev->net, "Failed to read STATMNGSTS_REG: %d\n", ret);
+               return ret;
+       }
+
        chipcode &= AX_CHIPCODE_MASK;
 
        ret = (chipcode == AX_AX88772_CHIPCODE) ? ax88772_hw_reset(dev, 0) :
@@ -858,7 +863,6 @@ static int marvell_phy_init(struct usbnet *dev)
                reg = asix_mdio_read(dev->net, dev->mii.phy_id,
                        MII_MARVELL_LED_CTRL);
                netdev_dbg(dev->net, "MII_MARVELL_LED_CTRL (2) = 0x%04x\n", reg);
-               reg &= 0xfc0f;
        }
 
        return 0;
@@ -920,11 +924,21 @@ static int ax88178_reset(struct usbnet *dev)
        int gpio0 = 0;
        u32 phyid;
 
-       asix_read_cmd(dev, AX_CMD_READ_GPIOS, 0, 0, 1, &status, 0);
+       ret = asix_read_cmd(dev, AX_CMD_READ_GPIOS, 0, 0, 1, &status, 0);
+       if (ret < 0) {
+               netdev_dbg(dev->net, "Failed to read GPIOS: %d\n", ret);
+               return ret;
+       }
+
        netdev_dbg(dev->net, "GPIO Status: 0x%04x\n", status);
 
        asix_write_cmd(dev, AX_CMD_WRITE_ENABLE, 0, 0, 0, NULL, 0);
-       asix_read_cmd(dev, AX_CMD_READ_EEPROM, 0x0017, 0, 2, &eeprom, 0);
+       ret = asix_read_cmd(dev, AX_CMD_READ_EEPROM, 0x0017, 0, 2, &eeprom, 0);
+       if (ret < 0) {
+               netdev_dbg(dev->net, "Failed to read EEPROM: %d\n", ret);
+               return ret;
+       }
+
        asix_write_cmd(dev, AX_CMD_WRITE_DISABLE, 0, 0, 0, NULL, 0);
 
        netdev_dbg(dev->net, "EEPROM index 0x17 is 0x%04x\n", eeprom);
index 82bb5ed..a7c1434 100644 (file)
@@ -21,6 +21,7 @@
 #include <net/ipv6.h>
 #include <net/addrconf.h>
 #include <net/ipv6_stubs.h>
+#include <net/ndisc.h>
 
 /* alternative VLAN for IP session 0 if not untagged */
 #define MBIM_IPS0_VID  4094
index bc1e3dd..5567220 100644 (file)
@@ -20,6 +20,8 @@
 #include <linux/of_net.h>
 #include <linux/mdio.h>
 #include <linux/phy.h>
+#include <net/selftests.h>
+
 #include "smsc95xx.h"
 
 #define SMSC_CHIPNAME                  "smsc95xx"
@@ -727,6 +729,26 @@ static u32 smsc95xx_get_link(struct net_device *net)
        return net->phydev->link;
 }
 
+static void smsc95xx_ethtool_get_strings(struct net_device *netdev, u32 sset,
+                                       u8 *data)
+{
+       switch (sset) {
+       case ETH_SS_TEST:
+               net_selftest_get_strings(data);
+               break;
+       }
+}
+
+static int smsc95xx_ethtool_get_sset_count(struct net_device *ndev, int sset)
+{
+       switch (sset) {
+       case ETH_SS_TEST:
+               return net_selftest_get_count();
+       default:
+               return -EOPNOTSUPP;
+       }
+}
+
 static const struct ethtool_ops smsc95xx_ethtool_ops = {
        .get_link       = smsc95xx_get_link,
        .nway_reset     = phy_ethtool_nway_reset,
@@ -743,6 +765,9 @@ static const struct ethtool_ops smsc95xx_ethtool_ops = {
        .get_link_ksettings     = phy_ethtool_get_link_ksettings,
        .set_link_ksettings     = phy_ethtool_set_link_ksettings,
        .get_ts_info    = ethtool_op_get_ts_info,
+       .self_test      = net_selftest,
+       .get_strings    = smsc95xx_ethtool_get_strings,
+       .get_sset_count = smsc95xx_ethtool_get_sset_count,
 };
 
 static int smsc95xx_ioctl(struct net_device *netdev, struct ifreq *rq, int cmd)
index 30d2912..6335d7a 100644 (file)
@@ -456,7 +456,7 @@ static const struct nfc_vendor_cmd st_nci_vendor_cmds[] = {
 
 int st_nci_vendor_cmds_init(struct nci_dev *ndev)
 {
-       return nfc_set_vendor_cmds(ndev->nfc_dev, st_nci_vendor_cmds,
+       return nci_set_vendor_cmds(ndev, st_nci_vendor_cmds,
                                   sizeof(st_nci_vendor_cmds));
 }
 EXPORT_SYMBOL(st_nci_vendor_cmds_init);
index 7488286..bfa418d 100644 (file)
@@ -358,7 +358,7 @@ int st21nfca_vendor_cmds_init(struct nfc_hci_dev *hdev)
        struct st21nfca_hci_info *info = nfc_hci_get_clientdata(hdev);
 
        init_completion(&info->vendor_info.req_completion);
-       return nfc_set_vendor_cmds(hdev->ndev, st21nfca_vendor_cmds,
-                                  sizeof(st21nfca_vendor_cmds));
+       return nfc_hci_set_vendor_cmds(hdev, st21nfca_vendor_cmds,
+                                      sizeof(st21nfca_vendor_cmds));
 }
 EXPORT_SYMBOL(st21nfca_vendor_cmds_init);
index 0e4bc8b..b6f2cfd 100644 (file)
@@ -317,11 +317,18 @@ no_memory:
 }
 EXPORT_SYMBOL(ptp_clock_register);
 
+static int unregister_vclock(struct device *dev, void *data)
+{
+       struct ptp_clock *ptp = dev_get_drvdata(dev);
+
+       ptp_vclock_unregister(info_to_vclock(ptp->info));
+       return 0;
+}
+
 int ptp_clock_unregister(struct ptp_clock *ptp)
 {
        if (ptp_vclock_in_use(ptp)) {
-               pr_err("ptp: virtual clock in use\n");
-               return -EBUSY;
+               device_for_each_child(&ptp->dev, NULL, unregister_vclock);
        }
 
        ptp->defunct = 1;
index 8070f3f..7d4da9e 100644 (file)
 
 #include <linux/device.h>
 #include <linux/err.h>
-#include <linux/init.h>
 #include <linux/interrupt.h>
 #include <linux/io.h>
+#include <linux/io-64-nonatomic-lo-hi.h>
+#include <linux/io-64-nonatomic-hi-lo.h>
 #include <linux/irq.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
@@ -100,7 +101,6 @@ struct pch_ts_regs {
 #define PCH_ECS_ETH            (1 << 0)
 
 #define PCH_ECS_CAN            (1 << 1)
-#define PCH_STATION_BYTES      6
 
 #define PCH_IEEE1588_ETH       (1 << 0)
 #define PCH_IEEE1588_CAN       (1 << 1)
@@ -115,8 +115,6 @@ struct pch_dev {
        int exts0_enabled;
        int exts1_enabled;
 
-       u32 mem_base;
-       u32 mem_size;
        u32 irq;
        struct pci_dev *pdev;
        spinlock_t register_lock;
@@ -148,28 +146,15 @@ static inline void pch_eth_enable_set(struct pch_dev *chip)
 static u64 pch_systime_read(struct pch_ts_regs __iomem *regs)
 {
        u64 ns;
-       u32 lo, hi;
 
-       lo = ioread32(&regs->systime_lo);
-       hi = ioread32(&regs->systime_hi);
+       ns = ioread64_lo_hi(&regs->systime_lo);
 
-       ns = ((u64) hi) << 32;
-       ns |= lo;
-       ns <<= TICKS_NS_SHIFT;
-
-       return ns;
+       return ns << TICKS_NS_SHIFT;
 }
 
 static void pch_systime_write(struct pch_ts_regs __iomem *regs, u64 ns)
 {
-       u32 hi, lo;
-
-       ns >>= TICKS_NS_SHIFT;
-       hi = ns >> 32;
-       lo = ns & 0xffffffff;
-
-       iowrite32(lo, &regs->systime_lo);
-       iowrite32(hi, &regs->systime_hi);
+       iowrite64_lo_hi(ns >> TICKS_NS_SHIFT, &regs->systime_lo);
 }
 
 static inline void pch_block_reset(struct pch_dev *chip)
@@ -235,16 +220,10 @@ u64 pch_rx_snap_read(struct pci_dev *pdev)
 {
        struct pch_dev *chip = pci_get_drvdata(pdev);
        u64 ns;
-       u32 lo, hi;
 
-       lo = ioread32(&chip->regs->rx_snap_lo);
-       hi = ioread32(&chip->regs->rx_snap_hi);
+       ns = ioread64_lo_hi(&chip->regs->rx_snap_lo);
 
-       ns = ((u64) hi) << 32;
-       ns |= lo;
-       ns <<= TICKS_NS_SHIFT;
-
-       return ns;
+       return ns << TICKS_NS_SHIFT;
 }
 EXPORT_SYMBOL(pch_rx_snap_read);
 
@@ -252,16 +231,10 @@ u64 pch_tx_snap_read(struct pci_dev *pdev)
 {
        struct pch_dev *chip = pci_get_drvdata(pdev);
        u64 ns;
-       u32 lo, hi;
-
-       lo = ioread32(&chip->regs->tx_snap_lo);
-       hi = ioread32(&chip->regs->tx_snap_hi);
 
-       ns = ((u64) hi) << 32;
-       ns |= lo;
-       ns <<= TICKS_NS_SHIFT;
+       ns = ioread64_lo_hi(&chip->regs->tx_snap_lo);
 
-       return ns;
+       return ns << TICKS_NS_SHIFT;
 }
 EXPORT_SYMBOL(pch_tx_snap_read);
 
@@ -292,8 +265,9 @@ static void pch_reset(struct pch_dev *chip)
  */
 int pch_set_station_address(u8 *addr, struct pci_dev *pdev)
 {
-       s32 i;
        struct pch_dev *chip = pci_get_drvdata(pdev);
+       bool valid;
+       u64 mac;
 
        /* Verify the parameter */
        if ((chip->regs == NULL) || addr == (u8 *)NULL) {
@@ -301,37 +275,15 @@ int pch_set_station_address(u8 *addr, struct pci_dev *pdev)
                        "invalid params returning PCH_INVALIDPARAM\n");
                return PCH_INVALIDPARAM;
        }
-       /* For all station address bytes */
-       for (i = 0; i < PCH_STATION_BYTES; i++) {
-               u32 val;
-               s32 tmp;
-
-               tmp = hex_to_bin(addr[i * 3]);
-               if (tmp < 0) {
-                       dev_err(&pdev->dev,
-                               "invalid params returning PCH_INVALIDPARAM\n");
-                       return PCH_INVALIDPARAM;
-               }
-               val = tmp * 16;
-               tmp = hex_to_bin(addr[(i * 3) + 1]);
-               if (tmp < 0) {
-                       dev_err(&pdev->dev,
-                               "invalid params returning PCH_INVALIDPARAM\n");
-                       return PCH_INVALIDPARAM;
-               }
-               val += tmp;
-               /* Expects ':' separated addresses */
-               if ((i < 5) && (addr[(i * 3) + 2] != ':')) {
-                       dev_err(&pdev->dev,
-                               "invalid params returning PCH_INVALIDPARAM\n");
-                       return PCH_INVALIDPARAM;
-               }
 
-               /* Ideally we should set the address only after validating
-                                                        entire string */
-               dev_dbg(&pdev->dev, "invoking pch_station_set\n");
-               iowrite32(val, &chip->regs->ts_st[i]);
+       valid = mac_pton(addr, (u8 *)&mac);
+       if (!valid) {
+               dev_err(&pdev->dev, "invalid params returning PCH_INVALIDPARAM\n");
+               return PCH_INVALIDPARAM;
        }
+
+       dev_dbg(&pdev->dev, "invoking pch_station_set\n");
+       iowrite64_lo_hi(mac, &chip->regs->ts_st);
        return 0;
 }
 EXPORT_SYMBOL(pch_set_station_address);
@@ -344,19 +296,16 @@ static irqreturn_t isr(int irq, void *priv)
        struct pch_dev *pch_dev = priv;
        struct pch_ts_regs __iomem *regs = pch_dev->regs;
        struct ptp_clock_event event;
-       u32 ack = 0, lo, hi, val;
+       u32 ack = 0, val;
 
        val = ioread32(&regs->event);
 
        if (val & PCH_TSE_SNS) {
                ack |= PCH_TSE_SNS;
                if (pch_dev->exts0_enabled) {
-                       hi = ioread32(&regs->asms_hi);
-                       lo = ioread32(&regs->asms_lo);
                        event.type = PTP_CLOCK_EXTTS;
                        event.index = 0;
-                       event.timestamp = ((u64) hi) << 32;
-                       event.timestamp |= lo;
+                       event.timestamp = ioread64_hi_lo(&regs->asms_hi);
                        event.timestamp <<= TICKS_NS_SHIFT;
                        ptp_clock_event(pch_dev->ptp_clock, &event);
                }
@@ -365,12 +314,9 @@ static irqreturn_t isr(int irq, void *priv)
        if (val & PCH_TSE_SNM) {
                ack |= PCH_TSE_SNM;
                if (pch_dev->exts1_enabled) {
-                       hi = ioread32(&regs->amms_hi);
-                       lo = ioread32(&regs->amms_lo);
                        event.type = PTP_CLOCK_EXTTS;
                        event.index = 1;
-                       event.timestamp = ((u64) hi) << 32;
-                       event.timestamp |= lo;
+                       event.timestamp = ioread64_hi_lo(&regs->asms_hi);
                        event.timestamp <<= TICKS_NS_SHIFT;
                        ptp_clock_event(pch_dev->ptp_clock, &event);
                }
@@ -501,31 +447,12 @@ static const struct ptp_clock_info ptp_pch_caps = {
        .enable         = ptp_pch_enable,
 };
 
-#define pch_suspend NULL
-#define pch_resume NULL
-
 static void pch_remove(struct pci_dev *pdev)
 {
        struct pch_dev *chip = pci_get_drvdata(pdev);
 
+       free_irq(pdev->irq, chip);
        ptp_clock_unregister(chip->ptp_clock);
-       /* free the interrupt */
-       if (pdev->irq != 0)
-               free_irq(pdev->irq, chip);
-
-       /* unmap the virtual IO memory space */
-       if (chip->regs != NULL) {
-               iounmap(chip->regs);
-               chip->regs = NULL;
-       }
-       /* release the reserved IO memory space */
-       if (chip->mem_base != 0) {
-               release_mem_region(chip->mem_base, chip->mem_size);
-               chip->mem_base = 0;
-       }
-       pci_disable_device(pdev);
-       kfree(chip);
-       dev_info(&pdev->dev, "complete\n");
 }
 
 static s32
@@ -535,50 +462,29 @@ pch_probe(struct pci_dev *pdev, const struct pci_device_id *id)
        unsigned long flags;
        struct pch_dev *chip;
 
-       chip = kzalloc(sizeof(struct pch_dev), GFP_KERNEL);
+       chip = devm_kzalloc(&pdev->dev, sizeof(*chip), GFP_KERNEL);
        if (chip == NULL)
                return -ENOMEM;
 
        /* enable the 1588 pci device */
-       ret = pci_enable_device(pdev);
+       ret = pcim_enable_device(pdev);
        if (ret != 0) {
                dev_err(&pdev->dev, "could not enable the pci device\n");
-               goto err_pci_en;
+               return ret;
        }
 
-       chip->mem_base = pci_resource_start(pdev, IO_MEM_BAR);
-       if (!chip->mem_base) {
+       ret = pcim_iomap_regions(pdev, BIT(IO_MEM_BAR), "1588_regs");
+       if (ret) {
                dev_err(&pdev->dev, "could not locate IO memory address\n");
-               ret = -ENODEV;
-               goto err_pci_start;
-       }
-
-       /* retrieve the available length of the IO memory space */
-       chip->mem_size = pci_resource_len(pdev, IO_MEM_BAR);
-
-       /* allocate the memory for the device registers */
-       if (!request_mem_region(chip->mem_base, chip->mem_size, "1588_regs")) {
-               dev_err(&pdev->dev,
-                       "could not allocate register memory space\n");
-               ret = -EBUSY;
-               goto err_req_mem_region;
+               return ret;
        }
 
        /* get the virtual address to the 1588 registers */
-       chip->regs = ioremap(chip->mem_base, chip->mem_size);
-
-       if (!chip->regs) {
-               dev_err(&pdev->dev, "Could not get virtual address\n");
-               ret = -ENOMEM;
-               goto err_ioremap;
-       }
-
+       chip->regs = pcim_iomap_table(pdev)[IO_MEM_BAR];
        chip->caps = ptp_pch_caps;
        chip->ptp_clock = ptp_clock_register(&chip->caps, &pdev->dev);
-       if (IS_ERR(chip->ptp_clock)) {
-               ret = PTR_ERR(chip->ptp_clock);
-               goto err_ptp_clock_reg;
-       }
+       if (IS_ERR(chip->ptp_clock))
+               return PTR_ERR(chip->ptp_clock);
 
        spin_lock_init(&chip->register_lock);
 
@@ -598,8 +504,7 @@ pch_probe(struct pci_dev *pdev, const struct pci_device_id *id)
        pch_reset(chip);
 
        iowrite32(DEFAULT_ADDEND, &chip->regs->addend);
-       iowrite32(1, &chip->regs->trgt_lo);
-       iowrite32(0, &chip->regs->trgt_hi);
+       iowrite64_lo_hi(1, &chip->regs->trgt_lo);
        iowrite32(PCH_TSE_TTIPEND, &chip->regs->event);
 
        pch_eth_enable_set(chip);
@@ -617,21 +522,7 @@ pch_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 
 err_req_irq:
        ptp_clock_unregister(chip->ptp_clock);
-err_ptp_clock_reg:
-       iounmap(chip->regs);
-       chip->regs = NULL;
 
-err_ioremap:
-       release_mem_region(chip->mem_base, chip->mem_size);
-
-err_req_mem_region:
-       chip->mem_base = 0;
-
-err_pci_start:
-       pci_disable_device(pdev);
-
-err_pci_en:
-       kfree(chip);
        dev_err(&pdev->dev, "probe failed(ret=0x%x)\n", ret);
 
        return ret;
@@ -646,33 +537,13 @@ static const struct pci_device_id pch_ieee1588_pcidev_id[] = {
 };
 MODULE_DEVICE_TABLE(pci, pch_ieee1588_pcidev_id);
 
-static SIMPLE_DEV_PM_OPS(pch_pm_ops, pch_suspend, pch_resume);
-
 static struct pci_driver pch_driver = {
        .name = KBUILD_MODNAME,
        .id_table = pch_ieee1588_pcidev_id,
        .probe = pch_probe,
        .remove = pch_remove,
-       .driver.pm = &pch_pm_ops,
 };
-
-static void __exit ptp_pch_exit(void)
-{
-       pci_unregister_driver(&pch_driver);
-}
-
-static s32 __init ptp_pch_init(void)
-{
-       s32 ret;
-
-       /* register the driver with the pci core */
-       ret = pci_register_driver(&pch_driver);
-
-       return ret;
-}
-
-module_init(ptp_pch_init);
-module_exit(ptp_pch_exit);
+module_pci_driver(pch_driver);
 
 module_param_string(station,
                    pch_param.station, sizeof(pch_param.station), 0444);
index 41b92dc..9233bfe 100644 (file)
@@ -14,7 +14,7 @@ static ssize_t clock_name_show(struct device *dev,
                               struct device_attribute *attr, char *page)
 {
        struct ptp_clock *ptp = dev_get_drvdata(dev);
-       return snprintf(page, PAGE_SIZE-1, "%s\n", ptp->info->name);
+       return sysfs_emit(page, "%s\n", ptp->info->name);
 }
 static DEVICE_ATTR_RO(clock_name);
 
@@ -387,7 +387,7 @@ static ssize_t ptp_pin_show(struct device *dev, struct device_attribute *attr,
 
        mutex_unlock(&ptp->pincfg_mux);
 
-       return snprintf(page, PAGE_SIZE, "%u %u\n", func, chan);
+       return sysfs_emit(page, "%u %u\n", func, chan);
 }
 
 static ssize_t ptp_pin_store(struct device *dev, struct device_attribute *attr,
index ab1d233..cb179a3 100644 (file)
@@ -57,6 +57,30 @@ static int ptp_vclock_gettime(struct ptp_clock_info *ptp,
        return 0;
 }
 
+static int ptp_vclock_gettimex(struct ptp_clock_info *ptp,
+                              struct timespec64 *ts,
+                              struct ptp_system_timestamp *sts)
+{
+       struct ptp_vclock *vclock = info_to_vclock(ptp);
+       struct ptp_clock *pptp = vclock->pclock;
+       struct timespec64 pts;
+       unsigned long flags;
+       int err;
+       u64 ns;
+
+       err = pptp->info->gettimex64(pptp->info, &pts, sts);
+       if (err)
+               return err;
+
+       spin_lock_irqsave(&vclock->lock, flags);
+       ns = timecounter_cyc2time(&vclock->tc, timespec64_to_ns(&pts));
+       spin_unlock_irqrestore(&vclock->lock, flags);
+
+       *ts = ns_to_timespec64(ns);
+
+       return 0;
+}
+
 static int ptp_vclock_settime(struct ptp_clock_info *ptp,
                              const struct timespec64 *ts)
 {
@@ -71,6 +95,28 @@ static int ptp_vclock_settime(struct ptp_clock_info *ptp,
        return 0;
 }
 
+static int ptp_vclock_getcrosststamp(struct ptp_clock_info *ptp,
+                                    struct system_device_crosststamp *xtstamp)
+{
+       struct ptp_vclock *vclock = info_to_vclock(ptp);
+       struct ptp_clock *pptp = vclock->pclock;
+       unsigned long flags;
+       int err;
+       u64 ns;
+
+       err = pptp->info->getcrosststamp(pptp->info, xtstamp);
+       if (err)
+               return err;
+
+       spin_lock_irqsave(&vclock->lock, flags);
+       ns = timecounter_cyc2time(&vclock->tc, ktime_to_ns(xtstamp->device));
+       spin_unlock_irqrestore(&vclock->lock, flags);
+
+       xtstamp->device = ns_to_ktime(ns);
+
+       return 0;
+}
+
 static long ptp_vclock_refresh(struct ptp_clock_info *ptp)
 {
        struct ptp_vclock *vclock = info_to_vclock(ptp);
@@ -84,11 +130,9 @@ static long ptp_vclock_refresh(struct ptp_clock_info *ptp)
 static const struct ptp_clock_info ptp_vclock_info = {
        .owner          = THIS_MODULE,
        .name           = "ptp virtual clock",
-       /* The maximum ppb value that long scaled_ppm can support */
-       .max_adj        = 32767999,
+       .max_adj        = 500000000,
        .adjfine        = ptp_vclock_adjfine,
        .adjtime        = ptp_vclock_adjtime,
-       .gettime64      = ptp_vclock_gettime,
        .settime64      = ptp_vclock_settime,
        .do_aux_work    = ptp_vclock_refresh,
 };
@@ -124,6 +168,12 @@ struct ptp_vclock *ptp_vclock_register(struct ptp_clock *pclock)
 
        vclock->pclock = pclock;
        vclock->info = ptp_vclock_info;
+       if (pclock->info->gettimex64)
+               vclock->info.gettimex64 = ptp_vclock_gettimex;
+       else
+               vclock->info.gettime64 = ptp_vclock_gettime;
+       if (pclock->info->getcrosststamp)
+               vclock->info.getcrosststamp = ptp_vclock_getcrosststamp;
        vclock->cc = ptp_vclock_cc;
 
        snprintf(vclock->info.name, PTP_CLOCK_NAME_LEN, "ptp%d_virt",
index 058b78f..0a3fb6c 100644 (file)
@@ -743,8 +743,8 @@ int qbman_swp_enqueue_multiple_mem_back(struct qbman_swp *s,
        full_mask = s->eqcr.pi_ci_mask;
        if (!s->eqcr.available) {
                eqcr_ci = s->eqcr.ci;
-               p = s->addr_cena + QBMAN_CENA_SWP_EQCR_CI_MEMBACK;
-               s->eqcr.ci = *p & full_mask;
+               s->eqcr.ci = qbman_read_register(s, QBMAN_CINH_SWP_EQCR_CI);
+               s->eqcr.ci &= full_mask;
                s->eqcr.available = qm_cyc_diff(s->eqcr.pi_ring_size,
                                        eqcr_ci, s->eqcr.ci);
                if (!s->eqcr.available) {
@@ -887,8 +887,8 @@ int qbman_swp_enqueue_multiple_desc_mem_back(struct qbman_swp *s,
        full_mask = s->eqcr.pi_ci_mask;
        if (!s->eqcr.available) {
                eqcr_ci = s->eqcr.ci;
-               p = s->addr_cena + QBMAN_CENA_SWP_EQCR_CI_MEMBACK;
-               s->eqcr.ci = *p & full_mask;
+               s->eqcr.ci = qbman_read_register(s, QBMAN_CINH_SWP_EQCR_CI);
+               s->eqcr.ci &= full_mask;
                s->eqcr.available = qm_cyc_diff(s->eqcr.pi_ring_size,
                                        eqcr_ci, s->eqcr.ci);
                if (!s->eqcr.available)
index b525d8c..88a51b2 100644 (file)
@@ -8,6 +8,7 @@
 #include <linux/jump_label.h>
 #include <linux/percpu.h>
 #include <linux/rbtree.h>
+#include <net/sock.h>
 #include <uapi/linux/bpf.h>
 
 struct sock;
@@ -165,11 +166,23 @@ int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, void *value);
 int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
                                     void *value, u64 flags);
 
+/* Opportunistic check to see whether we have any BPF program attached*/
+static inline bool cgroup_bpf_sock_enabled(struct sock *sk,
+                                          enum cgroup_bpf_attach_type type)
+{
+       struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+       struct bpf_prog_array *array;
+
+       array = rcu_access_pointer(cgrp->bpf.effective[type]);
+       return array != &bpf_empty_prog_array.hdr;
+}
+
 /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb)                            \
 ({                                                                           \
        int __ret = 0;                                                        \
-       if (cgroup_bpf_enabled(CGROUP_INET_INGRESS))                  \
+       if (cgroup_bpf_enabled(CGROUP_INET_INGRESS) &&                        \
+           cgroup_bpf_sock_enabled(sk, CGROUP_INET_INGRESS))                 \
                __ret = __cgroup_bpf_run_filter_skb(sk, skb,                  \
                                                    CGROUP_INET_INGRESS); \
                                                                              \
@@ -181,7 +194,8 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
        int __ret = 0;                                                         \
        if (cgroup_bpf_enabled(CGROUP_INET_EGRESS) && sk && sk == skb->sk) { \
                typeof(sk) __sk = sk_to_full_sk(sk);                           \
-               if (sk_fullsock(__sk))                                         \
+               if (sk_fullsock(__sk) &&                                       \
+                   cgroup_bpf_sock_enabled(__sk, CGROUP_INET_EGRESS))         \
                        __ret = __cgroup_bpf_run_filter_skb(__sk, skb,         \
                                                      CGROUP_INET_EGRESS); \
        }                                                                      \
@@ -347,7 +361,8 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
                                       kernel_optval)                          \
 ({                                                                            \
        int __ret = 0;                                                         \
-       if (cgroup_bpf_enabled(CGROUP_SETSOCKOPT))                             \
+       if (cgroup_bpf_enabled(CGROUP_SETSOCKOPT) &&                           \
+           cgroup_bpf_sock_enabled(sock, CGROUP_SETSOCKOPT))                  \
                __ret = __cgroup_bpf_run_filter_setsockopt(sock, level,        \
                                                           optname, optval,    \
                                                           optlen,             \
@@ -367,7 +382,8 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
                                       max_optlen, retval)                     \
 ({                                                                            \
        int __ret = retval;                                                    \
-       if (cgroup_bpf_enabled(CGROUP_GETSOCKOPT))                             \
+       if (cgroup_bpf_enabled(CGROUP_GETSOCKOPT) &&                           \
+           cgroup_bpf_sock_enabled(sock, CGROUP_GETSOCKOPT))                  \
                if (!(sock)->sk_prot->bpf_bypass_getsockopt ||                 \
                    !INDIRECT_CALL_INET_1((sock)->sk_prot->bpf_bypass_getsockopt, \
                                        tcp_bpf_bypass_getsockopt,             \
index fa517ae..2fc7e5c 100644 (file)
@@ -194,6 +194,17 @@ struct bpf_map {
        struct work_struct work;
        struct mutex freeze_mutex;
        atomic64_t writecnt;
+       /* 'Ownership' of program-containing map is claimed by the first program
+        * that is going to use this map or by the first program which FD is
+        * stored in the map to make sure that all callers and callees have the
+        * same prog type, JITed flag and xdp_has_frags flag.
+        */
+       struct {
+               spinlock_t lock;
+               enum bpf_prog_type type;
+               bool jited;
+               bool xdp_has_frags;
+       } owner;
 };
 
 static inline bool map_value_has_spin_lock(const struct bpf_map *map)
@@ -321,7 +332,10 @@ enum bpf_type_flag {
         */
        MEM_ALLOC               = BIT(2 + BPF_BASE_TYPE_BITS),
 
-       __BPF_TYPE_LAST_FLAG    = MEM_ALLOC,
+       /* MEM is in user address space. */
+       MEM_USER                = BIT(3 + BPF_BASE_TYPE_BITS),
+
+       __BPF_TYPE_LAST_FLAG    = MEM_USER,
 };
 
 /* Max number of base types. */
@@ -577,8 +591,7 @@ struct bpf_verifier_ops {
                                 const struct btf *btf,
                                 const struct btf_type *t, int off, int size,
                                 enum bpf_access_type atype,
-                                u32 *next_btf_id);
-       bool (*check_kfunc_call)(u32 kfunc_btf_id, struct module *owner);
+                                u32 *next_btf_id, enum bpf_type_flag *flag);
 };
 
 struct bpf_prog_offload_ops {
@@ -833,8 +846,8 @@ void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym);
 void bpf_image_ksym_del(struct bpf_ksym *ksym);
 void bpf_ksym_add(struct bpf_ksym *ksym);
 void bpf_ksym_del(struct bpf_ksym *ksym);
-int bpf_jit_charge_modmem(u32 pages);
-void bpf_jit_uncharge_modmem(u32 pages);
+int bpf_jit_charge_modmem(u32 size);
+void bpf_jit_uncharge_modmem(u32 size);
 bool bpf_prog_has_trampoline(const struct bpf_prog *prog);
 #else
 static inline int bpf_trampoline_link_prog(struct bpf_prog *prog,
@@ -939,6 +952,8 @@ struct bpf_prog_aux {
        bool func_proto_unreliable;
        bool sleepable;
        bool tail_call_reachable;
+       bool xdp_has_frags;
+       bool use_bpf_prog_pack;
        struct hlist_node tramp_hlist;
        /* BTF_KIND_FUNC_PROTO for valid attach_btf_id */
        const struct btf_type *attach_func_proto;
@@ -999,16 +1014,6 @@ struct bpf_prog_aux {
 };
 
 struct bpf_array_aux {
-       /* 'Ownership' of prog array is claimed by the first program that
-        * is going to use this map or by the first program which FD is
-        * stored in the map to make sure that all callers and callees have
-        * the same prog type and JITed flag.
-        */
-       struct {
-               spinlock_t lock;
-               enum bpf_prog_type type;
-               bool jited;
-       } owner;
        /* Programs with direct jumps into programs part of this array. */
        struct list_head poke_progs;
        struct bpf_map *map;
@@ -1183,7 +1188,14 @@ struct bpf_event_entry {
        struct rcu_head rcu;
 };
 
-bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp);
+static inline bool map_type_contains_progs(struct bpf_map *map)
+{
+       return map->map_type == BPF_MAP_TYPE_PROG_ARRAY ||
+              map->map_type == BPF_MAP_TYPE_DEVMAP ||
+              map->map_type == BPF_MAP_TYPE_CPUMAP;
+}
+
+bool bpf_prog_map_compatible(struct bpf_map *map, const struct bpf_prog *fp);
 int bpf_prog_calc_tag(struct bpf_prog *fp);
 
 const struct bpf_func_proto *bpf_get_trace_printk_proto(void);
@@ -1225,6 +1237,19 @@ struct bpf_prog_array {
        struct bpf_prog_array_item items[];
 };
 
+struct bpf_empty_prog_array {
+       struct bpf_prog_array hdr;
+       struct bpf_prog *null_prog;
+};
+
+/* to avoid allocating empty bpf_prog_array for cgroups that
+ * don't have bpf program attached use one global 'bpf_empty_prog_array'
+ * It will not be modified the caller of bpf_prog_array_alloc()
+ * (since caller requested prog_cnt == 0)
+ * that pointer should be 'freed' by bpf_prog_array_free()
+ */
+extern struct bpf_empty_prog_array bpf_empty_prog_array;
+
 struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags);
 void bpf_prog_array_free(struct bpf_prog_array *progs);
 int bpf_prog_array_length(struct bpf_prog_array *progs);
@@ -1251,6 +1276,7 @@ struct bpf_run_ctx {};
 struct bpf_cg_run_ctx {
        struct bpf_run_ctx run_ctx;
        const struct bpf_prog_array_item *prog_item;
+       int retval;
 };
 
 struct bpf_trace_run_ctx {
@@ -1283,19 +1309,19 @@ static inline void bpf_reset_run_ctx(struct bpf_run_ctx *old_ctx)
 
 typedef u32 (*bpf_prog_run_fn)(const struct bpf_prog *prog, const void *ctx);
 
-static __always_inline u32
+static __always_inline int
 BPF_PROG_RUN_ARRAY_CG_FLAGS(const struct bpf_prog_array __rcu *array_rcu,
                            const void *ctx, bpf_prog_run_fn run_prog,
-                           u32 *ret_flags)
+                           int retval, u32 *ret_flags)
 {
        const struct bpf_prog_array_item *item;
        const struct bpf_prog *prog;
        const struct bpf_prog_array *array;
        struct bpf_run_ctx *old_run_ctx;
        struct bpf_cg_run_ctx run_ctx;
-       u32 ret = 1;
        u32 func_ret;
 
+       run_ctx.retval = retval;
        migrate_disable();
        rcu_read_lock();
        array = rcu_dereference(array_rcu);
@@ -1304,27 +1330,29 @@ BPF_PROG_RUN_ARRAY_CG_FLAGS(const struct bpf_prog_array __rcu *array_rcu,
        while ((prog = READ_ONCE(item->prog))) {
                run_ctx.prog_item = item;
                func_ret = run_prog(prog, ctx);
-               ret &= (func_ret & 1);
+               if (!(func_ret & 1) && !IS_ERR_VALUE((long)run_ctx.retval))
+                       run_ctx.retval = -EPERM;
                *(ret_flags) |= (func_ret >> 1);
                item++;
        }
        bpf_reset_run_ctx(old_run_ctx);
        rcu_read_unlock();
        migrate_enable();
-       return ret;
+       return run_ctx.retval;
 }
 
-static __always_inline u32
+static __always_inline int
 BPF_PROG_RUN_ARRAY_CG(const struct bpf_prog_array __rcu *array_rcu,
-                     const void *ctx, bpf_prog_run_fn run_prog)
+                     const void *ctx, bpf_prog_run_fn run_prog,
+                     int retval)
 {
        const struct bpf_prog_array_item *item;
        const struct bpf_prog *prog;
        const struct bpf_prog_array *array;
        struct bpf_run_ctx *old_run_ctx;
        struct bpf_cg_run_ctx run_ctx;
-       u32 ret = 1;
 
+       run_ctx.retval = retval;
        migrate_disable();
        rcu_read_lock();
        array = rcu_dereference(array_rcu);
@@ -1332,13 +1360,14 @@ BPF_PROG_RUN_ARRAY_CG(const struct bpf_prog_array __rcu *array_rcu,
        old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
        while ((prog = READ_ONCE(item->prog))) {
                run_ctx.prog_item = item;
-               ret &= run_prog(prog, ctx);
+               if (!run_prog(prog, ctx) && !IS_ERR_VALUE((long)run_ctx.retval))
+                       run_ctx.retval = -EPERM;
                item++;
        }
        bpf_reset_run_ctx(old_run_ctx);
        rcu_read_unlock();
        migrate_enable();
-       return ret;
+       return run_ctx.retval;
 }
 
 static __always_inline u32
@@ -1391,19 +1420,21 @@ out:
  *   0: NET_XMIT_SUCCESS  skb should be transmitted
  *   1: NET_XMIT_DROP     skb should be dropped and cn
  *   2: NET_XMIT_CN       skb should be transmitted and cn
- *   3: -EPERM            skb should be dropped
+ *   3: -err              skb should be dropped
  */
 #define BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(array, ctx, func)                \
        ({                                              \
                u32 _flags = 0;                         \
                bool _cn;                               \
                u32 _ret;                               \
-               _ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(array, ctx, func, &_flags); \
+               _ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(array, ctx, func, 0, &_flags); \
                _cn = _flags & BPF_RET_SET_CN;          \
-               if (_ret)                               \
+               if (_ret && !IS_ERR_VALUE((long)_ret))  \
+                       _ret = -EFAULT;                 \
+               if (!_ret)                              \
                        _ret = (_cn ? NET_XMIT_CN : NET_XMIT_SUCCESS);  \
                else                                    \
-                       _ret = (_cn ? NET_XMIT_DROP : -EPERM);          \
+                       _ret = (_cn ? NET_XMIT_DROP : _ret);            \
                _ret;                                   \
        })
 
@@ -1724,7 +1755,6 @@ int bpf_prog_test_run_raw_tp(struct bpf_prog *prog,
 int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog,
                                const union bpf_attr *kattr,
                                union bpf_attr __user *uattr);
-bool bpf_prog_test_check_kfunc_call(u32 kfunc_id, struct module *owner);
 bool btf_ctx_access(int off, int size, enum bpf_access_type type,
                    const struct bpf_prog *prog,
                    struct bpf_insn_access_aux *info);
@@ -1754,7 +1784,7 @@ static inline bool bpf_tracing_btf_ctx_access(int off, int size,
 int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf,
                      const struct btf_type *t, int off, int size,
                      enum bpf_access_type atype,
-                     u32 *next_btf_id);
+                     u32 *next_btf_id, enum bpf_type_flag *flag);
 bool btf_struct_ids_match(struct bpf_verifier_log *log,
                          const struct btf *btf, u32 id, int off,
                          const struct btf *need_btf, u32 need_type_id);
@@ -1862,11 +1892,6 @@ static inline int bpf_obj_get_user(const char __user *pathname, int flags)
        return -EOPNOTSUPP;
 }
 
-static inline bool dev_map_can_have_prog(struct bpf_map *map)
-{
-       return false;
-}
-
 static inline void __dev_flush(void)
 {
 }
@@ -1930,11 +1955,6 @@ static inline int cpu_map_generic_redirect(struct bpf_cpu_map_entry *rcpu,
        return -EOPNOTSUPP;
 }
 
-static inline bool cpu_map_prog_allowed(struct bpf_map *map)
-{
-       return false;
-}
-
 static inline struct bpf_prog *bpf_prog_get_type_path(const char *name,
                                enum bpf_prog_type type)
 {
@@ -1976,12 +1996,6 @@ static inline int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog,
        return -ENOTSUPP;
 }
 
-static inline bool bpf_prog_test_check_kfunc_call(u32 kfunc_id,
-                                                 struct module *owner)
-{
-       return false;
-}
-
 static inline void bpf_map_put(struct bpf_map *map)
 {
 }
@@ -2076,6 +2090,9 @@ int bpf_prog_test_run_syscall(struct bpf_prog *prog,
 int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog);
 int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype);
 int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags);
+int sock_map_bpf_prog_query(const union bpf_attr *attr,
+                           union bpf_attr __user *uattr);
+
 void sock_map_unhash(struct sock *sk);
 void sock_map_close(struct sock *sk, long timeout);
 #else
@@ -2129,6 +2146,12 @@ static inline int sock_map_update_elem_sys(struct bpf_map *map, void *key, void
 {
        return -EOPNOTSUPP;
 }
+
+static inline int sock_map_bpf_prog_query(const union bpf_attr *attr,
+                                         union bpf_attr __user *uattr)
+{
+       return -EINVAL;
+}
 #endif /* CONFIG_BPF_SYSCALL */
 #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */
 
@@ -2227,6 +2250,7 @@ extern const struct bpf_func_proto bpf_kallsyms_lookup_name_proto;
 extern const struct bpf_func_proto bpf_find_vma_proto;
 extern const struct bpf_func_proto bpf_loop_proto;
 extern const struct bpf_func_proto bpf_strncmp_proto;
+extern const struct bpf_func_proto bpf_copy_from_user_task_proto;
 
 const struct bpf_func_proto *tracing_prog_func_proto(
   enum bpf_func_id func_id, const struct bpf_prog *prog);
@@ -2339,6 +2363,8 @@ enum bpf_text_poke_type {
 int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
                       void *addr1, void *addr2);
 
+void *bpf_arch_text_copy(void *dst, void *src, size_t len);
+
 struct btf_id_set;
 bool btf_id_set_contains(const struct btf_id_set *set, u32 id);
 
index e999317..7a7be8c 100644 (file)
@@ -521,6 +521,8 @@ bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt);
 
 int check_ptr_off_reg(struct bpf_verifier_env *env,
                      const struct bpf_reg_state *reg, int regno);
+int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+                            u32 regno);
 int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
                   u32 regno, u32 mem_size);
 
@@ -564,4 +566,9 @@ static inline u32 type_flag(u32 type)
        return type & ~BPF_BASE_TYPE_MASK;
 }
 
+static inline enum bpf_prog_type resolve_prog_type(struct bpf_prog *prog)
+{
+       return prog->aux->dst_prog ? prog->aux->dst_prog->type : prog->type;
+}
+
 #endif /* _LINUX_BPF_VERIFIER_H */
index 0c74348..36bc09b 100644 (file)
 #define BTF_TYPE_EMIT(type) ((void)(type *)0)
 #define BTF_TYPE_EMIT_ENUM(enum_val) ((void)enum_val)
 
+enum btf_kfunc_type {
+       BTF_KFUNC_TYPE_CHECK,
+       BTF_KFUNC_TYPE_ACQUIRE,
+       BTF_KFUNC_TYPE_RELEASE,
+       BTF_KFUNC_TYPE_RET_NULL,
+       BTF_KFUNC_TYPE_MAX,
+};
+
 struct btf;
 struct btf_member;
 struct btf_type;
 union bpf_attr;
 struct btf_show;
+struct btf_id_set;
+
+struct btf_kfunc_id_set {
+       struct module *owner;
+       union {
+               struct {
+                       struct btf_id_set *check_set;
+                       struct btf_id_set *acquire_set;
+                       struct btf_id_set *release_set;
+                       struct btf_id_set *ret_null_set;
+               };
+               struct btf_id_set *sets[BTF_KFUNC_TYPE_MAX];
+       };
+};
 
 extern const struct file_operations btf_fops;
 
@@ -216,6 +238,11 @@ static inline bool btf_type_is_var(const struct btf_type *t)
        return BTF_INFO_KIND(t->info) == BTF_KIND_VAR;
 }
 
+static inline bool btf_type_is_type_tag(const struct btf_type *t)
+{
+       return BTF_INFO_KIND(t->info) == BTF_KIND_TYPE_TAG;
+}
+
 /* union is only a special case of struct:
  * all its offsetof(member) == 0
  */
@@ -300,6 +327,11 @@ static inline const struct btf_var_secinfo *btf_type_var_secinfo(
        return (const struct btf_var_secinfo *)(t + 1);
 }
 
+static inline struct btf_param *btf_params(const struct btf_type *t)
+{
+       return (struct btf_param *)(t + 1);
+}
+
 #ifdef CONFIG_BPF_SYSCALL
 struct bpf_prog;
 
@@ -307,6 +339,11 @@ const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id);
 const char *btf_name_by_offset(const struct btf *btf, u32 offset);
 struct btf *btf_parse_vmlinux(void);
 struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog);
+bool btf_kfunc_id_set_contains(const struct btf *btf,
+                              enum bpf_prog_type prog_type,
+                              enum btf_kfunc_type type, u32 kfunc_btf_id);
+int register_btf_kfunc_id_set(enum bpf_prog_type prog_type,
+                             const struct btf_kfunc_id_set *s);
 #else
 static inline const struct btf_type *btf_type_by_id(const struct btf *btf,
                                                    u32 type_id)
@@ -318,50 +355,18 @@ static inline const char *btf_name_by_offset(const struct btf *btf,
 {
        return NULL;
 }
-#endif
-
-struct kfunc_btf_id_set {
-       struct list_head list;
-       struct btf_id_set *set;
-       struct module *owner;
-};
-
-struct kfunc_btf_id_list {
-       struct list_head list;
-       struct mutex mutex;
-};
-
-#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
-void register_kfunc_btf_id_set(struct kfunc_btf_id_list *l,
-                              struct kfunc_btf_id_set *s);
-void unregister_kfunc_btf_id_set(struct kfunc_btf_id_list *l,
-                                struct kfunc_btf_id_set *s);
-bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, u32 kfunc_id,
-                             struct module *owner);
-
-extern struct kfunc_btf_id_list bpf_tcp_ca_kfunc_list;
-extern struct kfunc_btf_id_list prog_test_kfunc_list;
-#else
-static inline void register_kfunc_btf_id_set(struct kfunc_btf_id_list *l,
-                                            struct kfunc_btf_id_set *s)
-{
-}
-static inline void unregister_kfunc_btf_id_set(struct kfunc_btf_id_list *l,
-                                              struct kfunc_btf_id_set *s)
+static inline bool btf_kfunc_id_set_contains(const struct btf *btf,
+                                            enum bpf_prog_type prog_type,
+                                            enum btf_kfunc_type type,
+                                            u32 kfunc_btf_id)
 {
+       return false;
 }
-static inline bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist,
-                                           u32 kfunc_id, struct module *owner)
+static inline int register_btf_kfunc_id_set(enum bpf_prog_type prog_type,
+                                           const struct btf_kfunc_id_set *s)
 {
-       return false;
+       return 0;
 }
-
-static struct kfunc_btf_id_list bpf_tcp_ca_kfunc_list __maybe_unused;
-static struct kfunc_btf_id_list prog_test_kfunc_list __maybe_unused;
 #endif
 
-#define DEFINE_KFUNC_BTF_ID_SET(set, name)                                     \
-       struct kfunc_btf_id_set name = { LIST_HEAD_INIT(name.list), (set),     \
-                                        THIS_MODULE }
-
 #endif
index 919c0fd..bc5d9cc 100644 (file)
@@ -11,6 +11,7 @@ struct btf_id_set {
 #ifdef CONFIG_DEBUG_INFO_BTF
 
 #include <linux/compiler.h> /* for __PASTE */
+#include <linux/compiler_attributes.h> /* for __maybe_unused */
 
 /*
  * Following macros help to define lists of BTF IDs placed
@@ -146,14 +147,14 @@ extern struct btf_id_set name;
 
 #else
 
-#define BTF_ID_LIST(name) static u32 name[5];
+#define BTF_ID_LIST(name) static u32 __maybe_unused name[5];
 #define BTF_ID(prefix, name)
 #define BTF_ID_UNUSED
-#define BTF_ID_LIST_GLOBAL(name, n) u32 name[n];
-#define BTF_ID_LIST_SINGLE(name, prefix, typename) static u32 name[1];
-#define BTF_ID_LIST_GLOBAL_SINGLE(name, prefix, typename) u32 name[1];
-#define BTF_SET_START(name) static struct btf_id_set name = { 0 };
-#define BTF_SET_START_GLOBAL(name) static struct btf_id_set name = { 0 };
+#define BTF_ID_LIST_GLOBAL(name, n) u32 __maybe_unused name[n];
+#define BTF_ID_LIST_SINGLE(name, prefix, typename) static u32 __maybe_unused name[1];
+#define BTF_ID_LIST_GLOBAL_SINGLE(name, prefix, typename) u32 __maybe_unused name[1];
+#define BTF_SET_START(name) static struct btf_id_set __maybe_unused name = { 0 };
+#define BTF_SET_START_GLOBAL(name) static struct btf_id_set __maybe_unused name = { 0 };
 #define BTF_SET_END(name)
 
 #endif /* CONFIG_DEBUG_INFO_BTF */
index 3c1795f..3f31ff4 100644 (file)
@@ -31,6 +31,9 @@ static inline void __chk_io_ptr(const volatile void __iomem *ptr) { }
 # define __kernel
 # ifdef STRUCTLEAK_PLUGIN
 #  define __user       __attribute__((user))
+# elif defined(CONFIG_DEBUG_INFO_BTF) && defined(CONFIG_PAHOLE_HAS_BTF_TAG) && \
+       __has_attribute(btf_type_tag)
+#  define __user       __attribute__((btf_type_tag("user")))
 # else
 #  define __user
 # endif
diff --git a/include/linux/dsa/tag_qca.h b/include/linux/dsa/tag_qca.h
new file mode 100644 (file)
index 0000000..4359fb0
--- /dev/null
@@ -0,0 +1,82 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __TAG_QCA_H
+#define __TAG_QCA_H
+
+#define QCA_HDR_LEN    2
+#define QCA_HDR_VERSION        0x2
+
+#define QCA_HDR_RECV_VERSION           GENMASK(15, 14)
+#define QCA_HDR_RECV_PRIORITY          GENMASK(13, 11)
+#define QCA_HDR_RECV_TYPE              GENMASK(10, 6)
+#define QCA_HDR_RECV_FRAME_IS_TAGGED   BIT(3)
+#define QCA_HDR_RECV_SOURCE_PORT       GENMASK(2, 0)
+
+/* Packet type for recv */
+#define QCA_HDR_RECV_TYPE_NORMAL       0x0
+#define QCA_HDR_RECV_TYPE_MIB          0x1
+#define QCA_HDR_RECV_TYPE_RW_REG_ACK   0x2
+
+#define QCA_HDR_XMIT_VERSION           GENMASK(15, 14)
+#define QCA_HDR_XMIT_PRIORITY          GENMASK(13, 11)
+#define QCA_HDR_XMIT_CONTROL           GENMASK(10, 8)
+#define QCA_HDR_XMIT_FROM_CPU          BIT(7)
+#define QCA_HDR_XMIT_DP_BIT            GENMASK(6, 0)
+
+/* Packet type for xmit */
+#define QCA_HDR_XMIT_TYPE_NORMAL       0x0
+#define QCA_HDR_XMIT_TYPE_RW_REG       0x1
+
+/* Check code for a valid mgmt packet. Switch will ignore the packet
+ * with this wrong.
+ */
+#define QCA_HDR_MGMT_CHECK_CODE_VAL    0x5
+
+/* Specific define for in-band MDIO read/write with Ethernet packet */
+#define QCA_HDR_MGMT_SEQ_LEN           4 /* 4 byte for the seq */
+#define QCA_HDR_MGMT_COMMAND_LEN       4 /* 4 byte for the command */
+#define QCA_HDR_MGMT_DATA1_LEN         4 /* First 4 byte for the mdio data */
+#define QCA_HDR_MGMT_HEADER_LEN                (QCA_HDR_MGMT_SEQ_LEN + \
+                                       QCA_HDR_MGMT_COMMAND_LEN + \
+                                       QCA_HDR_MGMT_DATA1_LEN)
+
+#define QCA_HDR_MGMT_DATA2_LEN         12 /* Other 12 byte for the mdio data */
+#define QCA_HDR_MGMT_PADDING_LEN       34 /* Padding to reach the min Ethernet packet */
+
+#define QCA_HDR_MGMT_PKT_LEN           (QCA_HDR_MGMT_HEADER_LEN + \
+                                       QCA_HDR_LEN + \
+                                       QCA_HDR_MGMT_DATA2_LEN + \
+                                       QCA_HDR_MGMT_PADDING_LEN)
+
+#define QCA_HDR_MGMT_SEQ_NUM           GENMASK(31, 0)  /* 63, 32 */
+#define QCA_HDR_MGMT_CHECK_CODE                GENMASK(31, 29) /* 31, 29 */
+#define QCA_HDR_MGMT_CMD               BIT(28)         /* 28 */
+#define QCA_HDR_MGMT_LENGTH            GENMASK(23, 20) /* 23, 20 */
+#define QCA_HDR_MGMT_ADDR              GENMASK(18, 0)  /* 18, 0 */
+
+/* Special struct emulating a Ethernet header */
+struct qca_mgmt_ethhdr {
+       u32 command;            /* command bit 31:0 */
+       u32 seq;                /* seq 63:32 */
+       u32 mdio_data;          /* first 4byte mdio */
+       __be16 hdr;             /* qca hdr */
+} __packed;
+
+enum mdio_cmd {
+       MDIO_WRITE = 0x0,
+       MDIO_READ
+};
+
+struct mib_ethhdr {
+       u32 data[3];            /* first 3 mib counter */
+       __be16 hdr;             /* qca hdr */
+} __packed;
+
+struct qca_tagger_data {
+       void (*rw_reg_ack_handler)(struct dsa_switch *ds,
+                                  struct sk_buff *skb);
+       void (*mib_autocast_handler)(struct dsa_switch *ds,
+                                    struct sk_buff *skb);
+};
+
+#endif /* __TAG_QCA_H */
index 11efc45..e0853f4 100644 (file)
@@ -70,9 +70,11 @@ enum {
 /**
  * struct kernel_ethtool_ringparam - RX/TX ring configuration
  * @rx_buf_len: Current length of buffers on the rx ring.
+ * @tcp_data_split: Scatter packet headers and data to separate buffers
  */
 struct kernel_ethtool_ringparam {
        u32     rx_buf_len;
+       u8      tcp_data_split;
 };
 
 /**
index 71fa57b..1cb1af9 100644 (file)
@@ -548,7 +548,7 @@ struct sock_fprog_kern {
 #define BPF_IMAGE_ALIGNMENT 8
 
 struct bpf_binary_header {
-       u32 pages;
+       u32 size;
        u8 image[] __aligned(BPF_IMAGE_ALIGNMENT);
 };
 
@@ -886,17 +886,8 @@ static inline void bpf_prog_lock_ro(struct bpf_prog *fp)
 static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr)
 {
        set_vm_flush_reset_perms(hdr);
-       set_memory_ro((unsigned long)hdr, hdr->pages);
-       set_memory_x((unsigned long)hdr, hdr->pages);
-}
-
-static inline struct bpf_binary_header *
-bpf_jit_binary_hdr(const struct bpf_prog *fp)
-{
-       unsigned long real_start = (unsigned long)fp->bpf_func;
-       unsigned long addr = real_start & PAGE_MASK;
-
-       return (void *)addr;
+       set_memory_ro((unsigned long)hdr, hdr->size >> PAGE_SHIFT);
+       set_memory_x((unsigned long)hdr, hdr->size >> PAGE_SHIFT);
 }
 
 int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap);
@@ -1068,6 +1059,18 @@ void *bpf_jit_alloc_exec(unsigned long size);
 void bpf_jit_free_exec(void *addr);
 void bpf_jit_free(struct bpf_prog *fp);
 
+struct bpf_binary_header *
+bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **ro_image,
+                         unsigned int alignment,
+                         struct bpf_binary_header **rw_hdr,
+                         u8 **rw_image,
+                         bpf_jit_fill_hole_t bpf_fill_ill_insns);
+int bpf_jit_binary_pack_finalize(struct bpf_prog *prog,
+                                struct bpf_binary_header *ro_header,
+                                struct bpf_binary_header *rw_header);
+void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header,
+                             struct bpf_binary_header *rw_header);
+
 int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
                                struct bpf_jit_poke_descriptor *poke);
 
@@ -1356,7 +1359,10 @@ struct bpf_sockopt_kern {
        s32             level;
        s32             optname;
        s32             optlen;
-       s32             retval;
+       /* for retval in struct bpf_cg_run_ctx */
+       struct task_struct *current_task;
+       /* Temporary "register" for indirect stores to ppos. */
+       u64             tmp_reg;
 };
 
 int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len);
index a59d25f..16870f8 100644 (file)
@@ -51,7 +51,7 @@ struct ipv6_devconf {
        __s32           use_optimistic;
 #endif
 #ifdef CONFIG_IPV6_MROUTE
-       __s32           mc_forwarding;
+       atomic_t        mc_forwarding;
 #endif
        __s32           disable_ipv6;
        __s32           drop_unicast_in_l2_multicast;
@@ -371,19 +371,12 @@ static inline struct ipv6_pinfo * inet6_sk(const struct sock *__sk)
        return NULL;
 }
 
-static inline struct inet6_request_sock *
-                       inet6_rsk(const struct request_sock *rsk)
-{
-       return NULL;
-}
-
 static inline struct raw6_sock *raw6_sk(const struct sock *sk)
 {
        return NULL;
 }
 
 #define inet6_rcv_saddr(__sk)  NULL
-#define tcp_twsk_ipv6only(__sk)                0
 #define inet_v6_ipv6only(__sk)         0
 #endif /* IS_ENABLED(CONFIG_IPV6) */
 #endif /* _IPV6_H */
index f8397f3..15e0e02 100644 (file)
@@ -66,11 +66,6 @@ static inline void linkmode_mod_bit(int nr, volatile unsigned long *addr,
                linkmode_clear_bit(nr, addr);
 }
 
-static inline void linkmode_change_bit(int nr, volatile unsigned long *addr)
-{
-       __change_bit(nr, addr);
-}
-
 static inline int linkmode_test_bit(int nr, const volatile unsigned long *addr)
 {
        return test_bit(nr, addr);
index 12ea29e..5ee1308 100644 (file)
@@ -355,56 +355,6 @@ static inline u32 mii_adv_to_ethtool_adv_x(u32 adv)
 }
 
 /**
- * mii_lpa_mod_linkmode_adv_sgmii
- * @lp_advertising: pointer to destination link mode.
- * @lpa: value of the MII_LPA register
- *
- * A small helper function that translates MII_LPA bits to
- * linkmode advertisement settings for SGMII.
- * Leaves other bits unchanged.
- */
-static inline void
-mii_lpa_mod_linkmode_lpa_sgmii(unsigned long *lp_advertising, u32 lpa)
-{
-       u32 speed_duplex = lpa & LPA_SGMII_DPX_SPD_MASK;
-
-       linkmode_mod_bit(ETHTOOL_LINK_MODE_1000baseT_Half_BIT, lp_advertising,
-                        speed_duplex == LPA_SGMII_1000HALF);
-
-       linkmode_mod_bit(ETHTOOL_LINK_MODE_1000baseT_Full_BIT, lp_advertising,
-                        speed_duplex == LPA_SGMII_1000FULL);
-
-       linkmode_mod_bit(ETHTOOL_LINK_MODE_100baseT_Half_BIT, lp_advertising,
-                        speed_duplex == LPA_SGMII_100HALF);
-
-       linkmode_mod_bit(ETHTOOL_LINK_MODE_100baseT_Full_BIT, lp_advertising,
-                        speed_duplex == LPA_SGMII_100FULL);
-
-       linkmode_mod_bit(ETHTOOL_LINK_MODE_10baseT_Half_BIT, lp_advertising,
-                        speed_duplex == LPA_SGMII_10HALF);
-
-       linkmode_mod_bit(ETHTOOL_LINK_MODE_10baseT_Full_BIT, lp_advertising,
-                        speed_duplex == LPA_SGMII_10FULL);
-}
-
-/**
- * mii_lpa_to_linkmode_adv_sgmii
- * @advertising: pointer to destination link mode.
- * @lpa: value of the MII_LPA register
- *
- * A small helper function that translates MII_ADVERTISE bits
- * to linkmode advertisement settings when in SGMII mode.
- * Clears the old value of advertising.
- */
-static inline void mii_lpa_to_linkmode_lpa_sgmii(unsigned long *lp_advertising,
-                                                u32 lpa)
-{
-       linkmode_zero(lp_advertising);
-
-       mii_lpa_mod_linkmode_lpa_sgmii(lp_advertising, lpa);
-}
-
-/**
  * mii_adv_mod_linkmode_adv_t
  * @advertising:pointer to destination link mode.
  * @adv: value of the MII_ADVERTISE register
index 598ac3b..27145c4 100644 (file)
@@ -64,13 +64,6 @@ enum {
 };
 
 enum {
-       MLX5_MODIFY_TIR_BITMASK_LRO                   = 0x0,
-       MLX5_MODIFY_TIR_BITMASK_INDIRECT_TABLE        = 0x1,
-       MLX5_MODIFY_TIR_BITMASK_HASH                  = 0x2,
-       MLX5_MODIFY_TIR_BITMASK_TUNNELED_OFFLOAD_EN   = 0x3
-};
-
-enum {
        MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE        = 0x0,
        MLX5_SET_HCA_CAP_OP_MOD_ODP                   = 0x2,
        MLX5_SET_HCA_CAP_OP_MOD_ATOMIC                = 0x3,
index 6b3267b..ed42bd5 100644 (file)
@@ -26,11 +26,6 @@ struct i40e_client_version {
        u8 rsvd;
 };
 
-enum i40e_client_state {
-       __I40E_CLIENT_NULL,
-       __I40E_CLIENT_REGISTERED
-};
-
 enum i40e_client_instance_state {
        __I40E_CLIENT_INSTANCE_NONE,
        __I40E_CLIENT_INSTANCE_OPENED,
@@ -190,11 +185,6 @@ struct i40e_client {
        const struct i40e_client_ops *ops; /* client ops provided by the client */
 };
 
-static inline bool i40e_client_is_registered(struct i40e_client *client)
-{
-       return test_bit(__I40E_CLIENT_REGISTERED, &client->state);
-}
-
 void i40e_client_device_register(struct i40e_info *ldev, struct i40e_client *client);
 void i40e_client_device_unregister(struct i40e_info *ldev);
 
index 1289593..1c1332e 100644 (file)
@@ -32,6 +32,8 @@ enum iidc_rdma_protocol {
 };
 
 #define IIDC_MAX_USER_PRIORITY         8
+#define IIDC_MAX_DSCP_MAPPING          64
+#define IIDC_DSCP_PFC_MODE             0x1
 
 /* Struct to hold per RDMA Qset info */
 struct iidc_rdma_qset_params {
@@ -60,6 +62,8 @@ struct iidc_qos_params {
        u8 vport_relative_bw;
        u8 vport_priority_type;
        u8 num_tc;
+       u8 pfc_mode;
+       u8 dscp_map[IIDC_MAX_DSCP_MAPPING];
 };
 
 struct iidc_event {
index e490b84..5f6e2c0 100644 (file)
@@ -1948,6 +1948,8 @@ enum netdev_ml_priv_type {
  *     @dev_addr_shadow:       Copy of @dev_addr to catch direct writes.
  *     @linkwatch_dev_tracker: refcount tracker used by linkwatch.
  *     @watchdog_dev_tracker:  refcount tracker used by watchdog.
+ *     @dev_registered_tracker:        tracker for reference held while
+ *                                     registered
  *
  *     FIXME: cleanup struct net_device such that network protocol info
  *     moves out.
@@ -2282,6 +2284,7 @@ struct net_device {
        u8 dev_addr_shadow[MAX_ADDR_LEN];
        netdevice_tracker       linkwatch_dev_tracker;
        netdevice_tracker       watchdog_dev_tracker;
+       netdevice_tracker       dev_registered_tracker;
 };
 #define to_net_dev(d) container_of(d, struct net_device, dev)
 
@@ -3817,14 +3820,7 @@ extern unsigned int      netdev_budget_usecs;
 /* Called by rtnetlink.c:rtnl_unlock() */
 void netdev_run_todo(void);
 
-/**
- *     dev_put - release reference to device
- *     @dev: network device
- *
- * Release reference to device to allow it to be freed.
- * Try using dev_put_track() instead.
- */
-static inline void dev_put(struct net_device *dev)
+static inline void __dev_put(struct net_device *dev)
 {
        if (dev) {
 #ifdef CONFIG_PCPU_DEV_REFCNT
@@ -3835,14 +3831,7 @@ static inline void dev_put(struct net_device *dev)
        }
 }
 
-/**
- *     dev_hold - get reference to device
- *     @dev: network device
- *
- * Hold reference to device to keep it from being freed.
- * Try using dev_hold_track() instead.
- */
-static inline void dev_hold(struct net_device *dev)
+static inline void __dev_hold(struct net_device *dev)
 {
        if (dev) {
 #ifdef CONFIG_PCPU_DEV_REFCNT
@@ -3853,11 +3842,24 @@ static inline void dev_hold(struct net_device *dev)
        }
 }
 
+static inline void __netdev_tracker_alloc(struct net_device *dev,
+                                         netdevice_tracker *tracker,
+                                         gfp_t gfp)
+{
+#ifdef CONFIG_NET_DEV_REFCNT_TRACKER
+       ref_tracker_alloc(&dev->refcnt_tracker, tracker, gfp);
+#endif
+}
+
+/* netdev_tracker_alloc() can upgrade a prior untracked reference
+ * taken by dev_get_by_name()/dev_get_by_index() to a tracked one.
+ */
 static inline void netdev_tracker_alloc(struct net_device *dev,
                                        netdevice_tracker *tracker, gfp_t gfp)
 {
 #ifdef CONFIG_NET_DEV_REFCNT_TRACKER
-       ref_tracker_alloc(&dev->refcnt_tracker, tracker, gfp);
+       refcount_dec(&dev->refcnt_tracker.no_tracker);
+       __netdev_tracker_alloc(dev, tracker, gfp);
 #endif
 }
 
@@ -3873,8 +3875,8 @@ static inline void dev_hold_track(struct net_device *dev,
                                  netdevice_tracker *tracker, gfp_t gfp)
 {
        if (dev) {
-               dev_hold(dev);
-               netdev_tracker_alloc(dev, tracker, gfp);
+               __dev_hold(dev);
+               __netdev_tracker_alloc(dev, tracker, gfp);
        }
 }
 
@@ -3883,10 +3885,34 @@ static inline void dev_put_track(struct net_device *dev,
 {
        if (dev) {
                netdev_tracker_free(dev, tracker);
-               dev_put(dev);
+               __dev_put(dev);
        }
 }
 
+/**
+ *     dev_hold - get reference to device
+ *     @dev: network device
+ *
+ * Hold reference to device to keep it from being freed.
+ * Try using dev_hold_track() instead.
+ */
+static inline void dev_hold(struct net_device *dev)
+{
+       dev_hold_track(dev, NULL, GFP_ATOMIC);
+}
+
+/**
+ *     dev_put - release reference to device
+ *     @dev: network device
+ *
+ * Release reference to device to allow it to be freed.
+ * Try using dev_put_track() instead.
+ */
+static inline void dev_put(struct net_device *dev)
+{
+       dev_put_track(dev, NULL);
+}
+
 static inline void dev_replace_track(struct net_device *odev,
                                     struct net_device *ndev,
                                     netdevice_tracker *tracker,
@@ -3895,11 +3921,11 @@ static inline void dev_replace_track(struct net_device *odev,
        if (odev)
                netdev_tracker_free(odev, tracker);
 
-       dev_hold(ndev);
-       dev_put(odev);
+       __dev_hold(ndev);
+       __dev_put(odev);
 
        if (ndev)
-               netdev_tracker_alloc(ndev, tracker, gfp);
+               __netdev_tracker_alloc(ndev, tracker, gfp);
 }
 
 /* Carrier loss detection, dial on demand. The functions netif_carrier_on
index 15e71bf..c2c6f33 100644 (file)
@@ -379,6 +379,7 @@ struct nf_nat_hook {
        unsigned int (*manip_pkt)(struct sk_buff *skb, struct nf_conn *ct,
                                  enum nf_nat_manip_type mtype,
                                  enum ip_conntrack_dir dir);
+       void (*remove_nat_bysrc)(struct nf_conn *ct);
 };
 
 extern const struct nf_nat_hook __rcu *nf_nat_hook;
index a28aa28..c3bdb43 100644 (file)
@@ -300,26 +300,22 @@ union pptp_ctrl_union {
        struct PptpSetLinkInfo          setlink;
 };
 
-extern int
-(*nf_nat_pptp_hook_outbound)(struct sk_buff *skb,
-                            struct nf_conn *ct, enum ip_conntrack_info ctinfo,
-                            unsigned int protoff,
-                            struct PptpControlHeader *ctlh,
-                            union pptp_ctrl_union *pptpReq);
-
-extern int
-(*nf_nat_pptp_hook_inbound)(struct sk_buff *skb,
-                           struct nf_conn *ct, enum ip_conntrack_info ctinfo,
-                           unsigned int protoff,
-                           struct PptpControlHeader *ctlh,
-                           union pptp_ctrl_union *pptpReq);
-
-extern void
-(*nf_nat_pptp_hook_exp_gre)(struct nf_conntrack_expect *exp_orig,
-                           struct nf_conntrack_expect *exp_reply);
-
-extern void
-(*nf_nat_pptp_hook_expectfn)(struct nf_conn *ct,
-                            struct nf_conntrack_expect *exp);
+struct nf_nat_pptp_hook {
+       int (*outbound)(struct sk_buff *skb,
+                       struct nf_conn *ct, enum ip_conntrack_info ctinfo,
+                       unsigned int protoff,
+                       struct PptpControlHeader *ctlh,
+                       union pptp_ctrl_union *pptpReq);
+       int (*inbound)(struct sk_buff *skb,
+                      struct nf_conn *ct, enum ip_conntrack_info ctinfo,
+                      unsigned int protoff,
+                      struct PptpControlHeader *ctlh,
+                      union pptp_ctrl_union *pptpReq);
+       void (*exp_gre)(struct nf_conntrack_expect *exp_orig,
+                       struct nf_conntrack_expect *exp_reply);
+       void (*expectfn)(struct nf_conn *ct,
+                        struct nf_conntrack_expect *exp);
+};
 
+extern const struct nf_nat_pptp_hook __rcu *nf_nat_pptp_hook;
 #endif /* _NF_CONNTRACK_PPTP_H */
index 1ec6318..bda1c38 100644 (file)
@@ -135,15 +135,6 @@ static inline void nl_set_extack_cookie_u64(struct netlink_ext_ack *extack,
        extack->cookie_len = sizeof(cookie);
 }
 
-static inline void nl_set_extack_cookie_u32(struct netlink_ext_ack *extack,
-                                           u32 cookie)
-{
-       if (!extack)
-               return;
-       memcpy(extack->cookie, &cookie, sizeof(cookie));
-       extack->cookie_len = sizeof(cookie);
-}
-
 void netlink_kernel_release(struct sock *sk);
 int __netlink_change_ngroups(struct sock *sk, unsigned int groups);
 int netlink_change_ngroups(struct sock *sk, unsigned int groups);
index add077a..266eb26 100644 (file)
@@ -31,8 +31,7 @@ void xpcs_link_up(struct phylink_pcs *pcs, unsigned int mode,
                  phy_interface_t interface, int speed, int duplex);
 int xpcs_do_config(struct dw_xpcs *xpcs, phy_interface_t interface,
                   unsigned int mode);
-void xpcs_validate(struct dw_xpcs *xpcs, unsigned long *supported,
-                  struct phylink_link_state *state);
+void xpcs_get_interfaces(struct dw_xpcs *xpcs, unsigned long *interfaces);
 int xpcs_config_eee(struct dw_xpcs *xpcs, int mult_fact_100ns,
                    int enable);
 struct dw_xpcs *xpcs_create(struct mdio_device *mdiodev,
index 6de8d7a..cd08cf1 100644 (file)
@@ -1661,7 +1661,7 @@ int phy_disable_interrupts(struct phy_device *phydev);
 void phy_request_interrupt(struct phy_device *phydev);
 void phy_free_interrupt(struct phy_device *phydev);
 void phy_print_status(struct phy_device *phydev);
-int phy_set_max_speed(struct phy_device *phydev, u32 max_speed);
+void phy_set_max_speed(struct phy_device *phydev, u32 max_speed);
 void phy_remove_link_mode(struct phy_device *phydev, u32 link_mode);
 void phy_advertise_supported(struct phy_device *phydev);
 void phy_support_sym_pause(struct phy_device *phydev);
index 713a0c9..cca149f 100644 (file)
@@ -582,7 +582,6 @@ int phylink_speed_up(struct phylink *pl);
 #define phylink_test(bm, mode) __phylink_do_bit(test_bit, bm, mode)
 
 void phylink_set_port_modes(unsigned long *bits);
-void phylink_set_10g_modes(unsigned long *mask);
 void phylink_helper_basex_speed(struct phylink_link_state *state);
 
 void phylink_mii_c22_pcs_decode_state(struct phylink_link_state *state,
index 60f3453..9ca353a 100644 (file)
@@ -13,6 +13,8 @@ struct ref_tracker_dir {
        spinlock_t              lock;
        unsigned int            quarantine_avail;
        refcount_t              untracked;
+       refcount_t              no_tracker;
+       bool                    dead;
        struct list_head        list; /* List of active trackers */
        struct list_head        quarantine; /* List of dead trackers */
 #endif
@@ -26,7 +28,9 @@ static inline void ref_tracker_dir_init(struct ref_tracker_dir *dir,
        INIT_LIST_HEAD(&dir->quarantine);
        spin_lock_init(&dir->lock);
        dir->quarantine_avail = quarantine_count;
+       dir->dead = false;
        refcount_set(&dir->untracked, 1);
+       refcount_set(&dir->no_tracker, 1);
        stack_depot_init();
 }
 
index 8a636e6..a5adbf6 100644 (file)
@@ -314,12 +314,38 @@ struct sk_buff;
  * used to translate the reason to string.
  */
 enum skb_drop_reason {
-       SKB_DROP_REASON_NOT_SPECIFIED,
-       SKB_DROP_REASON_NO_SOCKET,
-       SKB_DROP_REASON_PKT_TOO_SMALL,
-       SKB_DROP_REASON_TCP_CSUM,
-       SKB_DROP_REASON_SOCKET_FILTER,
-       SKB_DROP_REASON_UDP_CSUM,
+       SKB_DROP_REASON_NOT_SPECIFIED,  /* drop reason is not specified */
+       SKB_DROP_REASON_NO_SOCKET,      /* socket not found */
+       SKB_DROP_REASON_PKT_TOO_SMALL,  /* packet size is too small */
+       SKB_DROP_REASON_TCP_CSUM,       /* TCP checksum error */
+       SKB_DROP_REASON_SOCKET_FILTER,  /* dropped by socket filter */
+       SKB_DROP_REASON_UDP_CSUM,       /* UDP checksum error */
+       SKB_DROP_REASON_NETFILTER_DROP, /* dropped by netfilter */
+       SKB_DROP_REASON_OTHERHOST,      /* packet don't belong to current
+                                        * host (interface is in promisc
+                                        * mode)
+                                        */
+       SKB_DROP_REASON_IP_CSUM,        /* IP checksum error */
+       SKB_DROP_REASON_IP_INHDR,       /* there is something wrong with
+                                        * IP header (see
+                                        * IPSTATS_MIB_INHDRERRORS)
+                                        */
+       SKB_DROP_REASON_IP_RPFILTER,    /* IP rpfilter validate failed.
+                                        * see the document for rp_filter
+                                        * in ip-sysctl.rst for more
+                                        * information
+                                        */
+       SKB_DROP_REASON_UNICAST_IN_L2_MULTICAST, /* destination address of L2
+                                                 * is multicast, but L3 is
+                                                 * unicast.
+                                                 */
+       SKB_DROP_REASON_XFRM_POLICY,    /* xfrm policy check failed */
+       SKB_DROP_REASON_IP_NOPROTO,     /* no support for IP protocol */
+       SKB_DROP_REASON_SOCKET_RCVBUFF, /* socket receive buff is full */
+       SKB_DROP_REASON_PROTO_MEM,      /* proto memory limition, such as
+                                        * udp packet drop out of
+                                        * udp_memory_allocated.
+                                        */
        SKB_DROP_REASON_MAX,
 };
 
@@ -557,6 +583,7 @@ struct skb_shared_info {
         * Warning : all fields before dataref are cleared in __alloc_skb()
         */
        atomic_t        dataref;
+       unsigned int    xdp_frags_size;
 
        /* Intermediate layers must ensure that destructor_arg
         * remains valid until skb destructor */
@@ -3898,11 +3925,6 @@ static inline ktime_t net_timedelta(ktime_t t)
        return ktime_sub(ktime_get_real(), t);
 }
 
-static inline ktime_t net_invalid_timestamp(void)
-{
-       return 0;
-}
-
 static inline u8 skb_metadata_len(const struct sk_buff *skb)
 {
        return skb_shinfo(skb)->meta_len;
index 18a717f..fdb5375 100644 (file)
@@ -29,7 +29,7 @@ struct sk_msg_sg {
        u32                             end;
        u32                             size;
        u32                             copybreak;
-       unsigned long                   copy;
+       DECLARE_BITMAP(copy, MAX_MSG_FRAGS + 2);
        /* The extra two elements:
         * 1) used for chaining the front and sections when the list becomes
         *    partitioned (e.g. end < start). The crypto APIs require the
@@ -38,7 +38,6 @@ struct sk_msg_sg {
         */
        struct scatterlist              data[MAX_MSG_FRAGS + 2];
 };
-static_assert(BITS_PER_LONG >= NR_MSG_FRAG_IDS);
 
 /* UAPI in filter.c depends on struct sk_msg_sg being first element. */
 struct sk_msg {
@@ -171,11 +170,6 @@ static inline u32 sk_msg_iter_dist(u32 start, u32 end)
 #define sk_msg_iter_next(msg, which)                   \
        sk_msg_iter_var_next(msg->sg.which)
 
-static inline void sk_msg_clear_meta(struct sk_msg *msg)
-{
-       memset(&msg->sg, 0, offsetofend(struct sk_msg_sg, copy));
-}
-
 static inline void sk_msg_init(struct sk_msg *msg)
 {
        BUILD_BUG_ON(ARRAY_SIZE(msg->sg.data) - 1 != NR_MSG_FRAG_IDS);
@@ -234,7 +228,7 @@ static inline void sk_msg_compute_data_pointers(struct sk_msg *msg)
 {
        struct scatterlist *sge = sk_msg_elem(msg, msg->sg.start);
 
-       if (test_bit(msg->sg.start, &msg->sg.copy)) {
+       if (test_bit(msg->sg.start, msg->sg.copy)) {
                msg->data = NULL;
                msg->data_end = NULL;
        } else {
@@ -253,7 +247,7 @@ static inline void sk_msg_page_add(struct sk_msg *msg, struct page *page,
        sg_set_page(sge, page, len, offset);
        sg_unmark_end(sge);
 
-       __set_bit(msg->sg.end, &msg->sg.copy);
+       __set_bit(msg->sg.end, msg->sg.copy);
        msg->sg.size += len;
        sk_msg_iter_next(msg, end);
 }
@@ -262,9 +256,9 @@ static inline void sk_msg_sg_copy(struct sk_msg *msg, u32 i, bool copy_state)
 {
        do {
                if (copy_state)
-                       __set_bit(i, &msg->sg.copy);
+                       __set_bit(i, msg->sg.copy);
                else
-                       __clear_bit(i, &msg->sg.copy);
+                       __clear_bit(i, msg->sg.copy);
                sk_msg_iter_var_next(i);
                if (i == msg->sg.end)
                        break;
index 571f605..382af90 100644 (file)
@@ -88,6 +88,7 @@ struct svc_xprt {
        struct list_head        xpt_users;      /* callbacks on free */
 
        struct net              *xpt_net;
+       netns_tracker           ns_tracker;
        const struct cred       *xpt_cred;
        struct rpc_xprt         *xpt_bc_xprt;   /* NFSv4.1 backchannel */
        struct rpc_xprt_switch  *xpt_bc_xps;    /* NFSv4.1 backchannel */
index 955ea4d..3cdc8d8 100644 (file)
@@ -284,6 +284,7 @@ struct rpc_xprt {
        } stat;
 
        struct net              *xprt_net;
+       netns_tracker           ns_tracker;
        const char              *servername;
        const char              *address_strings[RPC_DISPLAY_MAX];
 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
index ae66dad..254a265 100644 (file)
@@ -23,11 +23,6 @@ static inline struct udphdr *udp_hdr(const struct sk_buff *skb)
        return (struct udphdr *)skb_transport_header(skb);
 }
 
-static inline struct udphdr *inner_udp_hdr(const struct sk_buff *skb)
-{
-       return (struct udphdr *)skb_inner_transport_header(skb);
-}
-
 #define UDP_HTABLE_SIZE_MIN            (CONFIG_BASE_SMALL ? 128 : 256)
 
 static inline u32 udp_hashfn(const struct net *net, u32 num, u32 mask)
index 1198a2b..739285f 100644 (file)
@@ -273,6 +273,23 @@ static inline void iov_iter_reexpand(struct iov_iter *i, size_t count)
        i->count = count;
 }
 
+static inline int
+iov_iter_npages_cap(struct iov_iter *i, int maxpages, size_t max_bytes)
+{
+       size_t shorted = 0;
+       int npages;
+
+       if (iov_iter_count(i) > max_bytes) {
+               shorted = iov_iter_count(i) - max_bytes;
+               iov_iter_truncate(i, max_bytes);
+       }
+       npages = iov_iter_npages(i, INT_MAX);
+       if (shorted)
+               iov_iter_reexpand(i, iov_iter_count(i) + shorted);
+
+       return npages;
+}
+
 struct csum_state {
        __wsum csum;
        size_t off;
index 8221af1..0f9790c 100644 (file)
@@ -187,18 +187,12 @@ typedef struct {
 
 typedef struct ax25_route {
        struct ax25_route       *next;
-       refcount_t              refcount;
        ax25_address            callsign;
        struct net_device       *dev;
        ax25_digi               *digipeat;
        char                    ip_mode;
 } ax25_route;
 
-static inline void ax25_hold_route(ax25_route *ax25_rt)
-{
-       refcount_inc(&ax25_rt->refcount);
-}
-
 void __ax25_put_route(ax25_route *ax25_rt);
 
 extern rwlock_t ax25_route_lock;
@@ -213,12 +207,6 @@ static inline void ax25_route_lock_unuse(void)
        read_unlock(&ax25_route_lock);
 }
 
-static inline void ax25_put_route(ax25_route *ax25_rt)
-{
-       if (refcount_dec_and_test(&ax25_rt->refcount))
-               __ax25_put_route(ax25_rt);
-}
-
 typedef struct {
        char                    slave;                  /* slave_mode?   */
        struct timer_list       slave_timer;            /* timeout timer */
index 586f69d..f5caff1 100644 (file)
@@ -258,6 +258,15 @@ struct adv_info {
 
 #define HCI_ADV_TX_POWER_NO_PREFERENCE 0x7F
 
+struct monitored_device {
+       struct list_head list;
+
+       bdaddr_t bdaddr;
+       __u8     addr_type;
+       __u16    handle;
+       bool     notified;
+};
+
 struct adv_pattern {
        struct list_head list;
        __u8 ad_type;
@@ -294,6 +303,9 @@ struct adv_monitor {
 
 #define HCI_MAX_SHORT_NAME_LENGTH      10
 
+#define HCI_CONN_HANDLE_UNSET          0xffff
+#define HCI_CONN_HANDLE_MAX            0x0eff
+
 /* Min encryption key size to match with SMP */
 #define HCI_MIN_ENC_KEY_SIZE           7
 
@@ -591,6 +603,9 @@ struct hci_dev {
 
        struct delayed_work     interleave_scan;
 
+       struct list_head        monitored_devices;
+       bool                    advmon_pend_notify;
+
 #if IS_ENABLED(CONFIG_BT_LEDS)
        struct led_trigger      *power_led;
 #endif
@@ -1847,6 +1862,8 @@ void mgmt_adv_monitor_removed(struct hci_dev *hdev, u16 handle);
 int mgmt_phy_configuration_changed(struct hci_dev *hdev, struct sock *skip);
 int mgmt_add_adv_patterns_monitor_complete(struct hci_dev *hdev, u8 status);
 int mgmt_remove_adv_monitor_complete(struct hci_dev *hdev, u8 status);
+void mgmt_adv_monitor_device_lost(struct hci_dev *hdev, u16 handle,
+                                 bdaddr_t *bdaddr, u8 addr_type);
 
 u8 hci_le_conn_update(struct hci_conn *conn, u16 min, u16 max, u16 latency,
                      u16 to_multiplier);
index 107b25d..99266f7 100644 (file)
@@ -1104,3 +1104,19 @@ struct mgmt_ev_controller_resume {
 #define MGMT_WAKE_REASON_NON_BT_WAKE           0x0
 #define MGMT_WAKE_REASON_UNEXPECTED            0x1
 #define MGMT_WAKE_REASON_REMOTE_WAKE           0x2
+
+#define MGMT_EV_ADV_MONITOR_DEVICE_FOUND       0x002f
+struct mgmt_ev_adv_monitor_device_found {
+       __le16 monitor_handle;
+       struct mgmt_addr_info addr;
+       __s8   rssi;
+       __le32 flags;
+       __le16 eir_len;
+       __u8   eir[0];
+} __packed;
+
+#define MGMT_EV_ADV_MONITOR_DEVICE_LOST                0x0030
+struct mgmt_ev_adv_monitor_device_lost {
+       __le16 monitor_handle;
+       struct mgmt_addr_info addr;
+} __packed;
index 83cfd2d..7dead85 100644 (file)
@@ -699,20 +699,6 @@ static inline struct slave *bond_slave_has_mac(struct bonding *bond,
 }
 
 /* Caller must hold rcu_read_lock() for read */
-static inline struct slave *bond_slave_has_mac_rcu(struct bonding *bond,
-                                              const u8 *mac)
-{
-       struct list_head *iter;
-       struct slave *tmp;
-
-       bond_for_each_slave_rcu(bond, tmp, iter)
-               if (ether_addr_equal_64bits(mac, tmp->dev->dev_addr))
-                       return tmp;
-
-       return NULL;
-}
-
-/* Caller must hold rcu_read_lock() for read */
 static inline bool bond_slave_has_mac_rx(struct bonding *bond, const u8 *mac)
 {
        struct list_head *iter;
index 6ed0784..833672d 100644 (file)
@@ -227,6 +227,16 @@ static inline void wpan_phy_net_set(struct wpan_phy *wpan_phy, struct net *net)
        write_pnet(&wpan_phy->_net, net);
 }
 
+/**
+ * struct ieee802154_addr - IEEE802.15.4 device address
+ * @mode: Address mode from frame header. Can be one of:
+ *        - @IEEE802154_ADDR_NONE
+ *        - @IEEE802154_ADDR_SHORT
+ *        - @IEEE802154_ADDR_LONG
+ * @pan_id: The PAN ID this address belongs to
+ * @short_addr: address if @mode is @IEEE802154_ADDR_SHORT
+ * @extended_addr: address if @mode is @IEEE802154_ADDR_LONG
+ */
 struct ieee802154_addr {
        u8 mode;
        __le16 pan_id;
index 57b3e4e..fd1f62a 100644 (file)
@@ -278,6 +278,10 @@ struct dsa_port {
 
        u8                      devlink_port_setup:1;
 
+       /* Master state bits, valid only on CPU ports */
+       u8                      master_admin_up:1;
+       u8                      master_oper_up:1;
+
        u8                      setup:1;
 
        struct device_node      *dn;
@@ -478,6 +482,12 @@ static inline bool dsa_port_is_unused(struct dsa_port *dp)
        return dp->type == DSA_PORT_TYPE_UNUSED;
 }
 
+static inline bool dsa_port_master_is_operational(struct dsa_port *dp)
+{
+       return dsa_port_is_cpu(dp) && dp->master_admin_up &&
+              dp->master_oper_up;
+}
+
 static inline bool dsa_is_unused_port(struct dsa_switch *ds, int p)
 {
        return dsa_to_port(ds, p)->type == DSA_PORT_TYPE_UNUSED;
@@ -581,6 +591,24 @@ static inline bool dsa_is_upstream_port(struct dsa_switch *ds, int port)
        return port == dsa_upstream_port(ds, port);
 }
 
+/* Return true if this is a DSA port leading away from the CPU */
+static inline bool dsa_is_downstream_port(struct dsa_switch *ds, int port)
+{
+       return dsa_is_dsa_port(ds, port) && !dsa_is_upstream_port(ds, port);
+}
+
+/* Return the local port used to reach the CPU port */
+static inline unsigned int dsa_switch_upstream_port(struct dsa_switch *ds)
+{
+       struct dsa_port *dp;
+
+       dsa_switch_for_each_available_port(dp, ds) {
+               return dsa_upstream_port(ds, dp->index);
+       }
+
+       return ds->num_ports;
+}
+
 /* Return true if @upstream_ds is an upstream switch of @downstream_ds, meaning
  * that the routing port from @downstream_ds to @upstream_ds is also the port
  * which @downstream_ds uses to reach its dedicated CPU.
@@ -1036,6 +1064,13 @@ struct dsa_switch_ops {
        int     (*tag_8021q_vlan_add)(struct dsa_switch *ds, int port, u16 vid,
                                      u16 flags);
        int     (*tag_8021q_vlan_del)(struct dsa_switch *ds, int port, u16 vid);
+
+       /*
+        * DSA master tracking operations
+        */
+       void    (*master_state_change)(struct dsa_switch *ds,
+                                      const struct net_device *master,
+                                      bool operational);
 };
 
 #define DSA_DEVLINK_PARAM_DRIVER(_id, _name, _type, _cmodes)           \
@@ -1246,7 +1281,7 @@ module_exit(dsa_tag_driver_module_exit)
 /**
  * module_dsa_tag_drivers() - Helper macro for registering DSA tag
  * drivers
- * @__ops_array: Array of tag driver strucutres
+ * @__ops_array: Array of tag driver structures
  *
  * Helper macro for DSA tag drivers which do not do anything special
  * in module init/exit. Each module may only use this macro once, and
index 8f75802..a765fed 100644 (file)
@@ -29,46 +29,50 @@ struct napi_gro_cb {
        /* Number of segments aggregated. */
        u16     count;
 
-       /* Start offset for remote checksum offload */
-       u16     gro_remcsum_start;
+       /* Used in ipv6_gro_receive() and foo-over-udp */
+       u16     proto;
 
        /* jiffies when first packet was created/queued */
        unsigned long age;
 
-       /* Used in ipv6_gro_receive() and foo-over-udp */
-       u16     proto;
+       /* portion of the cb set to zero at every gro iteration */
+       struct_group(zeroed,
+
+               /* Start offset for remote checksum offload */
+               u16     gro_remcsum_start;
 
-       /* This is non-zero if the packet may be of the same flow. */
-       u8      same_flow:1;
+               /* This is non-zero if the packet may be of the same flow. */
+               u8      same_flow:1;
 
-       /* Used in tunnel GRO receive */
-       u8      encap_mark:1;
+               /* Used in tunnel GRO receive */
+               u8      encap_mark:1;
 
-       /* GRO checksum is valid */
-       u8      csum_valid:1;
+               /* GRO checksum is valid */
+               u8      csum_valid:1;
 
-       /* Number of checksums via CHECKSUM_UNNECESSARY */
-       u8      csum_cnt:3;
+               /* Number of checksums via CHECKSUM_UNNECESSARY */
+               u8      csum_cnt:3;
 
-       /* Free the skb? */
-       u8      free:2;
+               /* Free the skb? */
+               u8      free:2;
 #define NAPI_GRO_FREE            1
 #define NAPI_GRO_FREE_STOLEN_HEAD 2
 
-       /* Used in foo-over-udp, set in udp[46]_gro_receive */
-       u8      is_ipv6:1;
+               /* Used in foo-over-udp, set in udp[46]_gro_receive */
+               u8      is_ipv6:1;
 
-       /* Used in GRE, set in fou/gue_gro_receive */
-       u8      is_fou:1;
+               /* Used in GRE, set in fou/gue_gro_receive */
+               u8      is_fou:1;
 
-       /* Used to determine if flush_id can be ignored */
-       u8      is_atomic:1;
+               /* Used to determine if flush_id can be ignored */
+               u8      is_atomic:1;
 
-       /* Number of gro_receive callbacks this packet already went through */
-       u8 recursion_counter:4;
+               /* Number of gro_receive callbacks this packet already went through */
+               u8 recursion_counter:4;
 
-       /* GRO is done by frag_list pointer chaining. */
-       u8      is_flist:1;
+               /* GRO is done by frag_list pointer chaining. */
+               u8      is_flist:1;
+       );
 
        /* used to support CHECKSUM_COMPLETE for tunneling protocols */
        __wsum  csum;
index 4ad47d9..3908296 100644 (file)
@@ -285,6 +285,14 @@ static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk)
 bool inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req);
 void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req);
 
+static inline unsigned long
+reqsk_timeout(struct request_sock *req, unsigned long max_timeout)
+{
+       u64 timeout = (u64)req->timeout << req->num_timeout;
+
+       return (unsigned long)min_t(u64, timeout, max_timeout);
+}
+
 static inline void inet_csk_prepare_for_destroy_sock(struct sock *sk)
 {
        /* The below has to be done to allow calling inet_csk_destroy_sock */
diff --git a/include/net/inet_dscp.h b/include/net/inet_dscp.h
new file mode 100644 (file)
index 0000000..72f250d
--- /dev/null
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * inet_dscp.h: helpers for handling differentiated services codepoints (DSCP)
+ *
+ * DSCP is defined in RFC 2474:
+ *
+ *        0   1   2   3   4   5   6   7
+ *      +---+---+---+---+---+---+---+---+
+ *      |         DSCP          |  CU   |
+ *      +---+---+---+---+---+---+---+---+
+ *
+ *        DSCP: differentiated services codepoint
+ *        CU:   currently unused
+ *
+ * The whole DSCP + CU bits form the DS field.
+ * The DS field is also commonly called TOS or Traffic Class (for IPv6).
+ *
+ * Note: the CU bits are now used for Explicit Congestion Notification
+ *       (RFC 3168).
+ */
+
+#ifndef _INET_DSCP_H
+#define _INET_DSCP_H
+
+#include <linux/types.h>
+
+/* Special type for storing DSCP values.
+ *
+ * A dscp_t variable stores a DS field with the CU (ECN) bits cleared.
+ * Using dscp_t allows to strictly separate DSCP and ECN bits, thus avoiding
+ * bugs where ECN bits are erroneously taken into account during FIB lookups
+ * or policy routing.
+ *
+ * Note: to get the real DSCP value contained in a dscp_t variable one would
+ * have to do a bit shift after calling inet_dscp_to_dsfield(). We could have
+ * a helper for that, but there's currently no users.
+ */
+typedef u8 __bitwise dscp_t;
+
+#define INET_DSCP_MASK 0xfc
+
+static inline dscp_t inet_dsfield_to_dscp(__u8 dsfield)
+{
+       return (__force dscp_t)(dsfield & INET_DSCP_MASK);
+}
+
+static inline __u8 inet_dscp_to_dsfield(dscp_t dscp)
+{
+       return (__force __u8)dscp;
+}
+
+static inline bool inet_validate_dscp(__u8 val)
+{
+       return !(val & ~INET_DSCP_MASK);
+}
+
+#endif /* _INET_DSCP_H */
index dfd919b..463ae5d 100644 (file)
@@ -65,13 +65,13 @@ struct inet_timewait_sock {
        /* these three are in inet_sock */
        __be16                  tw_sport;
        /* And these are ours. */
-       unsigned int            tw_kill         : 1,
-                               tw_transparent  : 1,
+       unsigned int            tw_transparent  : 1,
                                tw_flowlabel    : 20,
-                               tw_pad          : 2,    /* 2 bits hole */
+                               tw_pad          : 3,    /* 3 bits hole */
                                tw_tos          : 8;
        u32                     tw_txhash;
        u32                     tw_priority;
+       u32                     tw_bslot; /* bind bucket slot */
        struct timer_list       tw_timer;
        struct inet_bind_bucket *tw_tb;
 };
@@ -110,8 +110,6 @@ static inline void inet_twsk_reschedule(struct inet_timewait_sock *tw, int timeo
 
 void inet_twsk_deschedule_put(struct inet_timewait_sock *tw);
 
-void inet_twsk_purge(struct inet_hashinfo *hashinfo, int family);
-
 static inline
 struct net *twsk_net(const struct inet_timewait_sock *twsk)
 {
index b51bae4..3984f2c 100644 (file)
@@ -517,7 +517,6 @@ void ip_dst_metrics_put(struct dst_entry *dst)
                kfree(p);
 }
 
-u32 ip_idents_reserve(u32 hash, int segs);
 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs);
 
 static inline void ip_select_ident_segs(struct net *net, struct sk_buff *skb,
@@ -712,7 +711,7 @@ int ip_forward(struct sk_buff *skb);
  */
 
 void ip_options_build(struct sk_buff *skb, struct ip_options *opt,
-                     __be32 daddr, struct rtable *rt, int is_frag);
+                     __be32 daddr, struct rtable *rt);
 
 int __ip_options_echo(struct net *net, struct ip_options *dopt,
                      struct sk_buff *skb, const struct ip_options *sopt);
index c429770..6a82bcb 100644 (file)
@@ -17,6 +17,7 @@
 #include <linux/rcupdate.h>
 #include <net/fib_notifier.h>
 #include <net/fib_rules.h>
+#include <net/inet_dscp.h>
 #include <net/inetpeer.h>
 #include <linux/percpu.h>
 #include <linux/notifier.h>
@@ -24,7 +25,7 @@
 
 struct fib_config {
        u8                      fc_dst_len;
-       u8                      fc_tos;
+       dscp_t                  fc_dscp;
        u8                      fc_protocol;
        u8                      fc_scope;
        u8                      fc_type;
index 3afcb12..f693784 100644 (file)
@@ -15,9 +15,9 @@
 #include <linux/refcount.h>
 #include <linux/jump_label_ratelimit.h>
 #include <net/if_inet6.h>
-#include <net/ndisc.h>
 #include <net/flow.h>
 #include <net/flow_dissector.h>
+#include <net/inet_dscp.h>
 #include <net/snmp.h>
 #include <net/netns/hash.h>
 
@@ -437,8 +437,16 @@ struct ipv6_txoptions *ipv6_renew_options(struct sock *sk,
                                          struct ipv6_txoptions *opt,
                                          int newtype,
                                          struct ipv6_opt_hdr *newopt);
-struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space,
-                                         struct ipv6_txoptions *opt);
+struct ipv6_txoptions *__ipv6_fixup_options(struct ipv6_txoptions *opt_space,
+                                           struct ipv6_txoptions *opt);
+
+static inline struct ipv6_txoptions *
+ipv6_fixup_options(struct ipv6_txoptions *opt_space, struct ipv6_txoptions *opt)
+{
+       if (!opt)
+               return NULL;
+       return __ipv6_fixup_options(opt_space, opt);
+}
 
 bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb,
                       const struct inet6_skb_parm *opt);
@@ -967,6 +975,11 @@ static inline u8 ip6_tclass(__be32 flowinfo)
        return ntohl(flowinfo & IPV6_TCLASS_MASK) >> IPV6_TCLASS_SHIFT;
 }
 
+static inline dscp_t ip6_dscp(__be32 flowinfo)
+{
+       return inet_dsfield_to_dscp(ip6_tclass(flowinfo));
+}
+
 static inline __be32 ip6_make_flowinfo(unsigned int tclass, __be32 flowlabel)
 {
        return htonl(tclass << IPV6_TCLASS_SHIFT) | flowlabel;
@@ -1020,7 +1033,7 @@ struct sk_buff *ip6_make_skb(struct sock *sk,
                             int getfrag(void *from, char *to, int offset,
                                         int len, int odd, struct sk_buff *skb),
                             void *from, int length, int transhdrlen,
-                            struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
+                            struct ipcm6_cookie *ipc6,
                             struct rt6_info *rt, unsigned int flags,
                             struct inet_cork_full *cork);
 
index 0a47791..5052c66 100644 (file)
@@ -1,6 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _IPV6_FRAG_H
 #define _IPV6_FRAG_H
+#include <linux/icmpv6.h>
 #include <linux/kernel.h>
 #include <net/addrconf.h>
 #include <net/ipv6.h>
index d524ffb..2c3bbc6 100644 (file)
@@ -464,6 +464,12 @@ void ieee802154_rx_irqsafe(struct ieee802154_hw *hw, struct sk_buff *skb,
  * ieee802154_wake_queue - wake ieee802154 queue
  * @hw: pointer as obtained from ieee802154_alloc_hw().
  *
+ * Tranceivers usually have either one transmit framebuffer or one framebuffer
+ * for both transmitting and receiving. Hence, the core currently only handles
+ * one frame at a time for each phy, which means we had to stop the queue to
+ * avoid new skb to come during the transmission. The queue then needs to be
+ * woken up after the operation.
+ *
  * Drivers should use this function instead of netif_wake_queue.
  */
 void ieee802154_wake_queue(struct ieee802154_hw *hw);
@@ -472,6 +478,12 @@ void ieee802154_wake_queue(struct ieee802154_hw *hw);
  * ieee802154_stop_queue - stop ieee802154 queue
  * @hw: pointer as obtained from ieee802154_alloc_hw().
  *
+ * Tranceivers usually have either one transmit framebuffer or one framebuffer
+ * for both transmitting and receiving. Hence, the core currently only handles
+ * one frame at a time for each phy, which means we need to tell upper layers to
+ * stop giving us new skbs while we are busy with the transmitted one. The queue
+ * must then be stopped before transmitting.
+ *
  * Drivers should use this function instead of netif_stop_queue.
  */
 void ieee802154_stop_queue(struct ieee802154_hw *hw);
index 7e35ec7..e80a4ba 100644 (file)
@@ -45,6 +45,11 @@ static inline bool mctp_address_ok(mctp_eid_t eid)
        return eid >= 8 && eid < 255;
 }
 
+static inline bool mctp_address_matches(mctp_eid_t match, mctp_eid_t eid)
+{
+       return match == eid || match == MCTP_ADDR_ANY;
+}
+
 static inline struct mctp_hdr *mctp_hdr(struct sk_buff *skb)
 {
        return (struct mctp_hdr *)skb_network_header(skb);
@@ -121,7 +126,7 @@ struct mctp_sock {
  */
 struct mctp_sk_key {
        mctp_eid_t      peer_addr;
-       mctp_eid_t      local_addr;
+       mctp_eid_t      local_addr; /* MCTP_ADDR_ANY for local owned tags */
        __u8            tag; /* incoming tag match; invert TO for local */
 
        /* we hold a ref to sk when set */
@@ -158,6 +163,12 @@ struct mctp_sk_key {
         */
        unsigned long   dev_flow_state;
        struct mctp_dev *dev;
+
+       /* a tag allocated with SIOCMCTPALLOCTAG ioctl will not expire
+        * automatically on timeout or response, instead SIOCMCTPDROPTAG
+        * is used.
+        */
+       bool            manual_alloc;
 };
 
 struct mctp_skb_cb {
@@ -234,6 +245,9 @@ int mctp_local_output(struct sock *sk, struct mctp_route *rt,
                      struct sk_buff *skb, mctp_eid_t daddr, u8 req_tag);
 
 void mctp_key_unref(struct mctp_sk_key *key);
+struct mctp_sk_key *mctp_alloc_local_tag(struct mctp_sock *msk,
+                                        mctp_eid_t daddr, mctp_eid_t saddr,
+                                        bool manual, u8 *tagp);
 
 /* routing <--> device interface */
 unsigned int mctp_default_net(struct net *net);
index 5b61c46..c4f5601 100644 (file)
@@ -63,7 +63,7 @@ struct net {
                                                 */
        spinlock_t              rules_mod_lock;
 
-       unsigned int            dev_unreg_count;
+       atomic_t                dev_unreg_count;
 
        unsigned int            dev_base_seq;   /* protected by rtnl_mutex */
        int                     ifindex;
@@ -513,4 +513,10 @@ static inline void fnhe_genid_bump(struct net *net)
        atomic_inc(&net->fnhe_genid);
 }
 
+#ifdef CONFIG_NET
+void net_ns_init(void);
+#else
+static inline void net_ns_init(void) {}
+#endif
+
 #endif /* __NET_NET_NAMESPACE_H */
index 7f44a77..4b2b7f8 100644 (file)
@@ -78,7 +78,6 @@ static inline void nf_ct_acct_update(struct nf_conn *ct, u32 dir,
 
 void nf_conntrack_acct_pernet_init(struct net *net);
 
-int nf_conntrack_acct_init(void);
 void nf_conntrack_acct_fini(void);
 
 #endif /* _NF_CONNTRACK_ACCT_H */
diff --git a/include/net/netfilter/nf_conntrack_bpf.h b/include/net/netfilter/nf_conntrack_bpf.h
new file mode 100644 (file)
index 0000000..a473b56
--- /dev/null
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _NF_CONNTRACK_BPF_H
+#define _NF_CONNTRACK_BPF_H
+
+#include <linux/btf.h>
+#include <linux/kconfig.h>
+
+#if (IS_BUILTIN(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \
+    (IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES))
+
+extern int register_nf_conntrack_bpf(void);
+
+#else
+
+static inline int register_nf_conntrack_bpf(void)
+{
+       return 0;
+}
+
+#endif
+
+#endif /* _NF_CONNTRACK_BPF_H */
index d932e22..6c4c490 100644 (file)
@@ -21,10 +21,10 @@ enum nf_ct_ecache_state {
 
 struct nf_conntrack_ecache {
        unsigned long cache;            /* bitops want long */
-       u16 missed;                     /* missed events */
        u16 ctmask;                     /* bitmask of ct events to be delivered */
        u16 expmask;                    /* bitmask of expect events to be delivered */
        enum nf_ct_ecache_state state:8;/* ecache state */
+       u32 missed;                     /* missed events */
        u32 portid;                     /* netlink portid of destroyer */
 };
 
@@ -166,9 +166,6 @@ void nf_conntrack_ecache_work(struct net *net, enum nf_ct_ecache_state state);
 void nf_conntrack_ecache_pernet_init(struct net *net);
 void nf_conntrack_ecache_pernet_fini(struct net *net);
 
-int nf_conntrack_ecache_init(void);
-void nf_conntrack_ecache_fini(void);
-
 static inline bool nf_conntrack_ecache_dwork_pending(const struct net *net)
 {
        return net->ct.ecache_dwork_pending;
@@ -194,16 +191,6 @@ static inline void nf_conntrack_ecache_pernet_init(struct net *net)
 static inline void nf_conntrack_ecache_pernet_fini(struct net *net)
 {
 }
-
-static inline int nf_conntrack_ecache_init(void)
-{
-       return 0;
-}
-
-static inline void nf_conntrack_ecache_fini(void)
-{
-}
-
 static inline bool nf_conntrack_ecache_dwork_pending(const struct net *net) { return false; }
 #endif /* CONFIG_NF_CONNTRACK_EVENTS */
 #endif /*_NF_CONNTRACK_ECACHE_H*/
index c7515d8..96635ad 100644 (file)
@@ -49,7 +49,7 @@ enum nf_ct_ext_id {
 struct nf_ct_ext {
        u8 offset[NF_CT_EXT_NUM];
        u8 len;
-       char data[];
+       char data[] __aligned(8);
 };
 
 static inline bool __nf_ct_ext_exist(const struct nf_ct_ext *ext, u8 id)
@@ -72,23 +72,7 @@ static inline void *__nf_ct_ext_find(const struct nf_conn *ct, u8 id)
 #define nf_ct_ext_find(ext, id)        \
        ((id##_TYPE *)__nf_ct_ext_find((ext), (id)))
 
-/* Destroy all relationships */
-void nf_ct_ext_destroy(struct nf_conn *ct);
-
 /* Add this type, returns pointer to data or NULL. */
 void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp);
 
-struct nf_ct_ext_type {
-       /* Destroys relationships (can be NULL). */
-       void (*destroy)(struct nf_conn *ct);
-
-       enum nf_ct_ext_id id;
-
-       /* Length and min alignment. */
-       u8 len;
-       u8 align;
-};
-
-int nf_ct_extend_register(const struct nf_ct_ext_type *type);
-void nf_ct_extend_unregister(const struct nf_ct_ext_type *type);
 #endif /* _NF_CONNTRACK_EXTEND_H */
index ba91641..3c23298 100644 (file)
@@ -45,12 +45,9 @@ int nf_connlabels_replace(struct nf_conn *ct,
 
 #ifdef CONFIG_NF_CONNTRACK_LABELS
 int nf_conntrack_labels_init(void);
-void nf_conntrack_labels_fini(void);
 int nf_connlabels_get(struct net *net, unsigned int bit);
 void nf_connlabels_put(struct net *net);
 #else
-static inline int nf_conntrack_labels_init(void) { return 0; }
-static inline void nf_conntrack_labels_fini(void) {}
 static inline int nf_connlabels_get(struct net *net, unsigned int bit) { return 0; }
 static inline void nf_connlabels_put(struct net *net) {}
 #endif
index 0a10b50..883c414 100644 (file)
@@ -42,7 +42,4 @@ int nf_ct_seq_adjust(struct sk_buff *skb, struct nf_conn *ct,
                     enum ip_conntrack_info ctinfo, unsigned int protoff);
 s32 nf_ct_seq_offset(const struct nf_conn *ct, enum ip_conntrack_dir, u32 seq);
 
-int nf_conntrack_seqadj_init(void);
-void nf_conntrack_seqadj_fini(void);
-
 #endif /* _NF_CONNTRACK_SEQADJ_H */
index 659b0ea..3ea94f6 100644 (file)
@@ -89,23 +89,11 @@ static inline unsigned int *nf_ct_timeout_lookup(const struct nf_conn *ct)
 }
 
 #ifdef CONFIG_NF_CONNTRACK_TIMEOUT
-int nf_conntrack_timeout_init(void);
-void nf_conntrack_timeout_fini(void);
 void nf_ct_untimeout(struct net *net, struct nf_ct_timeout *timeout);
 int nf_ct_set_timeout(struct net *net, struct nf_conn *ct, u8 l3num, u8 l4num,
                      const char *timeout_name);
 void nf_ct_destroy_timeout(struct nf_conn *ct);
 #else
-static inline int nf_conntrack_timeout_init(void)
-{
-        return 0;
-}
-
-static inline void nf_conntrack_timeout_fini(void)
-{
-        return;
-}
-
 static inline int nf_ct_set_timeout(struct net *net, struct nf_conn *ct,
                                    u8 l3num, u8 l4num,
                                    const char *timeout_name)
@@ -120,8 +108,12 @@ static inline void nf_ct_destroy_timeout(struct nf_conn *ct)
 #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
 
 #ifdef CONFIG_NF_CONNTRACK_TIMEOUT
-extern struct nf_ct_timeout *(*nf_ct_timeout_find_get_hook)(struct net *net, const char *name);
-extern void (*nf_ct_timeout_put_hook)(struct nf_ct_timeout *timeout);
+struct nf_ct_timeout_hooks {
+       struct nf_ct_timeout *(*timeout_find_get)(struct net *net, const char *name);
+       void (*timeout_put)(struct nf_ct_timeout *timeout);
+};
+
+extern const struct nf_ct_timeout_hooks *nf_ct_timeout_hook;
 #endif
 
 #endif /* _NF_CONNTRACK_TIMEOUT_H */
index 820ea34..57138d9 100644 (file)
@@ -40,21 +40,8 @@ struct nf_conn_tstamp *nf_ct_tstamp_ext_add(struct nf_conn *ct, gfp_t gfp)
 
 #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
 void nf_conntrack_tstamp_pernet_init(struct net *net);
-
-int nf_conntrack_tstamp_init(void);
-void nf_conntrack_tstamp_fini(void);
 #else
 static inline void nf_conntrack_tstamp_pernet_init(struct net *net) {}
-
-static inline int nf_conntrack_tstamp_init(void)
-{
-       return 0;
-}
-
-static inline void nf_conntrack_tstamp_fini(void)
-{
-       return;
-}
 #endif /* CONFIG_NF_CONNTRACK_TIMESTAMP */
 
 #endif /* _NF_CONNTRACK_TSTAMP_H */
index b6fb1fd..0ea7c55 100644 (file)
@@ -42,6 +42,14 @@ struct nft_cmp_fast_expr {
        bool                    inv;
 };
 
+struct nft_cmp16_fast_expr {
+       struct nft_data         data;
+       struct nft_data         mask;
+       u8                      sreg;
+       u8                      len;
+       bool                    inv;
+};
+
 struct nft_immediate_expr {
        struct nft_data         data;
        u8                      dreg;
@@ -59,6 +67,7 @@ static inline u32 nft_cmp_fast_mask(unsigned int len)
 }
 
 extern const struct nft_expr_ops nft_cmp_fast_ops;
+extern const struct nft_expr_ops nft_cmp16_fast_ops;
 
 struct nft_payload {
        enum nft_payload_bases  base:8;
index 552bc25..388244e 100644 (file)
@@ -10,6 +10,7 @@ struct netns_core {
        struct ctl_table_header *sysctl_hdr;
 
        int     sysctl_somaxconn;
+       u8      sysctl_txrehash;
 
 #ifdef CONFIG_PROC_FS
        struct prot_inuse __percpu *prot_inuse;
index 7855764..f068786 100644 (file)
@@ -31,18 +31,16 @@ struct ping_group_range {
 struct inet_hashinfo;
 
 struct inet_timewait_death_row {
-       atomic_t                tw_count;
-       char                    tw_pad[L1_CACHE_BYTES - sizeof(atomic_t)];
+       refcount_t              tw_refcount;
 
-       struct inet_hashinfo    *hashinfo;
+       struct inet_hashinfo    *hashinfo ____cacheline_aligned_in_smp;
        int                     sysctl_max_tw_buckets;
 };
 
 struct tcp_fastopen_context;
 
 struct netns_ipv4 {
-       /* Please keep tcp_death_row at first field in netns_ipv4 */
-       struct inet_timewait_death_row tcp_death_row ____cacheline_aligned_in_smp;
+       struct inet_timewait_death_row *tcp_death_row;
 
 #ifdef CONFIG_SYSCTL
        struct ctl_table_header *forw_hdr;
@@ -70,11 +68,9 @@ struct netns_ipv4 {
        struct hlist_head       *fib_table_hash;
        struct sock             *fibnl;
 
-       struct sock  * __percpu *icmp_sk;
        struct sock             *mc_autojoin_sk;
 
        struct inet_peer_base   *peers;
-       struct sock  * __percpu *tcp_sk;
        struct fqdir            *fqdir;
 
        u8 sysctl_icmp_echo_ignore_all;
@@ -87,6 +83,7 @@ struct netns_ipv4 {
 
        u32 ip_rt_min_pmtu;
        int ip_rt_mtu_expires;
+       int ip_rt_min_advmss;
 
        struct local_ports ip_local_ports;
 
index a4b5503..d145f19 100644 (file)
@@ -88,11 +88,15 @@ struct netns_ipv6 {
        struct fib6_table       *fib6_local_tbl;
        struct fib_rules_ops    *fib6_rules_ops;
 #endif
-       struct sock * __percpu  *icmp_sk;
        struct sock             *ndisc_sk;
        struct sock             *tcp_sk;
        struct sock             *igmp_sk;
        struct sock             *mc_autojoin_sk;
+
+       struct hlist_head       *inet6_addr_lst;
+       spinlock_t              addrconf_hash_lock;
+       struct delayed_work     addr_chk_work;
+
 #ifdef CONFIG_IPV6_MROUTE
 #ifndef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES
        struct mr_table         *mrt6;
index 79a8055..97c3c19 100644 (file)
@@ -201,21 +201,67 @@ static inline void page_pool_put_page_bulk(struct page_pool *pool, void **data,
 }
 #endif
 
-void page_pool_put_page(struct page_pool *pool, struct page *page,
-                       unsigned int dma_sync_size, bool allow_direct);
+void page_pool_put_defragged_page(struct page_pool *pool, struct page *page,
+                                 unsigned int dma_sync_size,
+                                 bool allow_direct);
 
-/* Same as above but will try to sync the entire area pool->max_len */
-static inline void page_pool_put_full_page(struct page_pool *pool,
-                                          struct page *page, bool allow_direct)
+static inline void page_pool_fragment_page(struct page *page, long nr)
+{
+       atomic_long_set(&page->pp_frag_count, nr);
+}
+
+static inline long page_pool_defrag_page(struct page *page, long nr)
+{
+       long ret;
+
+       /* If nr == pp_frag_count then we have cleared all remaining
+        * references to the page. No need to actually overwrite it, instead
+        * we can leave this to be overwritten by the calling function.
+        *
+        * The main advantage to doing this is that an atomic_read is
+        * generally a much cheaper operation than an atomic update,
+        * especially when dealing with a page that may be partitioned
+        * into only 2 or 3 pieces.
+        */
+       if (atomic_long_read(&page->pp_frag_count) == nr)
+               return 0;
+
+       ret = atomic_long_sub_return(nr, &page->pp_frag_count);
+       WARN_ON(ret < 0);
+       return ret;
+}
+
+static inline bool page_pool_is_last_frag(struct page_pool *pool,
+                                         struct page *page)
+{
+       /* If fragments aren't enabled or count is 0 we were the last user */
+       return !(pool->p.flags & PP_FLAG_PAGE_FRAG) ||
+              (page_pool_defrag_page(page, 1) == 0);
+}
+
+static inline void page_pool_put_page(struct page_pool *pool,
+                                     struct page *page,
+                                     unsigned int dma_sync_size,
+                                     bool allow_direct)
 {
        /* When page_pool isn't compiled-in, net/core/xdp.c doesn't
         * allow registering MEM_TYPE_PAGE_POOL, but shield linker.
         */
 #ifdef CONFIG_PAGE_POOL
-       page_pool_put_page(pool, page, -1, allow_direct);
+       if (!page_pool_is_last_frag(pool, page))
+               return;
+
+       page_pool_put_defragged_page(pool, page, dma_sync_size, allow_direct);
 #endif
 }
 
+/* Same as above but will try to sync the entire area pool->max_len */
+static inline void page_pool_put_full_page(struct page_pool *pool,
+                                          struct page *page, bool allow_direct)
+{
+       page_pool_put_page(pool, page, -1, allow_direct);
+}
+
 /* Same as above but the caller must guarantee safe context. e.g NAPI */
 static inline void page_pool_recycle_direct(struct page_pool *pool,
                                            struct page *page)
@@ -243,30 +289,6 @@ static inline void page_pool_set_dma_addr(struct page *page, dma_addr_t addr)
                page->dma_addr_upper = upper_32_bits(addr);
 }
 
-static inline void page_pool_set_frag_count(struct page *page, long nr)
-{
-       atomic_long_set(&page->pp_frag_count, nr);
-}
-
-static inline long page_pool_atomic_sub_frag_count_return(struct page *page,
-                                                         long nr)
-{
-       long ret;
-
-       /* As suggested by Alexander, atomic_long_read() may cover up the
-        * reference count errors, so avoid calling atomic_long_read() in
-        * the cases of freeing or draining the page_frags, where we would
-        * not expect it to match or that are slowpath anyway.
-        */
-       if (__builtin_constant_p(nr) &&
-           atomic_long_read(&page->pp_frag_count) == nr)
-               return 0;
-
-       ret = atomic_long_sub_return(nr, &page->pp_frag_count);
-       WARN_ON(ret < 0);
-       return ret;
-}
-
 static inline bool is_page_pool_compiled_in(void)
 {
 #ifdef CONFIG_PAGE_POOL
index 676cb8e..a3b57a9 100644 (file)
@@ -1028,4 +1028,15 @@ struct tc_fifo_qopt_offload {
        };
 };
 
+#ifdef CONFIG_NET_CLS_ACT
+DECLARE_STATIC_KEY_FALSE(tc_skb_ext_tc);
+void tc_skb_ext_tc_enable(void);
+void tc_skb_ext_tc_disable(void);
+#define tc_skb_ext_tc_enabled() static_branch_unlikely(&tc_skb_ext_tc)
+#else /* CONFIG_NET_CLS_ACT */
+static inline void tc_skb_ext_tc_enable(void) { }
+static inline void tc_skb_ext_tc_disable(void) { }
+#define tc_skb_ext_tc_enabled() false
+#endif
+
 #endif
index 9e7b21c..44a3553 100644 (file)
@@ -63,12 +63,6 @@ static inline psched_time_t psched_get_time(void)
        return PSCHED_NS2TICKS(ktime_get_ns());
 }
 
-static inline psched_tdiff_t
-psched_tdiff_bounded(psched_time_t tv1, psched_time_t tv2, psched_time_t bound)
-{
-       return min(tv1 - tv2, bound);
-}
-
 struct qdisc_watchdog {
        u64             last_expires;
        struct hrtimer  timer;
index 29e41ff..144c39d 100644 (file)
@@ -70,6 +70,7 @@ struct request_sock {
        struct saved_syn                *saved_syn;
        u32                             secid;
        u32                             peer_secid;
+       u32                             timeout;
 };
 
 static inline struct request_sock *inet_reqsk(const struct sock *sk)
@@ -104,6 +105,7 @@ reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener,
        sk_node_init(&req_to_sk(req)->sk_node);
        sk_tx_queue_clear(req_to_sk(req));
        req->saved_syn = NULL;
+       req->timeout = 0;
        req->num_timeout = 0;
        req->num_retrans = 0;
        req->sk = NULL;
index 472843e..9bab396 100644 (file)
@@ -518,11 +518,6 @@ static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz)
        BUILD_BUG_ON(sizeof(qcb->data) < sz);
 }
 
-static inline int qdisc_qlen_cpu(const struct Qdisc *q)
-{
-       return this_cpu_ptr(q->cpu_qstats)->qlen;
-}
-
 static inline int qdisc_qlen(const struct Qdisc *q)
 {
        return q->q.qlen;
index ff9b508..d6c13f0 100644 (file)
@@ -316,6 +316,7 @@ struct sk_filter;
   *    @sk_rcvtimeo: %SO_RCVTIMEO setting
   *    @sk_sndtimeo: %SO_SNDTIMEO setting
   *    @sk_txhash: computed flow hash for use on transmit
+  *    @sk_txrehash: enable TX hash rethink
   *    @sk_filter: socket filtering instructions
   *    @sk_timer: sock cleanup timer
   *    @sk_stamp: time stamp of last packet received
@@ -491,6 +492,7 @@ struct sock {
        u32                     sk_ack_backlog;
        u32                     sk_max_ack_backlog;
        kuid_t                  sk_uid;
+       u8                      sk_txrehash;
 #ifdef CONFIG_NET_RX_BUSY_POLL
        u8                      sk_prefer_busy_poll;
        u16                     sk_busy_poll_budget;
@@ -587,6 +589,18 @@ static inline bool sk_user_data_is_nocopy(const struct sock *sk)
                           __tmp | SK_USER_DATA_NOCOPY);                \
 })
 
+static inline
+struct net *sock_net(const struct sock *sk)
+{
+       return read_pnet(&sk->sk_net);
+}
+
+static inline
+void sock_net_set(struct sock *sk, struct net *net)
+{
+       write_pnet(&sk->sk_net, net);
+}
+
 /*
  * SK_CAN_REUSE and SK_NO_REUSE on a socket mean that the socket is OK
  * or not whether his port will be reused by someone else. SK_FORCE_REUSE
@@ -2054,7 +2068,7 @@ static inline void sk_set_txhash(struct sock *sk)
 
 static inline bool sk_rethink_txhash(struct sock *sk)
 {
-       if (sk->sk_txhash) {
+       if (sk->sk_txhash && sk->sk_txrehash == SOCK_TXREHASH_ENABLED) {
                sk_set_txhash(sk);
                return true;
        }
@@ -2704,18 +2718,6 @@ static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb)
        __kfree_skb(skb);
 }
 
-static inline
-struct net *sock_net(const struct sock *sk)
-{
-       return read_pnet(&sk->sk_net);
-}
-
-static inline
-void sock_net_set(struct sock *sk, struct net *net)
-{
-       write_pnet(&sk->sk_net, net);
-}
-
 static inline bool
 skb_sk_is_prefetched(struct sk_buff *skb)
 {
index b9fc978..eff2487 100644 (file)
@@ -2358,7 +2358,7 @@ static inline u32 tcp_timeout_init(struct sock *sk)
 
        if (timeout <= 0)
                timeout = TCP_TIMEOUT_INIT;
-       return timeout;
+       return min_t(int, timeout, TCP_RTO_MAX);
 }
 
 static inline u32 tcp_rwnd_init_bpf(struct sock *sk)
index 9185e45..a3c5311 100644 (file)
@@ -70,49 +70,6 @@ static inline int udplite_checksum_init(struct sk_buff *skb, struct udphdr *uh)
        return 0;
 }
 
-/* Slow-path computation of checksum. Socket is locked. */
-static inline __wsum udplite_csum_outgoing(struct sock *sk, struct sk_buff *skb)
-{
-       const struct udp_sock *up = udp_sk(skb->sk);
-       int cscov = up->len;
-       __wsum csum = 0;
-
-       if (up->pcflag & UDPLITE_SEND_CC) {
-               /*
-                * Sender has set `partial coverage' option on UDP-Lite socket.
-                * The special case "up->pcslen == 0" signifies full coverage.
-                */
-               if (up->pcslen < up->len) {
-                       if (0 < up->pcslen)
-                               cscov = up->pcslen;
-                       udp_hdr(skb)->len = htons(up->pcslen);
-               }
-               /*
-                * NOTE: Causes for the error case  `up->pcslen > up->len':
-                *        (i)  Application error (will not be penalized).
-                *       (ii)  Payload too big for send buffer: data is split
-                *             into several packets, each with its own header.
-                *             In this case (e.g. last segment), coverage may
-                *             exceed packet length.
-                *       Since packets with coverage length > packet length are
-                *       illegal, we fall back to the defaults here.
-                */
-       }
-
-       skb->ip_summed = CHECKSUM_NONE;     /* no HW support for checksumming */
-
-       skb_queue_walk(&sk->sk_write_queue, skb) {
-               const int off = skb_transport_offset(skb);
-               const int len = skb->len - off;
-
-               csum = skb_checksum(skb, off, (cscov > len)? len : cscov, csum);
-
-               if ((cscov -= len) <= 0)
-                       break;
-       }
-       return csum;
-}
-
 /* Fast-path computation of checksum. Socket may not be locked. */
 static inline __wsum udplite_csum(struct sk_buff *skb)
 {
index 8f0812e..b7721c3 100644 (file)
@@ -60,12 +60,20 @@ struct xdp_rxq_info {
        u32 reg_state;
        struct xdp_mem_info mem;
        unsigned int napi_id;
+       u32 frag_size;
 } ____cacheline_aligned; /* perf critical, avoid false-sharing */
 
 struct xdp_txq_info {
        struct net_device *dev;
 };
 
+enum xdp_buff_flags {
+       XDP_FLAGS_HAS_FRAGS             = BIT(0), /* non-linear xdp buff */
+       XDP_FLAGS_FRAGS_PF_MEMALLOC     = BIT(1), /* xdp paged memory is under
+                                                  * pressure
+                                                  */
+};
+
 struct xdp_buff {
        void *data;
        void *data_end;
@@ -74,13 +82,40 @@ struct xdp_buff {
        struct xdp_rxq_info *rxq;
        struct xdp_txq_info *txq;
        u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/
+       u32 flags; /* supported values defined in xdp_buff_flags */
 };
 
+static __always_inline bool xdp_buff_has_frags(struct xdp_buff *xdp)
+{
+       return !!(xdp->flags & XDP_FLAGS_HAS_FRAGS);
+}
+
+static __always_inline void xdp_buff_set_frags_flag(struct xdp_buff *xdp)
+{
+       xdp->flags |= XDP_FLAGS_HAS_FRAGS;
+}
+
+static __always_inline void xdp_buff_clear_frags_flag(struct xdp_buff *xdp)
+{
+       xdp->flags &= ~XDP_FLAGS_HAS_FRAGS;
+}
+
+static __always_inline bool xdp_buff_is_frag_pfmemalloc(struct xdp_buff *xdp)
+{
+       return !!(xdp->flags & XDP_FLAGS_FRAGS_PF_MEMALLOC);
+}
+
+static __always_inline void xdp_buff_set_frag_pfmemalloc(struct xdp_buff *xdp)
+{
+       xdp->flags |= XDP_FLAGS_FRAGS_PF_MEMALLOC;
+}
+
 static __always_inline void
 xdp_init_buff(struct xdp_buff *xdp, u32 frame_sz, struct xdp_rxq_info *rxq)
 {
        xdp->frame_sz = frame_sz;
        xdp->rxq = rxq;
+       xdp->flags = 0;
 }
 
 static __always_inline void
@@ -111,6 +146,20 @@ xdp_get_shared_info_from_buff(struct xdp_buff *xdp)
        return (struct skb_shared_info *)xdp_data_hard_end(xdp);
 }
 
+static __always_inline unsigned int xdp_get_buff_len(struct xdp_buff *xdp)
+{
+       unsigned int len = xdp->data_end - xdp->data;
+       struct skb_shared_info *sinfo;
+
+       if (likely(!xdp_buff_has_frags(xdp)))
+               goto out;
+
+       sinfo = xdp_get_shared_info_from_buff(xdp);
+       len += sinfo->xdp_frags_size;
+out:
+       return len;
+}
+
 struct xdp_frame {
        void *data;
        u16 len;
@@ -122,8 +171,19 @@ struct xdp_frame {
         */
        struct xdp_mem_info mem;
        struct net_device *dev_rx; /* used by cpumap */
+       u32 flags; /* supported values defined in xdp_buff_flags */
 };
 
+static __always_inline bool xdp_frame_has_frags(struct xdp_frame *frame)
+{
+       return !!(frame->flags & XDP_FLAGS_HAS_FRAGS);
+}
+
+static __always_inline bool xdp_frame_is_frag_pfmemalloc(struct xdp_frame *frame)
+{
+       return !!(frame->flags & XDP_FLAGS_FRAGS_PF_MEMALLOC);
+}
+
 #define XDP_BULK_QUEUE_SIZE    16
 struct xdp_frame_bulk {
        int count;
@@ -159,6 +219,19 @@ static inline void xdp_scrub_frame(struct xdp_frame *frame)
        frame->dev_rx = NULL;
 }
 
+static inline void
+xdp_update_skb_shared_info(struct sk_buff *skb, u8 nr_frags,
+                          unsigned int size, unsigned int truesize,
+                          bool pfmemalloc)
+{
+       skb_shinfo(skb)->nr_frags = nr_frags;
+
+       skb->len += size;
+       skb->data_len += size;
+       skb->truesize += truesize;
+       skb->pfmemalloc |= pfmemalloc;
+}
+
 /* Avoids inlining WARN macro in fast-path */
 void xdp_warn(const char *msg, const char *func, const int line);
 #define XDP_WARN(msg) xdp_warn(msg, __func__, __LINE__)
@@ -180,6 +253,7 @@ void xdp_convert_frame_to_buff(struct xdp_frame *frame, struct xdp_buff *xdp)
        xdp->data_end = frame->data + frame->len;
        xdp->data_meta = frame->data - frame->metasize;
        xdp->frame_sz = frame->frame_sz;
+       xdp->flags = frame->flags;
 }
 
 static inline
@@ -206,6 +280,7 @@ int xdp_update_frame_from_buff(struct xdp_buff *xdp,
        xdp_frame->headroom = headroom - sizeof(*xdp_frame);
        xdp_frame->metasize = metasize;
        xdp_frame->frame_sz = xdp->frame_sz;
+       xdp_frame->flags = xdp->flags;
 
        return 0;
 }
@@ -230,6 +305,8 @@ struct xdp_frame *xdp_convert_buff_to_frame(struct xdp_buff *xdp)
        return xdp_frame;
 }
 
+void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
+                 struct xdp_buff *xdp);
 void xdp_return_frame(struct xdp_frame *xdpf);
 void xdp_return_frame_rx_napi(struct xdp_frame *xdpf);
 void xdp_return_buff(struct xdp_buff *xdp);
@@ -246,14 +323,37 @@ void __xdp_release_frame(void *data, struct xdp_mem_info *mem);
 static inline void xdp_release_frame(struct xdp_frame *xdpf)
 {
        struct xdp_mem_info *mem = &xdpf->mem;
+       struct skb_shared_info *sinfo;
+       int i;
 
        /* Curr only page_pool needs this */
-       if (mem->type == MEM_TYPE_PAGE_POOL)
-               __xdp_release_frame(xdpf->data, mem);
+       if (mem->type != MEM_TYPE_PAGE_POOL)
+               return;
+
+       if (likely(!xdp_frame_has_frags(xdpf)))
+               goto out;
+
+       sinfo = xdp_get_shared_info_from_frame(xdpf);
+       for (i = 0; i < sinfo->nr_frags; i++) {
+               struct page *page = skb_frag_page(&sinfo->frags[i]);
+
+               __xdp_release_frame(page_address(page), mem);
+       }
+out:
+       __xdp_release_frame(xdpf->data, mem);
+}
+
+int __xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
+                      struct net_device *dev, u32 queue_index,
+                      unsigned int napi_id, u32 frag_size);
+static inline int
+xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
+                struct net_device *dev, u32 queue_index,
+                unsigned int napi_id)
+{
+       return __xdp_rxq_info_reg(xdp_rxq, dev, queue_index, napi_id, 0);
 }
 
-int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
-                    struct net_device *dev, u32 queue_index, unsigned int napi_id);
 void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq);
 void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq);
 bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq);
index 443d459..4aa0318 100644 (file)
@@ -13,7 +13,7 @@
 
 void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries);
 bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc);
-u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, struct xdp_desc *desc, u32 max);
+u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 max);
 void xsk_tx_release(struct xsk_buff_pool *pool);
 struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev,
                                            u16 queue_id);
@@ -142,8 +142,7 @@ static inline bool xsk_tx_peek_desc(struct xsk_buff_pool *pool,
        return false;
 }
 
-static inline u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, struct xdp_desc *desc,
-                                                u32 max)
+static inline u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 max)
 {
        return 0;
 }
index ddeefc4..5554ee7 100644 (file)
@@ -60,6 +60,7 @@ struct xsk_buff_pool {
         */
        dma_addr_t *dma_pages;
        struct xdp_buff_xsk *heads;
+       struct xdp_desc *tx_descs;
        u64 chunk_mask;
        u64 addrs_cnt;
        u32 free_list_cnt;
index 175b057..165cf25 100644 (file)
@@ -15,6 +15,7 @@ enum {
        MCTP_TRACE_KEY_REPLIED,
        MCTP_TRACE_KEY_INVALIDATED,
        MCTP_TRACE_KEY_CLOSED,
+       MCTP_TRACE_KEY_DROPPED,
 };
 #endif /* __TRACE_MCTP_ENUMS */
 
@@ -22,6 +23,7 @@ TRACE_DEFINE_ENUM(MCTP_TRACE_KEY_TIMEOUT);
 TRACE_DEFINE_ENUM(MCTP_TRACE_KEY_REPLIED);
 TRACE_DEFINE_ENUM(MCTP_TRACE_KEY_INVALIDATED);
 TRACE_DEFINE_ENUM(MCTP_TRACE_KEY_CLOSED);
+TRACE_DEFINE_ENUM(MCTP_TRACE_KEY_DROPPED);
 
 TRACE_EVENT(mctp_key_acquire,
        TP_PROTO(const struct mctp_sk_key *key),
@@ -66,7 +68,8 @@ TRACE_EVENT(mctp_key_release,
                                 { MCTP_TRACE_KEY_TIMEOUT, "timeout" },
                                 { MCTP_TRACE_KEY_REPLIED, "replied" },
                                 { MCTP_TRACE_KEY_INVALIDATED, "invalidated" },
-                                { MCTP_TRACE_KEY_CLOSED, "closed" })
+                                { MCTP_TRACE_KEY_CLOSED, "closed" },
+                                { MCTP_TRACE_KEY_DROPPED, "dropped" })
        )
 );
 
index a8a64b9..cfcfd26 100644 (file)
        EM(SKB_DROP_REASON_TCP_CSUM, TCP_CSUM)                  \
        EM(SKB_DROP_REASON_SOCKET_FILTER, SOCKET_FILTER)        \
        EM(SKB_DROP_REASON_UDP_CSUM, UDP_CSUM)                  \
+       EM(SKB_DROP_REASON_NETFILTER_DROP, NETFILTER_DROP)      \
+       EM(SKB_DROP_REASON_OTHERHOST, OTHERHOST)                \
+       EM(SKB_DROP_REASON_IP_CSUM, IP_CSUM)                    \
+       EM(SKB_DROP_REASON_IP_INHDR, IP_INHDR)                  \
+       EM(SKB_DROP_REASON_IP_RPFILTER, IP_RPFILTER)            \
+       EM(SKB_DROP_REASON_UNICAST_IN_L2_MULTICAST,             \
+          UNICAST_IN_L2_MULTICAST)                             \
+       EM(SKB_DROP_REASON_XFRM_POLICY, XFRM_POLICY)            \
+       EM(SKB_DROP_REASON_IP_NOPROTO, IP_NOPROTO)              \
+       EM(SKB_DROP_REASON_SOCKET_RCVBUFF, SOCKET_RCVBUFF)      \
+       EM(SKB_DROP_REASON_PROTO_MEM, PROTO_MEM)                \
        EMe(SKB_DROP_REASON_MAX, MAX)
 
 #undef EM
index c77a131..467ca2f 100644 (file)
 
 #define SO_RESERVE_MEM         73
 
+#define SO_TXREHASH            74
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__))
index b0383d3..afe3d0d 100644 (file)
@@ -330,6 +330,8 @@ union bpf_iter_link_info {
  *                     *ctx_out*, *data_in* and *data_out* must be NULL.
  *                     *repeat* must be zero.
  *
+ *             BPF_PROG_RUN is an alias for BPF_PROG_TEST_RUN.
+ *
  *     Return
  *             Returns zero on success. On error, -1 is returned and *errno*
  *             is set appropriately.
@@ -1111,6 +1113,11 @@ enum bpf_link_type {
  */
 #define BPF_F_SLEEPABLE                (1U << 4)
 
+/* If BPF_F_XDP_HAS_FRAGS is used in BPF_PROG_LOAD command, the loaded program
+ * fully support xdp frags.
+ */
+#define BPF_F_XDP_HAS_FRAGS    (1U << 5)
+
 /* When BPF ldimm64's insn[0].src_reg != 0 then this can have
  * the following extensions:
  *
@@ -1775,6 +1782,8 @@ union bpf_attr {
  *             0 on success, or a negative error in case of failure.
  *
  * u64 bpf_get_current_pid_tgid(void)
+ *     Description
+ *             Get the current pid and tgid.
  *     Return
  *             A 64-bit integer containing the current tgid and pid, and
  *             created as such:
@@ -1782,6 +1791,8 @@ union bpf_attr {
  *             *current_task*\ **->pid**.
  *
  * u64 bpf_get_current_uid_gid(void)
+ *     Description
+ *             Get the current uid and gid.
  *     Return
  *             A 64-bit integer containing the current GID and UID, and
  *             created as such: *current_gid* **<< 32 \|** *current_uid*.
@@ -2256,6 +2267,8 @@ union bpf_attr {
  *             The 32-bit hash.
  *
  * u64 bpf_get_current_task(void)
+ *     Description
+ *             Get the current task.
  *     Return
  *             A pointer to the current task struct.
  *
@@ -2369,6 +2382,8 @@ union bpf_attr {
  *             indicate that the hash is outdated and to trigger a
  *             recalculation the next time the kernel tries to access this
  *             hash or when the **bpf_get_hash_recalc**\ () helper is called.
+ *     Return
+ *             void.
  *
  * long bpf_get_numa_node_id(void)
  *     Description
@@ -2466,6 +2481,8 @@ union bpf_attr {
  *             A 8-byte long unique number or 0 if *sk* is NULL.
  *
  * u32 bpf_get_socket_uid(struct sk_buff *skb)
+ *     Description
+ *             Get the owner UID of the socked associated to *skb*.
  *     Return
  *             The owner UID of the socket associated to *skb*. If the socket
  *             is **NULL**, or if it is not a full socket (i.e. if it is a
@@ -3240,6 +3257,9 @@ union bpf_attr {
  *             The id is returned or 0 in case the id could not be retrieved.
  *
  * u64 bpf_get_current_cgroup_id(void)
+ *     Description
+ *             Get the current cgroup id based on the cgroup within which
+ *             the current task is running.
  *     Return
  *             A 64-bit integer containing the current cgroup id based
  *             on the cgroup within which the current task is running.
@@ -5018,6 +5038,54 @@ union bpf_attr {
  *
  *     Return
  *             The number of arguments of the traced function.
+ *
+ * int bpf_get_retval(void)
+ *     Description
+ *             Get the syscall's return value that will be returned to userspace.
+ *
+ *             This helper is currently supported by cgroup programs only.
+ *     Return
+ *             The syscall's return value.
+ *
+ * int bpf_set_retval(int retval)
+ *     Description
+ *             Set the syscall's return value that will be returned to userspace.
+ *
+ *             This helper is currently supported by cgroup programs only.
+ *     Return
+ *             0 on success, or a negative error in case of failure.
+ *
+ * u64 bpf_xdp_get_buff_len(struct xdp_buff *xdp_md)
+ *     Description
+ *             Get the total size of a given xdp buff (linear and paged area)
+ *     Return
+ *             The total size of a given xdp buffer.
+ *
+ * long bpf_xdp_load_bytes(struct xdp_buff *xdp_md, u32 offset, void *buf, u32 len)
+ *     Description
+ *             This helper is provided as an easy way to load data from a
+ *             xdp buffer. It can be used to load *len* bytes from *offset* from
+ *             the frame associated to *xdp_md*, into the buffer pointed by
+ *             *buf*.
+ *     Return
+ *             0 on success, or a negative error in case of failure.
+ *
+ * long bpf_xdp_store_bytes(struct xdp_buff *xdp_md, u32 offset, void *buf, u32 len)
+ *     Description
+ *             Store *len* bytes from buffer *buf* into the frame
+ *             associated to *xdp_md*, at *offset*.
+ *     Return
+ *             0 on success, or a negative error in case of failure.
+ *
+ * long bpf_copy_from_user_task(void *dst, u32 size, const void *user_ptr, struct task_struct *tsk, u64 flags)
+ *     Description
+ *             Read *size* bytes from user space address *user_ptr* in *tsk*'s
+ *             address space, and stores the data in *dst*. *flags* is not
+ *             used yet and is provided for future extensibility. This helper
+ *             can only be used by sleepable programs.
+ *     Return
+ *             0 on success, or a negative error in case of failure. On error
+ *             *dst* buffer is zeroed out.
  */
 #define __BPF_FUNC_MAPPER(FN)          \
        FN(unspec),                     \
@@ -5206,6 +5274,12 @@ union bpf_attr {
        FN(get_func_arg),               \
        FN(get_func_ret),               \
        FN(get_func_arg_cnt),           \
+       FN(get_retval),                 \
+       FN(set_retval),                 \
+       FN(xdp_get_buff_len),           \
+       FN(xdp_load_bytes),             \
+       FN(xdp_store_bytes),            \
+       FN(copy_from_user_task),        \
        /* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
@@ -5500,7 +5574,8 @@ struct bpf_sock {
        __u32 src_ip4;
        __u32 src_ip6[4];
        __u32 src_port;         /* host byte order */
-       __u32 dst_port;         /* network byte order */
+       __be16 dst_port;        /* network byte order */
+       __u16 :16;              /* zero padding */
        __u32 dst_ip4;
        __u32 dst_ip6[4];
        __u32 state;
@@ -6378,7 +6453,8 @@ struct bpf_sk_lookup {
        __u32 protocol;         /* IP protocol (IPPROTO_TCP, IPPROTO_UDP) */
        __u32 remote_ip4;       /* Network byte order */
        __u32 remote_ip6[4];    /* Network byte order */
-       __u32 remote_port;      /* Network byte order */
+       __be16 remote_port;     /* Network byte order */
+       __u16 :16;              /* Zero padding */
        __u32 local_ip4;        /* Network byte order */
        __u32 local_ip6[4];     /* Network byte order */
        __u32 local_port;       /* Host byte order */
index cca6e47..417d428 100644 (file)
@@ -319,6 +319,12 @@ enum {
 /* RINGS */
 
 enum {
+       ETHTOOL_TCP_DATA_SPLIT_UNKNOWN = 0,
+       ETHTOOL_TCP_DATA_SPLIT_DISABLED,
+       ETHTOOL_TCP_DATA_SPLIT_ENABLED,
+};
+
+enum {
        ETHTOOL_A_RINGS_UNSPEC,
        ETHTOOL_A_RINGS_HEADER,                         /* nest - _A_HEADER_* */
        ETHTOOL_A_RINGS_RX_MAX,                         /* u32 */
@@ -330,6 +336,7 @@ enum {
        ETHTOOL_A_RINGS_RX_JUMBO,                       /* u32 */
        ETHTOOL_A_RINGS_TX,                             /* u32 */
        ETHTOOL_A_RINGS_RX_BUF_LEN,                     /* u32 */
+       ETHTOOL_A_RINGS_TCP_DATA_SPLIT,                 /* u8 */
 
        /* add new constants above here */
        __ETHTOOL_A_RINGS_CNT,
index 829ffdf..38f6a8f 100644 (file)
@@ -41,6 +41,15 @@ enum {
        /* IOAM Trace Header */
        IOAM6_IPTUNNEL_TRACE,           /* struct ioam6_trace_hdr */
 
+       /* Insertion frequency:
+        * "k over n" packets (0 < k <= n)
+        * [0.0001% ... 100%]
+        */
+#define IOAM6_IPTUNNEL_FREQ_MIN 1
+#define IOAM6_IPTUNNEL_FREQ_MAX 1000000
+       IOAM6_IPTUNNEL_FREQ_K,          /* u32 */
+       IOAM6_IPTUNNEL_FREQ_N,          /* u32 */
+
        __IOAM6_IPTUNNEL_MAX,
 };
 
index 07b0318..154ab56 100644 (file)
@@ -44,7 +44,25 @@ struct sockaddr_mctp_ext {
 
 #define MCTP_TAG_MASK          0x07
 #define MCTP_TAG_OWNER         0x08
+#define MCTP_TAG_PREALLOC      0x10
 
 #define MCTP_OPT_ADDR_EXT      1
 
+#define SIOCMCTPALLOCTAG       (SIOCPROTOPRIVATE + 0)
+#define SIOCMCTPDROPTAG                (SIOCPROTOPRIVATE + 1)
+
+struct mctp_ioc_tag_ctl {
+       mctp_eid_t      peer_addr;
+
+       /* For SIOCMCTPALLOCTAG: must be passed as zero, kernel will
+        * populate with the allocated tag value. Returned tag value will
+        * always have TO and PREALLOC set.
+        *
+        * For SIOCMCTPDROPTAG: userspace provides tag value to drop, from
+        * a prior SIOCMCTPALLOCTAG call (and so must have TO and PREALLOC set).
+        */
+       __u8            tag;
+       __u16           flags;
+};
+
 #endif /* __UAPI_MCTP_H */
index 66048cc..1bbea8f 100644 (file)
@@ -93,6 +93,7 @@ enum net_dm_attr {
        NET_DM_ATTR_SW_DROPS,                   /* flag */
        NET_DM_ATTR_HW_DROPS,                   /* flag */
        NET_DM_ATTR_FLOW_ACTION_COOKIE,         /* binary */
+       NET_DM_ATTR_REASON,                     /* string */
 
        __NET_DM_ATTR_MAX,
        NET_DM_ATTR_MAX = __NET_DM_ATTR_MAX - 1
index aed90c4..ef7c97f 100644 (file)
@@ -61,6 +61,7 @@ enum nfqnl_attr_type {
        NFQA_SECCTX,                    /* security context string */
        NFQA_VLAN,                      /* nested attribute: packet vlan info */
        NFQA_L2HDR,                     /* full L2 header */
+       NFQA_PRIORITY,                  /* skb->priority */
 
        __NFQA_MAX
 };
index eb0a9a5..51d6bb2 100644 (file)
@@ -31,4 +31,8 @@ struct __kernel_sockaddr_storage {
 
 #define SOCK_BUF_LOCK_MASK (SOCK_SNDBUF_LOCK | SOCK_RCVBUF_LOCK)
 
+#define SOCK_TXREHASH_DEFAULT  ((u8)-1)
+#define SOCK_TXREHASH_DISABLED 0
+#define SOCK_TXREHASH_ENABLED  1
+
 #endif /* _UAPI_LINUX_SOCKET_H */
index e9119bf..7328d4f 100644 (file)
@@ -86,6 +86,10 @@ config CC_HAS_ASM_INLINE
 config CC_HAS_NO_PROFILE_FN_ATTR
        def_bool $(success,echo '__attribute__((no_profile_instrument_function)) int x();' | $(CC) -x c - -c -o /dev/null -Werror)
 
+config PAHOLE_VERSION
+       int
+       default $(shell,$(srctree)/scripts/pahole-version.sh $(PAHOLE))
+
 config CONSTRUCTORS
        bool
 
index 65fa2e4..ada50f5 100644 (file)
@@ -99,6 +99,7 @@
 #include <linux/kcsan.h>
 #include <linux/init_syscalls.h>
 #include <linux/stackdepot.h>
+#include <net/net_namespace.h>
 
 #include <asm/io.h>
 #include <asm/bugs.h>
@@ -1116,6 +1117,7 @@ asmlinkage __visible void __init __no_sanitize_address start_kernel(void)
        key_init();
        security_init();
        dbg_late_init();
+       net_ns_init();
        vfs_caches_init();
        pagecache_init();
        signals_init();
index c7a5be3..7f145ae 100644 (file)
@@ -837,13 +837,12 @@ static int fd_array_map_delete_elem(struct bpf_map *map, void *key)
 static void *prog_fd_array_get_ptr(struct bpf_map *map,
                                   struct file *map_file, int fd)
 {
-       struct bpf_array *array = container_of(map, struct bpf_array, map);
        struct bpf_prog *prog = bpf_prog_get(fd);
 
        if (IS_ERR(prog))
                return prog;
 
-       if (!bpf_prog_array_compatible(array, prog)) {
+       if (!bpf_prog_map_compatible(map, prog)) {
                bpf_prog_put(prog);
                return ERR_PTR(-EINVAL);
        }
@@ -1071,7 +1070,6 @@ static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
        INIT_WORK(&aux->work, prog_array_map_clear_deferred);
        INIT_LIST_HEAD(&aux->poke_progs);
        mutex_init(&aux->poke_mutex);
-       spin_lock_init(&aux->owner.lock);
 
        map = array_map_alloc(attr);
        if (IS_ERR(map)) {
index b7aef5b..110029e 100644 (file)
@@ -5,6 +5,7 @@
 #include <linux/anon_inodes.h>
 #include <linux/filter.h>
 #include <linux/bpf.h>
+#include <linux/rcupdate_trace.h>
 
 struct bpf_iter_target_info {
        struct list_head list;
@@ -684,11 +685,20 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx)
 {
        int ret;
 
-       rcu_read_lock();
-       migrate_disable();
-       ret = bpf_prog_run(prog, ctx);
-       migrate_enable();
-       rcu_read_unlock();
+       if (prog->aux->sleepable) {
+               rcu_read_lock_trace();
+               migrate_disable();
+               might_fault();
+               ret = bpf_prog_run(prog, ctx);
+               migrate_enable();
+               rcu_read_unlock_trace();
+       } else {
+               rcu_read_lock();
+               migrate_disable();
+               ret = bpf_prog_run(prog, ctx);
+               migrate_enable();
+               rcu_read_unlock();
+       }
 
        /* bpf program can only return 0 or 1:
         *  0 : okay
index e16dafe..11740b3 100644 (file)
 DEFINE_IDR(btf_idr);
 DEFINE_SPINLOCK(btf_idr_lock);
 
+enum btf_kfunc_hook {
+       BTF_KFUNC_HOOK_XDP,
+       BTF_KFUNC_HOOK_TC,
+       BTF_KFUNC_HOOK_STRUCT_OPS,
+       BTF_KFUNC_HOOK_MAX,
+};
+
+enum {
+       BTF_KFUNC_SET_MAX_CNT = 32,
+};
+
+struct btf_kfunc_set_tab {
+       struct btf_id_set *sets[BTF_KFUNC_HOOK_MAX][BTF_KFUNC_TYPE_MAX];
+};
+
 struct btf {
        void *data;
        struct btf_type **types;
@@ -212,6 +227,7 @@ struct btf {
        refcount_t refcnt;
        u32 id;
        struct rcu_head rcu;
+       struct btf_kfunc_set_tab *kfunc_set_tab;
 
        /* split BTF support */
        struct btf *base_btf;
@@ -403,6 +419,9 @@ static struct btf_type btf_void;
 static int btf_resolve(struct btf_verifier_env *env,
                       const struct btf_type *t, u32 type_id);
 
+static int btf_func_check(struct btf_verifier_env *env,
+                         const struct btf_type *t);
+
 static bool btf_type_is_modifier(const struct btf_type *t)
 {
        /* Some of them is not strictly a C modifier
@@ -579,6 +598,7 @@ static bool btf_type_needs_resolve(const struct btf_type *t)
               btf_type_is_struct(t) ||
               btf_type_is_array(t) ||
               btf_type_is_var(t) ||
+              btf_type_is_func(t) ||
               btf_type_is_decl_tag(t) ||
               btf_type_is_datasec(t);
 }
@@ -1531,8 +1551,30 @@ static void btf_free_id(struct btf *btf)
        spin_unlock_irqrestore(&btf_idr_lock, flags);
 }
 
+static void btf_free_kfunc_set_tab(struct btf *btf)
+{
+       struct btf_kfunc_set_tab *tab = btf->kfunc_set_tab;
+       int hook, type;
+
+       if (!tab)
+               return;
+       /* For module BTF, we directly assign the sets being registered, so
+        * there is nothing to free except kfunc_set_tab.
+        */
+       if (btf_is_module(btf))
+               goto free_tab;
+       for (hook = 0; hook < ARRAY_SIZE(tab->sets); hook++) {
+               for (type = 0; type < ARRAY_SIZE(tab->sets[0]); type++)
+                       kfree(tab->sets[hook][type]);
+       }
+free_tab:
+       kfree(tab);
+       btf->kfunc_set_tab = NULL;
+}
+
 static void btf_free(struct btf *btf)
 {
+       btf_free_kfunc_set_tab(btf);
        kvfree(btf->types);
        kvfree(btf->resolved_sizes);
        kvfree(btf->resolved_ids);
@@ -3533,9 +3575,24 @@ static s32 btf_func_check_meta(struct btf_verifier_env *env,
        return 0;
 }
 
+static int btf_func_resolve(struct btf_verifier_env *env,
+                           const struct resolve_vertex *v)
+{
+       const struct btf_type *t = v->t;
+       u32 next_type_id = t->type;
+       int err;
+
+       err = btf_func_check(env, t);
+       if (err)
+               return err;
+
+       env_stack_pop_resolved(env, next_type_id, 0);
+       return 0;
+}
+
 static struct btf_kind_operations func_ops = {
        .check_meta = btf_func_check_meta,
-       .resolve = btf_df_resolve,
+       .resolve = btf_func_resolve,
        .check_member = btf_df_check_member,
        .check_kflag_member = btf_df_check_kflag_member,
        .log_details = btf_ref_type_log,
@@ -4156,7 +4213,7 @@ static bool btf_resolve_valid(struct btf_verifier_env *env,
                return !btf_resolved_type_id(btf, type_id) &&
                       !btf_resolved_type_size(btf, type_id);
 
-       if (btf_type_is_decl_tag(t))
+       if (btf_type_is_decl_tag(t) || btf_type_is_func(t))
                return btf_resolved_type_id(btf, type_id) &&
                       !btf_resolved_type_size(btf, type_id);
 
@@ -4246,12 +4303,6 @@ static int btf_check_all_types(struct btf_verifier_env *env)
                        if (err)
                                return err;
                }
-
-               if (btf_type_is_func(t)) {
-                       err = btf_func_check(env, t);
-                       if (err)
-                               return err;
-               }
        }
 
        return 0;
@@ -4848,6 +4899,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
        const char *tname = prog->aux->attach_func_name;
        struct bpf_verifier_log *log = info->log;
        const struct btf_param *args;
+       const char *tag_value;
        u32 nr_args, arg;
        int i, ret;
 
@@ -5000,6 +5052,13 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
        info->btf = btf;
        info->btf_id = t->type;
        t = btf_type_by_id(btf, t->type);
+
+       if (btf_type_is_type_tag(t)) {
+               tag_value = __btf_name_by_offset(btf, t->name_off);
+               if (strcmp(tag_value, "user") == 0)
+                       info->reg_type |= MEM_USER;
+       }
+
        /* skip modifiers */
        while (btf_type_is_modifier(t)) {
                info->btf_id = t->type;
@@ -5026,12 +5085,12 @@ enum bpf_struct_walk_result {
 
 static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf,
                           const struct btf_type *t, int off, int size,
-                          u32 *next_btf_id)
+                          u32 *next_btf_id, enum bpf_type_flag *flag)
 {
        u32 i, moff, mtrue_end, msize = 0, total_nelems = 0;
        const struct btf_type *mtype, *elem_type = NULL;
        const struct btf_member *member;
-       const char *tname, *mname;
+       const char *tname, *mname, *tag_value;
        u32 vlen, elem_id, mid;
 
 again:
@@ -5215,7 +5274,8 @@ error:
                }
 
                if (btf_type_is_ptr(mtype)) {
-                       const struct btf_type *stype;
+                       const struct btf_type *stype, *t;
+                       enum bpf_type_flag tmp_flag = 0;
                        u32 id;
 
                        if (msize != size || off != moff) {
@@ -5224,9 +5284,19 @@ error:
                                        mname, moff, tname, off, size);
                                return -EACCES;
                        }
+
+                       /* check __user tag */
+                       t = btf_type_by_id(btf, mtype->type);
+                       if (btf_type_is_type_tag(t)) {
+                               tag_value = __btf_name_by_offset(btf, t->name_off);
+                               if (strcmp(tag_value, "user") == 0)
+                                       tmp_flag = MEM_USER;
+                       }
+
                        stype = btf_type_skip_modifiers(btf, mtype->type, &id);
                        if (btf_type_is_struct(stype)) {
                                *next_btf_id = id;
+                               *flag = tmp_flag;
                                return WALK_PTR;
                        }
                }
@@ -5253,13 +5323,14 @@ error:
 int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf,
                      const struct btf_type *t, int off, int size,
                      enum bpf_access_type atype __maybe_unused,
-                     u32 *next_btf_id)
+                     u32 *next_btf_id, enum bpf_type_flag *flag)
 {
+       enum bpf_type_flag tmp_flag = 0;
        int err;
        u32 id;
 
        do {
-               err = btf_struct_walk(log, btf, t, off, size, &id);
+               err = btf_struct_walk(log, btf, t, off, size, &id, &tmp_flag);
 
                switch (err) {
                case WALK_PTR:
@@ -5267,6 +5338,7 @@ int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf,
                         * we're done.
                         */
                        *next_btf_id = id;
+                       *flag = tmp_flag;
                        return PTR_TO_BTF_ID;
                case WALK_SCALAR:
                        return SCALAR_VALUE;
@@ -5311,6 +5383,7 @@ bool btf_struct_ids_match(struct bpf_verifier_log *log,
                          const struct btf *need_btf, u32 need_type_id)
 {
        const struct btf_type *type;
+       enum bpf_type_flag flag;
        int err;
 
        /* Are we already done? */
@@ -5321,7 +5394,7 @@ again:
        type = btf_type_by_id(btf, id);
        if (!type)
                return false;
-       err = btf_struct_walk(log, btf, type, off, 1, &id);
+       err = btf_struct_walk(log, btf, type, off, 1, &id, &flag);
        if (err != WALK_STRUCT)
                return false;
 
@@ -5616,17 +5689,45 @@ static bool __btf_type_is_scalar_struct(struct bpf_verifier_log *log,
        return true;
 }
 
+static bool is_kfunc_arg_mem_size(const struct btf *btf,
+                                 const struct btf_param *arg,
+                                 const struct bpf_reg_state *reg)
+{
+       int len, sfx_len = sizeof("__sz") - 1;
+       const struct btf_type *t;
+       const char *param_name;
+
+       t = btf_type_skip_modifiers(btf, arg->type, NULL);
+       if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE)
+               return false;
+
+       /* In the future, this can be ported to use BTF tagging */
+       param_name = btf_name_by_offset(btf, arg->name_off);
+       if (str_is_empty(param_name))
+               return false;
+       len = strlen(param_name);
+       if (len < sfx_len)
+               return false;
+       param_name += len - sfx_len;
+       if (strncmp(param_name, "__sz", sfx_len))
+               return false;
+
+       return true;
+}
+
 static int btf_check_func_arg_match(struct bpf_verifier_env *env,
                                    const struct btf *btf, u32 func_id,
                                    struct bpf_reg_state *regs,
                                    bool ptr_to_mem_ok)
 {
        struct bpf_verifier_log *log = &env->log;
+       u32 i, nargs, ref_id, ref_obj_id = 0;
        bool is_kfunc = btf_is_kernel(btf);
        const char *func_name, *ref_tname;
        const struct btf_type *t, *ref_t;
        const struct btf_param *args;
-       u32 i, nargs, ref_id;
+       int ref_regno = 0;
+       bool rel = false;
 
        t = btf_type_by_id(btf, func_id);
        if (!t || !btf_type_is_func(t)) {
@@ -5704,6 +5805,16 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
                        if (reg->type == PTR_TO_BTF_ID) {
                                reg_btf = reg->btf;
                                reg_ref_id = reg->btf_id;
+                               /* Ensure only one argument is referenced PTR_TO_BTF_ID */
+                               if (reg->ref_obj_id) {
+                                       if (ref_obj_id) {
+                                               bpf_log(log, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
+                                                       regno, reg->ref_obj_id, ref_obj_id);
+                                               return -EFAULT;
+                                       }
+                                       ref_regno = regno;
+                                       ref_obj_id = reg->ref_obj_id;
+                               }
                        } else {
                                reg_btf = btf_vmlinux;
                                reg_ref_id = *reg2btf_ids[reg->type];
@@ -5727,17 +5838,33 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
                        u32 type_size;
 
                        if (is_kfunc) {
+                               bool arg_mem_size = i + 1 < nargs && is_kfunc_arg_mem_size(btf, &args[i + 1], &regs[regno + 1]);
+
                                /* Permit pointer to mem, but only when argument
                                 * type is pointer to scalar, or struct composed
                                 * (recursively) of scalars.
+                                * When arg_mem_size is true, the pointer can be
+                                * void *.
                                 */
                                if (!btf_type_is_scalar(ref_t) &&
-                                   !__btf_type_is_scalar_struct(log, btf, ref_t, 0)) {
+                                   !__btf_type_is_scalar_struct(log, btf, ref_t, 0) &&
+                                   (arg_mem_size ? !btf_type_is_void(ref_t) : 1)) {
                                        bpf_log(log,
-                                               "arg#%d pointer type %s %s must point to scalar or struct with scalar\n",
-                                               i, btf_type_str(ref_t), ref_tname);
+                                               "arg#%d pointer type %s %s must point to %sscalar, or struct with scalar\n",
+                                               i, btf_type_str(ref_t), ref_tname, arg_mem_size ? "void, " : "");
                                        return -EINVAL;
                                }
+
+                               /* Check for mem, len pair */
+                               if (arg_mem_size) {
+                                       if (check_kfunc_mem_size_reg(env, &regs[regno + 1], regno + 1)) {
+                                               bpf_log(log, "arg#%d arg#%d memory, len pair leads to invalid memory access\n",
+                                                       i, i + 1);
+                                               return -EINVAL;
+                                       }
+                                       i++;
+                                       continue;
+                               }
                        }
 
                        resolve_ret = btf_resolve_size(btf, ref_t, &type_size);
@@ -5758,7 +5885,23 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
                }
        }
 
-       return 0;
+       /* Either both are set, or neither */
+       WARN_ON_ONCE((ref_obj_id && !ref_regno) || (!ref_obj_id && ref_regno));
+       if (is_kfunc) {
+               rel = btf_kfunc_id_set_contains(btf, resolve_prog_type(env->prog),
+                                               BTF_KFUNC_TYPE_RELEASE, func_id);
+               /* We already made sure ref_obj_id is set only for one argument */
+               if (rel && !ref_obj_id) {
+                       bpf_log(log, "release kernel function %s expects refcounted PTR_TO_BTF_ID\n",
+                               func_name);
+                       return -EINVAL;
+               }
+               /* Allow (!rel && ref_obj_id), so that passing such referenced PTR_TO_BTF_ID to
+                * other kfuncs works
+                */
+       }
+       /* returns argument register number > 0 in case of reference release kfunc */
+       return rel ? ref_regno : 0;
 }
 
 /* Compare BTF of a function with given bpf_reg_state.
@@ -6200,12 +6343,17 @@ bool btf_id_set_contains(const struct btf_id_set *set, u32 id)
        return bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func) != NULL;
 }
 
+enum {
+       BTF_MODULE_F_LIVE = (1 << 0),
+};
+
 #ifdef CONFIG_DEBUG_INFO_BTF_MODULES
 struct btf_module {
        struct list_head list;
        struct module *module;
        struct btf *btf;
        struct bin_attribute *sysfs_attr;
+       int flags;
 };
 
 static LIST_HEAD(btf_modules);
@@ -6233,7 +6381,8 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op,
        int err = 0;
 
        if (mod->btf_data_size == 0 ||
-           (op != MODULE_STATE_COMING && op != MODULE_STATE_GOING))
+           (op != MODULE_STATE_COMING && op != MODULE_STATE_LIVE &&
+            op != MODULE_STATE_GOING))
                goto out;
 
        switch (op) {
@@ -6292,6 +6441,17 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op,
                }
 
                break;
+       case MODULE_STATE_LIVE:
+               mutex_lock(&btf_module_mutex);
+               list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) {
+                       if (btf_mod->module != module)
+                               continue;
+
+                       btf_mod->flags |= BTF_MODULE_F_LIVE;
+                       break;
+               }
+               mutex_unlock(&btf_module_mutex);
+               break;
        case MODULE_STATE_GOING:
                mutex_lock(&btf_module_mutex);
                list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) {
@@ -6338,7 +6498,12 @@ struct module *btf_try_get_module(const struct btf *btf)
                if (btf_mod->btf != btf)
                        continue;
 
-               if (try_module_get(btf_mod->module))
+               /* We must only consider module whose __init routine has
+                * finished, hence we must check for BTF_MODULE_F_LIVE flag,
+                * which is set from the notifier callback for
+                * MODULE_STATE_LIVE.
+                */
+               if ((btf_mod->flags & BTF_MODULE_F_LIVE) && try_module_get(btf_mod->module))
                        res = btf_mod->module;
 
                break;
@@ -6349,6 +6514,36 @@ struct module *btf_try_get_module(const struct btf *btf)
        return res;
 }
 
+/* Returns struct btf corresponding to the struct module
+ *
+ * This function can return NULL or ERR_PTR. Note that caller must
+ * release reference for struct btf iff btf_is_module is true.
+ */
+static struct btf *btf_get_module_btf(const struct module *module)
+{
+       struct btf *btf = NULL;
+#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+       struct btf_module *btf_mod, *tmp;
+#endif
+
+       if (!module)
+               return bpf_get_btf_vmlinux();
+#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+       mutex_lock(&btf_module_mutex);
+       list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) {
+               if (btf_mod->module != module)
+                       continue;
+
+               btf_get(btf_mod->btf);
+               btf = btf_mod->btf;
+               break;
+       }
+       mutex_unlock(&btf_module_mutex);
+#endif
+
+       return btf;
+}
+
 BPF_CALL_4(bpf_btf_find_by_name_kind, char *, name, int, name_sz, u32, kind, int, flags)
 {
        struct btf *btf;
@@ -6416,58 +6611,300 @@ BTF_ID_LIST_GLOBAL(btf_tracing_ids, MAX_BTF_TRACING_TYPE)
 BTF_TRACING_TYPE_xxx
 #undef BTF_TRACING_TYPE
 
-/* BTF ID set registration API for modules */
+/* Kernel Function (kfunc) BTF ID set registration API */
 
-#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+static int __btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
+                                   enum btf_kfunc_type type,
+                                   struct btf_id_set *add_set, bool vmlinux_set)
+{
+       struct btf_kfunc_set_tab *tab;
+       struct btf_id_set *set;
+       u32 set_cnt;
+       int ret;
+
+       if (hook >= BTF_KFUNC_HOOK_MAX || type >= BTF_KFUNC_TYPE_MAX) {
+               ret = -EINVAL;
+               goto end;
+       }
+
+       if (!add_set->cnt)
+               return 0;
+
+       tab = btf->kfunc_set_tab;
+       if (!tab) {
+               tab = kzalloc(sizeof(*tab), GFP_KERNEL | __GFP_NOWARN);
+               if (!tab)
+                       return -ENOMEM;
+               btf->kfunc_set_tab = tab;
+       }
+
+       set = tab->sets[hook][type];
+       /* Warn when register_btf_kfunc_id_set is called twice for the same hook
+        * for module sets.
+        */
+       if (WARN_ON_ONCE(set && !vmlinux_set)) {
+               ret = -EINVAL;
+               goto end;
+       }
+
+       /* We don't need to allocate, concatenate, and sort module sets, because
+        * only one is allowed per hook. Hence, we can directly assign the
+        * pointer and return.
+        */
+       if (!vmlinux_set) {
+               tab->sets[hook][type] = add_set;
+               return 0;
+       }
+
+       /* In case of vmlinux sets, there may be more than one set being
+        * registered per hook. To create a unified set, we allocate a new set
+        * and concatenate all individual sets being registered. While each set
+        * is individually sorted, they may become unsorted when concatenated,
+        * hence re-sorting the final set again is required to make binary
+        * searching the set using btf_id_set_contains function work.
+        */
+       set_cnt = set ? set->cnt : 0;
+
+       if (set_cnt > U32_MAX - add_set->cnt) {
+               ret = -EOVERFLOW;
+               goto end;
+       }
+
+       if (set_cnt + add_set->cnt > BTF_KFUNC_SET_MAX_CNT) {
+               ret = -E2BIG;
+               goto end;
+       }
+
+       /* Grow set */
+       set = krealloc(tab->sets[hook][type],
+                      offsetof(struct btf_id_set, ids[set_cnt + add_set->cnt]),
+                      GFP_KERNEL | __GFP_NOWARN);
+       if (!set) {
+               ret = -ENOMEM;
+               goto end;
+       }
+
+       /* For newly allocated set, initialize set->cnt to 0 */
+       if (!tab->sets[hook][type])
+               set->cnt = 0;
+       tab->sets[hook][type] = set;
+
+       /* Concatenate the two sets */
+       memcpy(set->ids + set->cnt, add_set->ids, add_set->cnt * sizeof(set->ids[0]));
+       set->cnt += add_set->cnt;
+
+       sort(set->ids, set->cnt, sizeof(set->ids[0]), btf_id_cmp_func, NULL);
+
+       return 0;
+end:
+       btf_free_kfunc_set_tab(btf);
+       return ret;
+}
+
+static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
+                                 const struct btf_kfunc_id_set *kset)
+{
+       bool vmlinux_set = !btf_is_module(btf);
+       int type, ret;
+
+       for (type = 0; type < ARRAY_SIZE(kset->sets); type++) {
+               if (!kset->sets[type])
+                       continue;
+
+               ret = __btf_populate_kfunc_set(btf, hook, type, kset->sets[type], vmlinux_set);
+               if (ret)
+                       break;
+       }
+       return ret;
+}
+
+static bool __btf_kfunc_id_set_contains(const struct btf *btf,
+                                       enum btf_kfunc_hook hook,
+                                       enum btf_kfunc_type type,
+                                       u32 kfunc_btf_id)
+{
+       struct btf_id_set *set;
+
+       if (hook >= BTF_KFUNC_HOOK_MAX || type >= BTF_KFUNC_TYPE_MAX)
+               return false;
+       if (!btf->kfunc_set_tab)
+               return false;
+       set = btf->kfunc_set_tab->sets[hook][type];
+       if (!set)
+               return false;
+       return btf_id_set_contains(set, kfunc_btf_id);
+}
 
-void register_kfunc_btf_id_set(struct kfunc_btf_id_list *l,
-                              struct kfunc_btf_id_set *s)
+static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
 {
-       mutex_lock(&l->mutex);
-       list_add(&s->list, &l->list);
-       mutex_unlock(&l->mutex);
+       switch (prog_type) {
+       case BPF_PROG_TYPE_XDP:
+               return BTF_KFUNC_HOOK_XDP;
+       case BPF_PROG_TYPE_SCHED_CLS:
+               return BTF_KFUNC_HOOK_TC;
+       case BPF_PROG_TYPE_STRUCT_OPS:
+               return BTF_KFUNC_HOOK_STRUCT_OPS;
+       default:
+               return BTF_KFUNC_HOOK_MAX;
+       }
 }
-EXPORT_SYMBOL_GPL(register_kfunc_btf_id_set);
 
-void unregister_kfunc_btf_id_set(struct kfunc_btf_id_list *l,
-                                struct kfunc_btf_id_set *s)
+/* Caution:
+ * Reference to the module (obtained using btf_try_get_module) corresponding to
+ * the struct btf *MUST* be held when calling this function from verifier
+ * context. This is usually true as we stash references in prog's kfunc_btf_tab;
+ * keeping the reference for the duration of the call provides the necessary
+ * protection for looking up a well-formed btf->kfunc_set_tab.
+ */
+bool btf_kfunc_id_set_contains(const struct btf *btf,
+                              enum bpf_prog_type prog_type,
+                              enum btf_kfunc_type type, u32 kfunc_btf_id)
 {
-       mutex_lock(&l->mutex);
-       list_del_init(&s->list);
-       mutex_unlock(&l->mutex);
+       enum btf_kfunc_hook hook;
+
+       hook = bpf_prog_type_to_kfunc_hook(prog_type);
+       return __btf_kfunc_id_set_contains(btf, hook, type, kfunc_btf_id);
 }
-EXPORT_SYMBOL_GPL(unregister_kfunc_btf_id_set);
 
-bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, u32 kfunc_id,
-                             struct module *owner)
+/* This function must be invoked only from initcalls/module init functions */
+int register_btf_kfunc_id_set(enum bpf_prog_type prog_type,
+                             const struct btf_kfunc_id_set *kset)
 {
-       struct kfunc_btf_id_set *s;
+       enum btf_kfunc_hook hook;
+       struct btf *btf;
+       int ret;
 
-       mutex_lock(&klist->mutex);
-       list_for_each_entry(s, &klist->list, list) {
-               if (s->owner == owner && btf_id_set_contains(s->set, kfunc_id)) {
-                       mutex_unlock(&klist->mutex);
-                       return true;
+       btf = btf_get_module_btf(kset->owner);
+       if (!btf) {
+               if (!kset->owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) {
+                       pr_err("missing vmlinux BTF, cannot register kfuncs\n");
+                       return -ENOENT;
                }
+               if (kset->owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) {
+                       pr_err("missing module BTF, cannot register kfuncs\n");
+                       return -ENOENT;
+               }
+               return 0;
        }
-       mutex_unlock(&klist->mutex);
-       return false;
+       if (IS_ERR(btf))
+               return PTR_ERR(btf);
+
+       hook = bpf_prog_type_to_kfunc_hook(prog_type);
+       ret = btf_populate_kfunc_set(btf, hook, kset);
+       /* reference is only taken for module BTF */
+       if (btf_is_module(btf))
+               btf_put(btf);
+       return ret;
 }
+EXPORT_SYMBOL_GPL(register_btf_kfunc_id_set);
 
-#define DEFINE_KFUNC_BTF_ID_LIST(name)                                         \
-       struct kfunc_btf_id_list name = { LIST_HEAD_INIT(name.list),           \
-                                         __MUTEX_INITIALIZER(name.mutex) };   \
-       EXPORT_SYMBOL_GPL(name)
+#define MAX_TYPES_ARE_COMPAT_DEPTH 2
 
-DEFINE_KFUNC_BTF_ID_LIST(bpf_tcp_ca_kfunc_list);
-DEFINE_KFUNC_BTF_ID_LIST(prog_test_kfunc_list);
+static
+int __bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id,
+                               const struct btf *targ_btf, __u32 targ_id,
+                               int level)
+{
+       const struct btf_type *local_type, *targ_type;
+       int depth = 32; /* max recursion depth */
 
-#endif
+       /* caller made sure that names match (ignoring flavor suffix) */
+       local_type = btf_type_by_id(local_btf, local_id);
+       targ_type = btf_type_by_id(targ_btf, targ_id);
+       if (btf_kind(local_type) != btf_kind(targ_type))
+               return 0;
 
+recur:
+       depth--;
+       if (depth < 0)
+               return -EINVAL;
+
+       local_type = btf_type_skip_modifiers(local_btf, local_id, &local_id);
+       targ_type = btf_type_skip_modifiers(targ_btf, targ_id, &targ_id);
+       if (!local_type || !targ_type)
+               return -EINVAL;
+
+       if (btf_kind(local_type) != btf_kind(targ_type))
+               return 0;
+
+       switch (btf_kind(local_type)) {
+       case BTF_KIND_UNKN:
+       case BTF_KIND_STRUCT:
+       case BTF_KIND_UNION:
+       case BTF_KIND_ENUM:
+       case BTF_KIND_FWD:
+               return 1;
+       case BTF_KIND_INT:
+               /* just reject deprecated bitfield-like integers; all other
+                * integers are by default compatible between each other
+                */
+               return btf_int_offset(local_type) == 0 && btf_int_offset(targ_type) == 0;
+       case BTF_KIND_PTR:
+               local_id = local_type->type;
+               targ_id = targ_type->type;
+               goto recur;
+       case BTF_KIND_ARRAY:
+               local_id = btf_array(local_type)->type;
+               targ_id = btf_array(targ_type)->type;
+               goto recur;
+       case BTF_KIND_FUNC_PROTO: {
+               struct btf_param *local_p = btf_params(local_type);
+               struct btf_param *targ_p = btf_params(targ_type);
+               __u16 local_vlen = btf_vlen(local_type);
+               __u16 targ_vlen = btf_vlen(targ_type);
+               int i, err;
+
+               if (local_vlen != targ_vlen)
+                       return 0;
+
+               for (i = 0; i < local_vlen; i++, local_p++, targ_p++) {
+                       if (level <= 0)
+                               return -EINVAL;
+
+                       btf_type_skip_modifiers(local_btf, local_p->type, &local_id);
+                       btf_type_skip_modifiers(targ_btf, targ_p->type, &targ_id);
+                       err = __bpf_core_types_are_compat(local_btf, local_id,
+                                                         targ_btf, targ_id,
+                                                         level - 1);
+                       if (err <= 0)
+                               return err;
+               }
+
+               /* tail recurse for return type check */
+               btf_type_skip_modifiers(local_btf, local_type->type, &local_id);
+               btf_type_skip_modifiers(targ_btf, targ_type->type, &targ_id);
+               goto recur;
+       }
+       default:
+               return 0;
+       }
+}
+
+/* Check local and target types for compatibility. This check is used for
+ * type-based CO-RE relocations and follow slightly different rules than
+ * field-based relocations. This function assumes that root types were already
+ * checked for name match. Beyond that initial root-level name check, names
+ * are completely ignored. Compatibility rules are as follows:
+ *   - any two STRUCTs/UNIONs/FWDs/ENUMs/INTs are considered compatible, but
+ *     kind should match for local and target types (i.e., STRUCT is not
+ *     compatible with UNION);
+ *   - for ENUMs, the size is ignored;
+ *   - for INT, size and signedness are ignored;
+ *   - for ARRAY, dimensionality is ignored, element types are checked for
+ *     compatibility recursively;
+ *   - CONST/VOLATILE/RESTRICT modifiers are ignored;
+ *   - TYPEDEFs/PTRs are compatible if types they pointing to are compatible;
+ *   - FUNC_PROTOs are compatible if they have compatible signature: same
+ *     number of input args and compatible return and argument types.
+ * These rules are not set in stone and probably will be adjusted as we get
+ * more experience with using BPF CO-RE relocations.
+ */
 int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id,
                              const struct btf *targ_btf, __u32 targ_id)
 {
-       return -EOPNOTSUPP;
+       return __bpf_core_types_are_compat(local_btf, local_id,
+                                          targ_btf, targ_id,
+                                          MAX_TYPES_ARE_COMPAT_DEPTH);
 }
 
 static bool bpf_core_is_flavor_sep(const char *s)
index 514b468..098632f 100644 (file)
@@ -1044,7 +1044,7 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr,
  *   NET_XMIT_DROP       (1)   - drop packet and notify TCP to call cwr
  *   NET_XMIT_CN         (2)   - continue with packet output and notify TCP
  *                               to call cwr
- *   -EPERM                    - drop packet
+ *   -err                      - drop packet
  *
  * For ingress packets, this function will return -EPERM if any
  * attached program was found and if it returned != 1 during execution.
@@ -1079,8 +1079,9 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
                        cgrp->bpf.effective[atype], skb, __bpf_prog_run_save_cb);
        } else {
                ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], skb,
-                                           __bpf_prog_run_save_cb);
-               ret = (ret == 1 ? 0 : -EPERM);
+                                           __bpf_prog_run_save_cb, 0);
+               if (ret && !IS_ERR_VALUE((long)ret))
+                       ret = -EFAULT;
        }
        bpf_restore_data_end(skb, saved_data_end);
        __skb_pull(skb, offset);
@@ -1107,10 +1108,9 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
                               enum cgroup_bpf_attach_type atype)
 {
        struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
-       int ret;
 
-       ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sk, bpf_prog_run);
-       return ret == 1 ? 0 : -EPERM;
+       return BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sk,
+                                    bpf_prog_run, 0);
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
 
@@ -1142,7 +1142,6 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
        };
        struct sockaddr_storage unspec;
        struct cgroup *cgrp;
-       int ret;
 
        /* Check socket family since not all sockets represent network
         * endpoint (e.g. AF_UNIX).
@@ -1156,10 +1155,8 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
        }
 
        cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
-       ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(cgrp->bpf.effective[atype], &ctx,
-                                         bpf_prog_run, flags);
-
-       return ret == 1 ? 0 : -EPERM;
+       return BPF_PROG_RUN_ARRAY_CG_FLAGS(cgrp->bpf.effective[atype], &ctx,
+                                          bpf_prog_run, 0, flags);
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
 
@@ -1184,11 +1181,9 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
                                     enum cgroup_bpf_attach_type atype)
 {
        struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
-       int ret;
 
-       ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sock_ops,
-                                   bpf_prog_run);
-       return ret == 1 ? 0 : -EPERM;
+       return BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sock_ops,
+                                    bpf_prog_run, 0);
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
 
@@ -1201,17 +1196,47 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
                .major = major,
                .minor = minor,
        };
-       int allow;
+       int ret;
 
        rcu_read_lock();
        cgrp = task_dfl_cgroup(current);
-       allow = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx,
-                                     bpf_prog_run);
+       ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx,
+                                   bpf_prog_run, 0);
        rcu_read_unlock();
 
-       return !allow;
+       return ret;
+}
+
+BPF_CALL_0(bpf_get_retval)
+{
+       struct bpf_cg_run_ctx *ctx =
+               container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
+
+       return ctx->retval;
+}
+
+static const struct bpf_func_proto bpf_get_retval_proto = {
+       .func           = bpf_get_retval,
+       .gpl_only       = false,
+       .ret_type       = RET_INTEGER,
+};
+
+BPF_CALL_1(bpf_set_retval, int, retval)
+{
+       struct bpf_cg_run_ctx *ctx =
+               container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
+
+       ctx->retval = retval;
+       return 0;
 }
 
+static const struct bpf_func_proto bpf_set_retval_proto = {
+       .func           = bpf_set_retval,
+       .gpl_only       = false,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *
 cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -1224,6 +1249,10 @@ cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
                return &bpf_get_current_cgroup_id_proto;
        case BPF_FUNC_perf_event_output:
                return &bpf_event_output_data_proto;
+       case BPF_FUNC_get_retval:
+               return &bpf_get_retval_proto;
+       case BPF_FUNC_set_retval:
+               return &bpf_set_retval_proto;
        default:
                return bpf_base_func_proto(func_id);
        }
@@ -1337,7 +1366,8 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
 
        rcu_read_lock();
        cgrp = task_dfl_cgroup(current);
-       ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx, bpf_prog_run);
+       ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx,
+                                   bpf_prog_run, 0);
        rcu_read_unlock();
 
        kfree(ctx.cur_val);
@@ -1350,24 +1380,10 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
                kfree(ctx.new_val);
        }
 
-       return ret == 1 ? 0 : -EPERM;
+       return ret;
 }
 
 #ifdef CONFIG_NET
-static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
-                                            enum cgroup_bpf_attach_type attach_type)
-{
-       struct bpf_prog_array *prog_array;
-       bool empty;
-
-       rcu_read_lock();
-       prog_array = rcu_dereference(cgrp->bpf.effective[attach_type]);
-       empty = bpf_prog_array_is_empty(prog_array);
-       rcu_read_unlock();
-
-       return empty;
-}
-
 static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen,
                             struct bpf_sockopt_buf *buf)
 {
@@ -1426,19 +1442,11 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
        };
        int ret, max_optlen;
 
-       /* Opportunistic check to see whether we have any BPF program
-        * attached to the hook so we don't waste time allocating
-        * memory and locking the socket.
-        */
-       if (__cgroup_bpf_prog_array_is_empty(cgrp, CGROUP_SETSOCKOPT))
-               return 0;
-
        /* Allocate a bit more than the initial user buffer for
         * BPF program. The canonical use case is overriding
         * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic).
         */
        max_optlen = max_t(int, 16, *optlen);
-
        max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
        if (max_optlen < 0)
                return max_optlen;
@@ -1452,13 +1460,11 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
 
        lock_sock(sk);
        ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_SETSOCKOPT],
-                                   &ctx, bpf_prog_run);
+                                   &ctx, bpf_prog_run, 0);
        release_sock(sk);
 
-       if (!ret) {
-               ret = -EPERM;
+       if (ret)
                goto out;
-       }
 
        if (ctx.optlen == -1) {
                /* optlen set to -1, bypass kernel */
@@ -1518,19 +1524,11 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
                .sk = sk,
                .level = level,
                .optname = optname,
-               .retval = retval,
+               .current_task = current,
        };
        int ret;
 
-       /* Opportunistic check to see whether we have any BPF program
-        * attached to the hook so we don't waste time allocating
-        * memory and locking the socket.
-        */
-       if (__cgroup_bpf_prog_array_is_empty(cgrp, CGROUP_GETSOCKOPT))
-               return retval;
-
        ctx.optlen = max_optlen;
-
        max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
        if (max_optlen < 0)
                return max_optlen;
@@ -1562,27 +1560,17 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
 
        lock_sock(sk);
        ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT],
-                                   &ctx, bpf_prog_run);
+                                   &ctx, bpf_prog_run, retval);
        release_sock(sk);
 
-       if (!ret) {
-               ret = -EPERM;
+       if (ret < 0)
                goto out;
-       }
 
        if (ctx.optlen > max_optlen || ctx.optlen < 0) {
                ret = -EFAULT;
                goto out;
        }
 
-       /* BPF programs only allowed to set retval to 0, not some
-        * arbitrary value.
-        */
-       if (ctx.retval != 0 && ctx.retval != retval) {
-               ret = -EFAULT;
-               goto out;
-       }
-
        if (ctx.optlen != 0) {
                if (copy_to_user(optval, ctx.optval, ctx.optlen) ||
                    put_user(ctx.optlen, optlen)) {
@@ -1591,8 +1579,6 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
                }
        }
 
-       ret = ctx.retval;
-
 out:
        sockopt_free_buf(&ctx, &buf);
        return ret;
@@ -1607,10 +1593,10 @@ int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
                .sk = sk,
                .level = level,
                .optname = optname,
-               .retval = retval,
                .optlen = *optlen,
                .optval = optval,
                .optval_end = optval + *optlen,
+               .current_task = current,
        };
        int ret;
 
@@ -1623,25 +1609,19 @@ int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
         */
 
        ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT],
-                                   &ctx, bpf_prog_run);
-       if (!ret)
-               return -EPERM;
+                                   &ctx, bpf_prog_run, retval);
+       if (ret < 0)
+               return ret;
 
        if (ctx.optlen > *optlen)
                return -EFAULT;
 
-       /* BPF programs only allowed to set retval to 0, not some
-        * arbitrary value.
-        */
-       if (ctx.retval != 0 && ctx.retval != retval)
-               return -EFAULT;
-
        /* BPF programs can shrink the buffer, export the modifications.
         */
        if (ctx.optlen != 0)
                *optlen = ctx.optlen;
 
-       return ctx.retval;
+       return ret;
 }
 #endif
 
@@ -2057,10 +2037,39 @@ static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,
                        *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optlen);
                break;
        case offsetof(struct bpf_sockopt, retval):
-               if (type == BPF_WRITE)
-                       *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, retval);
-               else
-                       *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, retval);
+               BUILD_BUG_ON(offsetof(struct bpf_cg_run_ctx, run_ctx) != 0);
+
+               if (type == BPF_WRITE) {
+                       int treg = BPF_REG_9;
+
+                       if (si->src_reg == treg || si->dst_reg == treg)
+                               --treg;
+                       if (si->src_reg == treg || si->dst_reg == treg)
+                               --treg;
+                       *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, treg,
+                                             offsetof(struct bpf_sockopt_kern, tmp_reg));
+                       *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task),
+                                             treg, si->dst_reg,
+                                             offsetof(struct bpf_sockopt_kern, current_task));
+                       *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx),
+                                             treg, treg,
+                                             offsetof(struct task_struct, bpf_ctx));
+                       *insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval),
+                                             treg, si->src_reg,
+                                             offsetof(struct bpf_cg_run_ctx, retval));
+                       *insn++ = BPF_LDX_MEM(BPF_DW, treg, si->dst_reg,
+                                             offsetof(struct bpf_sockopt_kern, tmp_reg));
+               } else {
+                       *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task),
+                                             si->dst_reg, si->src_reg,
+                                             offsetof(struct bpf_sockopt_kern, current_task));
+                       *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx),
+                                             si->dst_reg, si->dst_reg,
+                                             offsetof(struct task_struct, bpf_ctx));
+                       *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval),
+                                             si->dst_reg, si->dst_reg,
+                                             offsetof(struct bpf_cg_run_ctx, retval));
+               }
                break;
        case offsetof(struct bpf_sockopt, optval):
                *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval);
index de3e5bc..42d9654 100644 (file)
@@ -537,13 +537,10 @@ long bpf_jit_limit_max __read_mostly;
 static void
 bpf_prog_ksym_set_addr(struct bpf_prog *prog)
 {
-       const struct bpf_binary_header *hdr = bpf_jit_binary_hdr(prog);
-       unsigned long addr = (unsigned long)hdr;
-
        WARN_ON_ONCE(!bpf_prog_ebpf_jited(prog));
 
        prog->aux->ksym.start = (unsigned long) prog->bpf_func;
-       prog->aux->ksym.end   = addr + hdr->pages * PAGE_SIZE;
+       prog->aux->ksym.end   = prog->aux->ksym.start + prog->jited_len;
 }
 
 static void
@@ -808,6 +805,137 @@ int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
        return slot;
 }
 
+/*
+ * BPF program pack allocator.
+ *
+ * Most BPF programs are pretty small. Allocating a hole page for each
+ * program is sometime a waste. Many small bpf program also adds pressure
+ * to instruction TLB. To solve this issue, we introduce a BPF program pack
+ * allocator. The prog_pack allocator uses HPAGE_PMD_SIZE page (2MB on x86)
+ * to host BPF programs.
+ */
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define BPF_PROG_PACK_SIZE     HPAGE_PMD_SIZE
+#else
+#define BPF_PROG_PACK_SIZE     PAGE_SIZE
+#endif
+#define BPF_PROG_CHUNK_SHIFT   6
+#define BPF_PROG_CHUNK_SIZE    (1 << BPF_PROG_CHUNK_SHIFT)
+#define BPF_PROG_CHUNK_MASK    (~(BPF_PROG_CHUNK_SIZE - 1))
+#define BPF_PROG_CHUNK_COUNT   (BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE)
+
+struct bpf_prog_pack {
+       struct list_head list;
+       void *ptr;
+       unsigned long bitmap[BITS_TO_LONGS(BPF_PROG_CHUNK_COUNT)];
+};
+
+#define BPF_PROG_MAX_PACK_PROG_SIZE    BPF_PROG_PACK_SIZE
+#define BPF_PROG_SIZE_TO_NBITS(size)   (round_up(size, BPF_PROG_CHUNK_SIZE) / BPF_PROG_CHUNK_SIZE)
+
+static DEFINE_MUTEX(pack_mutex);
+static LIST_HEAD(pack_list);
+
+static struct bpf_prog_pack *alloc_new_pack(void)
+{
+       struct bpf_prog_pack *pack;
+
+       pack = kzalloc(sizeof(*pack), GFP_KERNEL);
+       if (!pack)
+               return NULL;
+       pack->ptr = module_alloc(BPF_PROG_PACK_SIZE);
+       if (!pack->ptr) {
+               kfree(pack);
+               return NULL;
+       }
+       bitmap_zero(pack->bitmap, BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE);
+       list_add_tail(&pack->list, &pack_list);
+
+       set_vm_flush_reset_perms(pack->ptr);
+       set_memory_ro((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE);
+       set_memory_x((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE);
+       return pack;
+}
+
+static void *bpf_prog_pack_alloc(u32 size)
+{
+       unsigned int nbits = BPF_PROG_SIZE_TO_NBITS(size);
+       struct bpf_prog_pack *pack;
+       unsigned long pos;
+       void *ptr = NULL;
+
+       if (size > BPF_PROG_MAX_PACK_PROG_SIZE) {
+               size = round_up(size, PAGE_SIZE);
+               ptr = module_alloc(size);
+               if (ptr) {
+                       set_vm_flush_reset_perms(ptr);
+                       set_memory_ro((unsigned long)ptr, size / PAGE_SIZE);
+                       set_memory_x((unsigned long)ptr, size / PAGE_SIZE);
+               }
+               return ptr;
+       }
+       mutex_lock(&pack_mutex);
+       list_for_each_entry(pack, &pack_list, list) {
+               pos = bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0,
+                                                nbits, 0);
+               if (pos < BPF_PROG_CHUNK_COUNT)
+                       goto found_free_area;
+       }
+
+       pack = alloc_new_pack();
+       if (!pack)
+               goto out;
+
+       pos = 0;
+
+found_free_area:
+       bitmap_set(pack->bitmap, pos, nbits);
+       ptr = (void *)(pack->ptr) + (pos << BPF_PROG_CHUNK_SHIFT);
+
+out:
+       mutex_unlock(&pack_mutex);
+       return ptr;
+}
+
+static void bpf_prog_pack_free(struct bpf_binary_header *hdr)
+{
+       struct bpf_prog_pack *pack = NULL, *tmp;
+       unsigned int nbits;
+       unsigned long pos;
+       void *pack_ptr;
+
+       if (hdr->size > BPF_PROG_MAX_PACK_PROG_SIZE) {
+               module_memfree(hdr);
+               return;
+       }
+
+       pack_ptr = (void *)((unsigned long)hdr & ~(BPF_PROG_PACK_SIZE - 1));
+       mutex_lock(&pack_mutex);
+
+       list_for_each_entry(tmp, &pack_list, list) {
+               if (tmp->ptr == pack_ptr) {
+                       pack = tmp;
+                       break;
+               }
+       }
+
+       if (WARN_ONCE(!pack, "bpf_prog_pack bug\n"))
+               goto out;
+
+       nbits = BPF_PROG_SIZE_TO_NBITS(hdr->size);
+       pos = ((unsigned long)hdr - (unsigned long)pack_ptr) >> BPF_PROG_CHUNK_SHIFT;
+
+       bitmap_clear(pack->bitmap, pos, nbits);
+       if (bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0,
+                                      BPF_PROG_CHUNK_COUNT, 0) == 0) {
+               list_del(&pack->list);
+               module_memfree(pack->ptr);
+               kfree(pack);
+       }
+out:
+       mutex_unlock(&pack_mutex);
+}
+
 static atomic_long_t bpf_jit_current;
 
 /* Can be overridden by an arch's JIT compiler if it has a custom,
@@ -833,12 +961,11 @@ static int __init bpf_jit_charge_init(void)
 }
 pure_initcall(bpf_jit_charge_init);
 
-int bpf_jit_charge_modmem(u32 pages)
+int bpf_jit_charge_modmem(u32 size)
 {
-       if (atomic_long_add_return(pages, &bpf_jit_current) >
-           (bpf_jit_limit >> PAGE_SHIFT)) {
+       if (atomic_long_add_return(size, &bpf_jit_current) > bpf_jit_limit) {
                if (!bpf_capable()) {
-                       atomic_long_sub(pages, &bpf_jit_current);
+                       atomic_long_sub(size, &bpf_jit_current);
                        return -EPERM;
                }
        }
@@ -846,9 +973,9 @@ int bpf_jit_charge_modmem(u32 pages)
        return 0;
 }
 
-void bpf_jit_uncharge_modmem(u32 pages)
+void bpf_jit_uncharge_modmem(u32 size)
 {
-       atomic_long_sub(pages, &bpf_jit_current);
+       atomic_long_sub(size, &bpf_jit_current);
 }
 
 void *__weak bpf_jit_alloc_exec(unsigned long size)
@@ -867,7 +994,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
                     bpf_jit_fill_hole_t bpf_fill_ill_insns)
 {
        struct bpf_binary_header *hdr;
-       u32 size, hole, start, pages;
+       u32 size, hole, start;
 
        WARN_ON_ONCE(!is_power_of_2(alignment) ||
                     alignment > BPF_IMAGE_ALIGNMENT);
@@ -877,20 +1004,19 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
         * random section of illegal instructions.
         */
        size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE);
-       pages = size / PAGE_SIZE;
 
-       if (bpf_jit_charge_modmem(pages))
+       if (bpf_jit_charge_modmem(size))
                return NULL;
        hdr = bpf_jit_alloc_exec(size);
        if (!hdr) {
-               bpf_jit_uncharge_modmem(pages);
+               bpf_jit_uncharge_modmem(size);
                return NULL;
        }
 
        /* Fill space with illegal/arch-dep instructions. */
        bpf_fill_ill_insns(hdr, size);
 
-       hdr->pages = pages;
+       hdr->size = size;
        hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
                     PAGE_SIZE - sizeof(*hdr));
        start = (get_random_int() % hole) & ~(alignment - 1);
@@ -903,10 +1029,113 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
 
 void bpf_jit_binary_free(struct bpf_binary_header *hdr)
 {
-       u32 pages = hdr->pages;
+       u32 size = hdr->size;
 
        bpf_jit_free_exec(hdr);
-       bpf_jit_uncharge_modmem(pages);
+       bpf_jit_uncharge_modmem(size);
+}
+
+/* Allocate jit binary from bpf_prog_pack allocator.
+ * Since the allocated memory is RO+X, the JIT engine cannot write directly
+ * to the memory. To solve this problem, a RW buffer is also allocated at
+ * as the same time. The JIT engine should calculate offsets based on the
+ * RO memory address, but write JITed program to the RW buffer. Once the
+ * JIT engine finishes, it calls bpf_jit_binary_pack_finalize, which copies
+ * the JITed program to the RO memory.
+ */
+struct bpf_binary_header *
+bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr,
+                         unsigned int alignment,
+                         struct bpf_binary_header **rw_header,
+                         u8 **rw_image,
+                         bpf_jit_fill_hole_t bpf_fill_ill_insns)
+{
+       struct bpf_binary_header *ro_header;
+       u32 size, hole, start;
+
+       WARN_ON_ONCE(!is_power_of_2(alignment) ||
+                    alignment > BPF_IMAGE_ALIGNMENT);
+
+       /* add 16 bytes for a random section of illegal instructions */
+       size = round_up(proglen + sizeof(*ro_header) + 16, BPF_PROG_CHUNK_SIZE);
+
+       if (bpf_jit_charge_modmem(size))
+               return NULL;
+       ro_header = bpf_prog_pack_alloc(size);
+       if (!ro_header) {
+               bpf_jit_uncharge_modmem(size);
+               return NULL;
+       }
+
+       *rw_header = kvmalloc(size, GFP_KERNEL);
+       if (!*rw_header) {
+               bpf_prog_pack_free(ro_header);
+               bpf_jit_uncharge_modmem(size);
+               return NULL;
+       }
+
+       /* Fill space with illegal/arch-dep instructions. */
+       bpf_fill_ill_insns(*rw_header, size);
+       (*rw_header)->size = size;
+
+       hole = min_t(unsigned int, size - (proglen + sizeof(*ro_header)),
+                    BPF_PROG_CHUNK_SIZE - sizeof(*ro_header));
+       start = (get_random_int() % hole) & ~(alignment - 1);
+
+       *image_ptr = &ro_header->image[start];
+       *rw_image = &(*rw_header)->image[start];
+
+       return ro_header;
+}
+
+/* Copy JITed text from rw_header to its final location, the ro_header. */
+int bpf_jit_binary_pack_finalize(struct bpf_prog *prog,
+                                struct bpf_binary_header *ro_header,
+                                struct bpf_binary_header *rw_header)
+{
+       void *ptr;
+
+       ptr = bpf_arch_text_copy(ro_header, rw_header, rw_header->size);
+
+       kvfree(rw_header);
+
+       if (IS_ERR(ptr)) {
+               bpf_prog_pack_free(ro_header);
+               return PTR_ERR(ptr);
+       }
+       prog->aux->use_bpf_prog_pack = true;
+       return 0;
+}
+
+/* bpf_jit_binary_pack_free is called in two different scenarios:
+ *   1) when the program is freed after;
+ *   2) when the JIT engine fails (before bpf_jit_binary_pack_finalize).
+ * For case 2), we need to free both the RO memory and the RW buffer.
+ * Also, ro_header->size in 2) is not properly set yet, so rw_header->size
+ * is used for uncharge.
+ */
+void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header,
+                             struct bpf_binary_header *rw_header)
+{
+       u32 size = rw_header ? rw_header->size : ro_header->size;
+
+       bpf_prog_pack_free(ro_header);
+       kvfree(rw_header);
+       bpf_jit_uncharge_modmem(size);
+}
+
+static inline struct bpf_binary_header *
+bpf_jit_binary_hdr(const struct bpf_prog *fp)
+{
+       unsigned long real_start = (unsigned long)fp->bpf_func;
+       unsigned long addr;
+
+       if (fp->aux->use_bpf_prog_pack)
+               addr = real_start & BPF_PROG_CHUNK_MASK;
+       else
+               addr = real_start & PAGE_MASK;
+
+       return (void *)addr;
 }
 
 /* This symbol is only overridden by archs that have different
@@ -918,7 +1147,10 @@ void __weak bpf_jit_free(struct bpf_prog *fp)
        if (fp->jited) {
                struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp);
 
-               bpf_jit_binary_free(hdr);
+               if (fp->aux->use_bpf_prog_pack)
+                       bpf_jit_binary_pack_free(hdr, NULL /* rw_buffer */);
+               else
+                       bpf_jit_binary_free(hdr);
 
                WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp));
        }
@@ -1829,28 +2061,30 @@ static unsigned int __bpf_prog_ret0_warn(const void *ctx,
 }
 #endif
 
-bool bpf_prog_array_compatible(struct bpf_array *array,
-                              const struct bpf_prog *fp)
+bool bpf_prog_map_compatible(struct bpf_map *map,
+                            const struct bpf_prog *fp)
 {
        bool ret;
 
        if (fp->kprobe_override)
                return false;
 
-       spin_lock(&array->aux->owner.lock);
-
-       if (!array->aux->owner.type) {
+       spin_lock(&map->owner.lock);
+       if (!map->owner.type) {
                /* There's no owner yet where we could check for
                 * compatibility.
                 */
-               array->aux->owner.type  = fp->type;
-               array->aux->owner.jited = fp->jited;
+               map->owner.type  = fp->type;
+               map->owner.jited = fp->jited;
+               map->owner.xdp_has_frags = fp->aux->xdp_has_frags;
                ret = true;
        } else {
-               ret = array->aux->owner.type  == fp->type &&
-                     array->aux->owner.jited == fp->jited;
+               ret = map->owner.type  == fp->type &&
+                     map->owner.jited == fp->jited &&
+                     map->owner.xdp_has_frags == fp->aux->xdp_has_frags;
        }
-       spin_unlock(&array->aux->owner.lock);
+       spin_unlock(&map->owner.lock);
+
        return ret;
 }
 
@@ -1862,13 +2096,11 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)
        mutex_lock(&aux->used_maps_mutex);
        for (i = 0; i < aux->used_map_cnt; i++) {
                struct bpf_map *map = aux->used_maps[i];
-               struct bpf_array *array;
 
-               if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
+               if (!map_type_contains_progs(map))
                        continue;
 
-               array = container_of(map, struct bpf_array, map);
-               if (!bpf_prog_array_compatible(array, fp)) {
+               if (!bpf_prog_map_compatible(map, fp)) {
                        ret = -EINVAL;
                        goto out;
                }
@@ -1968,18 +2200,10 @@ static struct bpf_prog_dummy {
        },
 };
 
-/* to avoid allocating empty bpf_prog_array for cgroups that
- * don't have bpf program attached use one global 'empty_prog_array'
- * It will not be modified the caller of bpf_prog_array_alloc()
- * (since caller requested prog_cnt == 0)
- * that pointer should be 'freed' by bpf_prog_array_free()
- */
-static struct {
-       struct bpf_prog_array hdr;
-       struct bpf_prog *null_prog;
-} empty_prog_array = {
+struct bpf_empty_prog_array bpf_empty_prog_array = {
        .null_prog = NULL,
 };
+EXPORT_SYMBOL(bpf_empty_prog_array);
 
 struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
 {
@@ -1989,12 +2213,12 @@ struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
                               (prog_cnt + 1),
                               flags);
 
-       return &empty_prog_array.hdr;
+       return &bpf_empty_prog_array.hdr;
 }
 
 void bpf_prog_array_free(struct bpf_prog_array *progs)
 {
-       if (!progs || progs == &empty_prog_array.hdr)
+       if (!progs || progs == &bpf_empty_prog_array.hdr)
                return;
        kfree_rcu(progs, rcu);
 }
@@ -2453,6 +2677,11 @@ int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
        return -ENOTSUPP;
 }
 
+void * __weak bpf_arch_text_copy(void *dst, void *src, size_t len)
+{
+       return ERR_PTR(-ENOTSUPP);
+}
+
 DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
 EXPORT_SYMBOL(bpf_stats_enabled_key);
 
index b3e6b94..650e5d2 100644 (file)
@@ -397,7 +397,8 @@ static int cpu_map_kthread_run(void *data)
        return 0;
 }
 
-static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, int fd)
+static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu,
+                                     struct bpf_map *map, int fd)
 {
        struct bpf_prog *prog;
 
@@ -405,7 +406,8 @@ static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, int fd)
        if (IS_ERR(prog))
                return PTR_ERR(prog);
 
-       if (prog->expected_attach_type != BPF_XDP_CPUMAP) {
+       if (prog->expected_attach_type != BPF_XDP_CPUMAP ||
+           !bpf_prog_map_compatible(map, prog)) {
                bpf_prog_put(prog);
                return -EINVAL;
        }
@@ -457,7 +459,7 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value,
        rcpu->map_id = map->id;
        rcpu->value.qsize  = value->qsize;
 
-       if (fd > 0 && __cpu_map_load_bpf_program(rcpu, fd))
+       if (fd > 0 && __cpu_map_load_bpf_program(rcpu, map, fd))
                goto free_ptr_ring;
 
        /* Setup kthread */
index fe019db..038f6d7 100644 (file)
@@ -858,7 +858,8 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
                                             BPF_PROG_TYPE_XDP, false);
                if (IS_ERR(prog))
                        goto err_put_dev;
-               if (prog->expected_attach_type != BPF_XDP_DEVMAP)
+               if (prog->expected_attach_type != BPF_XDP_DEVMAP ||
+                   !bpf_prog_map_compatible(&dtab->map, prog))
                        goto err_put_prog;
        }
 
index 01cfdf4..4e5969f 100644 (file)
@@ -16,6 +16,7 @@
 #include <linux/pid_namespace.h>
 #include <linux/proc_ns.h>
 #include <linux/security.h>
+#include <linux/btf_ids.h>
 
 #include "../../lib/kstrtox.h"
 
@@ -671,6 +672,39 @@ const struct bpf_func_proto bpf_copy_from_user_proto = {
        .arg3_type      = ARG_ANYTHING,
 };
 
+BPF_CALL_5(bpf_copy_from_user_task, void *, dst, u32, size,
+          const void __user *, user_ptr, struct task_struct *, tsk, u64, flags)
+{
+       int ret;
+
+       /* flags is not used yet */
+       if (unlikely(flags))
+               return -EINVAL;
+
+       if (unlikely(!size))
+               return 0;
+
+       ret = access_process_vm(tsk, (unsigned long)user_ptr, dst, size, 0);
+       if (ret == size)
+               return 0;
+
+       memset(dst, 0, size);
+       /* Return -EFAULT for partial read */
+       return ret < 0 ? ret : -EFAULT;
+}
+
+const struct bpf_func_proto bpf_copy_from_user_task_proto = {
+       .func           = bpf_copy_from_user_task,
+       .gpl_only       = true,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_UNINIT_MEM,
+       .arg2_type      = ARG_CONST_SIZE_OR_ZERO,
+       .arg3_type      = ARG_ANYTHING,
+       .arg4_type      = ARG_PTR_TO_BTF_ID,
+       .arg4_btf_id    = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
+       .arg5_type      = ARG_ANYTHING
+};
+
 BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu)
 {
        if (cpu >= nr_cpu_ids)
index 1400ac5..baf47d9 100644 (file)
@@ -1,40 +1,16 @@
 # SPDX-License-Identifier: GPL-2.0
 
 LIBBPF_SRCS = $(srctree)/tools/lib/bpf/
-LIBBPF_OUT = $(abspath $(obj))/libbpf
-LIBBPF_A = $(LIBBPF_OUT)/libbpf.a
-LIBBPF_DESTDIR = $(LIBBPF_OUT)
-LIBBPF_INCLUDE = $(LIBBPF_DESTDIR)/include
-
-# Although not in use by libbpf's Makefile, set $(O) so that the "dummy" test
-# in tools/scripts/Makefile.include always succeeds when building the kernel
-# with $(O) pointing to a relative path, as in "make O=build bindeb-pkg".
-$(LIBBPF_A): | $(LIBBPF_OUT)
-       $(Q)$(MAKE) -C $(LIBBPF_SRCS) O=$(LIBBPF_OUT)/ OUTPUT=$(LIBBPF_OUT)/   \
-               DESTDIR=$(LIBBPF_DESTDIR) prefix=                              \
-               $(LIBBPF_OUT)/libbpf.a install_headers
-
-libbpf_hdrs: $(LIBBPF_A)
-
-.PHONY: libbpf_hdrs
-
-$(LIBBPF_OUT):
-       $(call msg,MKDIR,$@)
-       $(Q)mkdir -p $@
+LIBBPF_INCLUDE = $(LIBBPF_SRCS)/..
 
 userccflags += -I $(srctree)/tools/include/ -I $(srctree)/tools/include/uapi \
        -I $(LIBBPF_INCLUDE) -Wno-unused-result
 
 userprogs := bpf_preload_umd
 
-clean-files := libbpf/
-
-$(obj)/iterators/iterators.o: | libbpf_hdrs
-
 bpf_preload_umd-objs := iterators/iterators.o
-bpf_preload_umd-userldlibs := $(LIBBPF_A) -lelf -lz
 
-$(obj)/bpf_preload_umd: $(LIBBPF_A)
+$(obj)/bpf_preload_umd:
 
 $(obj)/bpf_preload_umd_blob.o: $(obj)/bpf_preload_umd
 
index b8bd605..bfe24f8 100644 (file)
@@ -35,15 +35,15 @@ endif
 
 .PHONY: all clean
 
-all: iterators.skel.h
+all: iterators.lskel.h
 
 clean:
        $(call msg,CLEAN)
        $(Q)rm -rf $(OUTPUT) iterators
 
-iterators.skel.h: $(OUTPUT)/iterators.bpf.o | $(BPFTOOL)
+iterators.lskel.h: $(OUTPUT)/iterators.bpf.o | $(BPFTOOL)
        $(call msg,GEN-SKEL,$@)
-       $(Q)$(BPFTOOL) gen skeleton $< > $@
+       $(Q)$(BPFTOOL) gen skeleton -L $< > $@
 
 
 $(OUTPUT)/iterators.bpf.o: iterators.bpf.c $(BPFOBJ) | $(OUTPUT)
index 5d872a7..4dafe0f 100644 (file)
 #include <bpf/libbpf.h>
 #include <bpf/bpf.h>
 #include <sys/mount.h>
-#include "iterators.skel.h"
+#include "iterators.lskel.h"
 #include "bpf_preload_common.h"
 
 int to_kernel = -1;
 int from_kernel = 0;
 
-static int send_link_to_kernel(struct bpf_link *link, const char *link_name)
+static int __bpf_obj_get_info_by_fd(int bpf_fd, void *info, __u32 *info_len)
+{
+       union bpf_attr attr;
+       int err;
+
+       memset(&attr, 0, sizeof(attr));
+       attr.info.bpf_fd = bpf_fd;
+       attr.info.info_len = *info_len;
+       attr.info.info = (long) info;
+
+       err = skel_sys_bpf(BPF_OBJ_GET_INFO_BY_FD, &attr, sizeof(attr));
+       if (!err)
+               *info_len = attr.info.info_len;
+       return err;
+}
+
+static int send_link_to_kernel(int link_fd, const char *link_name)
 {
        struct bpf_preload_info obj = {};
        struct bpf_link_info info = {};
        __u32 info_len = sizeof(info);
        int err;
 
-       err = bpf_obj_get_info_by_fd(bpf_link__fd(link), &info, &info_len);
+       err = __bpf_obj_get_info_by_fd(link_fd, &info, &info_len);
        if (err)
                return err;
        obj.link_id = info.id;
@@ -37,7 +53,6 @@ static int send_link_to_kernel(struct bpf_link *link, const char *link_name)
 
 int main(int argc, char **argv)
 {
-       struct rlimit rlim = { RLIM_INFINITY, RLIM_INFINITY };
        struct iterators_bpf *skel;
        int err, magic;
        int debug_fd;
@@ -55,7 +70,6 @@ int main(int argc, char **argv)
                printf("bad start magic %d\n", magic);
                return 1;
        }
-       setrlimit(RLIMIT_MEMLOCK, &rlim);
        /* libbpf opens BPF object and loads it into the kernel */
        skel = iterators_bpf__open_and_load();
        if (!skel) {
@@ -72,10 +86,10 @@ int main(int argc, char **argv)
                goto cleanup;
 
        /* send two bpf_link IDs with names to the kernel */
-       err = send_link_to_kernel(skel->links.dump_bpf_map, "maps.debug");
+       err = send_link_to_kernel(skel->links.dump_bpf_map_fd, "maps.debug");
        if (err)
                goto cleanup;
-       err = send_link_to_kernel(skel->links.dump_bpf_prog, "progs.debug");
+       err = send_link_to_kernel(skel->links.dump_bpf_prog_fd, "progs.debug");
        if (err)
                goto cleanup;
 
diff --git a/kernel/bpf/preload/iterators/iterators.lskel.h b/kernel/bpf/preload/iterators/iterators.lskel.h
new file mode 100644 (file)
index 0000000..d90562d
--- /dev/null
@@ -0,0 +1,428 @@
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+/* THIS FILE IS AUTOGENERATED! */
+#ifndef __ITERATORS_BPF_SKEL_H__
+#define __ITERATORS_BPF_SKEL_H__
+
+#include <stdlib.h>
+#include <bpf/bpf.h>
+#include <bpf/skel_internal.h>
+
+struct iterators_bpf {
+       struct bpf_loader_ctx ctx;
+       struct {
+               struct bpf_map_desc rodata;
+       } maps;
+       struct {
+               struct bpf_prog_desc dump_bpf_map;
+               struct bpf_prog_desc dump_bpf_prog;
+       } progs;
+       struct {
+               int dump_bpf_map_fd;
+               int dump_bpf_prog_fd;
+       } links;
+       struct iterators_bpf__rodata {
+       } *rodata;
+};
+
+static inline int
+iterators_bpf__dump_bpf_map__attach(struct iterators_bpf *skel)
+{
+       int prog_fd = skel->progs.dump_bpf_map.prog_fd;
+       int fd = skel_link_create(prog_fd, 0, BPF_TRACE_ITER);
+
+       if (fd > 0)
+               skel->links.dump_bpf_map_fd = fd;
+       return fd;
+}
+
+static inline int
+iterators_bpf__dump_bpf_prog__attach(struct iterators_bpf *skel)
+{
+       int prog_fd = skel->progs.dump_bpf_prog.prog_fd;
+       int fd = skel_link_create(prog_fd, 0, BPF_TRACE_ITER);
+
+       if (fd > 0)
+               skel->links.dump_bpf_prog_fd = fd;
+       return fd;
+}
+
+static inline int
+iterators_bpf__attach(struct iterators_bpf *skel)
+{
+       int ret = 0;
+
+       ret = ret < 0 ? ret : iterators_bpf__dump_bpf_map__attach(skel);
+       ret = ret < 0 ? ret : iterators_bpf__dump_bpf_prog__attach(skel);
+       return ret < 0 ? ret : 0;
+}
+
+static inline void
+iterators_bpf__detach(struct iterators_bpf *skel)
+{
+       skel_closenz(skel->links.dump_bpf_map_fd);
+       skel_closenz(skel->links.dump_bpf_prog_fd);
+}
+static void
+iterators_bpf__destroy(struct iterators_bpf *skel)
+{
+       if (!skel)
+               return;
+       iterators_bpf__detach(skel);
+       skel_closenz(skel->progs.dump_bpf_map.prog_fd);
+       skel_closenz(skel->progs.dump_bpf_prog.prog_fd);
+       munmap(skel->rodata, 4096);
+       skel_closenz(skel->maps.rodata.map_fd);
+       free(skel);
+}
+static inline struct iterators_bpf *
+iterators_bpf__open(void)
+{
+       struct iterators_bpf *skel;
+
+       skel = calloc(sizeof(*skel), 1);
+       if (!skel)
+               goto cleanup;
+       skel->ctx.sz = (void *)&skel->links - (void *)skel;
+       skel->rodata =
+               mmap(NULL, 4096, PROT_READ | PROT_WRITE,
+                    MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+       if (skel->rodata == (void *) -1)
+               goto cleanup;
+       memcpy(skel->rodata, (void *)"\
+\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\
+\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x0a\0\x25\x34\x75\x20\
+\x25\x2d\x31\x36\x73\x25\x36\x64\x0a\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\
+\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\
+\x64\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x0a\0", 98);
+       skel->maps.rodata.initial_value = (__u64)(long)skel->rodata;
+       return skel;
+cleanup:
+       iterators_bpf__destroy(skel);
+       return NULL;
+}
+
+static inline int
+iterators_bpf__load(struct iterators_bpf *skel)
+{
+       struct bpf_load_and_run_opts opts = {};
+       int err;
+
+       opts.ctx = (struct bpf_loader_ctx *)skel;
+       opts.data_sz = 6056;
+       opts.data = (void *)"\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x9f\xeb\x01\0\
+\x18\0\0\0\0\0\0\0\x1c\x04\0\0\x1c\x04\0\0\xf9\x04\0\0\0\0\0\0\0\0\0\x02\x02\0\
+\0\0\x01\0\0\0\x02\0\0\x04\x10\0\0\0\x13\0\0\0\x03\0\0\0\0\0\0\0\x18\0\0\0\x04\
+\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\x08\0\0\0\0\0\0\0\0\0\0\x02\x0d\0\0\0\0\0\0\
+\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x01\0\0\0\x20\0\0\0\0\0\0\x01\x04\0\0\0\x20\
+\0\0\x01\x24\0\0\0\x01\0\0\x0c\x05\0\0\0\xa3\0\0\0\x03\0\0\x04\x18\0\0\0\xb1\0\
+\0\0\x09\0\0\0\0\0\0\0\xb5\0\0\0\x0b\0\0\0\x40\0\0\0\xc0\0\0\0\x0b\0\0\0\x80\0\
+\0\0\0\0\0\0\0\0\0\x02\x0a\0\0\0\xc8\0\0\0\0\0\0\x07\0\0\0\0\xd1\0\0\0\0\0\0\
+\x08\x0c\0\0\0\xd7\0\0\0\0\0\0\x01\x08\0\0\0\x40\0\0\0\x94\x01\0\0\x03\0\0\x04\
+\x18\0\0\0\x9c\x01\0\0\x0e\0\0\0\0\0\0\0\x9f\x01\0\0\x11\0\0\0\x20\0\0\0\xa4\
+\x01\0\0\x0e\0\0\0\xa0\0\0\0\xb0\x01\0\0\0\0\0\x08\x0f\0\0\0\xb6\x01\0\0\0\0\0\
+\x01\x04\0\0\0\x20\0\0\0\xc3\x01\0\0\0\0\0\x01\x01\0\0\0\x08\0\0\x01\0\0\0\0\0\
+\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x10\0\0\0\xc8\x01\0\0\0\0\0\x01\x04\0\0\0\
+\x20\0\0\0\0\0\0\0\0\0\0\x02\x14\0\0\0\x2c\x02\0\0\x02\0\0\x04\x10\0\0\0\x13\0\
+\0\0\x03\0\0\0\0\0\0\0\x3f\x02\0\0\x15\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\x18\0\
+\0\0\0\0\0\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x13\0\0\0\x44\x02\0\0\x01\0\0\x0c\
+\x16\0\0\0\x90\x02\0\0\x01\0\0\x04\x08\0\0\0\x99\x02\0\0\x19\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\x02\x1a\0\0\0\xea\x02\0\0\x06\0\0\x04\x38\0\0\0\x9c\x01\0\0\x0e\0\0\
+\0\0\0\0\0\x9f\x01\0\0\x11\0\0\0\x20\0\0\0\xf7\x02\0\0\x1b\0\0\0\xc0\0\0\0\x08\
+\x03\0\0\x15\0\0\0\0\x01\0\0\x11\x03\0\0\x1d\0\0\0\x40\x01\0\0\x1b\x03\0\0\x1e\
+\0\0\0\x80\x01\0\0\0\0\0\0\0\0\0\x02\x1c\0\0\0\0\0\0\0\0\0\0\x0a\x10\0\0\0\0\0\
+\0\0\0\0\0\x02\x1f\0\0\0\0\0\0\0\0\0\0\x02\x20\0\0\0\x65\x03\0\0\x02\0\0\x04\
+\x08\0\0\0\x73\x03\0\0\x0e\0\0\0\0\0\0\0\x7c\x03\0\0\x0e\0\0\0\x20\0\0\0\x1b\
+\x03\0\0\x03\0\0\x04\x18\0\0\0\x86\x03\0\0\x1b\0\0\0\0\0\0\0\x8e\x03\0\0\x21\0\
+\0\0\x40\0\0\0\x94\x03\0\0\x23\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\x02\x22\0\0\0\0\0\
+\0\0\0\0\0\x02\x24\0\0\0\x98\x03\0\0\x01\0\0\x04\x04\0\0\0\xa3\x03\0\0\x0e\0\0\
+\0\0\0\0\0\x0c\x04\0\0\x01\0\0\x04\x04\0\0\0\x15\x04\0\0\x0e\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x23\0\0\0\x8b\x04\0\0\0\0\0\x0e\x25\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x0e\0\0\0\x9f\x04\
+\0\0\0\0\0\x0e\x27\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\
+\x20\0\0\0\xb5\x04\0\0\0\0\0\x0e\x29\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\
+\x1c\0\0\0\x12\0\0\0\x11\0\0\0\xca\x04\0\0\0\0\0\x0e\x2b\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x04\0\0\0\xe1\x04\0\0\0\0\0\x0e\x2d\0\0\
+\0\x01\0\0\0\xe9\x04\0\0\x04\0\0\x0f\x62\0\0\0\x26\0\0\0\0\0\0\0\x23\0\0\0\x28\
+\0\0\0\x23\0\0\0\x0e\0\0\0\x2a\0\0\0\x31\0\0\0\x20\0\0\0\x2c\0\0\0\x51\0\0\0\
+\x11\0\0\0\xf1\x04\0\0\x01\0\0\x0f\x04\0\0\0\x2e\0\0\0\0\0\0\0\x04\0\0\0\0\x62\
+\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x6d\x65\x74\
+\x61\0\x6d\x61\x70\0\x63\x74\x78\0\x69\x6e\x74\0\x64\x75\x6d\x70\x5f\x62\x70\
+\x66\x5f\x6d\x61\x70\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x6d\x61\x70\0\x30\
+\x3a\x30\0\x2f\x77\x2f\x6e\x65\x74\x2d\x6e\x65\x78\x74\x2f\x6b\x65\x72\x6e\x65\
+\x6c\x2f\x62\x70\x66\x2f\x70\x72\x65\x6c\x6f\x61\x64\x2f\x69\x74\x65\x72\x61\
+\x74\x6f\x72\x73\x2f\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\x2e\
+\x63\0\x09\x73\x74\x72\x75\x63\x74\x20\x73\x65\x71\x5f\x66\x69\x6c\x65\x20\x2a\
+\x73\x65\x71\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\
+\x71\x3b\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x6d\x65\x74\x61\0\x73\x65\x71\0\
+\x73\x65\x73\x73\x69\x6f\x6e\x5f\x69\x64\0\x73\x65\x71\x5f\x6e\x75\x6d\0\x73\
+\x65\x71\x5f\x66\x69\x6c\x65\0\x5f\x5f\x75\x36\x34\0\x75\x6e\x73\x69\x67\x6e\
+\x65\x64\x20\x6c\x6f\x6e\x67\x20\x6c\x6f\x6e\x67\0\x30\x3a\x31\0\x09\x73\x74\
+\x72\x75\x63\x74\x20\x62\x70\x66\x5f\x6d\x61\x70\x20\x2a\x6d\x61\x70\x20\x3d\
+\x20\x63\x74\x78\x2d\x3e\x6d\x61\x70\x3b\0\x09\x69\x66\x20\x28\x21\x6d\x61\x70\
+\x29\0\x09\x5f\x5f\x75\x36\x34\x20\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x20\x63\
+\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x5f\x6e\x75\x6d\x3b\0\x30\
+\x3a\x32\0\x09\x69\x66\x20\x28\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x3d\x20\x30\
+\x29\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\
+\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\
+\x20\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\
+\x5c\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x6d\x61\x70\0\x69\x64\0\x6e\x61\x6d\x65\
+\0\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\0\x5f\x5f\x75\x33\x32\0\x75\x6e\
+\x73\x69\x67\x6e\x65\x64\x20\x69\x6e\x74\0\x63\x68\x61\x72\0\x5f\x5f\x41\x52\
+\x52\x41\x59\x5f\x53\x49\x5a\x45\x5f\x54\x59\x50\x45\x5f\x5f\0\x09\x42\x50\x46\
+\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\
+\x34\x75\x20\x25\x2d\x31\x36\x73\x25\x36\x64\x5c\x6e\x22\x2c\x20\x6d\x61\x70\
+\x2d\x3e\x69\x64\x2c\x20\x6d\x61\x70\x2d\x3e\x6e\x61\x6d\x65\x2c\x20\x6d\x61\
+\x70\x2d\x3e\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x29\x3b\0\x7d\0\x62\
+\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x70\x72\
+\x6f\x67\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x69\x74\x65\
+\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x09\x73\x74\x72\x75\x63\x74\x20\x62\
+\x70\x66\x5f\x70\x72\x6f\x67\x20\x2a\x70\x72\x6f\x67\x20\x3d\x20\x63\x74\x78\
+\x2d\x3e\x70\x72\x6f\x67\x3b\0\x09\x69\x66\x20\x28\x21\x70\x72\x6f\x67\x29\0\
+\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x61\x75\x78\0\x09\x61\x75\x78\x20\x3d\x20\
+\x70\x72\x6f\x67\x2d\x3e\x61\x75\x78\x3b\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\
+\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\
+\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\
+\x74\x61\x63\x68\x65\x64\x5c\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x70\x72\x6f\x67\
+\x5f\x61\x75\x78\0\x61\x74\x74\x61\x63\x68\x5f\x66\x75\x6e\x63\x5f\x6e\x61\x6d\
+\x65\0\x64\x73\x74\x5f\x70\x72\x6f\x67\0\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\
+\x62\x74\x66\0\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\
+\x73\x65\x71\x2c\x20\x22\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\
+\x25\x73\x5c\x6e\x22\x2c\x20\x61\x75\x78\x2d\x3e\x69\x64\x2c\0\x30\x3a\x34\0\
+\x30\x3a\x35\0\x09\x69\x66\x20\x28\x21\x62\x74\x66\x29\0\x62\x70\x66\x5f\x66\
+\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x69\x6e\x73\x6e\x5f\x6f\x66\x66\0\x74\x79\
+\x70\x65\x5f\x69\x64\0\x30\0\x73\x74\x72\x69\x6e\x67\x73\0\x74\x79\x70\x65\x73\
+\0\x68\x64\x72\0\x62\x74\x66\x5f\x68\x65\x61\x64\x65\x72\0\x73\x74\x72\x5f\x6c\
+\x65\x6e\0\x09\x74\x79\x70\x65\x73\x20\x3d\x20\x62\x74\x66\x2d\x3e\x74\x79\x70\
+\x65\x73\x3b\0\x09\x62\x70\x66\x5f\x70\x72\x6f\x62\x65\x5f\x72\x65\x61\x64\x5f\
+\x6b\x65\x72\x6e\x65\x6c\x28\x26\x74\x2c\x20\x73\x69\x7a\x65\x6f\x66\x28\x74\
+\x29\x2c\x20\x74\x79\x70\x65\x73\x20\x2b\x20\x62\x74\x66\x5f\x69\x64\x29\x3b\0\
+\x09\x73\x74\x72\x20\x3d\x20\x62\x74\x66\x2d\x3e\x73\x74\x72\x69\x6e\x67\x73\
+\x3b\0\x62\x74\x66\x5f\x74\x79\x70\x65\0\x6e\x61\x6d\x65\x5f\x6f\x66\x66\0\x09\
+\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3d\x20\x42\x50\x46\x5f\x43\x4f\x52\x45\
+\x5f\x52\x45\x41\x44\x28\x74\x2c\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x29\x3b\0\
+\x30\x3a\x32\x3a\x30\0\x09\x69\x66\x20\x28\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\
+\x3e\x3d\x20\x62\x74\x66\x2d\x3e\x68\x64\x72\x2e\x73\x74\x72\x5f\x6c\x65\x6e\
+\x29\0\x09\x72\x65\x74\x75\x72\x6e\x20\x73\x74\x72\x20\x2b\x20\x6e\x61\x6d\x65\
+\x5f\x6f\x66\x66\x3b\0\x30\x3a\x33\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\
+\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\
+\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x31\0\x64\x75\x6d\x70\x5f\x62\x70\x66\
+\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\
+\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x32\0\x4c\x49\x43\x45\
+\x4e\x53\x45\0\x2e\x72\x6f\x64\x61\x74\x61\0\x6c\x69\x63\x65\x6e\x73\x65\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x2d\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x02\0\0\
+\0\x04\0\0\0\x62\0\0\0\x01\0\0\0\x80\x04\0\0\0\0\0\0\0\0\0\0\x69\x74\x65\x72\
+\x61\x74\x6f\x72\x2e\x72\x6f\x64\x61\x74\x61\0\0\0\0\0\0\0\0\0\0\0\0\0\x2f\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\
+\x20\x20\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\
+\x73\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x25\x36\x64\x0a\0\x20\x20\x69\
+\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\
+\x61\x74\x74\x61\x63\x68\x65\x64\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\
+\x25\x73\x20\x25\x73\x0a\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\
+\x79\x12\0\0\0\0\0\0\x79\x26\0\0\0\0\0\0\x79\x17\x08\0\0\0\0\0\x15\x07\x1b\0\0\
+\0\0\0\x79\x11\0\0\0\0\0\0\x79\x11\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\
+\0\0\0\0\0\0\x07\x04\0\0\xe8\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x62\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\xb7\x03\0\0\x23\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\
+\0\x61\x71\0\0\0\0\0\0\x7b\x1a\xe8\xff\0\0\0\0\xb7\x01\0\0\x04\0\0\0\xbf\x72\0\
+\0\0\0\0\0\x0f\x12\0\0\0\0\0\0\x7b\x2a\xf0\xff\0\0\0\0\x61\x71\x14\0\0\0\0\0\
+\x7b\x1a\xf8\xff\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xe8\xff\xff\xff\xbf\
+\x61\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x23\0\0\0\xb7\x03\0\0\x0e\0\0\0\
+\xb7\x05\0\0\x18\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\
+\0\0\0\0\x07\0\0\0\0\0\0\0\x42\0\0\0\x7b\0\0\0\x1e\x3c\x01\0\x01\0\0\0\x42\0\0\
+\0\x7b\0\0\0\x24\x3c\x01\0\x02\0\0\0\x42\0\0\0\xee\0\0\0\x1d\x44\x01\0\x03\0\0\
+\0\x42\0\0\0\x0f\x01\0\0\x06\x4c\x01\0\x04\0\0\0\x42\0\0\0\x1a\x01\0\0\x17\x40\
+\x01\0\x05\0\0\0\x42\0\0\0\x1a\x01\0\0\x1d\x40\x01\0\x06\0\0\0\x42\0\0\0\x43\
+\x01\0\0\x06\x58\x01\0\x08\0\0\0\x42\0\0\0\x56\x01\0\0\x03\x5c\x01\0\x0f\0\0\0\
+\x42\0\0\0\xdc\x01\0\0\x02\x64\x01\0\x1f\0\0\0\x42\0\0\0\x2a\x02\0\0\x01\x6c\
+\x01\0\0\0\0\0\x02\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\
+\0\x10\0\0\0\x02\0\0\0\xea\0\0\0\0\0\0\0\x20\0\0\0\x02\0\0\0\x3e\0\0\0\0\0\0\0\
+\x28\0\0\0\x08\0\0\0\x3f\x01\0\0\0\0\0\0\x78\0\0\0\x0d\0\0\0\x3e\0\0\0\0\0\0\0\
+\x88\0\0\0\x0d\0\0\0\xea\0\0\0\0\0\0\0\xa8\0\0\0\x0d\0\0\0\x3f\x01\0\0\0\0\0\0\
+\x1a\0\0\0\x21\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\0\0\0\
+\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\x10\0\0\0\0\0\0\
+\0\0\0\0\0\x0a\0\0\0\x01\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\x10\0\0\0\0\0\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x62\x70\x66\x5f\x6d\
+\x61\x70\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\x79\x12\0\0\0\0\0\0\x79\x26\0\0\
+\0\0\0\0\x79\x12\x08\0\0\0\0\0\x15\x02\x3c\0\0\0\0\0\x79\x11\0\0\0\0\0\0\x79\
+\x27\0\0\0\0\0\0\x79\x11\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\0\0\0\0\0\
+\0\x07\x04\0\0\xd0\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\
+\x31\0\0\0\xb7\x03\0\0\x20\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\0\x7b\
+\x6a\xc8\xff\0\0\0\0\x61\x71\0\0\0\0\0\0\x7b\x1a\xd0\xff\0\0\0\0\xb7\x03\0\0\
+\x04\0\0\0\xbf\x79\0\0\0\0\0\0\x0f\x39\0\0\0\0\0\0\x79\x71\x28\0\0\0\0\0\x79\
+\x78\x30\0\0\0\0\0\x15\x08\x18\0\0\0\0\0\xb7\x02\0\0\0\0\0\0\x0f\x21\0\0\0\0\0\
+\0\x61\x11\x04\0\0\0\0\0\x79\x83\x08\0\0\0\0\0\x67\x01\0\0\x03\0\0\0\x0f\x13\0\
+\0\0\0\0\0\x79\x86\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf8\xff\xff\xff\
+\xb7\x02\0\0\x08\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x01\0\0\0\0\0\0\x79\xa3\xf8\xff\
+\0\0\0\0\x0f\x13\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf4\xff\xff\xff\
+\xb7\x02\0\0\x04\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x03\0\0\x04\0\0\0\x61\xa1\xf4\
+\xff\0\0\0\0\x61\x82\x10\0\0\0\0\0\x3d\x21\x02\0\0\0\0\0\x0f\x16\0\0\0\0\0\0\
+\xbf\x69\0\0\0\0\0\0\x7b\x9a\xd8\xff\0\0\0\0\x79\x71\x18\0\0\0\0\0\x7b\x1a\xe0\
+\xff\0\0\0\0\x79\x71\x20\0\0\0\0\0\x79\x11\0\0\0\0\0\0\x0f\x31\0\0\0\0\0\0\x7b\
+\x1a\xe8\xff\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xd0\xff\xff\xff\x79\xa1\
+\xc8\xff\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x51\0\0\0\xb7\x03\0\0\x11\0\0\0\
+\xb7\x05\0\0\x20\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\
+\0\0\0\0\x17\0\0\0\0\0\0\0\x42\0\0\0\x7b\0\0\0\x1e\x80\x01\0\x01\0\0\0\x42\0\0\
+\0\x7b\0\0\0\x24\x80\x01\0\x02\0\0\0\x42\0\0\0\x60\x02\0\0\x1f\x88\x01\0\x03\0\
+\0\0\x42\0\0\0\x84\x02\0\0\x06\x94\x01\0\x04\0\0\0\x42\0\0\0\x1a\x01\0\0\x17\
+\x84\x01\0\x05\0\0\0\x42\0\0\0\x9d\x02\0\0\x0e\xa0\x01\0\x06\0\0\0\x42\0\0\0\
+\x1a\x01\0\0\x1d\x84\x01\0\x07\0\0\0\x42\0\0\0\x43\x01\0\0\x06\xa4\x01\0\x09\0\
+\0\0\x42\0\0\0\xaf\x02\0\0\x03\xa8\x01\0\x11\0\0\0\x42\0\0\0\x1f\x03\0\0\x02\
+\xb0\x01\0\x18\0\0\0\x42\0\0\0\x5a\x03\0\0\x06\x04\x01\0\x1b\0\0\0\x42\0\0\0\0\
+\0\0\0\0\0\0\0\x1c\0\0\0\x42\0\0\0\xab\x03\0\0\x0f\x10\x01\0\x1d\0\0\0\x42\0\0\
+\0\xc0\x03\0\0\x2d\x14\x01\0\x1f\0\0\0\x42\0\0\0\xf7\x03\0\0\x0d\x0c\x01\0\x21\
+\0\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\x22\0\0\0\x42\0\0\0\xc0\x03\0\0\x02\x14\x01\0\
+\x25\0\0\0\x42\0\0\0\x1e\x04\0\0\x0d\x18\x01\0\x28\0\0\0\x42\0\0\0\0\0\0\0\0\0\
+\0\0\x29\0\0\0\x42\0\0\0\x1e\x04\0\0\x0d\x18\x01\0\x2c\0\0\0\x42\0\0\0\x1e\x04\
+\0\0\x0d\x18\x01\0\x2d\0\0\0\x42\0\0\0\x4c\x04\0\0\x1b\x1c\x01\0\x2e\0\0\0\x42\
+\0\0\0\x4c\x04\0\0\x06\x1c\x01\0\x2f\0\0\0\x42\0\0\0\x6f\x04\0\0\x0d\x24\x01\0\
+\x31\0\0\0\x42\0\0\0\x1f\x03\0\0\x02\xb0\x01\0\x40\0\0\0\x42\0\0\0\x2a\x02\0\0\
+\x01\xc0\x01\0\0\0\0\0\x14\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\
+\0\0\0\0\0\x10\0\0\0\x14\0\0\0\xea\0\0\0\0\0\0\0\x20\0\0\0\x14\0\0\0\x3e\0\0\0\
+\0\0\0\0\x28\0\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\x30\0\0\0\x08\0\0\0\x3f\x01\0\0\
+\0\0\0\0\x88\0\0\0\x1a\0\0\0\x3e\0\0\0\0\0\0\0\x98\0\0\0\x1a\0\0\0\xea\0\0\0\0\
+\0\0\0\xb0\0\0\0\x1a\0\0\0\x52\x03\0\0\0\0\0\0\xb8\0\0\0\x1a\0\0\0\x56\x03\0\0\
+\0\0\0\0\xc8\0\0\0\x1f\0\0\0\x84\x03\0\0\0\0\0\0\xe0\0\0\0\x20\0\0\0\xea\0\0\0\
+\0\0\0\0\xf8\0\0\0\x20\0\0\0\x3e\0\0\0\0\0\0\0\x20\x01\0\0\x24\0\0\0\x3e\0\0\0\
+\0\0\0\0\x58\x01\0\0\x1a\0\0\0\xea\0\0\0\0\0\0\0\x68\x01\0\0\x20\0\0\0\x46\x04\
+\0\0\0\0\0\0\x90\x01\0\0\x1a\0\0\0\x3f\x01\0\0\0\0\0\0\xa0\x01\0\0\x1a\0\0\0\
+\x87\x04\0\0\0\0\0\0\xa8\x01\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\x1a\0\0\0\x42\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\0\0\0\0\0\0\x1c\0\0\
+\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\x10\0\0\0\0\0\0\0\0\0\0\0\x1a\0\
+\0\0\x01\0\0\0\0\0\0\0\x13\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\0\
+\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\0\0\
+\0\0\0\0";
+       opts.insns_sz = 2184;
+       opts.insns = (void *)"\
+\xbf\x16\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\x78\xff\xff\xff\xb7\x02\0\
+\0\x88\0\0\0\xb7\x03\0\0\0\0\0\0\x85\0\0\0\x71\0\0\0\x05\0\x14\0\0\0\0\0\x61\
+\xa1\x78\xff\0\0\0\0\xd5\x01\x01\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa1\x7c\xff\
+\0\0\0\0\xd5\x01\x01\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa1\x80\xff\0\0\0\0\xd5\
+\x01\x01\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa1\x84\xff\0\0\0\0\xd5\x01\x01\0\0\
+\0\0\0\x85\0\0\0\xa8\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x61\x01\0\0\0\0\
+\0\0\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xbf\x70\0\0\
+\0\0\0\0\x95\0\0\0\0\0\0\0\x61\x60\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\
+\x48\x0e\0\0\x63\x01\0\0\0\0\0\0\x61\x60\x0c\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\
+\0\0\x44\x0e\0\0\x63\x01\0\0\0\0\0\0\x79\x60\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\
+\0\0\0\0\x38\x0e\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x05\0\0\
+\x18\x61\0\0\0\0\0\0\0\0\0\0\x30\x0e\0\0\x7b\x01\0\0\0\0\0\0\xb7\x01\0\0\x12\0\
+\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x30\x0e\0\0\xb7\x03\0\0\x1c\0\0\0\x85\0\0\0\
+\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\xd4\xff\0\0\0\0\x63\x7a\x78\xff\0\0\0\0\
+\x61\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x80\x0e\0\0\x63\x01\0\0\0\
+\0\0\0\x61\x60\x20\0\0\0\0\0\x15\0\x03\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\
+\x5c\x0e\0\0\x63\x01\0\0\0\0\0\0\xb7\x01\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\
+\0\x50\x0e\0\0\xb7\x03\0\0\x48\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\
+\xc5\x07\xc3\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x63\x71\0\0\0\0\0\
+\0\x79\x63\x18\0\0\0\0\0\x15\x03\x04\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x98\
+\x0e\0\0\xb7\x02\0\0\x62\0\0\0\x85\0\0\0\x94\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\x61\x20\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x08\x0f\0\0\x63\x01\0\
+\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x0f\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\
+\x10\x0f\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x98\x0e\0\0\x18\
+\x61\0\0\0\0\0\0\0\0\0\0\x18\x0f\0\0\x7b\x01\0\0\0\0\0\0\xb7\x01\0\0\x02\0\0\0\
+\x18\x62\0\0\0\0\0\0\0\0\0\0\x08\x0f\0\0\xb7\x03\0\0\x20\0\0\0\x85\0\0\0\xa6\0\
+\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\xa3\xff\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\x61\x20\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x28\x0f\0\0\x63\x01\0\0\
+\0\0\0\0\xb7\x01\0\0\x16\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x28\x0f\0\0\xb7\x03\
+\0\0\x04\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\x96\xff\0\0\0\0\
+\x18\x60\0\0\0\0\0\0\0\0\0\0\x30\x0f\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x78\x11\0\
+\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x38\x0f\0\0\x18\x61\0\0\0\0\
+\0\0\0\0\0\0\x70\x11\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x40\
+\x10\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb8\x11\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\
+\0\0\0\0\0\0\0\0\0\x48\x10\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xc8\x11\0\0\x7b\x01\
+\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xe8\x10\0\0\x18\x61\0\0\0\0\0\0\0\0\0\
+\0\xe8\x11\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x18\x61\
+\0\0\0\0\0\0\0\0\0\0\xe0\x11\0\0\x7b\x01\0\0\0\0\0\0\x61\x60\x08\0\0\0\0\0\x18\
+\x61\0\0\0\0\0\0\0\0\0\0\x80\x11\0\0\x63\x01\0\0\0\0\0\0\x61\x60\x0c\0\0\0\0\0\
+\x18\x61\0\0\0\0\0\0\0\0\0\0\x84\x11\0\0\x63\x01\0\0\0\0\0\0\x79\x60\x10\0\0\0\
+\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x88\x11\0\0\x7b\x01\0\0\0\0\0\0\x61\xa0\x78\
+\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb0\x11\0\0\x63\x01\0\0\0\0\0\0\x18\
+\x61\0\0\0\0\0\0\0\0\0\0\xf8\x11\0\0\xb7\x02\0\0\x11\0\0\0\xb7\x03\0\0\x0c\0\0\
+\0\xb7\x04\0\0\0\0\0\0\x85\0\0\0\xa7\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\x60\xff\
+\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x68\x11\0\0\x63\x70\x6c\0\0\0\0\0\x77\x07\
+\0\0\x20\0\0\0\x63\x70\x70\0\0\0\0\0\xb7\x01\0\0\x05\0\0\0\x18\x62\0\0\0\0\0\0\
+\0\0\0\0\x68\x11\0\0\xb7\x03\0\0\x8c\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\
+\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xd8\x11\0\0\x61\x01\0\0\0\0\0\0\xd5\x01\x02\0\
+\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xc5\x07\x4e\xff\0\0\0\0\x63\
+\x7a\x80\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x10\x12\0\0\x18\x61\0\0\0\0\0\
+\0\0\0\0\0\x10\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x18\x12\
+\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x08\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\
+\0\0\0\0\0\0\0\x28\x14\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x50\x17\0\0\x7b\x01\0\0\
+\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x30\x14\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\
+\x60\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xd0\x15\0\0\x18\
+\x61\0\0\0\0\0\0\0\0\0\0\x80\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x78\x17\0\0\x7b\x01\0\0\0\0\0\0\x61\
+\x60\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x18\x17\0\0\x63\x01\0\0\0\0\0\0\
+\x61\x60\x0c\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x1c\x17\0\0\x63\x01\0\0\0\0\
+\0\0\x79\x60\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x20\x17\0\0\x7b\x01\0\0\
+\0\0\0\0\x61\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x48\x17\0\0\x63\
+\x01\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x90\x17\0\0\xb7\x02\0\0\x12\0\0\0\
+\xb7\x03\0\0\x0c\0\0\0\xb7\x04\0\0\0\0\0\0\x85\0\0\0\xa7\0\0\0\xbf\x07\0\0\0\0\
+\0\0\xc5\x07\x17\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x17\0\0\x63\x70\x6c\
+\0\0\0\0\0\x77\x07\0\0\x20\0\0\0\x63\x70\x70\0\0\0\0\0\xb7\x01\0\0\x05\0\0\0\
+\x18\x62\0\0\0\0\0\0\0\0\0\0\0\x17\0\0\xb7\x03\0\0\x8c\0\0\0\x85\0\0\0\xa6\0\0\
+\0\xbf\x07\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x70\x17\0\0\x61\x01\0\0\0\0\
+\0\0\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xc5\x07\x05\
+\xff\0\0\0\0\x63\x7a\x84\xff\0\0\0\0\x61\xa1\x78\xff\0\0\0\0\xd5\x01\x02\0\0\0\
+\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa0\x80\xff\0\0\0\0\x63\x06\
+\x28\0\0\0\0\0\x61\xa0\x84\xff\0\0\0\0\x63\x06\x2c\0\0\0\0\0\x18\x61\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\x61\x10\0\0\0\0\0\0\x63\x06\x18\0\0\0\0\0\xb7\0\0\0\0\0\0\0\
+\x95\0\0\0\0\0\0\0";
+       err = bpf_load_and_run(&opts);
+       if (err < 0)
+               return err;
+       skel->rodata =
+               mmap(skel->rodata, 4096, PROT_READ, MAP_SHARED | MAP_FIXED,
+                       skel->maps.rodata.map_fd, 0);
+       return 0;
+}
+
+static inline struct iterators_bpf *
+iterators_bpf__open_and_load(void)
+{
+       struct iterators_bpf *skel;
+
+       skel = iterators_bpf__open();
+       if (!skel)
+               return NULL;
+       if (iterators_bpf__load(skel)) {
+               iterators_bpf__destroy(skel);
+               return NULL;
+       }
+       return skel;
+}
+
+#endif /* __ITERATORS_BPF_SKEL_H__ */
diff --git a/kernel/bpf/preload/iterators/iterators.skel.h b/kernel/bpf/preload/iterators/iterators.skel.h
deleted file mode 100644 (file)
index cf9a6a9..0000000
+++ /dev/null
@@ -1,412 +0,0 @@
-/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
-
-/* THIS FILE IS AUTOGENERATED! */
-#ifndef __ITERATORS_BPF_SKEL_H__
-#define __ITERATORS_BPF_SKEL_H__
-
-#include <stdlib.h>
-#include <bpf/libbpf.h>
-
-struct iterators_bpf {
-       struct bpf_object_skeleton *skeleton;
-       struct bpf_object *obj;
-       struct {
-               struct bpf_map *rodata;
-       } maps;
-       struct {
-               struct bpf_program *dump_bpf_map;
-               struct bpf_program *dump_bpf_prog;
-       } progs;
-       struct {
-               struct bpf_link *dump_bpf_map;
-               struct bpf_link *dump_bpf_prog;
-       } links;
-       struct iterators_bpf__rodata {
-               char dump_bpf_map____fmt[35];
-               char dump_bpf_map____fmt_1[14];
-               char dump_bpf_prog____fmt[32];
-               char dump_bpf_prog____fmt_2[17];
-       } *rodata;
-};
-
-static void
-iterators_bpf__destroy(struct iterators_bpf *obj)
-{
-       if (!obj)
-               return;
-       if (obj->skeleton)
-               bpf_object__destroy_skeleton(obj->skeleton);
-       free(obj);
-}
-
-static inline int
-iterators_bpf__create_skeleton(struct iterators_bpf *obj);
-
-static inline struct iterators_bpf *
-iterators_bpf__open_opts(const struct bpf_object_open_opts *opts)
-{
-       struct iterators_bpf *obj;
-
-       obj = (struct iterators_bpf *)calloc(1, sizeof(*obj));
-       if (!obj)
-               return NULL;
-       if (iterators_bpf__create_skeleton(obj))
-               goto err;
-       if (bpf_object__open_skeleton(obj->skeleton, opts))
-               goto err;
-
-       return obj;
-err:
-       iterators_bpf__destroy(obj);
-       return NULL;
-}
-
-static inline struct iterators_bpf *
-iterators_bpf__open(void)
-{
-       return iterators_bpf__open_opts(NULL);
-}
-
-static inline int
-iterators_bpf__load(struct iterators_bpf *obj)
-{
-       return bpf_object__load_skeleton(obj->skeleton);
-}
-
-static inline struct iterators_bpf *
-iterators_bpf__open_and_load(void)
-{
-       struct iterators_bpf *obj;
-
-       obj = iterators_bpf__open();
-       if (!obj)
-               return NULL;
-       if (iterators_bpf__load(obj)) {
-               iterators_bpf__destroy(obj);
-               return NULL;
-       }
-       return obj;
-}
-
-static inline int
-iterators_bpf__attach(struct iterators_bpf *obj)
-{
-       return bpf_object__attach_skeleton(obj->skeleton);
-}
-
-static inline void
-iterators_bpf__detach(struct iterators_bpf *obj)
-{
-       return bpf_object__detach_skeleton(obj->skeleton);
-}
-
-static inline int
-iterators_bpf__create_skeleton(struct iterators_bpf *obj)
-{
-       struct bpf_object_skeleton *s;
-
-       s = (struct bpf_object_skeleton *)calloc(1, sizeof(*s));
-       if (!s)
-               return -1;
-       obj->skeleton = s;
-
-       s->sz = sizeof(*s);
-       s->name = "iterators_bpf";
-       s->obj = &obj->obj;
-
-       /* maps */
-       s->map_cnt = 1;
-       s->map_skel_sz = sizeof(*s->maps);
-       s->maps = (struct bpf_map_skeleton *)calloc(s->map_cnt, s->map_skel_sz);
-       if (!s->maps)
-               goto err;
-
-       s->maps[0].name = "iterator.rodata";
-       s->maps[0].map = &obj->maps.rodata;
-       s->maps[0].mmaped = (void **)&obj->rodata;
-
-       /* programs */
-       s->prog_cnt = 2;
-       s->prog_skel_sz = sizeof(*s->progs);
-       s->progs = (struct bpf_prog_skeleton *)calloc(s->prog_cnt, s->prog_skel_sz);
-       if (!s->progs)
-               goto err;
-
-       s->progs[0].name = "dump_bpf_map";
-       s->progs[0].prog = &obj->progs.dump_bpf_map;
-       s->progs[0].link = &obj->links.dump_bpf_map;
-
-       s->progs[1].name = "dump_bpf_prog";
-       s->progs[1].prog = &obj->progs.dump_bpf_prog;
-       s->progs[1].link = &obj->links.dump_bpf_prog;
-
-       s->data_sz = 7176;
-       s->data = (void *)"\
-\x7f\x45\x4c\x46\x02\x01\x01\0\0\0\0\0\0\0\0\0\x01\0\xf7\0\x01\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\x48\x18\0\0\0\0\0\0\0\0\0\0\x40\0\0\0\0\0\x40\0\x0f\0\
-\x0e\0\x79\x12\0\0\0\0\0\0\x79\x26\0\0\0\0\0\0\x79\x17\x08\0\0\0\0\0\x15\x07\
-\x1a\0\0\0\0\0\x79\x21\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\0\0\0\0\0\0\
-\x07\x04\0\0\xe8\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x02\0\0\0\0\0\0\0\0\0\0\0\
-\0\0\0\xb7\x03\0\0\x23\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\0\x61\x71\0\
-\0\0\0\0\0\x7b\x1a\xe8\xff\0\0\0\0\xb7\x01\0\0\x04\0\0\0\xbf\x72\0\0\0\0\0\0\
-\x0f\x12\0\0\0\0\0\0\x7b\x2a\xf0\xff\0\0\0\0\x61\x71\x14\0\0\0\0\0\x7b\x1a\xf8\
-\xff\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xe8\xff\xff\xff\xbf\x61\0\0\0\0\0\
-\0\x18\x02\0\0\x23\0\0\0\0\0\0\0\0\0\0\0\xb7\x03\0\0\x0e\0\0\0\xb7\x05\0\0\x18\
-\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\x79\x12\0\0\0\0\
-\0\0\x79\x26\0\0\0\0\0\0\x79\x11\x08\0\0\0\0\0\x15\x01\x3b\0\0\0\0\0\x79\x17\0\
-\0\0\0\0\0\x79\x21\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\
-\x04\0\0\xd0\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x02\0\0\x31\0\0\0\0\0\0\0\0\0\
-\0\0\xb7\x03\0\0\x20\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\0\x7b\x6a\xc8\
-\xff\0\0\0\0\x61\x71\0\0\0\0\0\0\x7b\x1a\xd0\xff\0\0\0\0\xb7\x03\0\0\x04\0\0\0\
-\xbf\x79\0\0\0\0\0\0\x0f\x39\0\0\0\0\0\0\x79\x71\x28\0\0\0\0\0\x79\x78\x30\0\0\
-\0\0\0\x15\x08\x18\0\0\0\0\0\xb7\x02\0\0\0\0\0\0\x0f\x21\0\0\0\0\0\0\x61\x11\
-\x04\0\0\0\0\0\x79\x83\x08\0\0\0\0\0\x67\x01\0\0\x03\0\0\0\x0f\x13\0\0\0\0\0\0\
-\x79\x86\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf8\xff\xff\xff\xb7\x02\0\
-\0\x08\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x01\0\0\0\0\0\0\x79\xa3\xf8\xff\0\0\0\0\
-\x0f\x13\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf4\xff\xff\xff\xb7\x02\0\
-\0\x04\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x03\0\0\x04\0\0\0\x61\xa1\xf4\xff\0\0\0\0\
-\x61\x82\x10\0\0\0\0\0\x3d\x21\x02\0\0\0\0\0\x0f\x16\0\0\0\0\0\0\xbf\x69\0\0\0\
-\0\0\0\x7b\x9a\xd8\xff\0\0\0\0\x79\x71\x18\0\0\0\0\0\x7b\x1a\xe0\xff\0\0\0\0\
-\x79\x71\x20\0\0\0\0\0\x79\x11\0\0\0\0\0\0\x0f\x31\0\0\0\0\0\0\x7b\x1a\xe8\xff\
-\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xd0\xff\xff\xff\x79\xa1\xc8\xff\0\0\0\
-\0\x18\x02\0\0\x51\0\0\0\0\0\0\0\0\0\0\0\xb7\x03\0\0\x11\0\0\0\xb7\x05\0\0\x20\
-\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\x20\x20\x69\x64\
-\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x6d\
-\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\
-\x73\x25\x36\x64\x0a\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\
-\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\x64\x0a\0\x25\x34\
-\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x0a\0\x47\x50\x4c\0\x9f\
-\xeb\x01\0\x18\0\0\0\0\0\0\0\x1c\x04\0\0\x1c\x04\0\0\x09\x05\0\0\0\0\0\0\0\0\0\
-\x02\x02\0\0\0\x01\0\0\0\x02\0\0\x04\x10\0\0\0\x13\0\0\0\x03\0\0\0\0\0\0\0\x18\
-\0\0\0\x04\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\x08\0\0\0\0\0\0\0\0\0\0\x02\x0d\0\
-\0\0\0\0\0\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x01\0\0\0\x20\0\0\0\0\0\0\x01\x04\
-\0\0\0\x20\0\0\x01\x24\0\0\0\x01\0\0\x0c\x05\0\0\0\xaf\0\0\0\x03\0\0\x04\x18\0\
-\0\0\xbd\0\0\0\x09\0\0\0\0\0\0\0\xc1\0\0\0\x0b\0\0\0\x40\0\0\0\xcc\0\0\0\x0b\0\
-\0\0\x80\0\0\0\0\0\0\0\0\0\0\x02\x0a\0\0\0\xd4\0\0\0\0\0\0\x07\0\0\0\0\xdd\0\0\
-\0\0\0\0\x08\x0c\0\0\0\xe3\0\0\0\0\0\0\x01\x08\0\0\0\x40\0\0\0\xa4\x01\0\0\x03\
-\0\0\x04\x18\0\0\0\xac\x01\0\0\x0e\0\0\0\0\0\0\0\xaf\x01\0\0\x11\0\0\0\x20\0\0\
-\0\xb4\x01\0\0\x0e\0\0\0\xa0\0\0\0\xc0\x01\0\0\0\0\0\x08\x0f\0\0\0\xc6\x01\0\0\
-\0\0\0\x01\x04\0\0\0\x20\0\0\0\xd3\x01\0\0\0\0\0\x01\x01\0\0\0\x08\0\0\x01\0\0\
-\0\0\0\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x10\0\0\0\xd8\x01\0\0\0\0\0\x01\x04\
-\0\0\0\x20\0\0\0\0\0\0\0\0\0\0\x02\x14\0\0\0\x3c\x02\0\0\x02\0\0\x04\x10\0\0\0\
-\x13\0\0\0\x03\0\0\0\0\0\0\0\x4f\x02\0\0\x15\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\
-\x18\0\0\0\0\0\0\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x13\0\0\0\x54\x02\0\0\x01\0\
-\0\x0c\x16\0\0\0\xa0\x02\0\0\x01\0\0\x04\x08\0\0\0\xa9\x02\0\0\x19\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\x02\x1a\0\0\0\xfa\x02\0\0\x06\0\0\x04\x38\0\0\0\xac\x01\0\0\
-\x0e\0\0\0\0\0\0\0\xaf\x01\0\0\x11\0\0\0\x20\0\0\0\x07\x03\0\0\x1b\0\0\0\xc0\0\
-\0\0\x18\x03\0\0\x15\0\0\0\0\x01\0\0\x21\x03\0\0\x1d\0\0\0\x40\x01\0\0\x2b\x03\
-\0\0\x1e\0\0\0\x80\x01\0\0\0\0\0\0\0\0\0\x02\x1c\0\0\0\0\0\0\0\0\0\0\x0a\x10\0\
-\0\0\0\0\0\0\0\0\0\x02\x1f\0\0\0\0\0\0\0\0\0\0\x02\x20\0\0\0\x75\x03\0\0\x02\0\
-\0\x04\x08\0\0\0\x83\x03\0\0\x0e\0\0\0\0\0\0\0\x8c\x03\0\0\x0e\0\0\0\x20\0\0\0\
-\x2b\x03\0\0\x03\0\0\x04\x18\0\0\0\x96\x03\0\0\x1b\0\0\0\0\0\0\0\x9e\x03\0\0\
-\x21\0\0\0\x40\0\0\0\xa4\x03\0\0\x23\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\x02\x22\0\0\
-\0\0\0\0\0\0\0\0\x02\x24\0\0\0\xa8\x03\0\0\x01\0\0\x04\x04\0\0\0\xb3\x03\0\0\
-\x0e\0\0\0\0\0\0\0\x1c\x04\0\0\x01\0\0\x04\x04\0\0\0\x25\x04\0\0\x0e\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x23\0\0\0\x9b\x04\0\0\0\0\0\
-\x0e\x25\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x0e\0\0\0\
-\xaf\x04\0\0\0\0\0\x0e\x27\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\
-\x12\0\0\0\x20\0\0\0\xc5\x04\0\0\0\0\0\x0e\x29\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\
-\0\0\0\0\x1c\0\0\0\x12\0\0\0\x11\0\0\0\xda\x04\0\0\0\0\0\x0e\x2b\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x04\0\0\0\xf1\x04\0\0\0\0\0\x0e\
-\x2d\0\0\0\x01\0\0\0\xf9\x04\0\0\x04\0\0\x0f\0\0\0\0\x26\0\0\0\0\0\0\0\x23\0\0\
-\0\x28\0\0\0\x23\0\0\0\x0e\0\0\0\x2a\0\0\0\x31\0\0\0\x20\0\0\0\x2c\0\0\0\x51\0\
-\0\0\x11\0\0\0\x01\x05\0\0\x01\0\0\x0f\0\0\0\0\x2e\0\0\0\0\0\0\0\x04\0\0\0\0\
-\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x6d\x65\
-\x74\x61\0\x6d\x61\x70\0\x63\x74\x78\0\x69\x6e\x74\0\x64\x75\x6d\x70\x5f\x62\
-\x70\x66\x5f\x6d\x61\x70\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x6d\x61\x70\0\
-\x30\x3a\x30\0\x2f\x68\x6f\x6d\x65\x2f\x61\x6c\x72\x75\x61\x2f\x62\x75\x69\x6c\
-\x64\x2f\x6c\x69\x6e\x75\x78\x2f\x6b\x65\x72\x6e\x65\x6c\x2f\x62\x70\x66\x2f\
-\x70\x72\x65\x6c\x6f\x61\x64\x2f\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2f\x69\
-\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\x2e\x63\0\x09\x73\x74\x72\x75\
-\x63\x74\x20\x73\x65\x71\x5f\x66\x69\x6c\x65\x20\x2a\x73\x65\x71\x20\x3d\x20\
-\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x3b\0\x62\x70\x66\x5f\
-\x69\x74\x65\x72\x5f\x6d\x65\x74\x61\0\x73\x65\x71\0\x73\x65\x73\x73\x69\x6f\
-\x6e\x5f\x69\x64\0\x73\x65\x71\x5f\x6e\x75\x6d\0\x73\x65\x71\x5f\x66\x69\x6c\
-\x65\0\x5f\x5f\x75\x36\x34\0\x6c\x6f\x6e\x67\x20\x6c\x6f\x6e\x67\x20\x75\x6e\
-\x73\x69\x67\x6e\x65\x64\x20\x69\x6e\x74\0\x30\x3a\x31\0\x09\x73\x74\x72\x75\
-\x63\x74\x20\x62\x70\x66\x5f\x6d\x61\x70\x20\x2a\x6d\x61\x70\x20\x3d\x20\x63\
-\x74\x78\x2d\x3e\x6d\x61\x70\x3b\0\x09\x69\x66\x20\x28\x21\x6d\x61\x70\x29\0\
-\x30\x3a\x32\0\x09\x5f\x5f\x75\x36\x34\x20\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\
-\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x5f\x6e\x75\x6d\
-\x3b\0\x09\x69\x66\x20\x28\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x3d\x20\x30\x29\
-\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\
-\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\
-\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x5c\
-\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x6d\x61\x70\0\x69\x64\0\x6e\x61\x6d\x65\0\
-\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\0\x5f\x5f\x75\x33\x32\0\x75\x6e\
-\x73\x69\x67\x6e\x65\x64\x20\x69\x6e\x74\0\x63\x68\x61\x72\0\x5f\x5f\x41\x52\
-\x52\x41\x59\x5f\x53\x49\x5a\x45\x5f\x54\x59\x50\x45\x5f\x5f\0\x09\x42\x50\x46\
-\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\
-\x34\x75\x20\x25\x2d\x31\x36\x73\x25\x36\x64\x5c\x6e\x22\x2c\x20\x6d\x61\x70\
-\x2d\x3e\x69\x64\x2c\x20\x6d\x61\x70\x2d\x3e\x6e\x61\x6d\x65\x2c\x20\x6d\x61\
-\x70\x2d\x3e\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x29\x3b\0\x7d\0\x62\
-\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x70\x72\
-\x6f\x67\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x69\x74\x65\
-\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x09\x73\x74\x72\x75\x63\x74\x20\x62\
-\x70\x66\x5f\x70\x72\x6f\x67\x20\x2a\x70\x72\x6f\x67\x20\x3d\x20\x63\x74\x78\
-\x2d\x3e\x70\x72\x6f\x67\x3b\0\x09\x69\x66\x20\x28\x21\x70\x72\x6f\x67\x29\0\
-\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x61\x75\x78\0\x09\x61\x75\x78\x20\x3d\x20\
-\x70\x72\x6f\x67\x2d\x3e\x61\x75\x78\x3b\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\
-\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\
-\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\
-\x74\x61\x63\x68\x65\x64\x5c\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x70\x72\x6f\x67\
-\x5f\x61\x75\x78\0\x61\x74\x74\x61\x63\x68\x5f\x66\x75\x6e\x63\x5f\x6e\x61\x6d\
-\x65\0\x64\x73\x74\x5f\x70\x72\x6f\x67\0\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\
-\x62\x74\x66\0\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\
-\x73\x65\x71\x2c\x20\x22\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\
-\x25\x73\x5c\x6e\x22\x2c\x20\x61\x75\x78\x2d\x3e\x69\x64\x2c\0\x30\x3a\x34\0\
-\x30\x3a\x35\0\x09\x69\x66\x20\x28\x21\x62\x74\x66\x29\0\x62\x70\x66\x5f\x66\
-\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x69\x6e\x73\x6e\x5f\x6f\x66\x66\0\x74\x79\
-\x70\x65\x5f\x69\x64\0\x30\0\x73\x74\x72\x69\x6e\x67\x73\0\x74\x79\x70\x65\x73\
-\0\x68\x64\x72\0\x62\x74\x66\x5f\x68\x65\x61\x64\x65\x72\0\x73\x74\x72\x5f\x6c\
-\x65\x6e\0\x09\x74\x79\x70\x65\x73\x20\x3d\x20\x62\x74\x66\x2d\x3e\x74\x79\x70\
-\x65\x73\x3b\0\x09\x62\x70\x66\x5f\x70\x72\x6f\x62\x65\x5f\x72\x65\x61\x64\x5f\
-\x6b\x65\x72\x6e\x65\x6c\x28\x26\x74\x2c\x20\x73\x69\x7a\x65\x6f\x66\x28\x74\
-\x29\x2c\x20\x74\x79\x70\x65\x73\x20\x2b\x20\x62\x74\x66\x5f\x69\x64\x29\x3b\0\
-\x09\x73\x74\x72\x20\x3d\x20\x62\x74\x66\x2d\x3e\x73\x74\x72\x69\x6e\x67\x73\
-\x3b\0\x62\x74\x66\x5f\x74\x79\x70\x65\0\x6e\x61\x6d\x65\x5f\x6f\x66\x66\0\x09\
-\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3d\x20\x42\x50\x46\x5f\x43\x4f\x52\x45\
-\x5f\x52\x45\x41\x44\x28\x74\x2c\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x29\x3b\0\
-\x30\x3a\x32\x3a\x30\0\x09\x69\x66\x20\x28\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\
-\x3e\x3d\x20\x62\x74\x66\x2d\x3e\x68\x64\x72\x2e\x73\x74\x72\x5f\x6c\x65\x6e\
-\x29\0\x09\x72\x65\x74\x75\x72\x6e\x20\x73\x74\x72\x20\x2b\x20\x6e\x61\x6d\x65\
-\x5f\x6f\x66\x66\x3b\0\x30\x3a\x33\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\
-\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\
-\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x31\0\x64\x75\x6d\x70\x5f\x62\x70\x66\
-\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\
-\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x32\0\x4c\x49\x43\x45\
-\x4e\x53\x45\0\x2e\x72\x6f\x64\x61\x74\x61\0\x6c\x69\x63\x65\x6e\x73\x65\0\x9f\
-\xeb\x01\0\x20\0\0\0\0\0\0\0\x24\0\0\0\x24\0\0\0\x44\x02\0\0\x68\x02\0\0\xa4\
-\x01\0\0\x08\0\0\0\x31\0\0\0\x01\0\0\0\0\0\0\0\x07\0\0\0\x62\x02\0\0\x01\0\0\0\
-\0\0\0\0\x17\0\0\0\x10\0\0\0\x31\0\0\0\x09\0\0\0\0\0\0\0\x42\0\0\0\x87\0\0\0\
-\x1e\x40\x01\0\x08\0\0\0\x42\0\0\0\x87\0\0\0\x24\x40\x01\0\x10\0\0\0\x42\0\0\0\
-\xfe\0\0\0\x1d\x48\x01\0\x18\0\0\0\x42\0\0\0\x1f\x01\0\0\x06\x50\x01\0\x20\0\0\
-\0\x42\0\0\0\x2e\x01\0\0\x1d\x44\x01\0\x28\0\0\0\x42\0\0\0\x53\x01\0\0\x06\x5c\
-\x01\0\x38\0\0\0\x42\0\0\0\x66\x01\0\0\x03\x60\x01\0\x70\0\0\0\x42\0\0\0\xec\
-\x01\0\0\x02\x68\x01\0\xf0\0\0\0\x42\0\0\0\x3a\x02\0\0\x01\x70\x01\0\x62\x02\0\
-\0\x1a\0\0\0\0\0\0\0\x42\0\0\0\x87\0\0\0\x1e\x84\x01\0\x08\0\0\0\x42\0\0\0\x87\
-\0\0\0\x24\x84\x01\0\x10\0\0\0\x42\0\0\0\x70\x02\0\0\x1f\x8c\x01\0\x18\0\0\0\
-\x42\0\0\0\x94\x02\0\0\x06\x98\x01\0\x20\0\0\0\x42\0\0\0\xad\x02\0\0\x0e\xa4\
-\x01\0\x28\0\0\0\x42\0\0\0\x2e\x01\0\0\x1d\x88\x01\0\x30\0\0\0\x42\0\0\0\x53\
-\x01\0\0\x06\xa8\x01\0\x40\0\0\0\x42\0\0\0\xbf\x02\0\0\x03\xac\x01\0\x80\0\0\0\
-\x42\0\0\0\x2f\x03\0\0\x02\xb4\x01\0\xb8\0\0\0\x42\0\0\0\x6a\x03\0\0\x06\x08\
-\x01\0\xd0\0\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\xd8\0\0\0\x42\0\0\0\xbb\x03\0\0\x0f\
-\x14\x01\0\xe0\0\0\0\x42\0\0\0\xd0\x03\0\0\x2d\x18\x01\0\xf0\0\0\0\x42\0\0\0\
-\x07\x04\0\0\x0d\x10\x01\0\0\x01\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\x08\x01\0\0\x42\
-\0\0\0\xd0\x03\0\0\x02\x18\x01\0\x20\x01\0\0\x42\0\0\0\x2e\x04\0\0\x0d\x1c\x01\
-\0\x38\x01\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\x40\x01\0\0\x42\0\0\0\x2e\x04\0\0\x0d\
-\x1c\x01\0\x58\x01\0\0\x42\0\0\0\x2e\x04\0\0\x0d\x1c\x01\0\x60\x01\0\0\x42\0\0\
-\0\x5c\x04\0\0\x1b\x20\x01\0\x68\x01\0\0\x42\0\0\0\x5c\x04\0\0\x06\x20\x01\0\
-\x70\x01\0\0\x42\0\0\0\x7f\x04\0\0\x0d\x28\x01\0\x78\x01\0\0\x42\0\0\0\0\0\0\0\
-\0\0\0\0\x80\x01\0\0\x42\0\0\0\x2f\x03\0\0\x02\xb4\x01\0\xf8\x01\0\0\x42\0\0\0\
-\x3a\x02\0\0\x01\xc4\x01\0\x10\0\0\0\x31\0\0\0\x07\0\0\0\0\0\0\0\x02\0\0\0\x3e\
-\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\x02\0\0\0\xfa\0\
-\0\0\0\0\0\0\x20\0\0\0\x08\0\0\0\x2a\x01\0\0\0\0\0\0\x70\0\0\0\x0d\0\0\0\x3e\0\
-\0\0\0\0\0\0\x80\0\0\0\x0d\0\0\0\xfa\0\0\0\0\0\0\0\xa0\0\0\0\x0d\0\0\0\x2a\x01\
-\0\0\0\0\0\0\x62\x02\0\0\x12\0\0\0\0\0\0\0\x14\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\
-\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\x14\0\0\0\xfa\0\0\0\0\0\0\0\x20\0\0\0\
-\x18\0\0\0\x3e\0\0\0\0\0\0\0\x28\0\0\0\x08\0\0\0\x2a\x01\0\0\0\0\0\0\x80\0\0\0\
-\x1a\0\0\0\x3e\0\0\0\0\0\0\0\x90\0\0\0\x1a\0\0\0\xfa\0\0\0\0\0\0\0\xa8\0\0\0\
-\x1a\0\0\0\x62\x03\0\0\0\0\0\0\xb0\0\0\0\x1a\0\0\0\x66\x03\0\0\0\0\0\0\xc0\0\0\
-\0\x1f\0\0\0\x94\x03\0\0\0\0\0\0\xd8\0\0\0\x20\0\0\0\xfa\0\0\0\0\0\0\0\xf0\0\0\
-\0\x20\0\0\0\x3e\0\0\0\0\0\0\0\x18\x01\0\0\x24\0\0\0\x3e\0\0\0\0\0\0\0\x50\x01\
-\0\0\x1a\0\0\0\xfa\0\0\0\0\0\0\0\x60\x01\0\0\x20\0\0\0\x56\x04\0\0\0\0\0\0\x88\
-\x01\0\0\x1a\0\0\0\x2a\x01\0\0\0\0\0\0\x98\x01\0\0\x1a\0\0\0\x97\x04\0\0\0\0\0\
-\0\xa0\x01\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\x91\0\0\0\x04\0\xf1\xff\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xe6\0\0\
-\0\0\0\x02\0\x70\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xd8\0\0\0\0\0\x02\0\xf0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\xdf\0\0\0\0\0\x03\0\x78\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\xd1\0\0\0\0\0\x03\0\x80\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xca\0\0\0\0\0\x03\0\
-\xf8\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x14\0\0\0\x01\0\x04\0\0\0\0\0\0\0\0\0\x23\
-\0\0\0\0\0\0\0\x04\x01\0\0\x01\0\x04\0\x23\0\0\0\0\0\0\0\x0e\0\0\0\0\0\0\0\x28\
-\0\0\0\x01\0\x04\0\x31\0\0\0\0\0\0\0\x20\0\0\0\0\0\0\0\xed\0\0\0\x01\0\x04\0\
-\x51\0\0\0\0\0\0\0\x11\0\0\0\0\0\0\0\0\0\0\0\x03\0\x02\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\x03\0\x03\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\
-\x04\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xc2\0\0\0\x11\0\x05\0\0\0\0\0\0\0\0\0\
-\x04\0\0\0\0\0\0\0\x3d\0\0\0\x12\0\x02\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\x5b\
-\0\0\0\x12\0\x03\0\0\0\0\0\0\0\0\0\x08\x02\0\0\0\0\0\0\x48\0\0\0\0\0\0\0\x01\0\
-\0\0\x0d\0\0\0\xc8\0\0\0\0\0\0\0\x01\0\0\0\x0d\0\0\0\x50\0\0\0\0\0\0\0\x01\0\0\
-\0\x0d\0\0\0\xd0\x01\0\0\0\0\0\0\x01\0\0\0\x0d\0\0\0\xf0\x03\0\0\0\0\0\0\x0a\0\
-\0\0\x0d\0\0\0\xfc\x03\0\0\0\0\0\0\x0a\0\0\0\x0d\0\0\0\x08\x04\0\0\0\0\0\0\x0a\
-\0\0\0\x0d\0\0\0\x14\x04\0\0\0\0\0\0\x0a\0\0\0\x0d\0\0\0\x2c\x04\0\0\0\0\0\0\0\
-\0\0\0\x0e\0\0\0\x2c\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x3c\0\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x50\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x60\0\0\0\0\0\0\0\0\0\0\0\x0b\0\
-\0\0\x70\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\
-\x90\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xa0\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xb0\0\
-\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xc0\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xd0\0\0\0\0\
-\0\0\0\0\0\0\0\x0b\0\0\0\xe8\0\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xf8\0\0\0\0\0\0\0\
-\0\0\0\0\x0c\0\0\0\x08\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x18\x01\0\0\0\0\0\0\0\
-\0\0\0\x0c\0\0\0\x28\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x38\x01\0\0\0\0\0\0\0\0\
-\0\0\x0c\0\0\0\x48\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x58\x01\0\0\0\0\0\0\0\0\0\
-\0\x0c\0\0\0\x68\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x78\x01\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x88\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x98\x01\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\xa8\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xb8\x01\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\xc8\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xd8\x01\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\xe8\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xf8\x01\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x08\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x18\x02\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x28\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x38\x02\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x48\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x58\x02\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x68\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x78\x02\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x94\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xa4\x02\0\0\0\0\0\0\0\0\0\0\
-\x0b\0\0\0\xb4\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xc4\x02\0\0\0\0\0\0\0\0\0\0\
-\x0b\0\0\0\xd4\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xe4\x02\0\0\0\0\0\0\0\0\0\0\
-\x0b\0\0\0\xf4\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x0c\x03\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x1c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x2c\x03\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x3c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x4c\x03\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x5c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x6c\x03\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x7c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x8c\x03\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x9c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xac\x03\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\xbc\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xcc\x03\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\xdc\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xec\x03\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\xfc\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x0c\x04\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x1c\x04\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x4d\x4e\x40\x41\x42\x43\x4c\0\
-\x2e\x74\x65\x78\x74\0\x2e\x72\x65\x6c\x2e\x42\x54\x46\x2e\x65\x78\x74\0\x64\
-\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\
-\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\0\
-\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x2e\x72\x65\x6c\x69\x74\x65\
-\x72\x2f\x62\x70\x66\x5f\x6d\x61\x70\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\
-\x72\x6f\x67\0\x2e\x72\x65\x6c\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\
-\x67\0\x2e\x6c\x6c\x76\x6d\x5f\x61\x64\x64\x72\x73\x69\x67\0\x6c\x69\x63\x65\
-\x6e\x73\x65\0\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\x2e\x63\0\
-\x2e\x73\x74\x72\x74\x61\x62\0\x2e\x73\x79\x6d\x74\x61\x62\0\x2e\x72\x6f\x64\
-\x61\x74\x61\0\x2e\x72\x65\x6c\x2e\x42\x54\x46\0\x4c\x49\x43\x45\x4e\x53\x45\0\
-\x4c\x42\x42\x31\x5f\x37\0\x4c\x42\x42\x31\x5f\x36\0\x4c\x42\x42\x30\x5f\x34\0\
-\x4c\x42\x42\x31\x5f\x33\0\x4c\x42\x42\x30\x5f\x33\0\x64\x75\x6d\x70\x5f\x62\
-\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x32\0\x64\x75\x6d\
-\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x31\0\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\x01\0\0\
-\0\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\0\0\0\x04\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x4e\0\0\0\x01\0\0\0\x06\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\x40\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x08\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\x6d\0\0\0\x01\0\0\0\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\x40\x01\0\0\0\0\0\0\x08\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\xb1\0\0\0\x01\0\0\0\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x48\x03\0\
-\0\0\0\0\0\x62\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\x89\0\0\0\x01\0\0\0\x03\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xaa\x03\0\0\0\0\0\0\x04\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xbd\0\0\0\x01\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xae\x03\0\0\0\0\0\0\x3d\x09\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x01\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\0\xeb\x0c\0\0\0\0\0\0\x2c\x04\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xa9\0\0\0\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\x18\x11\0\0\0\0\0\0\x98\x01\0\0\0\0\0\0\x0e\0\0\0\x0e\0\0\0\x08\0\0\
-\0\0\0\0\0\x18\0\0\0\0\0\0\0\x4a\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\xb0\x12\0\0\0\0\0\0\x20\0\0\0\0\0\0\0\x08\0\0\0\x02\0\0\0\x08\0\0\0\0\0\0\0\
-\x10\0\0\0\0\0\0\0\x69\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xd0\x12\
-\0\0\0\0\0\0\x20\0\0\0\0\0\0\0\x08\0\0\0\x03\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\
-\0\0\0\0\xb9\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xf0\x12\0\0\0\0\0\
-\0\x50\0\0\0\0\0\0\0\x08\0\0\0\x06\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\
-\x07\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x40\x13\0\0\0\0\0\0\xe0\
-\x03\0\0\0\0\0\0\x08\0\0\0\x07\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\x7b\0\
-\0\0\x03\x4c\xff\x6f\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\0\0\x20\x17\0\0\0\0\0\0\x07\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xa1\0\0\0\x03\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x27\x17\0\0\0\0\0\0\x1a\x01\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0";
-
-       return 0;
-err:
-       bpf_object__destroy_skeleton(s);
-       return -1;
-}
-
-#endif /* __ITERATORS_BPF_SKEL_H__ */
index fa4505f..72ce1ed 100644 (file)
@@ -556,16 +556,14 @@ static unsigned long bpf_map_memory_footprint(const struct bpf_map *map)
 
 static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
 {
-       const struct bpf_map *map = filp->private_data;
-       const struct bpf_array *array;
+       struct bpf_map *map = filp->private_data;
        u32 type = 0, jited = 0;
 
-       if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) {
-               array = container_of(map, struct bpf_array, map);
-               spin_lock(&array->aux->owner.lock);
-               type  = array->aux->owner.type;
-               jited = array->aux->owner.jited;
-               spin_unlock(&array->aux->owner.lock);
+       if (map_type_contains_progs(map)) {
+               spin_lock(&map->owner.lock);
+               type  = map->owner.type;
+               jited = map->owner.jited;
+               spin_unlock(&map->owner.lock);
        }
 
        seq_printf(m,
@@ -874,6 +872,7 @@ static int map_create(union bpf_attr *attr)
        atomic64_set(&map->refcnt, 1);
        atomic64_set(&map->usercnt, 1);
        mutex_init(&map->freeze_mutex);
+       spin_lock_init(&map->owner.lock);
 
        map->spin_lock_off = -EINVAL;
        map->timer_off = -EINVAL;
@@ -2217,7 +2216,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
                                 BPF_F_ANY_ALIGNMENT |
                                 BPF_F_TEST_STATE_FREQ |
                                 BPF_F_SLEEPABLE |
-                                BPF_F_TEST_RND_HI32))
+                                BPF_F_TEST_RND_HI32 |
+                                BPF_F_XDP_HAS_FRAGS))
                return -EINVAL;
 
        if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
@@ -2303,6 +2303,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
        prog->aux->dst_prog = dst_prog;
        prog->aux->offload_requested = !!attr->prog_ifindex;
        prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE;
+       prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS;
 
        err = security_bpf_prog_alloc(prog->aux);
        if (err)
@@ -3318,6 +3319,11 @@ static int bpf_prog_query(const union bpf_attr *attr,
        case BPF_FLOW_DISSECTOR:
        case BPF_SK_LOOKUP:
                return netns_bpf_prog_query(attr, uattr);
+       case BPF_SK_SKB_STREAM_PARSER:
+       case BPF_SK_SKB_STREAM_VERDICT:
+       case BPF_SK_MSG_VERDICT:
+       case BPF_SK_SKB_VERDICT:
+               return sock_map_bpf_prog_query(attr, uattr);
        default:
                return -EINVAL;
        }
index 5e7edf9..7224691 100644 (file)
@@ -213,7 +213,7 @@ static void __bpf_tramp_image_put_deferred(struct work_struct *work)
        im = container_of(work, struct bpf_tramp_image, work);
        bpf_image_ksym_del(&im->ksym);
        bpf_jit_free_exec(im->image);
-       bpf_jit_uncharge_modmem(1);
+       bpf_jit_uncharge_modmem(PAGE_SIZE);
        percpu_ref_exit(&im->pcref);
        kfree_rcu(im, rcu);
 }
@@ -310,7 +310,7 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx)
        if (!im)
                goto out;
 
-       err = bpf_jit_charge_modmem(1);
+       err = bpf_jit_charge_modmem(PAGE_SIZE);
        if (err)
                goto out_free_im;
 
@@ -332,7 +332,7 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx)
 out_free_image:
        bpf_jit_free_exec(im->image);
 out_uncharge:
-       bpf_jit_uncharge_modmem(1);
+       bpf_jit_uncharge_modmem(PAGE_SIZE);
 out_free_im:
        kfree(im);
 out:
index a39eede..bbef86c 100644 (file)
@@ -452,7 +452,8 @@ static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type)
 {
        return base_type(type) == PTR_TO_SOCKET ||
                base_type(type) == PTR_TO_TCP_SOCK ||
-               base_type(type) == PTR_TO_MEM;
+               base_type(type) == PTR_TO_MEM ||
+               base_type(type) == PTR_TO_BTF_ID;
 }
 
 static bool type_is_rdonly_mem(u32 type)
@@ -535,7 +536,7 @@ static bool is_cmpxchg_insn(const struct bpf_insn *insn)
 static const char *reg_type_str(struct bpf_verifier_env *env,
                                enum bpf_reg_type type)
 {
-       char postfix[16] = {0}, prefix[16] = {0};
+       char postfix[16] = {0}, prefix[32] = {0};
        static const char * const str[] = {
                [NOT_INIT]              = "?",
                [SCALAR_VALUE]          = "inv",
@@ -569,9 +570,11 @@ static const char *reg_type_str(struct bpf_verifier_env *env,
        }
 
        if (type & MEM_RDONLY)
-               strncpy(prefix, "rdonly_", 16);
+               strncpy(prefix, "rdonly_", 32);
        if (type & MEM_ALLOC)
-               strncpy(prefix, "alloc_", 16);
+               strncpy(prefix, "alloc_", 32);
+       if (type & MEM_USER)
+               strncpy(prefix, "user_", 32);
 
        snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s%s",
                 prefix, str[base_type(type)], postfix);
@@ -1546,14 +1549,15 @@ static void mark_reg_not_init(struct bpf_verifier_env *env,
 static void mark_btf_ld_reg(struct bpf_verifier_env *env,
                            struct bpf_reg_state *regs, u32 regno,
                            enum bpf_reg_type reg_type,
-                           struct btf *btf, u32 btf_id)
+                           struct btf *btf, u32 btf_id,
+                           enum bpf_type_flag flag)
 {
        if (reg_type == SCALAR_VALUE) {
                mark_reg_unknown(env, regs, regno);
                return;
        }
        mark_reg_known_zero(env, regs, regno);
-       regs[regno].type = PTR_TO_BTF_ID;
+       regs[regno].type = PTR_TO_BTF_ID | flag;
        regs[regno].btf = btf;
        regs[regno].btf_id = btf_id;
 }
@@ -1743,7 +1747,7 @@ find_kfunc_desc(const struct bpf_prog *prog, u32 func_id, u16 offset)
 }
 
 static struct btf *__find_kfunc_desc_btf(struct bpf_verifier_env *env,
-                                        s16 offset, struct module **btf_modp)
+                                        s16 offset)
 {
        struct bpf_kfunc_btf kf_btf = { .offset = offset };
        struct bpf_kfunc_btf_tab *tab;
@@ -1797,8 +1801,6 @@ static struct btf *__find_kfunc_desc_btf(struct bpf_verifier_env *env,
                sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
                     kfunc_btf_cmp_by_off, NULL);
        }
-       if (btf_modp)
-               *btf_modp = b->module;
        return b->btf;
 }
 
@@ -1815,8 +1817,7 @@ void bpf_free_kfunc_btf_tab(struct bpf_kfunc_btf_tab *tab)
 }
 
 static struct btf *find_kfunc_desc_btf(struct bpf_verifier_env *env,
-                                      u32 func_id, s16 offset,
-                                      struct module **btf_modp)
+                                      u32 func_id, s16 offset)
 {
        if (offset) {
                if (offset < 0) {
@@ -1827,7 +1828,7 @@ static struct btf *find_kfunc_desc_btf(struct bpf_verifier_env *env,
                        return ERR_PTR(-EINVAL);
                }
 
-               return __find_kfunc_desc_btf(env, offset, btf_modp);
+               return __find_kfunc_desc_btf(env, offset);
        }
        return btf_vmlinux ?: ERR_PTR(-ENOENT);
 }
@@ -1890,7 +1891,7 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
                prog_aux->kfunc_btf_tab = btf_tab;
        }
 
-       desc_btf = find_kfunc_desc_btf(env, func_id, offset, NULL);
+       desc_btf = find_kfunc_desc_btf(env, func_id, offset);
        if (IS_ERR(desc_btf)) {
                verbose(env, "failed to find BTF for kernel function\n");
                return PTR_ERR(desc_btf);
@@ -2351,7 +2352,7 @@ static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn)
        if (insn->src_reg != BPF_PSEUDO_KFUNC_CALL)
                return NULL;
 
-       desc_btf = find_kfunc_desc_btf(data, insn->imm, insn->off, NULL);
+       desc_btf = find_kfunc_desc_btf(data, insn->imm, insn->off);
        if (IS_ERR(desc_btf))
                return "<error>";
 
@@ -3498,11 +3499,6 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
 
 #define MAX_PACKET_OFF 0xffff
 
-static enum bpf_prog_type resolve_prog_type(struct bpf_prog *prog)
-{
-       return prog->aux->dst_prog ? prog->aux->dst_prog->type : prog->type;
-}
-
 static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
                                       const struct bpf_call_arg_meta *meta,
                                       enum bpf_access_type t)
@@ -4159,6 +4155,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
        struct bpf_reg_state *reg = regs + regno;
        const struct btf_type *t = btf_type_by_id(reg->btf, reg->btf_id);
        const char *tname = btf_name_by_offset(reg->btf, t->name_off);
+       enum bpf_type_flag flag = 0;
        u32 btf_id;
        int ret;
 
@@ -4178,9 +4175,16 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
                return -EACCES;
        }
 
+       if (reg->type & MEM_USER) {
+               verbose(env,
+                       "R%d is ptr_%s access user memory: off=%d\n",
+                       regno, tname, off);
+               return -EACCES;
+       }
+
        if (env->ops->btf_struct_access) {
                ret = env->ops->btf_struct_access(&env->log, reg->btf, t,
-                                                 off, size, atype, &btf_id);
+                                                 off, size, atype, &btf_id, &flag);
        } else {
                if (atype != BPF_READ) {
                        verbose(env, "only read is supported\n");
@@ -4188,14 +4192,14 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
                }
 
                ret = btf_struct_access(&env->log, reg->btf, t, off, size,
-                                       atype, &btf_id);
+                                       atype, &btf_id, &flag);
        }
 
        if (ret < 0)
                return ret;
 
        if (atype == BPF_READ && value_regno >= 0)
-               mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id);
+               mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id, flag);
 
        return 0;
 }
@@ -4208,6 +4212,7 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env,
 {
        struct bpf_reg_state *reg = regs + regno;
        struct bpf_map *map = reg->map_ptr;
+       enum bpf_type_flag flag = 0;
        const struct btf_type *t;
        const char *tname;
        u32 btf_id;
@@ -4245,12 +4250,12 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env,
                return -EACCES;
        }
 
-       ret = btf_struct_access(&env->log, btf_vmlinux, t, off, size, atype, &btf_id);
+       ret = btf_struct_access(&env->log, btf_vmlinux, t, off, size, atype, &btf_id, &flag);
        if (ret < 0)
                return ret;
 
        if (value_regno >= 0)
-               mark_btf_ld_reg(env, regs, value_regno, ret, btf_vmlinux, btf_id);
+               mark_btf_ld_reg(env, regs, value_regno, ret, btf_vmlinux, btf_id, flag);
 
        return 0;
 }
@@ -4451,7 +4456,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
                if (err < 0)
                        return err;
 
-               err = check_ctx_access(env, insn_idx, off, size, t, &reg_type, &btf, &btf_id);
+               err = check_ctx_access(env, insn_idx, off, size, t, &reg_type, &btf,
+                                      &btf_id);
                if (err)
                        verbose_linfo(env, insn_idx, "; ");
                if (!err && t == BPF_READ && value_regno >= 0) {
@@ -4877,6 +4883,62 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
        }
 }
 
+static int check_mem_size_reg(struct bpf_verifier_env *env,
+                             struct bpf_reg_state *reg, u32 regno,
+                             bool zero_size_allowed,
+                             struct bpf_call_arg_meta *meta)
+{
+       int err;
+
+       /* This is used to refine r0 return value bounds for helpers
+        * that enforce this value as an upper bound on return values.
+        * See do_refine_retval_range() for helpers that can refine
+        * the return value. C type of helper is u32 so we pull register
+        * bound from umax_value however, if negative verifier errors
+        * out. Only upper bounds can be learned because retval is an
+        * int type and negative retvals are allowed.
+        */
+       if (meta)
+               meta->msize_max_value = reg->umax_value;
+
+       /* The register is SCALAR_VALUE; the access check
+        * happens using its boundaries.
+        */
+       if (!tnum_is_const(reg->var_off))
+               /* For unprivileged variable accesses, disable raw
+                * mode so that the program is required to
+                * initialize all the memory that the helper could
+                * just partially fill up.
+                */
+               meta = NULL;
+
+       if (reg->smin_value < 0) {
+               verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n",
+                       regno);
+               return -EACCES;
+       }
+
+       if (reg->umin_value == 0) {
+               err = check_helper_mem_access(env, regno - 1, 0,
+                                             zero_size_allowed,
+                                             meta);
+               if (err)
+                       return err;
+       }
+
+       if (reg->umax_value >= BPF_MAX_VAR_SIZ) {
+               verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
+                       regno);
+               return -EACCES;
+       }
+       err = check_helper_mem_access(env, regno - 1,
+                                     reg->umax_value,
+                                     zero_size_allowed, meta);
+       if (!err)
+               err = mark_chain_precision(env, regno);
+       return err;
+}
+
 int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
                   u32 regno, u32 mem_size)
 {
@@ -4900,6 +4962,28 @@ int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
        return check_helper_mem_access(env, regno, mem_size, true, NULL);
 }
 
+int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+                            u32 regno)
+{
+       struct bpf_reg_state *mem_reg = &cur_regs(env)[regno - 1];
+       bool may_be_null = type_may_be_null(mem_reg->type);
+       struct bpf_reg_state saved_reg;
+       int err;
+
+       WARN_ON_ONCE(regno < BPF_REG_2 || regno > BPF_REG_5);
+
+       if (may_be_null) {
+               saved_reg = *mem_reg;
+               mark_ptr_not_null_reg(mem_reg);
+       }
+
+       err = check_mem_size_reg(env, reg, regno, true, NULL);
+
+       if (may_be_null)
+               *mem_reg = saved_reg;
+       return err;
+}
+
 /* Implementation details:
  * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL
  * Two bpf_map_lookups (even with the same key) will have different reg->id.
@@ -5439,51 +5523,7 @@ skip_type_check:
        } else if (arg_type_is_mem_size(arg_type)) {
                bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO);
 
-               /* This is used to refine r0 return value bounds for helpers
-                * that enforce this value as an upper bound on return values.
-                * See do_refine_retval_range() for helpers that can refine
-                * the return value. C type of helper is u32 so we pull register
-                * bound from umax_value however, if negative verifier errors
-                * out. Only upper bounds can be learned because retval is an
-                * int type and negative retvals are allowed.
-                */
-               meta->msize_max_value = reg->umax_value;
-
-               /* The register is SCALAR_VALUE; the access check
-                * happens using its boundaries.
-                */
-               if (!tnum_is_const(reg->var_off))
-                       /* For unprivileged variable accesses, disable raw
-                        * mode so that the program is required to
-                        * initialize all the memory that the helper could
-                        * just partially fill up.
-                        */
-                       meta = NULL;
-
-               if (reg->smin_value < 0) {
-                       verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n",
-                               regno);
-                       return -EACCES;
-               }
-
-               if (reg->umin_value == 0) {
-                       err = check_helper_mem_access(env, regno - 1, 0,
-                                                     zero_size_allowed,
-                                                     meta);
-                       if (err)
-                               return err;
-               }
-
-               if (reg->umax_value >= BPF_MAX_VAR_SIZ) {
-                       verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
-                               regno);
-                       return -EACCES;
-               }
-               err = check_helper_mem_access(env, regno - 1,
-                                             reg->umax_value,
-                                             zero_size_allowed, meta);
-               if (!err)
-                       err = mark_chain_precision(env, regno);
+               err = check_mem_size_reg(env, reg, regno, zero_size_allowed, meta);
        } else if (arg_type_is_alloc_size(arg_type)) {
                if (!tnum_is_const(reg->var_off)) {
                        verbose(env, "R%d is not a known constant'\n",
@@ -6842,22 +6882,23 @@ static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno,
        }
 }
 
-static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn)
+static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+                           int *insn_idx_p)
 {
        const struct btf_type *t, *func, *func_proto, *ptr_type;
        struct bpf_reg_state *regs = cur_regs(env);
        const char *func_name, *ptr_type_name;
        u32 i, nargs, func_id, ptr_type_id;
-       struct module *btf_mod = NULL;
+       int err, insn_idx = *insn_idx_p;
        const struct btf_param *args;
        struct btf *desc_btf;
-       int err;
+       bool acq;
 
        /* skip for now, but return error when we find this in fixup_kfunc_call */
        if (!insn->imm)
                return 0;
 
-       desc_btf = find_kfunc_desc_btf(env, insn->imm, insn->off, &btf_mod);
+       desc_btf = find_kfunc_desc_btf(env, insn->imm, insn->off);
        if (IS_ERR(desc_btf))
                return PTR_ERR(desc_btf);
 
@@ -6866,23 +6907,43 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn)
        func_name = btf_name_by_offset(desc_btf, func->name_off);
        func_proto = btf_type_by_id(desc_btf, func->type);
 
-       if (!env->ops->check_kfunc_call ||
-           !env->ops->check_kfunc_call(func_id, btf_mod)) {
+       if (!btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog),
+                                     BTF_KFUNC_TYPE_CHECK, func_id)) {
                verbose(env, "calling kernel function %s is not allowed\n",
                        func_name);
                return -EACCES;
        }
 
+       acq = btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog),
+                                       BTF_KFUNC_TYPE_ACQUIRE, func_id);
+
        /* Check the arguments */
        err = btf_check_kfunc_arg_match(env, desc_btf, func_id, regs);
-       if (err)
+       if (err < 0)
                return err;
+       /* In case of release function, we get register number of refcounted
+        * PTR_TO_BTF_ID back from btf_check_kfunc_arg_match, do the release now
+        */
+       if (err) {
+               err = release_reference(env, regs[err].ref_obj_id);
+               if (err) {
+                       verbose(env, "kfunc %s#%d reference has not been acquired before\n",
+                               func_name, func_id);
+                       return err;
+               }
+       }
 
        for (i = 0; i < CALLER_SAVED_REGS; i++)
                mark_reg_not_init(env, regs, caller_saved[i]);
 
        /* Check return type */
        t = btf_type_skip_modifiers(desc_btf, func_proto->type, NULL);
+
+       if (acq && !btf_type_is_ptr(t)) {
+               verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n");
+               return -EINVAL;
+       }
+
        if (btf_type_is_scalar(t)) {
                mark_reg_unknown(env, regs, BPF_REG_0);
                mark_btf_func_reg_size(env, BPF_REG_0, t->size);
@@ -6901,7 +6962,21 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn)
                regs[BPF_REG_0].btf = desc_btf;
                regs[BPF_REG_0].type = PTR_TO_BTF_ID;
                regs[BPF_REG_0].btf_id = ptr_type_id;
+               if (btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog),
+                                             BTF_KFUNC_TYPE_RET_NULL, func_id)) {
+                       regs[BPF_REG_0].type |= PTR_MAYBE_NULL;
+                       /* For mark_ptr_or_null_reg, see 93c230e3f5bd6 */
+                       regs[BPF_REG_0].id = ++env->id_gen;
+               }
                mark_btf_func_reg_size(env, BPF_REG_0, sizeof(void *));
+               if (acq) {
+                       int id = acquire_reference_state(env, insn_idx);
+
+                       if (id < 0)
+                               return id;
+                       regs[BPF_REG_0].id = id;
+                       regs[BPF_REG_0].ref_obj_id = id;
+               }
        } /* else { add_kfunc_call() ensures it is btf_type_is_void(t) } */
 
        nargs = btf_type_vlen(func_proto);
@@ -11549,7 +11624,7 @@ static int do_check(struct bpf_verifier_env *env)
                                if (insn->src_reg == BPF_PSEUDO_CALL)
                                        err = check_func_call(env, insn, &env->insn_idx);
                                else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL)
-                                       err = check_kfunc_call(env, insn);
+                                       err = check_kfunc_call(env, insn, &env->insn_idx);
                                else
                                        err = check_helper_call(env, insn, &env->insn_idx);
                                if (err)
@@ -12992,6 +13067,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 
        prog->jited = 1;
        prog->bpf_func = func[0]->bpf_func;
+       prog->jited_len = func[0]->jited_len;
        prog->aux->func = func;
        prog->aux->func_cnt = env->subprog_cnt;
        bpf_prog_jit_attempt_done(prog);
index 21aa306..a2024ba 100644 (file)
@@ -1235,6 +1235,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
                return &bpf_get_task_stack_proto;
        case BPF_FUNC_copy_from_user:
                return prog->aux->sleepable ? &bpf_copy_from_user_proto : NULL;
+       case BPF_FUNC_copy_from_user_task:
+               return prog->aux->sleepable ? &bpf_copy_from_user_task_proto : NULL;
        case BPF_FUNC_snprintf_btf:
                return &bpf_snprintf_btf_proto;
        case BPF_FUNC_per_cpu_ptr:
@@ -1562,6 +1564,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {
 
 extern const struct bpf_func_proto bpf_skb_output_proto;
 extern const struct bpf_func_proto bpf_xdp_output_proto;
+extern const struct bpf_func_proto bpf_xdp_get_buff_len_trace_proto;
 
 BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args,
           struct bpf_map *, map, u64, flags)
@@ -1661,6 +1664,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
                return &bpf_sock_from_file_proto;
        case BPF_FUNC_get_socket_cookie:
                return &bpf_get_socket_ptr_cookie_proto;
+       case BPF_FUNC_xdp_get_buff_len:
+               return &bpf_xdp_get_buff_len_trace_proto;
 #endif
        case BPF_FUNC_seq_printf:
                return prog->expected_attach_type == BPF_TRACE_ITER ?
index 14b89aa..1555da6 100644 (file)
@@ -296,7 +296,7 @@ config DEBUG_INFO_DWARF4
 config DEBUG_INFO_DWARF5
        bool "Generate DWARF Version 5 debuginfo"
        depends on !CC_IS_CLANG || (CC_IS_CLANG && (AS_IS_LLVM || (AS_IS_GNU && AS_VERSION >= 23502)))
-       depends on !DEBUG_INFO_BTF
+       depends on !DEBUG_INFO_BTF || PAHOLE_VERSION >= 121
        help
          Generate DWARF v5 debug info. Requires binutils 2.35.2, gcc 5.0+ (gcc
          5.0+ accepts the -gdwarf-5 flag but only had partial support for some
@@ -323,7 +323,15 @@ config DEBUG_INFO_BTF
          DWARF type info into equivalent deduplicated BTF type info.
 
 config PAHOLE_HAS_SPLIT_BTF
-       def_bool $(success, test `$(PAHOLE) --version | sed -E 's/v([0-9]+)\.([0-9]+)/\1\2/'` -ge "119")
+       def_bool PAHOLE_VERSION >= 119
+
+config PAHOLE_HAS_BTF_TAG
+       def_bool PAHOLE_VERSION >= 123
+       depends on CC_IS_CLANG
+       help
+         Decide whether pahole emits btf_tag attributes (btf_type_tag and
+         btf_decl_tag) or not. Currently only clang compiler implements
+         these attributes, so make the config depend on CC_IS_CLANG.
 
 config DEBUG_INFO_BTF_MODULES
        def_bool y
index a6789c0..dc7b14a 100644 (file)
@@ -20,6 +20,7 @@ void ref_tracker_dir_exit(struct ref_tracker_dir *dir)
        unsigned long flags;
        bool leak = false;
 
+       dir->dead = true;
        spin_lock_irqsave(&dir->lock, flags);
        list_for_each_entry_safe(tracker, n, &dir->quarantine, head) {
                list_del(&tracker->head);
@@ -37,6 +38,7 @@ void ref_tracker_dir_exit(struct ref_tracker_dir *dir)
        spin_unlock_irqrestore(&dir->lock, flags);
        WARN_ON_ONCE(leak);
        WARN_ON_ONCE(refcount_read(&dir->untracked) != 1);
+       WARN_ON_ONCE(refcount_read(&dir->no_tracker) != 1);
 }
 EXPORT_SYMBOL(ref_tracker_dir_exit);
 
@@ -72,6 +74,12 @@ int ref_tracker_alloc(struct ref_tracker_dir *dir,
        gfp_t gfp_mask = gfp;
        unsigned long flags;
 
+       WARN_ON_ONCE(dir->dead);
+
+       if (!trackerp) {
+               refcount_inc(&dir->no_tracker);
+               return 0;
+       }
        if (gfp & __GFP_DIRECT_RECLAIM)
                gfp_mask |= __GFP_NOFAIL;
        *trackerp = tracker = kzalloc(sizeof(*tracker), gfp_mask);
@@ -81,7 +89,6 @@ int ref_tracker_alloc(struct ref_tracker_dir *dir,
                return -ENOMEM;
        }
        nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 1);
-       nr_entries = filter_irq_stacks(entries, nr_entries);
        tracker->alloc_stack_handle = stack_depot_save(entries, nr_entries, gfp);
 
        spin_lock_irqsave(&dir->lock, flags);
@@ -95,17 +102,23 @@ int ref_tracker_free(struct ref_tracker_dir *dir,
                     struct ref_tracker **trackerp)
 {
        unsigned long entries[REF_TRACKER_STACK_ENTRIES];
-       struct ref_tracker *tracker = *trackerp;
        depot_stack_handle_t stack_handle;
+       struct ref_tracker *tracker;
        unsigned int nr_entries;
        unsigned long flags;
 
+       WARN_ON_ONCE(dir->dead);
+
+       if (!trackerp) {
+               refcount_dec(&dir->no_tracker);
+               return 0;
+       }
+       tracker = *trackerp;
        if (!tracker) {
                refcount_dec(&dir->untracked);
                return -EEXIST;
        }
        nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 1);
-       nr_entries = filter_irq_stacks(entries, nr_entries);
        stack_handle = stack_depot_save(entries, nr_entries, GFP_ATOMIC);
 
        spin_lock_irqsave(&dir->lock, flags);
index a068757..7b3341c 100644 (file)
@@ -5,6 +5,7 @@
  * (C) 2015 Pengutronix, Alexander Aring <aar@pengutronix.de>
  */
 
+#include <linux/if_arp.h>
 #include <linux/module.h>
 
 #include <net/6lowpan.h>
index 9751207..b7c4d65 100644 (file)
@@ -116,7 +116,6 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route)
                return -ENOMEM;
        }
 
-       refcount_set(&ax25_rt->refcount, 1);
        ax25_rt->callsign     = route->dest_addr;
        ax25_rt->dev          = ax25_dev->dev;
        ax25_rt->digipeat     = NULL;
@@ -167,12 +166,12 @@ static int ax25_rt_del(struct ax25_routes_struct *route)
                    ax25cmp(&route->dest_addr, &s->callsign) == 0) {
                        if (ax25_route_list == s) {
                                ax25_route_list = s->next;
-                               ax25_put_route(s);
+                               __ax25_put_route(s);
                        } else {
                                for (t = ax25_route_list; t != NULL; t = t->next) {
                                        if (t->next == s) {
                                                t->next = s->next;
-                                               ax25_put_route(s);
+                                               __ax25_put_route(s);
                                                break;
                                        }
                                }
index f4004cf..9f311fd 100644 (file)
@@ -134,7 +134,7 @@ static u8 batadv_mcast_mla_rtr_flags_softif_get_ipv6(struct net_device *dev)
 {
        struct inet6_dev *in6_dev = __in6_dev_get(dev);
 
-       if (in6_dev && in6_dev->cnf.mc_forwarding)
+       if (in6_dev && atomic_read(&in6_dev->cnf.mc_forwarding))
                return BATADV_NO_FLAGS;
        else
                return BATADV_MCAST_WANT_NO_RTR6;
index 04ebe90..d106511 100644 (file)
@@ -689,6 +689,7 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst,
 
        bacpy(&conn->dst, dst);
        bacpy(&conn->src, &hdev->bdaddr);
+       conn->handle = HCI_CONN_HANDLE_UNSET;
        conn->hdev  = hdev;
        conn->type  = type;
        conn->role  = role;
index 2b7bd36..5bde0ec 100644 (file)
@@ -2503,6 +2503,7 @@ struct hci_dev *hci_alloc_dev_priv(int sizeof_priv)
        INIT_LIST_HEAD(&hdev->conn_hash.list);
        INIT_LIST_HEAD(&hdev->adv_instances);
        INIT_LIST_HEAD(&hdev->blocked_keys);
+       INIT_LIST_HEAD(&hdev->monitored_devices);
 
        INIT_LIST_HEAD(&hdev->local_codecs);
        INIT_WORK(&hdev->rx_work, hci_rx_work);
@@ -3666,8 +3667,8 @@ static void hci_scodata_packet(struct hci_dev *hdev, struct sk_buff *skb)
                sco_recv_scodata(conn, skb);
                return;
        } else {
-               bt_dev_err(hdev, "SCO packet for unknown connection handle %d",
-                          handle);
+               bt_dev_err_ratelimited(hdev, "SCO packet for unknown connection handle %d",
+                                      handle);
        }
 
        kfree_skb(skb);
index fc30f4c..63b9259 100644 (file)
@@ -3068,6 +3068,11 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, void *data,
        struct hci_ev_conn_complete *ev = data;
        struct hci_conn *conn;
 
+       if (__le16_to_cpu(ev->handle) > HCI_CONN_HANDLE_MAX) {
+               bt_dev_err(hdev, "Ignoring HCI_Connection_Complete for invalid handle");
+               return;
+       }
+
        bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
 
        hci_dev_lock(hdev);
@@ -3106,6 +3111,17 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, void *data,
                }
        }
 
+       /* The HCI_Connection_Complete event is only sent once per connection.
+        * Processing it more than once per connection can corrupt kernel memory.
+        *
+        * As the connection handle is set here for the first time, it indicates
+        * whether the connection is already set up.
+        */
+       if (conn->handle != HCI_CONN_HANDLE_UNSET) {
+               bt_dev_err(hdev, "Ignoring HCI_Connection_Complete for existing connection");
+               goto unlock;
+       }
+
        if (!ev->status) {
                conn->handle = __le16_to_cpu(ev->handle);
 
@@ -4534,7 +4550,7 @@ static void hci_inquiry_result_with_rssi_evt(struct hci_dev *hdev, void *edata,
                        if (!info) {
                                bt_dev_err(hdev, "Malformed HCI Event: 0x%2.2x",
                                           HCI_EV_INQUIRY_RESULT_WITH_RSSI);
-                               return;
+                               goto unlock;
                        }
 
                        bacpy(&data.bdaddr, &info->bdaddr);
@@ -4565,7 +4581,7 @@ static void hci_inquiry_result_with_rssi_evt(struct hci_dev *hdev, void *edata,
                        if (!info) {
                                bt_dev_err(hdev, "Malformed HCI Event: 0x%2.2x",
                                           HCI_EV_INQUIRY_RESULT_WITH_RSSI);
-                               return;
+                               goto unlock;
                        }
 
                        bacpy(&data.bdaddr, &info->bdaddr);
@@ -4587,7 +4603,7 @@ static void hci_inquiry_result_with_rssi_evt(struct hci_dev *hdev, void *edata,
                bt_dev_err(hdev, "Malformed HCI Event: 0x%2.2x",
                           HCI_EV_INQUIRY_RESULT_WITH_RSSI);
        }
-
+unlock:
        hci_dev_unlock(hdev);
 }
 
@@ -4661,6 +4677,24 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev, void *data,
        struct hci_ev_sync_conn_complete *ev = data;
        struct hci_conn *conn;
 
+       switch (ev->link_type) {
+       case SCO_LINK:
+       case ESCO_LINK:
+               break;
+       default:
+               /* As per Core 5.3 Vol 4 Part E 7.7.35 (p.2219), Link_Type
+                * for HCI_Synchronous_Connection_Complete is limited to
+                * either SCO or eSCO
+                */
+               bt_dev_err(hdev, "Ignoring connect complete event for invalid link type");
+               return;
+       }
+
+       if (__le16_to_cpu(ev->handle) > HCI_CONN_HANDLE_MAX) {
+               bt_dev_err(hdev, "Ignoring HCI_Sync_Conn_Complete for invalid handle");
+               return;
+       }
+
        bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
 
        hci_dev_lock(hdev);
@@ -4684,23 +4718,19 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev, void *data,
                        goto unlock;
        }
 
+       /* The HCI_Synchronous_Connection_Complete event is only sent once per connection.
+        * Processing it more than once per connection can corrupt kernel memory.
+        *
+        * As the connection handle is set here for the first time, it indicates
+        * whether the connection is already set up.
+        */
+       if (conn->handle != HCI_CONN_HANDLE_UNSET) {
+               bt_dev_err(hdev, "Ignoring HCI_Sync_Conn_Complete event for existing connection");
+               goto unlock;
+       }
+
        switch (ev->status) {
        case 0x00:
-               /* The synchronous connection complete event should only be
-                * sent once per new connection. Receiving a successful
-                * complete event when the connection status is already
-                * BT_CONNECTED means that the device is misbehaving and sent
-                * multiple complete event packets for the same new connection.
-                *
-                * Registering the device more than once can corrupt kernel
-                * memory, hence upon detecting this invalid event, we report
-                * an error and ignore the packet.
-                */
-               if (conn->state == BT_CONNECTED) {
-                       bt_dev_err(hdev, "Ignoring connect complete event for existing connection");
-                       goto unlock;
-               }
-
                conn->handle = __le16_to_cpu(ev->handle);
                conn->state  = BT_CONNECTED;
                conn->type   = ev->link_type;
@@ -5496,6 +5526,11 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status,
        struct smp_irk *irk;
        u8 addr_type;
 
+       if (handle > HCI_CONN_HANDLE_MAX) {
+               bt_dev_err(hdev, "Ignoring HCI_LE_Connection_Complete for invalid handle");
+               return;
+       }
+
        hci_dev_lock(hdev);
 
        /* All controllers implicitly stop advertising in the event of a
@@ -5537,6 +5572,17 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status,
                cancel_delayed_work(&conn->le_conn_timeout);
        }
 
+       /* The HCI_LE_Connection_Complete event is only sent once per connection.
+        * Processing it more than once per connection can corrupt kernel memory.
+        *
+        * As the connection handle is set here for the first time, it indicates
+        * whether the connection is already set up.
+        */
+       if (conn->handle != HCI_CONN_HANDLE_UNSET) {
+               bt_dev_err(hdev, "Ignoring HCI_Connection_Complete for existing connection");
+               goto unlock;
+       }
+
        le_conn_update_addr(conn, bdaddr, bdaddr_type, local_rpa);
 
        /* Lookup the identity address from the stored connection
@@ -6798,7 +6844,7 @@ static const struct hci_ev {
        HCI_EV(HCI_EV_NUM_COMP_BLOCKS, hci_num_comp_blocks_evt,
               sizeof(struct hci_ev_num_comp_blocks)),
        /* [0xff = HCI_EV_VENDOR] */
-       HCI_EV(HCI_EV_VENDOR, msft_vendor_evt, 0),
+       HCI_EV_VL(HCI_EV_VENDOR, msft_vendor_evt, 0, HCI_MAX_EVENT_SIZE),
 };
 
 static void hci_event_func(struct hci_dev *hdev, u8 event, struct sk_buff *skb,
@@ -6823,8 +6869,9 @@ static void hci_event_func(struct hci_dev *hdev, u8 event, struct sk_buff *skb,
         * decide if that is acceptable.
         */
        if (skb->len > ev->max_len)
-               bt_dev_warn(hdev, "unexpected event 0x%2.2x length: %u > %u",
-                           event, skb->len, ev->max_len);
+               bt_dev_warn_ratelimited(hdev,
+                                       "unexpected event 0x%2.2x length: %u > %u",
+                                       event, skb->len, ev->max_len);
 
        data = hci_ev_skb_pull(hdev, skb, event, ev->min_len);
        if (!data)
index 0feb68f..6e71aa6 100644 (file)
@@ -382,6 +382,9 @@ int hci_cmd_sync_queue(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
 {
        struct hci_cmd_sync_work_entry *entry;
 
+       if (hci_dev_test_flag(hdev, HCI_UNREGISTER))
+               return -ENODEV;
+
        entry = kmalloc(sizeof(*entry), GFP_KERNEL);
        if (!entry)
                return -ENOMEM;
@@ -5140,8 +5143,8 @@ static void set_ext_conn_params(struct hci_conn *conn,
        p->max_ce_len = cpu_to_le16(0x0000);
 }
 
-int hci_le_ext_create_conn_sync(struct hci_dev *hdev, struct hci_conn *conn,
-                               u8 own_addr_type)
+static int hci_le_ext_create_conn_sync(struct hci_dev *hdev,
+                                      struct hci_conn *conn, u8 own_addr_type)
 {
        struct hci_cp_le_ext_create_conn *cp;
        struct hci_cp_le_ext_conn_param *p;
index 37087cf..5dd684e 100644 (file)
@@ -42,7 +42,7 @@
 #include "aosp.h"
 
 #define MGMT_VERSION   1
-#define MGMT_REVISION  21
+#define MGMT_REVISION  22
 
 static const u16 mgmt_commands[] = {
        MGMT_OP_READ_INDEX_LIST,
@@ -174,6 +174,8 @@ static const u16 mgmt_events[] = {
        MGMT_EV_ADV_MONITOR_REMOVED,
        MGMT_EV_CONTROLLER_SUSPEND,
        MGMT_EV_CONTROLLER_RESUME,
+       MGMT_EV_ADV_MONITOR_DEVICE_FOUND,
+       MGMT_EV_ADV_MONITOR_DEVICE_LOST,
 };
 
 static const u16 mgmt_untrusted_commands[] = {
@@ -9589,12 +9591,116 @@ static bool is_filter_match(struct hci_dev *hdev, s8 rssi, u8 *eir,
        return true;
 }
 
+void mgmt_adv_monitor_device_lost(struct hci_dev *hdev, u16 handle,
+                                 bdaddr_t *bdaddr, u8 addr_type)
+{
+       struct mgmt_ev_adv_monitor_device_lost ev;
+
+       ev.monitor_handle = cpu_to_le16(handle);
+       bacpy(&ev.addr.bdaddr, bdaddr);
+       ev.addr.type = addr_type;
+
+       mgmt_event(MGMT_EV_ADV_MONITOR_DEVICE_LOST, hdev, &ev, sizeof(ev),
+                  NULL);
+}
+
+static void mgmt_adv_monitor_device_found(struct hci_dev *hdev,
+                                         bdaddr_t *bdaddr, bool report_device,
+                                         struct sk_buff *skb,
+                                         struct sock *skip_sk)
+{
+       struct sk_buff *advmon_skb;
+       size_t advmon_skb_len;
+       __le16 *monitor_handle;
+       struct monitored_device *dev, *tmp;
+       bool matched = false;
+       bool notify = false;
+
+       /* We have received the Advertisement Report because:
+        * 1. the kernel has initiated active discovery
+        * 2. if not, we have pend_le_reports > 0 in which case we are doing
+        *    passive scanning
+        * 3. if none of the above is true, we have one or more active
+        *    Advertisement Monitor
+        *
+        * For case 1 and 2, report all advertisements via MGMT_EV_DEVICE_FOUND
+        * and report ONLY one advertisement per device for the matched Monitor
+        * via MGMT_EV_ADV_MONITOR_DEVICE_FOUND event.
+        *
+        * For case 3, since we are not active scanning and all advertisements
+        * received are due to a matched Advertisement Monitor, report all
+        * advertisements ONLY via MGMT_EV_ADV_MONITOR_DEVICE_FOUND event.
+        */
+       if (report_device && !hdev->advmon_pend_notify) {
+               mgmt_event_skb(skb, skip_sk);
+               return;
+       }
+
+       advmon_skb_len = (sizeof(struct mgmt_ev_adv_monitor_device_found) -
+                         sizeof(struct mgmt_ev_device_found)) + skb->len;
+       advmon_skb = mgmt_alloc_skb(hdev, MGMT_EV_ADV_MONITOR_DEVICE_FOUND,
+                                   advmon_skb_len);
+       if (!advmon_skb) {
+               if (report_device)
+                       mgmt_event_skb(skb, skip_sk);
+               else
+                       kfree_skb(skb);
+               return;
+       }
+
+       /* ADV_MONITOR_DEVICE_FOUND is similar to DEVICE_FOUND event except
+        * that it also has 'monitor_handle'. Make a copy of DEVICE_FOUND and
+        * store monitor_handle of the matched monitor.
+        */
+       monitor_handle = skb_put(advmon_skb, sizeof(*monitor_handle));
+       skb_put_data(advmon_skb, skb->data, skb->len);
+
+       hdev->advmon_pend_notify = false;
+
+       list_for_each_entry_safe(dev, tmp, &hdev->monitored_devices, list) {
+               if (!bacmp(&dev->bdaddr, bdaddr)) {
+                       matched = true;
+
+                       if (!dev->notified) {
+                               *monitor_handle = cpu_to_le16(dev->handle);
+                               notify = true;
+                               dev->notified = true;
+                       }
+               }
+
+               if (!dev->notified)
+                       hdev->advmon_pend_notify = true;
+       }
+
+       if (!report_device &&
+           ((matched && !notify) || !msft_monitor_supported(hdev))) {
+               /* Handle 0 indicates that we are not active scanning and this
+                * is a subsequent advertisement report for an already matched
+                * Advertisement Monitor or the controller offloading support
+                * is not available.
+                */
+               *monitor_handle = 0;
+               notify = true;
+       }
+
+       if (report_device)
+               mgmt_event_skb(skb, skip_sk);
+       else
+               kfree_skb(skb);
+
+       if (notify)
+               mgmt_event_skb(advmon_skb, skip_sk);
+       else
+               kfree_skb(advmon_skb);
+}
+
 void mgmt_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type,
                       u8 addr_type, u8 *dev_class, s8 rssi, u32 flags,
                       u8 *eir, u16 eir_len, u8 *scan_rsp, u8 scan_rsp_len)
 {
        struct sk_buff *skb;
        struct mgmt_ev_device_found *ev;
+       bool report_device = hci_discovery_active(hdev);
 
        /* Don't send events for a non-kernel initiated discovery. With
         * LE one exception is if we have pend_le_reports > 0 in which
@@ -9603,11 +9709,10 @@ void mgmt_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type,
        if (!hci_discovery_active(hdev)) {
                if (link_type == ACL_LINK)
                        return;
-               if (link_type == LE_LINK &&
-                   list_empty(&hdev->pend_le_reports) &&
-                   !hci_is_adv_monitoring(hdev)) {
+               if (link_type == LE_LINK && !list_empty(&hdev->pend_le_reports))
+                       report_device = true;
+               else if (!hci_is_adv_monitoring(hdev))
                        return;
-               }
        }
 
        if (hdev->discovery.result_filtering) {
@@ -9672,7 +9777,7 @@ void mgmt_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type,
 
        ev->eir_len = cpu_to_le16(eir_len + scan_rsp_len);
 
-       mgmt_event_skb(skb, NULL);
+       mgmt_adv_monitor_device_found(hdev, bdaddr, report_device, skb, NULL);
 }
 
 void mgmt_remote_name(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type,
index 6a94363..9a3d77d 100644 (file)
@@ -80,6 +80,14 @@ struct msft_rp_le_set_advertisement_filter_enable {
        __u8 sub_opcode;
 } __packed;
 
+#define MSFT_EV_LE_MONITOR_DEVICE      0x02
+struct msft_ev_le_monitor_device {
+       __u8     addr_type;
+       bdaddr_t bdaddr;
+       __u8     monitor_handle;
+       __u8     monitor_state;
+} __packed;
+
 struct msft_monitor_advertisement_handle_data {
        __u8  msft_handle;
        __u16 mgmt_handle;
@@ -204,6 +212,37 @@ static struct msft_monitor_advertisement_handle_data *msft_find_handle_data
        return NULL;
 }
 
+/* This function requires the caller holds hdev->lock */
+static int msft_monitor_device_del(struct hci_dev *hdev, __u16 mgmt_handle,
+                                  bdaddr_t *bdaddr, __u8 addr_type,
+                                  bool notify)
+{
+       struct monitored_device *dev, *tmp;
+       int count = 0;
+
+       list_for_each_entry_safe(dev, tmp, &hdev->monitored_devices, list) {
+               /* mgmt_handle == 0 indicates remove all devices, whereas,
+                * bdaddr == NULL indicates remove all devices matching the
+                * mgmt_handle.
+                */
+               if ((!mgmt_handle || dev->handle == mgmt_handle) &&
+                   (!bdaddr || (!bacmp(bdaddr, &dev->bdaddr) &&
+                                addr_type == dev->addr_type))) {
+                       if (notify && dev->notified) {
+                               mgmt_adv_monitor_device_lost(hdev, dev->handle,
+                                                            &dev->bdaddr,
+                                                            dev->addr_type);
+                       }
+
+                       list_del(&dev->list);
+                       kfree(dev);
+                       count++;
+               }
+       }
+
+       return count;
+}
+
 static void msft_le_monitor_advertisement_cb(struct hci_dev *hdev,
                                             u8 status, u16 opcode,
                                             struct sk_buff *skb)
@@ -294,6 +333,10 @@ static void msft_le_cancel_monitor_advertisement_cb(struct hci_dev *hdev,
                if (monitor && !msft->suspending)
                        hci_free_adv_monitor(hdev, monitor);
 
+               /* Clear any monitored devices by this Adv Monitor */
+               msft_monitor_device_del(hdev, handle_data->mgmt_handle, NULL,
+                                       0, false);
+
                list_del(&handle_data->list);
                kfree(handle_data);
        }
@@ -557,6 +600,14 @@ void msft_do_close(struct hci_dev *hdev)
                list_del(&handle_data->list);
                kfree(handle_data);
        }
+
+       hci_dev_lock(hdev);
+
+       /* Clear any devices that are being monitored and notify device lost */
+       hdev->advmon_pend_notify = false;
+       msft_monitor_device_del(hdev, 0, NULL, 0, true);
+
+       hci_dev_unlock(hdev);
 }
 
 void msft_register(struct hci_dev *hdev)
@@ -590,10 +641,101 @@ void msft_unregister(struct hci_dev *hdev)
        kfree(msft);
 }
 
+/* This function requires the caller holds hdev->lock */
+static void msft_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr,
+                             __u8 addr_type, __u16 mgmt_handle)
+{
+       struct monitored_device *dev;
+
+       dev = kmalloc(sizeof(*dev), GFP_KERNEL);
+       if (!dev) {
+               bt_dev_err(hdev, "MSFT vendor event %u: no memory",
+                          MSFT_EV_LE_MONITOR_DEVICE);
+               return;
+       }
+
+       bacpy(&dev->bdaddr, bdaddr);
+       dev->addr_type = addr_type;
+       dev->handle = mgmt_handle;
+       dev->notified = false;
+
+       INIT_LIST_HEAD(&dev->list);
+       list_add(&dev->list, &hdev->monitored_devices);
+       hdev->advmon_pend_notify = true;
+}
+
+/* This function requires the caller holds hdev->lock */
+static void msft_device_lost(struct hci_dev *hdev, bdaddr_t *bdaddr,
+                            __u8 addr_type, __u16 mgmt_handle)
+{
+       if (!msft_monitor_device_del(hdev, mgmt_handle, bdaddr, addr_type,
+                                    true)) {
+               bt_dev_err(hdev, "MSFT vendor event %u: dev %pMR not in list",
+                          MSFT_EV_LE_MONITOR_DEVICE, bdaddr);
+       }
+}
+
+static void *msft_skb_pull(struct hci_dev *hdev, struct sk_buff *skb,
+                          u8 ev, size_t len)
+{
+       void *data;
+
+       data = skb_pull_data(skb, len);
+       if (!data)
+               bt_dev_err(hdev, "Malformed MSFT vendor event: 0x%02x", ev);
+
+       return data;
+}
+
+/* This function requires the caller holds hdev->lock */
+static void msft_monitor_device_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+       struct msft_ev_le_monitor_device *ev;
+       struct msft_monitor_advertisement_handle_data *handle_data;
+       u8 addr_type;
+
+       ev = msft_skb_pull(hdev, skb, MSFT_EV_LE_MONITOR_DEVICE, sizeof(*ev));
+       if (!ev)
+               return;
+
+       bt_dev_dbg(hdev,
+                  "MSFT vendor event 0x%02x: handle 0x%04x state %d addr %pMR",
+                  MSFT_EV_LE_MONITOR_DEVICE, ev->monitor_handle,
+                  ev->monitor_state, &ev->bdaddr);
+
+       handle_data = msft_find_handle_data(hdev, ev->monitor_handle, false);
+       if (!handle_data)
+               return;
+
+       switch (ev->addr_type) {
+       case ADDR_LE_DEV_PUBLIC:
+               addr_type = BDADDR_LE_PUBLIC;
+               break;
+
+       case ADDR_LE_DEV_RANDOM:
+               addr_type = BDADDR_LE_RANDOM;
+               break;
+
+       default:
+               bt_dev_err(hdev,
+                          "MSFT vendor event 0x%02x: unknown addr type 0x%02x",
+                          MSFT_EV_LE_MONITOR_DEVICE, ev->addr_type);
+               return;
+       }
+
+       if (ev->monitor_state)
+               msft_device_found(hdev, &ev->bdaddr, addr_type,
+                                 handle_data->mgmt_handle);
+       else
+               msft_device_lost(hdev, &ev->bdaddr, addr_type,
+                                handle_data->mgmt_handle);
+}
+
 void msft_vendor_evt(struct hci_dev *hdev, void *data, struct sk_buff *skb)
 {
        struct msft_data *msft = hdev->msft_data;
-       u8 event;
+       u8 *evt_prefix;
+       u8 *evt;
 
        if (!msft)
                return;
@@ -602,13 +744,12 @@ void msft_vendor_evt(struct hci_dev *hdev, void *data, struct sk_buff *skb)
         * matches, and otherwise just return.
         */
        if (msft->evt_prefix_len > 0) {
-               if (skb->len < msft->evt_prefix_len)
+               evt_prefix = msft_skb_pull(hdev, skb, 0, msft->evt_prefix_len);
+               if (!evt_prefix)
                        return;
 
-               if (memcmp(skb->data, msft->evt_prefix, msft->evt_prefix_len))
+               if (memcmp(evt_prefix, msft->evt_prefix, msft->evt_prefix_len))
                        return;
-
-               skb_pull(skb, msft->evt_prefix_len);
        }
 
        /* Every event starts at least with an event code and the rest of
@@ -617,10 +758,23 @@ void msft_vendor_evt(struct hci_dev *hdev, void *data, struct sk_buff *skb)
        if (skb->len < 1)
                return;
 
-       event = *skb->data;
-       skb_pull(skb, 1);
+       evt = msft_skb_pull(hdev, skb, 0, sizeof(*evt));
+       if (!evt)
+               return;
+
+       hci_dev_lock(hdev);
+
+       switch (*evt) {
+       case MSFT_EV_LE_MONITOR_DEVICE:
+               msft_monitor_device_evt(hdev, skb);
+               break;
 
-       bt_dev_dbg(hdev, "MSFT vendor event %u", event);
+       default:
+               bt_dev_dbg(hdev, "MSFT vendor event 0x%02x", *evt);
+               break;
+       }
+
+       hci_dev_unlock(hdev);
 }
 
 __u64 msft_get_features(struct hci_dev *hdev)
index fbc8963..d0e54e3 100644 (file)
@@ -145,7 +145,8 @@ static int bpf_dummy_ops_btf_struct_access(struct bpf_verifier_log *log,
                                           const struct btf *btf,
                                           const struct btf_type *t, int off,
                                           int size, enum bpf_access_type atype,
-                                          u32 *next_btf_id)
+                                          u32 *next_btf_id,
+                                          enum bpf_type_flag *flag)
 {
        const struct btf_type *state;
        s32 type_id;
@@ -162,7 +163,8 @@ static int bpf_dummy_ops_btf_struct_access(struct bpf_verifier_log *log,
                return -EACCES;
        }
 
-       err = btf_struct_access(log, btf, t, off, size, atype, next_btf_id);
+       err = btf_struct_access(log, btf, t, off, size, atype, next_btf_id,
+                               flag);
        if (err < 0)
                return err;
 
index 46dd957..f080345 100644 (file)
@@ -5,6 +5,7 @@
 #include <linux/btf.h>
 #include <linux/btf_ids.h>
 #include <linux/slab.h>
+#include <linux/init.h>
 #include <linux/vmalloc.h>
 #include <linux/etherdevice.h>
 #include <linux/filter.h>
@@ -130,7 +131,8 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat,
 
 static int bpf_test_finish(const union bpf_attr *kattr,
                           union bpf_attr __user *uattr, const void *data,
-                          u32 size, u32 retval, u32 duration)
+                          struct skb_shared_info *sinfo, u32 size,
+                          u32 retval, u32 duration)
 {
        void __user *data_out = u64_to_user_ptr(kattr->test.data_out);
        int err = -EFAULT;
@@ -145,8 +147,37 @@ static int bpf_test_finish(const union bpf_attr *kattr,
                err = -ENOSPC;
        }
 
-       if (data_out && copy_to_user(data_out, data, copy_size))
-               goto out;
+       if (data_out) {
+               int len = sinfo ? copy_size - sinfo->xdp_frags_size : copy_size;
+
+               if (copy_to_user(data_out, data, len))
+                       goto out;
+
+               if (sinfo) {
+                       int i, offset = len;
+                       u32 data_len;
+
+                       for (i = 0; i < sinfo->nr_frags; i++) {
+                               skb_frag_t *frag = &sinfo->frags[i];
+
+                               if (offset >= copy_size) {
+                                       err = -ENOSPC;
+                                       break;
+                               }
+
+                               data_len = min_t(u32, copy_size - offset,
+                                                skb_frag_size(frag));
+
+                               if (copy_to_user(data_out + offset,
+                                                skb_frag_address(frag),
+                                                data_len))
+                                       goto out;
+
+                               offset += data_len;
+                       }
+               }
+       }
+
        if (copy_to_user(&uattr->test.data_size_out, &size, sizeof(size)))
                goto out;
        if (copy_to_user(&uattr->test.retval, &retval, sizeof(retval)))
@@ -171,6 +202,8 @@ int noinline bpf_fentry_test1(int a)
 {
        return a + 1;
 }
+EXPORT_SYMBOL_GPL(bpf_fentry_test1);
+ALLOW_ERROR_INJECTION(bpf_fentry_test1, ERRNO);
 
 int noinline bpf_fentry_test2(int a, u64 b)
 {
@@ -232,28 +265,142 @@ struct sock * noinline bpf_kfunc_call_test3(struct sock *sk)
        return sk;
 }
 
+struct prog_test_ref_kfunc {
+       int a;
+       int b;
+       struct prog_test_ref_kfunc *next;
+};
+
+static struct prog_test_ref_kfunc prog_test_struct = {
+       .a = 42,
+       .b = 108,
+       .next = &prog_test_struct,
+};
+
+noinline struct prog_test_ref_kfunc *
+bpf_kfunc_call_test_acquire(unsigned long *scalar_ptr)
+{
+       /* randomly return NULL */
+       if (get_jiffies_64() % 2)
+               return NULL;
+       return &prog_test_struct;
+}
+
+noinline void bpf_kfunc_call_test_release(struct prog_test_ref_kfunc *p)
+{
+}
+
+struct prog_test_pass1 {
+       int x0;
+       struct {
+               int x1;
+               struct {
+                       int x2;
+                       struct {
+                               int x3;
+                       };
+               };
+       };
+};
+
+struct prog_test_pass2 {
+       int len;
+       short arr1[4];
+       struct {
+               char arr2[4];
+               unsigned long arr3[8];
+       } x;
+};
+
+struct prog_test_fail1 {
+       void *p;
+       int x;
+};
+
+struct prog_test_fail2 {
+       int x8;
+       struct prog_test_pass1 x;
+};
+
+struct prog_test_fail3 {
+       int len;
+       char arr1[2];
+       char arr2[];
+};
+
+noinline void bpf_kfunc_call_test_pass_ctx(struct __sk_buff *skb)
+{
+}
+
+noinline void bpf_kfunc_call_test_pass1(struct prog_test_pass1 *p)
+{
+}
+
+noinline void bpf_kfunc_call_test_pass2(struct prog_test_pass2 *p)
+{
+}
+
+noinline void bpf_kfunc_call_test_fail1(struct prog_test_fail1 *p)
+{
+}
+
+noinline void bpf_kfunc_call_test_fail2(struct prog_test_fail2 *p)
+{
+}
+
+noinline void bpf_kfunc_call_test_fail3(struct prog_test_fail3 *p)
+{
+}
+
+noinline void bpf_kfunc_call_test_mem_len_pass1(void *mem, int mem__sz)
+{
+}
+
+noinline void bpf_kfunc_call_test_mem_len_fail1(void *mem, int len)
+{
+}
+
+noinline void bpf_kfunc_call_test_mem_len_fail2(u64 *mem, int len)
+{
+}
+
 __diag_pop();
 
 ALLOW_ERROR_INJECTION(bpf_modify_return_test, ERRNO);
 
-BTF_SET_START(test_sk_kfunc_ids)
+BTF_SET_START(test_sk_check_kfunc_ids)
 BTF_ID(func, bpf_kfunc_call_test1)
 BTF_ID(func, bpf_kfunc_call_test2)
 BTF_ID(func, bpf_kfunc_call_test3)
-BTF_SET_END(test_sk_kfunc_ids)
-
-bool bpf_prog_test_check_kfunc_call(u32 kfunc_id, struct module *owner)
-{
-       if (btf_id_set_contains(&test_sk_kfunc_ids, kfunc_id))
-               return true;
-       return bpf_check_mod_kfunc_call(&prog_test_kfunc_list, kfunc_id, owner);
-}
-
-static void *bpf_test_init(const union bpf_attr *kattr, u32 size,
-                          u32 headroom, u32 tailroom)
+BTF_ID(func, bpf_kfunc_call_test_acquire)
+BTF_ID(func, bpf_kfunc_call_test_release)
+BTF_ID(func, bpf_kfunc_call_test_pass_ctx)
+BTF_ID(func, bpf_kfunc_call_test_pass1)
+BTF_ID(func, bpf_kfunc_call_test_pass2)
+BTF_ID(func, bpf_kfunc_call_test_fail1)
+BTF_ID(func, bpf_kfunc_call_test_fail2)
+BTF_ID(func, bpf_kfunc_call_test_fail3)
+BTF_ID(func, bpf_kfunc_call_test_mem_len_pass1)
+BTF_ID(func, bpf_kfunc_call_test_mem_len_fail1)
+BTF_ID(func, bpf_kfunc_call_test_mem_len_fail2)
+BTF_SET_END(test_sk_check_kfunc_ids)
+
+BTF_SET_START(test_sk_acquire_kfunc_ids)
+BTF_ID(func, bpf_kfunc_call_test_acquire)
+BTF_SET_END(test_sk_acquire_kfunc_ids)
+
+BTF_SET_START(test_sk_release_kfunc_ids)
+BTF_ID(func, bpf_kfunc_call_test_release)
+BTF_SET_END(test_sk_release_kfunc_ids)
+
+BTF_SET_START(test_sk_ret_null_kfunc_ids)
+BTF_ID(func, bpf_kfunc_call_test_acquire)
+BTF_SET_END(test_sk_ret_null_kfunc_ids)
+
+static void *bpf_test_init(const union bpf_attr *kattr, u32 user_size,
+                          u32 size, u32 headroom, u32 tailroom)
 {
        void __user *data_in = u64_to_user_ptr(kattr->test.data_in);
-       u32 user_size = kattr->test.data_size_in;
        void *data;
 
        if (size < ETH_HLEN || size > PAGE_SIZE - headroom - tailroom)
@@ -581,7 +728,8 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
        if (kattr->test.flags || kattr->test.cpu)
                return -EINVAL;
 
-       data = bpf_test_init(kattr, size, NET_SKB_PAD + NET_IP_ALIGN,
+       data = bpf_test_init(kattr, kattr->test.data_size_in,
+                            size, NET_SKB_PAD + NET_IP_ALIGN,
                             SKB_DATA_ALIGN(sizeof(struct skb_shared_info)));
        if (IS_ERR(data))
                return PTR_ERR(data);
@@ -683,7 +831,8 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
        /* bpf program can never convert linear skb to non-linear */
        if (WARN_ON_ONCE(skb_is_nonlinear(skb)))
                size = skb_headlen(skb);
-       ret = bpf_test_finish(kattr, uattr, skb->data, size, retval, duration);
+       ret = bpf_test_finish(kattr, uattr, skb->data, NULL, size, retval,
+                             duration);
        if (!ret)
                ret = bpf_ctx_finish(kattr, uattr, ctx,
                                     sizeof(struct __sk_buff));
@@ -758,16 +907,16 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
                          union bpf_attr __user *uattr)
 {
        u32 tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
-       u32 headroom = XDP_PACKET_HEADROOM;
        u32 size = kattr->test.data_size_in;
+       u32 headroom = XDP_PACKET_HEADROOM;
+       u32 retval, duration, max_data_sz;
        u32 repeat = kattr->test.repeat;
        struct netdev_rx_queue *rxqueue;
+       struct skb_shared_info *sinfo;
        struct xdp_buff xdp = {};
-       u32 retval, duration;
+       int i, ret = -EINVAL;
        struct xdp_md *ctx;
-       u32 max_data_sz;
        void *data;
-       int ret = -EINVAL;
 
        if (prog->expected_attach_type == BPF_XDP_DEVMAP ||
            prog->expected_attach_type == BPF_XDP_CPUMAP)
@@ -787,26 +936,65 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
                headroom -= ctx->data;
        }
 
-       /* XDP have extra tailroom as (most) drivers use full page */
        max_data_sz = 4096 - headroom - tailroom;
+       size = min_t(u32, size, max_data_sz);
 
-       data = bpf_test_init(kattr, max_data_sz, headroom, tailroom);
+       data = bpf_test_init(kattr, size, max_data_sz, headroom, tailroom);
        if (IS_ERR(data)) {
                ret = PTR_ERR(data);
                goto free_ctx;
        }
 
        rxqueue = __netif_get_rx_queue(current->nsproxy->net_ns->loopback_dev, 0);
-       xdp_init_buff(&xdp, headroom + max_data_sz + tailroom,
-                     &rxqueue->xdp_rxq);
+       rxqueue->xdp_rxq.frag_size = headroom + max_data_sz + tailroom;
+       xdp_init_buff(&xdp, rxqueue->xdp_rxq.frag_size, &rxqueue->xdp_rxq);
        xdp_prepare_buff(&xdp, data, headroom, size, true);
+       sinfo = xdp_get_shared_info_from_buff(&xdp);
 
        ret = xdp_convert_md_to_buff(ctx, &xdp);
        if (ret)
                goto free_data;
 
+       if (unlikely(kattr->test.data_size_in > size)) {
+               void __user *data_in = u64_to_user_ptr(kattr->test.data_in);
+
+               while (size < kattr->test.data_size_in) {
+                       struct page *page;
+                       skb_frag_t *frag;
+                       u32 data_len;
+
+                       if (sinfo->nr_frags == MAX_SKB_FRAGS) {
+                               ret = -ENOMEM;
+                               goto out;
+                       }
+
+                       page = alloc_page(GFP_KERNEL);
+                       if (!page) {
+                               ret = -ENOMEM;
+                               goto out;
+                       }
+
+                       frag = &sinfo->frags[sinfo->nr_frags++];
+                       __skb_frag_set_page(frag, page);
+
+                       data_len = min_t(u32, kattr->test.data_size_in - size,
+                                        PAGE_SIZE);
+                       skb_frag_size_set(frag, data_len);
+
+                       if (copy_from_user(page_address(page), data_in + size,
+                                          data_len)) {
+                               ret = -EFAULT;
+                               goto out;
+                       }
+                       sinfo->xdp_frags_size += data_len;
+                       size += data_len;
+               }
+               xdp_buff_set_frags_flag(&xdp);
+       }
+
        if (repeat > 1)
                bpf_prog_change_xdp(NULL, prog);
+
        ret = bpf_test_run(prog, &xdp, repeat, &retval, &duration, true);
        /* We convert the xdp_buff back to an xdp_md before checking the return
         * code so the reference count of any held netdevice will be decremented
@@ -816,12 +1004,9 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
        if (ret)
                goto out;
 
-       if (xdp.data_meta != data + headroom ||
-           xdp.data_end != xdp.data_meta + size)
-               size = xdp.data_end - xdp.data_meta;
-
-       ret = bpf_test_finish(kattr, uattr, xdp.data_meta, size, retval,
-                             duration);
+       size = xdp.data_end - xdp.data_meta + sinfo->xdp_frags_size;
+       ret = bpf_test_finish(kattr, uattr, xdp.data_meta, sinfo, size,
+                             retval, duration);
        if (!ret)
                ret = bpf_ctx_finish(kattr, uattr, ctx,
                                     sizeof(struct xdp_md));
@@ -830,6 +1015,8 @@ out:
        if (repeat > 1)
                bpf_prog_change_xdp(prog, NULL);
 free_data:
+       for (i = 0; i < sinfo->nr_frags; i++)
+               __free_page(skb_frag_page(&sinfo->frags[i]));
        kfree(data);
 free_ctx:
        kfree(ctx);
@@ -876,7 +1063,7 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
        if (size < ETH_HLEN)
                return -EINVAL;
 
-       data = bpf_test_init(kattr, size, 0, 0);
+       data = bpf_test_init(kattr, kattr->test.data_size_in, size, 0, 0);
        if (IS_ERR(data))
                return PTR_ERR(data);
 
@@ -911,8 +1098,8 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
        if (ret < 0)
                goto out;
 
-       ret = bpf_test_finish(kattr, uattr, &flow_keys, sizeof(flow_keys),
-                             retval, duration);
+       ret = bpf_test_finish(kattr, uattr, &flow_keys, NULL,
+                             sizeof(flow_keys), retval, duration);
        if (!ret)
                ret = bpf_ctx_finish(kattr, uattr, user_ctx,
                                     sizeof(struct bpf_flow_keys));
@@ -960,7 +1147,7 @@ int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kat
        if (!range_is_zero(user_ctx, offsetofend(typeof(*user_ctx), local_port), sizeof(*user_ctx)))
                goto out;
 
-       if (user_ctx->local_port > U16_MAX || user_ctx->remote_port > U16_MAX) {
+       if (user_ctx->local_port > U16_MAX) {
                ret = -ERANGE;
                goto out;
        }
@@ -968,7 +1155,7 @@ int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kat
        ctx.family = (u16)user_ctx->family;
        ctx.protocol = (u16)user_ctx->protocol;
        ctx.dport = (u16)user_ctx->local_port;
-       ctx.sport = (__force __be16)user_ctx->remote_port;
+       ctx.sport = user_ctx->remote_port;
 
        switch (ctx.family) {
        case AF_INET:
@@ -1016,7 +1203,7 @@ int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kat
                user_ctx->cookie = sock_gen_cookie(ctx.selected_sk);
        }
 
-       ret = bpf_test_finish(kattr, uattr, NULL, 0, retval, duration);
+       ret = bpf_test_finish(kattr, uattr, NULL, NULL, 0, retval, duration);
        if (!ret)
                ret = bpf_ctx_finish(kattr, uattr, user_ctx, sizeof(*user_ctx));
 
@@ -1067,3 +1254,17 @@ out:
        kfree(ctx);
        return err;
 }
+
+static const struct btf_kfunc_id_set bpf_prog_test_kfunc_set = {
+       .owner        = THIS_MODULE,
+       .check_set    = &test_sk_check_kfunc_ids,
+       .acquire_set  = &test_sk_acquire_kfunc_ids,
+       .release_set  = &test_sk_release_kfunc_ids,
+       .ret_null_set = &test_sk_ret_null_kfunc_ids,
+};
+
+static int __init bpf_prog_test_run_init(void)
+{
+       return register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_prog_test_kfunc_set);
+}
+late_initcall(bpf_prog_test_run_init);
index 4401397..52dd0b6 100644 (file)
@@ -268,7 +268,7 @@ static int receive(struct sk_buff *skb, struct net_device *dev,
 
        err = caifd->layer.up->receive(caifd->layer.up, pkt);
 
-       /* For -EILSEQ the packet is not freed so so it now */
+       /* For -EILSEQ the packet is not freed so free it now */
        if (err == -EILSEQ)
                cfpkt_destroy(pkt);
 
index d8861e8..2422135 100644 (file)
@@ -1239,16 +1239,19 @@ static int __net_init cangw_pernet_init(struct net *net)
        return 0;
 }
 
-static void __net_exit cangw_pernet_exit(struct net *net)
+static void __net_exit cangw_pernet_exit_batch(struct list_head *net_list)
 {
+       struct net *net;
+
        rtnl_lock();
-       cgw_remove_all_jobs(net);
+       list_for_each_entry(net, net_list, exit_list)
+               cgw_remove_all_jobs(net);
        rtnl_unlock();
 }
 
 static struct pernet_operations cangw_pernet_ops = {
        .init = cangw_pernet_init,
-       .exit = cangw_pernet_exit,
+       .exit_batch = cangw_pernet_exit_batch,
 };
 
 static __init int cgw_module_init(void)
index 1baab07..2c3b874 100644 (file)
@@ -1037,7 +1037,7 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf)
                                /*  avoid cases where sscanf is not exact inverse of printf */
                                snprintf(buf, IFNAMSIZ, name, i);
                                if (!strncmp(buf, name_node->name, IFNAMSIZ))
-                                       set_bit(i, inuse);
+                                       __set_bit(i, inuse);
                        }
                        if (!sscanf(d->name, name, &i))
                                continue;
@@ -1047,7 +1047,7 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf)
                        /*  avoid cases where sscanf is not exact inverse of printf */
                        snprintf(buf, IFNAMSIZ, name, i);
                        if (!strncmp(buf, d->name, IFNAMSIZ))
-                               set_bit(i, inuse);
+                               __set_bit(i, inuse);
                }
 
                i = find_first_zero_bit(inuse, max_netdevices);
@@ -9143,7 +9143,7 @@ DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
 static void net_set_todo(struct net_device *dev)
 {
        list_add_tail(&dev->todo_list, &net_todo_list);
-       dev_net(dev)->dev_unreg_count++;
+       atomic_inc(&dev_net(dev)->dev_unreg_count);
 }
 
 static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
@@ -9683,8 +9683,10 @@ int register_netdevice(struct net_device *dev)
        linkwatch_init_dev(dev);
 
        dev_init_scheduler(dev);
-       dev_hold(dev);
+
+       dev_hold_track(dev, &dev->dev_registered_tracker, GFP_KERNEL);
        list_netdevice(dev);
+
        add_device_randomness(dev->dev_addr, dev->addr_len);
 
        /* If the device has permanent device address, driver should
@@ -9963,11 +9965,8 @@ void netdev_run_todo(void)
                if (dev->needs_free_netdev)
                        free_netdev(dev);
 
-               /* Report a network device has been unregistered */
-               rtnl_lock();
-               dev_net(dev)->dev_unreg_count--;
-               __rtnl_unlock();
-               wake_up(&netdev_unregistering_wq);
+               if (atomic_dec_and_test(&dev_net(dev)->dev_unreg_count))
+                       wake_up(&netdev_unregistering_wq);
 
                /* Free network device */
                kobject_put(&dev->dev.kobj);
@@ -10172,7 +10171,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
        dev->pcpu_refcnt = alloc_percpu(int);
        if (!dev->pcpu_refcnt)
                goto free_dev;
-       dev_hold(dev);
+       __dev_hold(dev);
 #else
        refcount_set(&dev->dev_refcnt, 1);
 #endif
@@ -10449,7 +10448,7 @@ void unregister_netdevice_many(struct list_head *head)
        synchronize_net();
 
        list_for_each_entry(dev, head, unreg_list) {
-               dev_put(dev);
+               dev_put_track(dev, &dev->dev_registered_tracker);
                net_set_todo(dev);
        }
 
@@ -10732,8 +10731,7 @@ static int __net_init netdev_init(struct net *net)
        BUILD_BUG_ON(GRO_HASH_BUCKETS >
                     8 * sizeof_field(struct napi_struct, gro_bitmask));
 
-       if (net != &init_net)
-               INIT_LIST_HEAD(&net->dev_base_head);
+       INIT_LIST_HEAD(&net->dev_base_head);
 
        net->dev_name_head = netdev_create_hash();
        if (net->dev_name_head == NULL)
@@ -10849,14 +10847,14 @@ static struct pernet_operations __net_initdata netdev_net_ops = {
        .exit = netdev_exit,
 };
 
-static void __net_exit default_device_exit(struct net *net)
+static void __net_exit default_device_exit_net(struct net *net)
 {
        struct net_device *dev, *aux;
        /*
         * Push all migratable network devices back to the
         * initial network namespace
         */
-       rtnl_lock();
+       ASSERT_RTNL();
        for_each_netdev_safe(net, dev, aux) {
                int err;
                char fb_name[IFNAMSIZ];
@@ -10880,24 +10878,24 @@ static void __net_exit default_device_exit(struct net *net)
                        BUG();
                }
        }
-       rtnl_unlock();
 }
 
 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
 {
-       /* Return with the rtnl_lock held when there are no network
+       /* Return (with the rtnl_lock held) when there are no network
         * devices unregistering in any network namespace in net_list.
         */
-       struct net *net;
-       bool unregistering;
        DEFINE_WAIT_FUNC(wait, woken_wake_function);
+       bool unregistering;
+       struct net *net;
 
+       ASSERT_RTNL();
        add_wait_queue(&netdev_unregistering_wq, &wait);
        for (;;) {
                unregistering = false;
-               rtnl_lock();
+
                list_for_each_entry(net, net_list, exit_list) {
-                       if (net->dev_unreg_count > 0) {
+                       if (atomic_read(&net->dev_unreg_count) > 0) {
                                unregistering = true;
                                break;
                        }
@@ -10907,6 +10905,7 @@ static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
                __rtnl_unlock();
 
                wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
+               rtnl_lock();
        }
        remove_wait_queue(&netdev_unregistering_wq, &wait);
 }
@@ -10922,6 +10921,11 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list)
        struct net *net;
        LIST_HEAD(dev_kill_list);
 
+       rtnl_lock();
+       list_for_each_entry(net, net_list, exit_list) {
+               default_device_exit_net(net);
+               cond_resched();
+       }
        /* To prevent network device cleanup code from dereferencing
         * loopback devices or network devices that have been freed
         * wait here for all pending unregistrations to complete,
@@ -10934,6 +10938,7 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list)
         * default_device_exit_batch.
         */
        rtnl_lock_unregistering(net_list);
+
        list_for_each_entry(net, net_list, exit_list) {
                for_each_netdev_reverse(net, dev) {
                        if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
@@ -10947,7 +10952,6 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list)
 }
 
 static struct pernet_operations __net_initdata default_device_ops = {
-       .exit = default_device_exit,
        .exit_batch = default_device_exit_batch,
 };
 
index 7b288a1..4641126 100644 (file)
 static int trace_state = TRACE_OFF;
 static bool monitor_hw;
 
+#undef EM
+#undef EMe
+
+#define EM(a, b)       [a] = #b,
+#define EMe(a, b)      [a] = #b
+
+/* drop_reasons is used to translate 'enum skb_drop_reason' to string,
+ * which is reported to user space.
+ */
+static const char * const drop_reasons[] = {
+       TRACE_SKB_DROP_REASON
+};
+
 /* net_dm_mutex
  *
  * An overall lock guarding every operation coming from userspace.
@@ -126,6 +139,7 @@ struct net_dm_skb_cb {
                struct devlink_trap_metadata *hw_metadata;
                void *pc;
        };
+       enum skb_drop_reason reason;
 };
 
 #define NET_DM_SKB_CB(__skb) ((struct net_dm_skb_cb *)&((__skb)->cb[0]))
@@ -498,6 +512,7 @@ static void net_dm_packet_trace_kfree_skb_hit(void *ignore,
 {
        ktime_t tstamp = ktime_get_real();
        struct per_cpu_dm_data *data;
+       struct net_dm_skb_cb *cb;
        struct sk_buff *nskb;
        unsigned long flags;
 
@@ -508,7 +523,11 @@ static void net_dm_packet_trace_kfree_skb_hit(void *ignore,
        if (!nskb)
                return;
 
-       NET_DM_SKB_CB(nskb)->pc = location;
+       if ((unsigned int)reason >= SKB_DROP_REASON_MAX)
+               reason = SKB_DROP_REASON_NOT_SPECIFIED;
+       cb = NET_DM_SKB_CB(nskb);
+       cb->reason = reason;
+       cb->pc = location;
        /* Override the timestamp because we care about the time when the
         * packet was dropped.
         */
@@ -553,7 +572,8 @@ static size_t net_dm_in_port_size(void)
 
 #define NET_DM_MAX_SYMBOL_LEN 40
 
-static size_t net_dm_packet_report_size(size_t payload_len)
+static size_t net_dm_packet_report_size(size_t payload_len,
+                                       enum skb_drop_reason reason)
 {
        size_t size;
 
@@ -574,6 +594,8 @@ static size_t net_dm_packet_report_size(size_t payload_len)
               nla_total_size(sizeof(u32)) +
               /* NET_DM_ATTR_PROTO */
               nla_total_size(sizeof(u16)) +
+              /* NET_DM_ATTR_REASON */
+              nla_total_size(strlen(drop_reasons[reason]) + 1) +
               /* NET_DM_ATTR_PAYLOAD */
               nla_total_size(payload_len);
 }
@@ -606,7 +628,7 @@ nla_put_failure:
 static int net_dm_packet_report_fill(struct sk_buff *msg, struct sk_buff *skb,
                                     size_t payload_len)
 {
-       u64 pc = (u64)(uintptr_t) NET_DM_SKB_CB(skb)->pc;
+       struct net_dm_skb_cb *cb = NET_DM_SKB_CB(skb);
        char buf[NET_DM_MAX_SYMBOL_LEN];
        struct nlattr *attr;
        void *hdr;
@@ -620,10 +642,15 @@ static int net_dm_packet_report_fill(struct sk_buff *msg, struct sk_buff *skb,
        if (nla_put_u16(msg, NET_DM_ATTR_ORIGIN, NET_DM_ORIGIN_SW))
                goto nla_put_failure;
 
-       if (nla_put_u64_64bit(msg, NET_DM_ATTR_PC, pc, NET_DM_ATTR_PAD))
+       if (nla_put_u64_64bit(msg, NET_DM_ATTR_PC, (u64)(uintptr_t)cb->pc,
+                             NET_DM_ATTR_PAD))
+               goto nla_put_failure;
+
+       if (nla_put_string(msg, NET_DM_ATTR_REASON,
+                          drop_reasons[cb->reason]))
                goto nla_put_failure;
 
-       snprintf(buf, sizeof(buf), "%pS", NET_DM_SKB_CB(skb)->pc);
+       snprintf(buf, sizeof(buf), "%pS", cb->pc);
        if (nla_put_string(msg, NET_DM_ATTR_SYMBOL, buf))
                goto nla_put_failure;
 
@@ -679,7 +706,9 @@ static void net_dm_packet_report(struct sk_buff *skb)
        if (net_dm_trunc_len)
                payload_len = min_t(size_t, net_dm_trunc_len, payload_len);
 
-       msg = nlmsg_new(net_dm_packet_report_size(payload_len), GFP_KERNEL);
+       msg = nlmsg_new(net_dm_packet_report_size(payload_len,
+                                                 NET_DM_SKB_CB(skb)->reason),
+                       GFP_KERNEL);
        if (!msg)
                goto out;
 
index 4603b7c..8182440 100644 (file)
@@ -2603,7 +2603,7 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
         * account for the headroom.
         */
        bytes_sg_total = start - offset + bytes;
-       if (!test_bit(i, &msg->sg.copy) && bytes_sg_total <= len)
+       if (!test_bit(i, msg->sg.copy) && bytes_sg_total <= len)
                goto out;
 
        /* At this point we need to linearize multiple scatterlist
@@ -2809,7 +2809,7 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
        /* Place newly allocated data buffer */
        sk_mem_charge(msg->sk, len);
        msg->sg.size += len;
-       __clear_bit(new, &msg->sg.copy);
+       __clear_bit(new, msg->sg.copy);
        sg_set_page(&msg->sg.data[new], page, len + copy, 0);
        if (rsge.length) {
                get_page(sg_page(&rsge));
@@ -3783,6 +3783,28 @@ static const struct bpf_func_proto sk_skb_change_head_proto = {
        .arg2_type      = ARG_ANYTHING,
        .arg3_type      = ARG_ANYTHING,
 };
+
+BPF_CALL_1(bpf_xdp_get_buff_len, struct  xdp_buff*, xdp)
+{
+       return xdp_get_buff_len(xdp);
+}
+
+static const struct bpf_func_proto bpf_xdp_get_buff_len_proto = {
+       .func           = bpf_xdp_get_buff_len,
+       .gpl_only       = false,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_CTX,
+};
+
+BTF_ID_LIST_SINGLE(bpf_xdp_get_buff_len_bpf_ids, struct, xdp_buff)
+
+const struct bpf_func_proto bpf_xdp_get_buff_len_trace_proto = {
+       .func           = bpf_xdp_get_buff_len,
+       .gpl_only       = false,
+       .arg1_type      = ARG_PTR_TO_BTF_ID,
+       .arg1_btf_id    = &bpf_xdp_get_buff_len_bpf_ids[0],
+};
+
 static unsigned long xdp_get_metalen(const struct xdp_buff *xdp)
 {
        return xdp_data_meta_unsupported(xdp) ? 0 :
@@ -3817,11 +3839,208 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
        .arg2_type      = ARG_ANYTHING,
 };
 
+static void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off,
+                            void *buf, unsigned long len, bool flush)
+{
+       unsigned long ptr_len, ptr_off = 0;
+       skb_frag_t *next_frag, *end_frag;
+       struct skb_shared_info *sinfo;
+       void *src, *dst;
+       u8 *ptr_buf;
+
+       if (likely(xdp->data_end - xdp->data >= off + len)) {
+               src = flush ? buf : xdp->data + off;
+               dst = flush ? xdp->data + off : buf;
+               memcpy(dst, src, len);
+               return;
+       }
+
+       sinfo = xdp_get_shared_info_from_buff(xdp);
+       end_frag = &sinfo->frags[sinfo->nr_frags];
+       next_frag = &sinfo->frags[0];
+
+       ptr_len = xdp->data_end - xdp->data;
+       ptr_buf = xdp->data;
+
+       while (true) {
+               if (off < ptr_off + ptr_len) {
+                       unsigned long copy_off = off - ptr_off;
+                       unsigned long copy_len = min(len, ptr_len - copy_off);
+
+                       src = flush ? buf : ptr_buf + copy_off;
+                       dst = flush ? ptr_buf + copy_off : buf;
+                       memcpy(dst, src, copy_len);
+
+                       off += copy_len;
+                       len -= copy_len;
+                       buf += copy_len;
+               }
+
+               if (!len || next_frag == end_frag)
+                       break;
+
+               ptr_off += ptr_len;
+               ptr_buf = skb_frag_address(next_frag);
+               ptr_len = skb_frag_size(next_frag);
+               next_frag++;
+       }
+}
+
+static void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len)
+{
+       struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
+       u32 size = xdp->data_end - xdp->data;
+       void *addr = xdp->data;
+       int i;
+
+       if (unlikely(offset > 0xffff || len > 0xffff))
+               return ERR_PTR(-EFAULT);
+
+       if (offset + len > xdp_get_buff_len(xdp))
+               return ERR_PTR(-EINVAL);
+
+       if (offset < size) /* linear area */
+               goto out;
+
+       offset -= size;
+       for (i = 0; i < sinfo->nr_frags; i++) { /* paged area */
+               u32 frag_size = skb_frag_size(&sinfo->frags[i]);
+
+               if  (offset < frag_size) {
+                       addr = skb_frag_address(&sinfo->frags[i]);
+                       size = frag_size;
+                       break;
+               }
+               offset -= frag_size;
+       }
+out:
+       return offset + len < size ? addr + offset : NULL;
+}
+
+BPF_CALL_4(bpf_xdp_load_bytes, struct xdp_buff *, xdp, u32, offset,
+          void *, buf, u32, len)
+{
+       void *ptr;
+
+       ptr = bpf_xdp_pointer(xdp, offset, len);
+       if (IS_ERR(ptr))
+               return PTR_ERR(ptr);
+
+       if (!ptr)
+               bpf_xdp_copy_buf(xdp, offset, buf, len, false);
+       else
+               memcpy(buf, ptr, len);
+
+       return 0;
+}
+
+static const struct bpf_func_proto bpf_xdp_load_bytes_proto = {
+       .func           = bpf_xdp_load_bytes,
+       .gpl_only       = false,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_CTX,
+       .arg2_type      = ARG_ANYTHING,
+       .arg3_type      = ARG_PTR_TO_UNINIT_MEM,
+       .arg4_type      = ARG_CONST_SIZE,
+};
+
+BPF_CALL_4(bpf_xdp_store_bytes, struct xdp_buff *, xdp, u32, offset,
+          void *, buf, u32, len)
+{
+       void *ptr;
+
+       ptr = bpf_xdp_pointer(xdp, offset, len);
+       if (IS_ERR(ptr))
+               return PTR_ERR(ptr);
+
+       if (!ptr)
+               bpf_xdp_copy_buf(xdp, offset, buf, len, true);
+       else
+               memcpy(ptr, buf, len);
+
+       return 0;
+}
+
+static const struct bpf_func_proto bpf_xdp_store_bytes_proto = {
+       .func           = bpf_xdp_store_bytes,
+       .gpl_only       = false,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_CTX,
+       .arg2_type      = ARG_ANYTHING,
+       .arg3_type      = ARG_PTR_TO_UNINIT_MEM,
+       .arg4_type      = ARG_CONST_SIZE,
+};
+
+static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset)
+{
+       struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
+       skb_frag_t *frag = &sinfo->frags[sinfo->nr_frags - 1];
+       struct xdp_rxq_info *rxq = xdp->rxq;
+       unsigned int tailroom;
+
+       if (!rxq->frag_size || rxq->frag_size > xdp->frame_sz)
+               return -EOPNOTSUPP;
+
+       tailroom = rxq->frag_size - skb_frag_size(frag) - skb_frag_off(frag);
+       if (unlikely(offset > tailroom))
+               return -EINVAL;
+
+       memset(skb_frag_address(frag) + skb_frag_size(frag), 0, offset);
+       skb_frag_size_add(frag, offset);
+       sinfo->xdp_frags_size += offset;
+
+       return 0;
+}
+
+static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset)
+{
+       struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
+       int i, n_frags_free = 0, len_free = 0;
+
+       if (unlikely(offset > (int)xdp_get_buff_len(xdp) - ETH_HLEN))
+               return -EINVAL;
+
+       for (i = sinfo->nr_frags - 1; i >= 0 && offset > 0; i--) {
+               skb_frag_t *frag = &sinfo->frags[i];
+               int shrink = min_t(int, offset, skb_frag_size(frag));
+
+               len_free += shrink;
+               offset -= shrink;
+
+               if (skb_frag_size(frag) == shrink) {
+                       struct page *page = skb_frag_page(frag);
+
+                       __xdp_return(page_address(page), &xdp->rxq->mem,
+                                    false, NULL);
+                       n_frags_free++;
+               } else {
+                       skb_frag_size_sub(frag, shrink);
+                       break;
+               }
+       }
+       sinfo->nr_frags -= n_frags_free;
+       sinfo->xdp_frags_size -= len_free;
+
+       if (unlikely(!sinfo->nr_frags)) {
+               xdp_buff_clear_frags_flag(xdp);
+               xdp->data_end -= offset;
+       }
+
+       return 0;
+}
+
 BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset)
 {
        void *data_hard_end = xdp_data_hard_end(xdp); /* use xdp->frame_sz */
        void *data_end = xdp->data_end + offset;
 
+       if (unlikely(xdp_buff_has_frags(xdp))) { /* non-linear xdp buff */
+               if (offset < 0)
+                       return bpf_xdp_frags_shrink_tail(xdp, -offset);
+
+               return bpf_xdp_frags_increase_tail(xdp, offset);
+       }
+
        /* Notice that xdp_data_hard_end have reserved some tailroom */
        if (unlikely(data_end > data_hard_end))
                return -EINVAL;
@@ -4047,6 +4266,14 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
        struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
        enum bpf_map_type map_type = ri->map_type;
 
+       /* XDP_REDIRECT is not fully supported yet for xdp frags since
+        * not all XDP capable drivers can map non-linear xdp_frame in
+        * ndo_xdp_xmit.
+        */
+       if (unlikely(xdp_buff_has_frags(xdp) &&
+                    map_type != BPF_MAP_TYPE_CPUMAP))
+               return -EOPNOTSUPP;
+
        if (map_type == BPF_MAP_TYPE_XSKMAP)
                return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog);
 
@@ -4590,10 +4817,12 @@ static const struct bpf_func_proto bpf_sk_ancestor_cgroup_id_proto = {
 };
 #endif
 
-static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff,
+static unsigned long bpf_xdp_copy(void *dst, const void *ctx,
                                  unsigned long off, unsigned long len)
 {
-       memcpy(dst_buff, src_buff + off, len);
+       struct xdp_buff *xdp = (struct xdp_buff *)ctx;
+
+       bpf_xdp_copy_buf(xdp, off, dst, len, false);
        return 0;
 }
 
@@ -4604,11 +4833,11 @@ BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map,
 
        if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
                return -EINVAL;
-       if (unlikely(!xdp ||
-                    xdp_size > (unsigned long)(xdp->data_end - xdp->data)))
+
+       if (unlikely(!xdp || xdp_size > xdp_get_buff_len(xdp)))
                return -EFAULT;
 
-       return bpf_event_output(map, flags, meta, meta_size, xdp->data,
+       return bpf_event_output(map, flags, meta, meta_size, xdp,
                                xdp_size, bpf_xdp_copy);
 }
 
@@ -4862,6 +5091,13 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname,
                case SO_REUSEPORT:
                        sk->sk_reuseport = valbool;
                        break;
+               case SO_TXREHASH:
+                       if (val < -1 || val > 1) {
+                               ret = -EINVAL;
+                               break;
+                       }
+                       sk->sk_txrehash = (u8)val;
+                       break;
                default:
                        ret = -EINVAL;
                }
@@ -5040,6 +5276,9 @@ static int _bpf_getsockopt(struct sock *sk, int level, int optname,
                case SO_REUSEPORT:
                        *((int *)optval) = sk->sk_reuseport;
                        break;
+               case SO_TXREHASH:
+                       *((int *)optval) = sk->sk_txrehash;
+                       break;
                default:
                        goto err_clear;
                }
@@ -7533,6 +7772,12 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
                return &bpf_xdp_redirect_map_proto;
        case BPF_FUNC_xdp_adjust_tail:
                return &bpf_xdp_adjust_tail_proto;
+       case BPF_FUNC_xdp_get_buff_len:
+               return &bpf_xdp_get_buff_len_proto;
+       case BPF_FUNC_xdp_load_bytes:
+               return &bpf_xdp_load_bytes_proto;
+       case BPF_FUNC_xdp_store_bytes:
+               return &bpf_xdp_store_bytes_proto;
        case BPF_FUNC_fib_lookup:
                return &bpf_xdp_fib_lookup_proto;
        case BPF_FUNC_check_mtu:
@@ -8030,6 +8275,7 @@ bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type,
                              struct bpf_insn_access_aux *info)
 {
        const int size_default = sizeof(__u32);
+       int field_size;
 
        if (off < 0 || off >= sizeof(struct bpf_sock))
                return false;
@@ -8041,7 +8287,6 @@ bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type,
        case offsetof(struct bpf_sock, family):
        case offsetof(struct bpf_sock, type):
        case offsetof(struct bpf_sock, protocol):
-       case offsetof(struct bpf_sock, dst_port):
        case offsetof(struct bpf_sock, src_port):
        case offsetof(struct bpf_sock, rx_queue_mapping):
        case bpf_ctx_range(struct bpf_sock, src_ip4):
@@ -8050,6 +8295,14 @@ bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type,
        case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]):
                bpf_ctx_record_field_size(info, size_default);
                return bpf_ctx_narrow_access_ok(off, size, size_default);
+       case bpf_ctx_range(struct bpf_sock, dst_port):
+               field_size = size == size_default ?
+                       size_default : sizeof_field(struct bpf_sock, dst_port);
+               bpf_ctx_record_field_size(info, field_size);
+               return bpf_ctx_narrow_access_ok(off, size, field_size);
+       case offsetofend(struct bpf_sock, dst_port) ...
+            offsetof(struct bpf_sock, dst_ip4) - 1:
+               return false;
        }
 
        return size == size_default;
@@ -10062,7 +10315,6 @@ const struct bpf_verifier_ops tc_cls_act_verifier_ops = {
        .convert_ctx_access     = tc_cls_act_convert_ctx_access,
        .gen_prologue           = tc_cls_act_prologue,
        .gen_ld_abs             = bpf_gen_ld_abs,
-       .check_kfunc_call       = bpf_prog_test_check_kfunc_call,
 };
 
 const struct bpf_prog_ops tc_cls_act_prog_ops = {
@@ -10601,7 +10853,8 @@ static bool sk_lookup_is_valid_access(int off, int size,
        case bpf_ctx_range(struct bpf_sk_lookup, local_ip4):
        case bpf_ctx_range_till(struct bpf_sk_lookup, remote_ip6[0], remote_ip6[3]):
        case bpf_ctx_range_till(struct bpf_sk_lookup, local_ip6[0], local_ip6[3]):
-       case bpf_ctx_range(struct bpf_sk_lookup, remote_port):
+       case offsetof(struct bpf_sk_lookup, remote_port) ...
+            offsetof(struct bpf_sk_lookup, local_ip4) - 1:
        case bpf_ctx_range(struct bpf_sk_lookup, local_port):
        case bpf_ctx_range(struct bpf_sk_lookup, ingress_ifindex):
                bpf_ctx_record_field_size(info, sizeof(__u32));
index a11b286..ee5e7e8 100644 (file)
@@ -459,29 +459,22 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 
                skb_set_network_header(skb, skb_gro_offset(skb));
                skb_reset_mac_len(skb);
-               NAPI_GRO_CB(skb)->same_flow = 0;
+               BUILD_BUG_ON(sizeof_field(struct napi_gro_cb, zeroed) != sizeof(u32));
+               BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct napi_gro_cb, zeroed),
+                                        sizeof(u32))); /* Avoid slow unaligned acc */
+               *(u32 *)&NAPI_GRO_CB(skb)->zeroed = 0;
                NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
-               NAPI_GRO_CB(skb)->free = 0;
-               NAPI_GRO_CB(skb)->encap_mark = 0;
-               NAPI_GRO_CB(skb)->recursion_counter = 0;
-               NAPI_GRO_CB(skb)->is_fou = 0;
                NAPI_GRO_CB(skb)->is_atomic = 1;
-               NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
 
                /* Setup for GRO checksum validation */
                switch (skb->ip_summed) {
                case CHECKSUM_COMPLETE:
                        NAPI_GRO_CB(skb)->csum = skb->csum;
                        NAPI_GRO_CB(skb)->csum_valid = 1;
-                       NAPI_GRO_CB(skb)->csum_cnt = 0;
                        break;
                case CHECKSUM_UNNECESSARY:
                        NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
-                       NAPI_GRO_CB(skb)->csum_valid = 0;
                        break;
-               default:
-                       NAPI_GRO_CB(skb)->csum_cnt = 0;
-                       NAPI_GRO_CB(skb)->csum_valid = 0;
                }
 
                pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive,
@@ -634,7 +627,6 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
 
        skb->encapsulation = 0;
        skb_shinfo(skb)->gso_type = 0;
-       skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
        if (unlikely(skb->slow_gro)) {
                skb_orphan(skb);
                skb_ext_reset(skb);
index b0f5344..95098d1 100644 (file)
@@ -166,10 +166,10 @@ static void linkwatch_do_dev(struct net_device *dev)
 
                netdev_state_change(dev);
        }
-       /* Note: our callers are responsible for
-        * calling netdev_tracker_free().
+       /* Note: our callers are responsible for calling netdev_tracker_free().
+        * This is the reason we use __dev_put() instead of dev_put().
         */
-       dev_put(dev);
+       __dev_put(dev);
 }
 
 static void __linkwatch_run_queue(int urgent_only)
index a5b5bb9..0ec2f59 100644 (file)
@@ -44,13 +44,7 @@ EXPORT_SYMBOL_GPL(net_rwsem);
 static struct key_tag init_net_key_domain = { .usage = REFCOUNT_INIT(1) };
 #endif
 
-struct net init_net = {
-       .ns.count       = REFCOUNT_INIT(1),
-       .dev_base_head  = LIST_HEAD_INIT(init_net.dev_base_head),
-#ifdef CONFIG_KEYS
-       .key_domain     = &init_net_key_domain,
-#endif
-};
+struct net init_net;
 EXPORT_SYMBOL(init_net);
 
 static bool init_net_initialized;
@@ -301,6 +295,7 @@ struct net *get_net_ns_by_id(const struct net *net, int id)
 
        return peer;
 }
+EXPORT_SYMBOL_GPL(get_net_ns_by_id);
 
 /*
  * setup_net runs the initializers for the network namespace object.
@@ -363,6 +358,8 @@ out_undo:
 static int __net_init net_defaults_init_net(struct net *net)
 {
        net->core.sysctl_somaxconn = SOMAXCONN;
+       net->core.sysctl_txrehash = SOCK_TXREHASH_ENABLED;
+
        return 0;
 }
 
@@ -1084,7 +1081,7 @@ out:
        rtnl_set_sk_err(net, RTNLGRP_NSID, err);
 }
 
-static int __init net_ns_init(void)
+void __init net_ns_init(void)
 {
        struct net_generic *ng;
 
@@ -1105,6 +1102,9 @@ static int __init net_ns_init(void)
 
        rcu_assign_pointer(init_net.gen, ng);
 
+#ifdef CONFIG_KEYS
+       init_net.key_domain = &init_net_key_domain;
+#endif
        down_write(&pernet_ops_rwsem);
        if (setup_net(&init_net, &init_user_ns))
                panic("Could not setup the initial network namespace");
@@ -1119,12 +1119,8 @@ static int __init net_ns_init(void)
                      RTNL_FLAG_DOIT_UNLOCKED);
        rtnl_register(PF_UNSPEC, RTM_GETNSID, rtnl_net_getid, rtnl_net_dumpid,
                      RTNL_FLAG_DOIT_UNLOCKED);
-
-       return 0;
 }
 
-pure_initcall(net_ns_init);
-
 static void free_exit_list(struct pernet_operations *ops, struct list_head *net_exit_list)
 {
        ops_pre_exit_list(ops, net_exit_list);
index bd62c01..e25d359 100644 (file)
@@ -423,11 +423,6 @@ static __always_inline struct page *
 __page_pool_put_page(struct page_pool *pool, struct page *page,
                     unsigned int dma_sync_size, bool allow_direct)
 {
-       /* It is not the last user for the page frag case */
-       if (pool->p.flags & PP_FLAG_PAGE_FRAG &&
-           page_pool_atomic_sub_frag_count_return(page, 1))
-               return NULL;
-
        /* This allocator is optimized for the XDP mode that uses
         * one-frame-per-page, but have fallbacks that act like the
         * regular page allocator APIs.
@@ -471,8 +466,8 @@ __page_pool_put_page(struct page_pool *pool, struct page *page,
        return NULL;
 }
 
-void page_pool_put_page(struct page_pool *pool, struct page *page,
-                       unsigned int dma_sync_size, bool allow_direct)
+void page_pool_put_defragged_page(struct page_pool *pool, struct page *page,
+                                 unsigned int dma_sync_size, bool allow_direct)
 {
        page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct);
        if (page && !page_pool_recycle_in_ring(pool, page)) {
@@ -480,7 +475,7 @@ void page_pool_put_page(struct page_pool *pool, struct page *page,
                page_pool_return_page(pool, page);
        }
 }
-EXPORT_SYMBOL(page_pool_put_page);
+EXPORT_SYMBOL(page_pool_put_defragged_page);
 
 /* Caller must not use data area after call, as this function overwrites it */
 void page_pool_put_page_bulk(struct page_pool *pool, void **data,
@@ -491,6 +486,10 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data,
        for (i = 0; i < count; i++) {
                struct page *page = virt_to_head_page(data[i]);
 
+               /* It is not the last user for the page frag case */
+               if (!page_pool_is_last_frag(pool, page))
+                       continue;
+
                page = __page_pool_put_page(pool, page, -1, false);
                /* Approved for bulk recycling in ptr_ring cache */
                if (page)
@@ -526,8 +525,7 @@ static struct page *page_pool_drain_frag(struct page_pool *pool,
        long drain_count = BIAS_MAX - pool->frag_users;
 
        /* Some user is still using the page frag */
-       if (likely(page_pool_atomic_sub_frag_count_return(page,
-                                                         drain_count)))
+       if (likely(page_pool_defrag_page(page, drain_count)))
                return NULL;
 
        if (page_ref_count(page) == 1 && !page_is_pfmemalloc(page)) {
@@ -548,8 +546,7 @@ static void page_pool_free_frag(struct page_pool *pool)
 
        pool->frag_page = NULL;
 
-       if (!page ||
-           page_pool_atomic_sub_frag_count_return(page, drain_count))
+       if (!page || page_pool_defrag_page(page, drain_count))
                return;
 
        page_pool_return_page(pool, page);
@@ -588,7 +585,7 @@ frag_reset:
                pool->frag_users = 1;
                *offset = 0;
                pool->frag_offset = size;
-               page_pool_set_frag_count(page, BIAS_MAX);
+               page_pool_fragment_page(page, BIAS_MAX);
                return page;
        }
 
index 710da8a..a6fad3d 100644 (file)
@@ -459,7 +459,7 @@ static void rtnl_lock_unregistering_all(void)
                 * setup_net() and cleanup_net() are not possible.
                 */
                for_each_net(net) {
-                       if (net->dev_unreg_count > 0) {
+                       if (atomic_read(&net->dev_unreg_count) > 0) {
                                unregistering = true;
                                break;
                        }
index 4ff806d..09d31a7 100644 (file)
@@ -1447,6 +1447,15 @@ set_sndbuf:
                break;
        }
 
+       case SO_TXREHASH:
+               if (val < -1 || val > 1) {
+                       ret = -EINVAL;
+                       break;
+               }
+               /* Paired with READ_ONCE() in tcp_rtx_synack() */
+               WRITE_ONCE(sk->sk_txrehash, (u8)val);
+               break;
+
        default:
                ret = -ENOPROTOOPT;
                break;
@@ -1834,6 +1843,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
                v.val = sk->sk_reserved_mem;
                break;
 
+       case SO_TXREHASH:
+               v.val = sk->sk_txrehash;
+               break;
+
        default:
                /* We implement the SO_SNDLOWAT etc to not be settable
                 * (1003.1g 7).
@@ -2266,6 +2279,7 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
                        sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
                        /* pairs with the WRITE_ONCE() in netif_set_gso_max_size() */
                        sk->sk_gso_max_size = READ_ONCE(dst->dev->gso_max_size);
+                       sk->sk_gso_max_size -= (MAX_TCP_HEADER + 1);
                        /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
                        max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
                }
@@ -2611,7 +2625,8 @@ int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
 
        switch (cmsg->cmsg_type) {
        case SO_MARK:
-               if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+               if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
+                   !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
                        return -EPERM;
                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
                        return -EINVAL;
@@ -3278,6 +3293,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
        sk->sk_pacing_rate = ~0UL;
        WRITE_ONCE(sk->sk_pacing_shift, 10);
        sk->sk_incoming_cpu = -1;
+       sk->sk_txrehash = SOCK_TXREHASH_DEFAULT;
 
        sk_rx_queue_clear(sk);
        /*
index 1827669..2d213c4 100644 (file)
@@ -1416,38 +1416,50 @@ static struct sk_psock_progs *sock_map_progs(struct bpf_map *map)
        return NULL;
 }
 
-static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
-                               struct bpf_prog *old, u32 which)
+static int sock_map_prog_lookup(struct bpf_map *map, struct bpf_prog ***pprog,
+                               u32 which)
 {
        struct sk_psock_progs *progs = sock_map_progs(map);
-       struct bpf_prog **pprog;
 
        if (!progs)
                return -EOPNOTSUPP;
 
        switch (which) {
        case BPF_SK_MSG_VERDICT:
-               pprog = &progs->msg_parser;
+               *pprog = &progs->msg_parser;
                break;
 #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
        case BPF_SK_SKB_STREAM_PARSER:
-               pprog = &progs->stream_parser;
+               *pprog = &progs->stream_parser;
                break;
 #endif
        case BPF_SK_SKB_STREAM_VERDICT:
                if (progs->skb_verdict)
                        return -EBUSY;
-               pprog = &progs->stream_verdict;
+               *pprog = &progs->stream_verdict;
                break;
        case BPF_SK_SKB_VERDICT:
                if (progs->stream_verdict)
                        return -EBUSY;
-               pprog = &progs->skb_verdict;
+               *pprog = &progs->skb_verdict;
                break;
        default:
                return -EOPNOTSUPP;
        }
 
+       return 0;
+}
+
+static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
+                               struct bpf_prog *old, u32 which)
+{
+       struct bpf_prog **pprog;
+       int ret;
+
+       ret = sock_map_prog_lookup(map, &pprog, which);
+       if (ret)
+               return ret;
+
        if (old)
                return psock_replace_prog(pprog, prog, old);
 
@@ -1455,6 +1467,57 @@ static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
        return 0;
 }
 
+int sock_map_bpf_prog_query(const union bpf_attr *attr,
+                           union bpf_attr __user *uattr)
+{
+       __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
+       u32 prog_cnt = 0, flags = 0, ufd = attr->target_fd;
+       struct bpf_prog **pprog;
+       struct bpf_prog *prog;
+       struct bpf_map *map;
+       struct fd f;
+       u32 id = 0;
+       int ret;
+
+       if (attr->query.query_flags)
+               return -EINVAL;
+
+       f = fdget(ufd);
+       map = __bpf_map_get(f);
+       if (IS_ERR(map))
+               return PTR_ERR(map);
+
+       rcu_read_lock();
+
+       ret = sock_map_prog_lookup(map, &pprog, attr->query.attach_type);
+       if (ret)
+               goto end;
+
+       prog = *pprog;
+       prog_cnt = !prog ? 0 : 1;
+
+       if (!attr->query.prog_cnt || !prog_ids || !prog_cnt)
+               goto end;
+
+       /* we do not hold the refcnt, the bpf prog may be released
+        * asynchronously and the id would be set to 0.
+        */
+       id = data_race(prog->aux->id);
+       if (id == 0)
+               prog_cnt = 0;
+
+end:
+       rcu_read_unlock();
+
+       if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)) ||
+           (id != 0 && copy_to_user(prog_ids, &id, sizeof(u32))) ||
+           copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt)))
+               ret = -EFAULT;
+
+       fdput(f);
+       return ret;
+}
+
 static void sock_map_unlink(struct sock *sk, struct sk_psock_link *link)
 {
        switch (link->map->map_type) {
index 7b4d485..dbeb8ec 100644 (file)
@@ -593,6 +593,15 @@ static struct ctl_table netns_core_table[] = {
                .extra1         = SYSCTL_ZERO,
                .proc_handler   = proc_dointvec_minmax
        },
+       {
+               .procname       = "txrehash",
+               .data           = &init_net.core.sysctl_txrehash,
+               .maxlen         = sizeof(u8),
+               .mode           = 0644,
+               .extra1         = SYSCTL_ZERO,
+               .extra2         = SYSCTL_ONE,
+               .proc_handler   = proc_dou8vec_minmax,
+       },
        { }
 };
 
@@ -611,7 +620,7 @@ __setup("fb_tunnels=", fb_tunnels_only_for_init_net_sysctl_setup);
 
 static __net_init int sysctl_core_net_init(struct net *net)
 {
-       struct ctl_table *tbl;
+       struct ctl_table *tbl, *tmp;
 
        tbl = netns_core_table;
        if (!net_eq(net, &init_net)) {
@@ -619,7 +628,8 @@ static __net_init int sysctl_core_net_init(struct net *net)
                if (tbl == NULL)
                        goto err_dup;
 
-               tbl[0].data = &net->core.sysctl_somaxconn;
+               for (tmp = tbl; tmp->procname; tmp++)
+                       tmp->data += (char *)net - (char *)&init_net;
 
                /* Don't export any sysctls to unprivileged users */
                if (net->user_ns != &init_user_ns) {
index 7aba355..361df31 100644 (file)
@@ -162,8 +162,9 @@ static void xdp_rxq_info_init(struct xdp_rxq_info *xdp_rxq)
 }
 
 /* Returns 0 on success, negative on failure */
-int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
-                    struct net_device *dev, u32 queue_index, unsigned int napi_id)
+int __xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
+                      struct net_device *dev, u32 queue_index,
+                      unsigned int napi_id, u32 frag_size)
 {
        if (!dev) {
                WARN(1, "Missing net_device from driver");
@@ -185,11 +186,12 @@ int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
        xdp_rxq->dev = dev;
        xdp_rxq->queue_index = queue_index;
        xdp_rxq->napi_id = napi_id;
+       xdp_rxq->frag_size = frag_size;
 
        xdp_rxq->reg_state = REG_STATE_REGISTERED;
        return 0;
 }
-EXPORT_SYMBOL_GPL(xdp_rxq_info_reg);
+EXPORT_SYMBOL_GPL(__xdp_rxq_info_reg);
 
 void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq)
 {
@@ -369,8 +371,8 @@ EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);
  * is used for those calls sites.  Thus, allowing for faster recycling
  * of xdp_frames/pages in those cases.
  */
-static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
-                        struct xdp_buff *xdp)
+void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
+                 struct xdp_buff *xdp)
 {
        struct xdp_mem_allocator *xa;
        struct page *page;
@@ -406,12 +408,38 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
 
 void xdp_return_frame(struct xdp_frame *xdpf)
 {
+       struct skb_shared_info *sinfo;
+       int i;
+
+       if (likely(!xdp_frame_has_frags(xdpf)))
+               goto out;
+
+       sinfo = xdp_get_shared_info_from_frame(xdpf);
+       for (i = 0; i < sinfo->nr_frags; i++) {
+               struct page *page = skb_frag_page(&sinfo->frags[i]);
+
+               __xdp_return(page_address(page), &xdpf->mem, false, NULL);
+       }
+out:
        __xdp_return(xdpf->data, &xdpf->mem, false, NULL);
 }
 EXPORT_SYMBOL_GPL(xdp_return_frame);
 
 void xdp_return_frame_rx_napi(struct xdp_frame *xdpf)
 {
+       struct skb_shared_info *sinfo;
+       int i;
+
+       if (likely(!xdp_frame_has_frags(xdpf)))
+               goto out;
+
+       sinfo = xdp_get_shared_info_from_frame(xdpf);
+       for (i = 0; i < sinfo->nr_frags; i++) {
+               struct page *page = skb_frag_page(&sinfo->frags[i]);
+
+               __xdp_return(page_address(page), &xdpf->mem, true, NULL);
+       }
+out:
        __xdp_return(xdpf->data, &xdpf->mem, true, NULL);
 }
 EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi);
@@ -447,7 +475,7 @@ void xdp_return_frame_bulk(struct xdp_frame *xdpf,
        struct xdp_mem_allocator *xa;
 
        if (mem->type != MEM_TYPE_PAGE_POOL) {
-               __xdp_return(xdpf->data, &xdpf->mem, false, NULL);
+               xdp_return_frame(xdpf);
                return;
        }
 
@@ -466,12 +494,38 @@ void xdp_return_frame_bulk(struct xdp_frame *xdpf,
                bq->xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
        }
 
+       if (unlikely(xdp_frame_has_frags(xdpf))) {
+               struct skb_shared_info *sinfo;
+               int i;
+
+               sinfo = xdp_get_shared_info_from_frame(xdpf);
+               for (i = 0; i < sinfo->nr_frags; i++) {
+                       skb_frag_t *frag = &sinfo->frags[i];
+
+                       bq->q[bq->count++] = skb_frag_address(frag);
+                       if (bq->count == XDP_BULK_QUEUE_SIZE)
+                               xdp_flush_frame_bulk(bq);
+               }
+       }
        bq->q[bq->count++] = xdpf->data;
 }
 EXPORT_SYMBOL_GPL(xdp_return_frame_bulk);
 
 void xdp_return_buff(struct xdp_buff *xdp)
 {
+       struct skb_shared_info *sinfo;
+       int i;
+
+       if (likely(!xdp_buff_has_frags(xdp)))
+               goto out;
+
+       sinfo = xdp_get_shared_info_from_buff(xdp);
+       for (i = 0; i < sinfo->nr_frags; i++) {
+               struct page *page = skb_frag_page(&sinfo->frags[i]);
+
+               __xdp_return(page_address(page), &xdp->rxq->mem, true, xdp);
+       }
+out:
        __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp);
 }
 
@@ -561,8 +615,14 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf,
                                           struct sk_buff *skb,
                                           struct net_device *dev)
 {
+       struct skb_shared_info *sinfo = xdp_get_shared_info_from_frame(xdpf);
        unsigned int headroom, frame_size;
        void *hard_start;
+       u8 nr_frags;
+
+       /* xdp frags frame */
+       if (unlikely(xdp_frame_has_frags(xdpf)))
+               nr_frags = sinfo->nr_frags;
 
        /* Part of headroom was reserved to xdpf */
        headroom = sizeof(*xdpf) + xdpf->headroom;
@@ -582,6 +642,12 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf,
        if (xdpf->metasize)
                skb_metadata_set(skb, xdpf->metasize);
 
+       if (unlikely(xdp_frame_has_frags(xdpf)))
+               xdp_update_skb_shared_info(skb, nr_frags,
+                                          sinfo->xdp_frags_size,
+                                          nr_frags * xdpf->frame_sz,
+                                          xdp_frame_is_frag_pfmemalloc(xdpf));
+
        /* Essential SKB info: protocol and skb->dev */
        skb->protocol = eth_type_trans(skb, dev);
 
index 5183e62..671c377 100644 (file)
@@ -136,11 +136,6 @@ static inline int between48(const u64 seq1, const u64 seq2, const u64 seq3)
        return (seq3 << 16) - (seq2 << 16) >= (seq1 << 16) - (seq2 << 16);
 }
 
-static inline u64 max48(const u64 seq1, const u64 seq2)
-{
-       return after48(seq1, seq2) ? seq1 : seq2;
-}
-
 /**
  * dccp_loss_count - Approximate the number of lost data packets in a burst loss
  * @s1:  last known sequence number before the loss ('hole')
index 0ea2927..ae66256 100644 (file)
@@ -1030,15 +1030,9 @@ static void __net_exit dccp_v4_exit_net(struct net *net)
        inet_ctl_sock_destroy(pn->v4_ctl_sk);
 }
 
-static void __net_exit dccp_v4_exit_batch(struct list_head *net_exit_list)
-{
-       inet_twsk_purge(&dccp_hashinfo, AF_INET);
-}
-
 static struct pernet_operations dccp_v4_ops = {
        .init   = dccp_v4_init_net,
        .exit   = dccp_v4_exit_net,
-       .exit_batch = dccp_v4_exit_batch,
        .id     = &dccp_v4_pernet_id,
        .size   = sizeof(struct dccp_v4_pernet),
 };
index fa66351..eab3bd1 100644 (file)
@@ -1115,15 +1115,9 @@ static void __net_exit dccp_v6_exit_net(struct net *net)
        inet_ctl_sock_destroy(pn->v6_ctl_sk);
 }
 
-static void __net_exit dccp_v6_exit_batch(struct list_head *net_exit_list)
-{
-       inet_twsk_purge(&dccp_hashinfo, AF_INET6);
-}
-
 static struct pernet_operations dccp_v6_ops = {
        .init   = dccp_v6_init_net,
        .exit   = dccp_v6_exit_net,
-       .exit_batch = dccp_v6_exit_batch,
        .id     = &dccp_v6_pernet_id,
        .size   = sizeof(struct dccp_v6_pernet),
 };
index 91e7a22..64d805b 100644 (file)
@@ -22,6 +22,7 @@
 #include "feat.h"
 
 struct inet_timewait_death_row dccp_death_row = {
+       .tw_refcount = REFCOUNT_INIT(1),
        .sysctl_max_tw_buckets = NR_FILE * 2,
        .hashinfo       = &dccp_hashinfo,
 };
index dcad310..e498c92 100644 (file)
@@ -15,6 +15,7 @@
 #include <linux/of.h>
 #include <linux/of_net.h>
 #include <net/devlink.h>
+#include <net/sch_generic.h>
 
 #include "dsa_priv.h"
 
@@ -1064,9 +1065,18 @@ static int dsa_tree_setup_master(struct dsa_switch_tree *dst)
 
        list_for_each_entry(dp, &dst->ports, list) {
                if (dsa_port_is_cpu(dp)) {
-                       err = dsa_master_setup(dp->master, dp);
+                       struct net_device *master = dp->master;
+                       bool admin_up = (master->flags & IFF_UP) &&
+                                       !qdisc_tx_is_noop(master);
+
+                       err = dsa_master_setup(master, dp);
                        if (err)
                                return err;
+
+                       /* Replay master state event */
+                       dsa_tree_master_admin_state_change(dst, master, admin_up);
+                       dsa_tree_master_oper_state_change(dst, master,
+                                                         netif_oper_up(master));
                }
        }
 
@@ -1081,9 +1091,19 @@ static void dsa_tree_teardown_master(struct dsa_switch_tree *dst)
 
        rtnl_lock();
 
-       list_for_each_entry(dp, &dst->ports, list)
-               if (dsa_port_is_cpu(dp))
-                       dsa_master_teardown(dp->master);
+       list_for_each_entry(dp, &dst->ports, list) {
+               if (dsa_port_is_cpu(dp)) {
+                       struct net_device *master = dp->master;
+
+                       /* Synthesizing an "admin down" state is sufficient for
+                        * the switches to get a notification if the master is
+                        * currently up and running.
+                        */
+                       dsa_tree_master_admin_state_change(dst, master, false);
+
+                       dsa_master_teardown(master);
+               }
+       }
 
        rtnl_unlock();
 }
@@ -1279,6 +1299,52 @@ out_unlock:
        return err;
 }
 
+static void dsa_tree_master_state_change(struct dsa_switch_tree *dst,
+                                        struct net_device *master)
+{
+       struct dsa_notifier_master_state_info info;
+       struct dsa_port *cpu_dp = master->dsa_ptr;
+
+       info.master = master;
+       info.operational = dsa_port_master_is_operational(cpu_dp);
+
+       dsa_tree_notify(dst, DSA_NOTIFIER_MASTER_STATE_CHANGE, &info);
+}
+
+void dsa_tree_master_admin_state_change(struct dsa_switch_tree *dst,
+                                       struct net_device *master,
+                                       bool up)
+{
+       struct dsa_port *cpu_dp = master->dsa_ptr;
+       bool notify = false;
+
+       if ((dsa_port_master_is_operational(cpu_dp)) !=
+           (up && cpu_dp->master_oper_up))
+               notify = true;
+
+       cpu_dp->master_admin_up = up;
+
+       if (notify)
+               dsa_tree_master_state_change(dst, master);
+}
+
+void dsa_tree_master_oper_state_change(struct dsa_switch_tree *dst,
+                                      struct net_device *master,
+                                      bool up)
+{
+       struct dsa_port *cpu_dp = master->dsa_ptr;
+       bool notify = false;
+
+       if ((dsa_port_master_is_operational(cpu_dp)) !=
+           (cpu_dp->master_admin_up && up))
+               notify = true;
+
+       cpu_dp->master_oper_up = up;
+
+       if (notify)
+               dsa_tree_master_state_change(dst, master);
+}
+
 static struct dsa_port *dsa_port_touch(struct dsa_switch *ds, int index)
 {
        struct dsa_switch_tree *dst = ds->dst;
index 760306f..2bbfa9e 100644 (file)
@@ -40,6 +40,7 @@ enum {
        DSA_NOTIFIER_TAG_PROTO_DISCONNECT,
        DSA_NOTIFIER_TAG_8021Q_VLAN_ADD,
        DSA_NOTIFIER_TAG_8021Q_VLAN_DEL,
+       DSA_NOTIFIER_MASTER_STATE_CHANGE,
 };
 
 /* DSA_NOTIFIER_AGEING_TIME */
@@ -109,6 +110,12 @@ struct dsa_notifier_tag_8021q_vlan_info {
        u16 vid;
 };
 
+/* DSA_NOTIFIER_MASTER_STATE_CHANGE */
+struct dsa_notifier_master_state_info {
+       const struct net_device *master;
+       bool operational;
+};
+
 struct dsa_switchdev_event_work {
        struct dsa_switch *ds;
        int port;
@@ -482,6 +489,12 @@ int dsa_tree_change_tag_proto(struct dsa_switch_tree *dst,
                              struct net_device *master,
                              const struct dsa_device_ops *tag_ops,
                              const struct dsa_device_ops *old_tag_ops);
+void dsa_tree_master_admin_state_change(struct dsa_switch_tree *dst,
+                                       struct net_device *master,
+                                       bool up);
+void dsa_tree_master_oper_state_change(struct dsa_switch_tree *dst,
+                                      struct net_device *master,
+                                      bool up);
 unsigned int dsa_bridge_num_get(const struct net_device *bridge_dev, int max);
 void dsa_bridge_num_put(const struct net_device *bridge_dev,
                        unsigned int bridge_num);
index 22241af..2b5b0f2 100644 (file)
@@ -2346,6 +2346,36 @@ static int dsa_slave_netdevice_event(struct notifier_block *nb,
                err = dsa_port_lag_change(dp, info->lower_state_info);
                return notifier_from_errno(err);
        }
+       case NETDEV_CHANGE:
+       case NETDEV_UP: {
+               /* Track state of master port.
+                * DSA driver may require the master port (and indirectly
+                * the tagger) to be available for some special operation.
+                */
+               if (netdev_uses_dsa(dev)) {
+                       struct dsa_port *cpu_dp = dev->dsa_ptr;
+                       struct dsa_switch_tree *dst = cpu_dp->ds->dst;
+
+                       /* Track when the master port is UP */
+                       dsa_tree_master_oper_state_change(dst, dev,
+                                                         netif_oper_up(dev));
+
+                       /* Track when the master port is ready and can accept
+                        * packet.
+                        * NETDEV_UP event is not enough to flag a port as ready.
+                        * We also have to wait for linkwatch_do_dev to dev_activate
+                        * and emit a NETDEV_CHANGE event.
+                        * We check if a master port is ready by checking if the dev
+                        * have a qdisc assigned and is not noop.
+                        */
+                       dsa_tree_master_admin_state_change(dst, dev,
+                                                          !qdisc_tx_is_noop(dev));
+
+                       return NOTIFY_OK;
+               }
+
+               return NOTIFY_DONE;
+       }
        case NETDEV_GOING_DOWN: {
                struct dsa_port *dp, *cpu_dp;
                struct dsa_switch_tree *dst;
@@ -2357,6 +2387,8 @@ static int dsa_slave_netdevice_event(struct notifier_block *nb,
                cpu_dp = dev->dsa_ptr;
                dst = cpu_dp->ds->dst;
 
+               dsa_tree_master_admin_state_change(dst, dev, false);
+
                list_for_each_entry(dp, &dst->ports, list) {
                        if (!dsa_port_is_user(dp))
                                continue;
index e3c7d26..4866b58 100644 (file)
@@ -113,26 +113,15 @@ static int dsa_switch_bridge_join(struct dsa_switch *ds,
        return dsa_tag_8021q_bridge_join(ds, info);
 }
 
-static int dsa_switch_bridge_leave(struct dsa_switch *ds,
-                                  struct dsa_notifier_bridge_info *info)
+static int dsa_switch_sync_vlan_filtering(struct dsa_switch *ds,
+                                         struct dsa_notifier_bridge_info *info)
 {
-       struct dsa_switch_tree *dst = ds->dst;
        struct netlink_ext_ack extack = {0};
        bool change_vlan_filtering = false;
        bool vlan_filtering;
        struct dsa_port *dp;
        int err;
 
-       if (dst->index == info->tree_index && ds->index == info->sw_index &&
-           ds->ops->port_bridge_leave)
-               ds->ops->port_bridge_leave(ds, info->port, info->bridge);
-
-       if ((dst->index != info->tree_index || ds->index != info->sw_index) &&
-           ds->ops->crosschip_bridge_leave)
-               ds->ops->crosschip_bridge_leave(ds, info->tree_index,
-                                               info->sw_index, info->port,
-                                               info->bridge);
-
        if (ds->needs_standalone_vlan_filtering &&
            !br_vlan_enabled(info->bridge.dev)) {
                change_vlan_filtering = true;
@@ -172,6 +161,31 @@ static int dsa_switch_bridge_leave(struct dsa_switch *ds,
                        return err;
        }
 
+       return 0;
+}
+
+static int dsa_switch_bridge_leave(struct dsa_switch *ds,
+                                  struct dsa_notifier_bridge_info *info)
+{
+       struct dsa_switch_tree *dst = ds->dst;
+       int err;
+
+       if (dst->index == info->tree_index && ds->index == info->sw_index &&
+           ds->ops->port_bridge_leave)
+               ds->ops->port_bridge_leave(ds, info->port, info->bridge);
+
+       if ((dst->index != info->tree_index || ds->index != info->sw_index) &&
+           ds->ops->crosschip_bridge_leave)
+               ds->ops->crosschip_bridge_leave(ds, info->tree_index,
+                                               info->sw_index, info->port,
+                                               info->bridge);
+
+       if (ds->dst->index == info->tree_index && ds->index == info->sw_index) {
+               err = dsa_switch_sync_vlan_filtering(ds, info);
+               if (err)
+                       return err;
+       }
+
        return dsa_tag_8021q_bridge_leave(ds, info);
 }
 
@@ -683,6 +697,18 @@ dsa_switch_disconnect_tag_proto(struct dsa_switch *ds,
        return 0;
 }
 
+static int
+dsa_switch_master_state_change(struct dsa_switch *ds,
+                              struct dsa_notifier_master_state_info *info)
+{
+       if (!ds->ops->master_state_change)
+               return 0;
+
+       ds->ops->master_state_change(ds, info->master, info->operational);
+
+       return 0;
+}
+
 static int dsa_switch_event(struct notifier_block *nb,
                            unsigned long event, void *info)
 {
@@ -756,6 +782,9 @@ static int dsa_switch_event(struct notifier_block *nb,
        case DSA_NOTIFIER_TAG_8021Q_VLAN_DEL:
                err = dsa_switch_tag_8021q_vlan_del(ds, info);
                break;
+       case DSA_NOTIFIER_MASTER_STATE_CHANGE:
+               err = dsa_switch_master_state_change(ds, info);
+               break;
        default:
                err = -EOPNOTSUPP;
                break;
index 1ea9401..57d2e00 100644 (file)
@@ -4,30 +4,12 @@
  */
 
 #include <linux/etherdevice.h>
+#include <linux/bitfield.h>
+#include <net/dsa.h>
+#include <linux/dsa/tag_qca.h>
 
 #include "dsa_priv.h"
 
-#define QCA_HDR_LEN    2
-#define QCA_HDR_VERSION        0x2
-
-#define QCA_HDR_RECV_VERSION_MASK      GENMASK(15, 14)
-#define QCA_HDR_RECV_VERSION_S         14
-#define QCA_HDR_RECV_PRIORITY_MASK     GENMASK(13, 11)
-#define QCA_HDR_RECV_PRIORITY_S                11
-#define QCA_HDR_RECV_TYPE_MASK         GENMASK(10, 6)
-#define QCA_HDR_RECV_TYPE_S            6
-#define QCA_HDR_RECV_FRAME_IS_TAGGED   BIT(3)
-#define QCA_HDR_RECV_SOURCE_PORT_MASK  GENMASK(2, 0)
-
-#define QCA_HDR_XMIT_VERSION_MASK      GENMASK(15, 14)
-#define QCA_HDR_XMIT_VERSION_S         14
-#define QCA_HDR_XMIT_PRIORITY_MASK     GENMASK(13, 11)
-#define QCA_HDR_XMIT_PRIORITY_S                11
-#define QCA_HDR_XMIT_CONTROL_MASK      GENMASK(10, 8)
-#define QCA_HDR_XMIT_CONTROL_S         8
-#define QCA_HDR_XMIT_FROM_CPU          BIT(7)
-#define QCA_HDR_XMIT_DP_BIT_MASK       GENMASK(6, 0)
-
 static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev)
 {
        struct dsa_port *dp = dsa_slave_to_port(dev);
@@ -40,8 +22,9 @@ static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev)
        phdr = dsa_etype_header_pos_tx(skb);
 
        /* Set the version field, and set destination port information */
-       hdr = QCA_HDR_VERSION << QCA_HDR_XMIT_VERSION_S |
-               QCA_HDR_XMIT_FROM_CPU | BIT(dp->index);
+       hdr = FIELD_PREP(QCA_HDR_XMIT_VERSION, QCA_HDR_VERSION);
+       hdr |= QCA_HDR_XMIT_FROM_CPU;
+       hdr |= FIELD_PREP(QCA_HDR_XMIT_DP_BIT, BIT(dp->index));
 
        *phdr = htons(hdr);
 
@@ -50,10 +33,17 @@ static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev)
 
 static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev)
 {
-       u8 ver;
-       u16  hdr;
-       int port;
+       struct qca_tagger_data *tagger_data;
+       struct dsa_port *dp = dev->dsa_ptr;
+       struct dsa_switch *ds = dp->ds;
+       u8 ver, pk_type;
        __be16 *phdr;
+       int port;
+       u16 hdr;
+
+       BUILD_BUG_ON(sizeof(struct qca_mgmt_ethhdr) != QCA_HDR_MGMT_HEADER_LEN + QCA_HDR_LEN);
+
+       tagger_data = ds->tagger_data;
 
        if (unlikely(!pskb_may_pull(skb, QCA_HDR_LEN)))
                return NULL;
@@ -62,16 +52,33 @@ static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev)
        hdr = ntohs(*phdr);
 
        /* Make sure the version is correct */
-       ver = (hdr & QCA_HDR_RECV_VERSION_MASK) >> QCA_HDR_RECV_VERSION_S;
+       ver = FIELD_GET(QCA_HDR_RECV_VERSION, hdr);
        if (unlikely(ver != QCA_HDR_VERSION))
                return NULL;
 
+       /* Get pk type */
+       pk_type = FIELD_GET(QCA_HDR_RECV_TYPE, hdr);
+
+       /* Ethernet mgmt read/write packet */
+       if (pk_type == QCA_HDR_RECV_TYPE_RW_REG_ACK) {
+               if (likely(tagger_data->rw_reg_ack_handler))
+                       tagger_data->rw_reg_ack_handler(ds, skb);
+               return NULL;
+       }
+
+       /* Ethernet MIB counter packet */
+       if (pk_type == QCA_HDR_RECV_TYPE_MIB) {
+               if (likely(tagger_data->mib_autocast_handler))
+                       tagger_data->mib_autocast_handler(ds, skb);
+               return NULL;
+       }
+
        /* Remove QCA tag and recalculate checksum */
        skb_pull_rcsum(skb, QCA_HDR_LEN);
        dsa_strip_etype_header(skb, QCA_HDR_LEN);
 
        /* Get source port information */
-       port = (hdr & QCA_HDR_RECV_SOURCE_PORT_MASK);
+       port = FIELD_GET(QCA_HDR_RECV_SOURCE_PORT, hdr);
 
        skb->dev = dsa_master_find_slave(dev, 0, port);
        if (!skb->dev)
@@ -80,12 +87,34 @@ static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev)
        return skb;
 }
 
+static int qca_tag_connect(struct dsa_switch *ds)
+{
+       struct qca_tagger_data *tagger_data;
+
+       tagger_data = kzalloc(sizeof(*tagger_data), GFP_KERNEL);
+       if (!tagger_data)
+               return -ENOMEM;
+
+       ds->tagger_data = tagger_data;
+
+       return 0;
+}
+
+static void qca_tag_disconnect(struct dsa_switch *ds)
+{
+       kfree(ds->tagger_data);
+       ds->tagger_data = NULL;
+}
+
 static const struct dsa_device_ops qca_netdev_ops = {
        .name   = "qca",
        .proto  = DSA_TAG_PROTO_QCA,
+       .connect = qca_tag_connect,
+       .disconnect = qca_tag_disconnect,
        .xmit   = qca_tag_xmit,
        .rcv    = qca_tag_rcv,
        .needed_headroom = QCA_HDR_LEN,
+       .promisc_on_master = true,
 };
 
 MODULE_LICENSE("GPL");
index c1d5f5e..18a5035 100644 (file)
@@ -53,7 +53,8 @@ static int rings_reply_size(const struct ethnl_req_info *req_base,
               nla_total_size(sizeof(u32)) +    /* _RINGS_RX_MINI */
               nla_total_size(sizeof(u32)) +    /* _RINGS_RX_JUMBO */
               nla_total_size(sizeof(u32)) +    /* _RINGS_TX */
-              nla_total_size(sizeof(u32));     /* _RINGS_RX_BUF_LEN */
+              nla_total_size(sizeof(u32)) +    /* _RINGS_RX_BUF_LEN */
+              nla_total_size(sizeof(u8));      /* _RINGS_TCP_DATA_SPLIT */
 }
 
 static int rings_fill_reply(struct sk_buff *skb,
@@ -61,9 +62,11 @@ static int rings_fill_reply(struct sk_buff *skb,
                            const struct ethnl_reply_data *reply_base)
 {
        const struct rings_reply_data *data = RINGS_REPDATA(reply_base);
-       const struct kernel_ethtool_ringparam *kernel_ringparam = &data->kernel_ringparam;
+       const struct kernel_ethtool_ringparam *kr = &data->kernel_ringparam;
        const struct ethtool_ringparam *ringparam = &data->ringparam;
 
+       WARN_ON(kr->tcp_data_split > ETHTOOL_TCP_DATA_SPLIT_ENABLED);
+
        if ((ringparam->rx_max_pending &&
             (nla_put_u32(skb, ETHTOOL_A_RINGS_RX_MAX,
                          ringparam->rx_max_pending) ||
@@ -84,9 +87,11 @@ static int rings_fill_reply(struct sk_buff *skb,
                          ringparam->tx_max_pending) ||
              nla_put_u32(skb, ETHTOOL_A_RINGS_TX,
                          ringparam->tx_pending)))  ||
-           (kernel_ringparam->rx_buf_len &&
-            (nla_put_u32(skb, ETHTOOL_A_RINGS_RX_BUF_LEN,
-                         kernel_ringparam->rx_buf_len))))
+           (kr->rx_buf_len &&
+            (nla_put_u32(skb, ETHTOOL_A_RINGS_RX_BUF_LEN, kr->rx_buf_len))) ||
+           (kr->tcp_data_split &&
+            (nla_put_u8(skb, ETHTOOL_A_RINGS_TCP_DATA_SPLIT,
+                        kr->tcp_data_split))))
                return -EMSGSIZE;
 
        return 0;
index 99f3af1..fe6094e 100644 (file)
@@ -17,6 +17,7 @@
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/debugfs.h>
+#include <linux/jhash.h>
 #include "hsr_main.h"
 #include "hsr_framereg.h"
 
@@ -28,6 +29,7 @@ hsr_node_table_show(struct seq_file *sfp, void *data)
 {
        struct hsr_priv *priv = (struct hsr_priv *)sfp->private;
        struct hsr_node *node;
+       int i;
 
        seq_printf(sfp, "Node Table entries for (%s) device\n",
                   (priv->prot_version == PRP_V1 ? "PRP" : "HSR"));
@@ -39,22 +41,28 @@ hsr_node_table_show(struct seq_file *sfp, void *data)
                seq_puts(sfp, "DAN-H\n");
 
        rcu_read_lock();
-       list_for_each_entry_rcu(node, &priv->node_db, mac_list) {
-               /* skip self node */
-               if (hsr_addr_is_self(priv, node->macaddress_A))
-                       continue;
-               seq_printf(sfp, "%pM ", &node->macaddress_A[0]);
-               seq_printf(sfp, "%pM ", &node->macaddress_B[0]);
-               seq_printf(sfp, "%10lx, ", node->time_in[HSR_PT_SLAVE_A]);
-               seq_printf(sfp, "%10lx, ", node->time_in[HSR_PT_SLAVE_B]);
-               seq_printf(sfp, "%14x, ", node->addr_B_port);
-
-               if (priv->prot_version == PRP_V1)
-                       seq_printf(sfp, "%5x, %5x, %5x\n",
-                                  node->san_a, node->san_b,
-                                  (node->san_a == 0 && node->san_b == 0));
-               else
-                       seq_printf(sfp, "%5x\n", 1);
+
+       for (i = 0 ; i < priv->hash_buckets; i++) {
+               hlist_for_each_entry_rcu(node, &priv->node_db[i], mac_list) {
+                       /* skip self node */
+                       if (hsr_addr_is_self(priv, node->macaddress_A))
+                               continue;
+                       seq_printf(sfp, "%pM ", &node->macaddress_A[0]);
+                       seq_printf(sfp, "%pM ", &node->macaddress_B[0]);
+                       seq_printf(sfp, "%10lx, ",
+                                  node->time_in[HSR_PT_SLAVE_A]);
+                       seq_printf(sfp, "%10lx, ",
+                                  node->time_in[HSR_PT_SLAVE_B]);
+                       seq_printf(sfp, "%14x, ", node->addr_B_port);
+
+                       if (priv->prot_version == PRP_V1)
+                               seq_printf(sfp, "%5x, %5x, %5x\n",
+                                          node->san_a, node->san_b,
+                                          (node->san_a == 0 &&
+                                           node->san_b == 0));
+                       else
+                               seq_printf(sfp, "%5x\n", 1);
+               }
        }
        rcu_read_unlock();
        return 0;
index e57fdad..7f25021 100644 (file)
@@ -485,12 +485,16 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2],
 {
        bool unregister = false;
        struct hsr_priv *hsr;
-       int res;
+       int res, i;
 
        hsr = netdev_priv(hsr_dev);
        INIT_LIST_HEAD(&hsr->ports);
-       INIT_LIST_HEAD(&hsr->node_db);
-       INIT_LIST_HEAD(&hsr->self_node_db);
+       INIT_HLIST_HEAD(&hsr->self_node_db);
+       hsr->hash_buckets = HSR_HSIZE;
+       get_random_bytes(&hsr->hash_seed, sizeof(hsr->hash_seed));
+       for (i = 0; i < hsr->hash_buckets; i++)
+               INIT_HLIST_HEAD(&hsr->node_db[i]);
+
        spin_lock_init(&hsr->list_lock);
 
        eth_hw_addr_set(hsr_dev, slave[0]->dev_addr);
index e59cbb4..5bf3577 100644 (file)
@@ -570,20 +570,23 @@ static int fill_frame_info(struct hsr_frame_info *frame,
        struct ethhdr *ethhdr;
        __be16 proto;
        int ret;
+       u32 hash;
 
        /* Check if skb contains ethhdr */
        if (skb->mac_len < sizeof(struct ethhdr))
                return -EINVAL;
 
        memset(frame, 0, sizeof(*frame));
+
+       ethhdr = (struct ethhdr *)skb_mac_header(skb);
+       hash = hsr_mac_hash(port->hsr, ethhdr->h_source);
        frame->is_supervision = is_supervision_frame(port->hsr, skb);
-       frame->node_src = hsr_get_node(port, &hsr->node_db, skb,
+       frame->node_src = hsr_get_node(port, &hsr->node_db[hash], skb,
                                       frame->is_supervision,
                                       port->type);
        if (!frame->node_src)
                return -1; /* Unknown node and !is_supervision, or no mem */
 
-       ethhdr = (struct ethhdr *)skb_mac_header(skb);
        frame->is_vlan = false;
        proto = ethhdr->h_proto;
 
index 0775f0f..b3c6ffa 100644 (file)
 #include <linux/etherdevice.h>
 #include <linux/slab.h>
 #include <linux/rculist.h>
+#include <linux/jhash.h>
 #include "hsr_main.h"
 #include "hsr_framereg.h"
 #include "hsr_netlink.h"
 
-/*     TODO: use hash lists for mac addresses (linux/jhash.h)?    */
+u32 hsr_mac_hash(struct hsr_priv *hsr, const unsigned char *addr)
+{
+       u32 hash = jhash(addr, ETH_ALEN, hsr->hash_seed);
+
+       return reciprocal_scale(hash, hsr->hash_buckets);
+}
+
+struct hsr_node *hsr_node_get_first(struct hlist_head *head)
+{
+       struct hlist_node *first;
+
+       first = rcu_dereference(hlist_first_rcu(head));
+       if (first)
+               return hlist_entry(first, struct hsr_node, mac_list);
+
+       return NULL;
+}
 
 /* seq_nr_after(a, b) - return true if a is after (higher in sequence than) b,
  * false otherwise.
@@ -42,8 +59,7 @@ bool hsr_addr_is_self(struct hsr_priv *hsr, unsigned char *addr)
 {
        struct hsr_node *node;
 
-       node = list_first_or_null_rcu(&hsr->self_node_db, struct hsr_node,
-                                     mac_list);
+       node = hsr_node_get_first(&hsr->self_node_db);
        if (!node) {
                WARN_ONCE(1, "HSR: No self node\n");
                return false;
@@ -59,12 +75,12 @@ bool hsr_addr_is_self(struct hsr_priv *hsr, unsigned char *addr)
 
 /* Search for mac entry. Caller must hold rcu read lock.
  */
-static struct hsr_node *find_node_by_addr_A(struct list_head *node_db,
+static struct hsr_node *find_node_by_addr_A(struct hlist_head *node_db,
                                            const unsigned char addr[ETH_ALEN])
 {
        struct hsr_node *node;
 
-       list_for_each_entry_rcu(node, node_db, mac_list) {
+       hlist_for_each_entry_rcu(node, node_db, mac_list) {
                if (ether_addr_equal(node->macaddress_A, addr))
                        return node;
        }
@@ -79,7 +95,7 @@ int hsr_create_self_node(struct hsr_priv *hsr,
                         const unsigned char addr_a[ETH_ALEN],
                         const unsigned char addr_b[ETH_ALEN])
 {
-       struct list_head *self_node_db = &hsr->self_node_db;
+       struct hlist_head *self_node_db = &hsr->self_node_db;
        struct hsr_node *node, *oldnode;
 
        node = kmalloc(sizeof(*node), GFP_KERNEL);
@@ -90,14 +106,13 @@ int hsr_create_self_node(struct hsr_priv *hsr,
        ether_addr_copy(node->macaddress_B, addr_b);
 
        spin_lock_bh(&hsr->list_lock);
-       oldnode = list_first_or_null_rcu(self_node_db,
-                                        struct hsr_node, mac_list);
+       oldnode = hsr_node_get_first(self_node_db);
        if (oldnode) {
-               list_replace_rcu(&oldnode->mac_list, &node->mac_list);
+               hlist_replace_rcu(&oldnode->mac_list, &node->mac_list);
                spin_unlock_bh(&hsr->list_lock);
                kfree_rcu(oldnode, rcu_head);
        } else {
-               list_add_tail_rcu(&node->mac_list, self_node_db);
+               hlist_add_tail_rcu(&node->mac_list, self_node_db);
                spin_unlock_bh(&hsr->list_lock);
        }
 
@@ -106,25 +121,25 @@ int hsr_create_self_node(struct hsr_priv *hsr,
 
 void hsr_del_self_node(struct hsr_priv *hsr)
 {
-       struct list_head *self_node_db = &hsr->self_node_db;
+       struct hlist_head *self_node_db = &hsr->self_node_db;
        struct hsr_node *node;
 
        spin_lock_bh(&hsr->list_lock);
-       node = list_first_or_null_rcu(self_node_db, struct hsr_node, mac_list);
+       node = hsr_node_get_first(self_node_db);
        if (node) {
-               list_del_rcu(&node->mac_list);
+               hlist_del_rcu(&node->mac_list);
                kfree_rcu(node, rcu_head);
        }
        spin_unlock_bh(&hsr->list_lock);
 }
 
-void hsr_del_nodes(struct list_head *node_db)
+void hsr_del_nodes(struct hlist_head *node_db)
 {
        struct hsr_node *node;
-       struct hsr_node *tmp;
+       struct hlist_node *tmp;
 
-       list_for_each_entry_safe(node, tmp, node_db, mac_list)
-               kfree(node);
+       hlist_for_each_entry_safe(node, tmp, node_db, mac_list)
+               kfree_rcu(node, rcu_head);
 }
 
 void prp_handle_san_frame(bool san, enum hsr_port_type port,
@@ -145,7 +160,7 @@ void prp_handle_san_frame(bool san, enum hsr_port_type port,
  * originating from the newly added node.
  */
 static struct hsr_node *hsr_add_node(struct hsr_priv *hsr,
-                                    struct list_head *node_db,
+                                    struct hlist_head *node_db,
                                     unsigned char addr[],
                                     u16 seq_out, bool san,
                                     enum hsr_port_type rx_port)
@@ -175,14 +190,14 @@ static struct hsr_node *hsr_add_node(struct hsr_priv *hsr,
                hsr->proto_ops->handle_san_frame(san, rx_port, new_node);
 
        spin_lock_bh(&hsr->list_lock);
-       list_for_each_entry_rcu(node, node_db, mac_list,
-                               lockdep_is_held(&hsr->list_lock)) {
+       hlist_for_each_entry_rcu(node, node_db, mac_list,
+                                lockdep_is_held(&hsr->list_lock)) {
                if (ether_addr_equal(node->macaddress_A, addr))
                        goto out;
                if (ether_addr_equal(node->macaddress_B, addr))
                        goto out;
        }
-       list_add_tail_rcu(&new_node->mac_list, node_db);
+       hlist_add_tail_rcu(&new_node->mac_list, node_db);
        spin_unlock_bh(&hsr->list_lock);
        return new_node;
 out:
@@ -202,7 +217,7 @@ void prp_update_san_info(struct hsr_node *node, bool is_sup)
 
 /* Get the hsr_node from which 'skb' was sent.
  */
-struct hsr_node *hsr_get_node(struct hsr_port *port, struct list_head *node_db,
+struct hsr_node *hsr_get_node(struct hsr_port *port, struct hlist_head *node_db,
                              struct sk_buff *skb, bool is_sup,
                              enum hsr_port_type rx_port)
 {
@@ -218,7 +233,7 @@ struct hsr_node *hsr_get_node(struct hsr_port *port, struct list_head *node_db,
 
        ethhdr = (struct ethhdr *)skb_mac_header(skb);
 
-       list_for_each_entry_rcu(node, node_db, mac_list) {
+       hlist_for_each_entry_rcu(node, node_db, mac_list) {
                if (ether_addr_equal(node->macaddress_A, ethhdr->h_source)) {
                        if (hsr->proto_ops->update_san_info)
                                hsr->proto_ops->update_san_info(node, is_sup);
@@ -268,11 +283,12 @@ void hsr_handle_sup_frame(struct hsr_frame_info *frame)
        struct hsr_sup_tlv *hsr_sup_tlv;
        struct hsr_node *node_real;
        struct sk_buff *skb = NULL;
-       struct list_head *node_db;
+       struct hlist_head *node_db;
        struct ethhdr *ethhdr;
        int i;
        unsigned int pull_size = 0;
        unsigned int total_pull_size = 0;
+       u32 hash;
 
        /* Here either frame->skb_hsr or frame->skb_prp should be
         * valid as supervision frame always will have protocol
@@ -310,11 +326,13 @@ void hsr_handle_sup_frame(struct hsr_frame_info *frame)
        hsr_sp = (struct hsr_sup_payload *)skb->data;
 
        /* Merge node_curr (registered on macaddress_B) into node_real */
-       node_db = &port_rcv->hsr->node_db;
-       node_real = find_node_by_addr_A(node_db, hsr_sp->macaddress_A);
+       node_db = port_rcv->hsr->node_db;
+       hash = hsr_mac_hash(hsr, hsr_sp->macaddress_A);
+       node_real = find_node_by_addr_A(&node_db[hash], hsr_sp->macaddress_A);
        if (!node_real)
                /* No frame received from AddrA of this node yet */
-               node_real = hsr_add_node(hsr, node_db, hsr_sp->macaddress_A,
+               node_real = hsr_add_node(hsr, &node_db[hash],
+                                        hsr_sp->macaddress_A,
                                         HSR_SEQNR_START - 1, true,
                                         port_rcv->type);
        if (!node_real)
@@ -348,7 +366,8 @@ void hsr_handle_sup_frame(struct hsr_frame_info *frame)
                hsr_sp = (struct hsr_sup_payload *)skb->data;
 
                /* Check if redbox mac and node mac are equal. */
-               if (!ether_addr_equal(node_real->macaddress_A, hsr_sp->macaddress_A)) {
+               if (!ether_addr_equal(node_real->macaddress_A,
+                                     hsr_sp->macaddress_A)) {
                        /* This is a redbox supervision frame for a VDAN! */
                        goto done;
                }
@@ -368,7 +387,7 @@ void hsr_handle_sup_frame(struct hsr_frame_info *frame)
        node_real->addr_B_port = port_rcv->type;
 
        spin_lock_bh(&hsr->list_lock);
-       list_del_rcu(&node_curr->mac_list);
+       hlist_del_rcu(&node_curr->mac_list);
        spin_unlock_bh(&hsr->list_lock);
        kfree_rcu(node_curr, rcu_head);
 
@@ -406,6 +425,7 @@ void hsr_addr_subst_dest(struct hsr_node *node_src, struct sk_buff *skb,
                         struct hsr_port *port)
 {
        struct hsr_node *node_dst;
+       u32 hash;
 
        if (!skb_mac_header_was_set(skb)) {
                WARN_ONCE(1, "%s: Mac header not set\n", __func__);
@@ -415,7 +435,8 @@ void hsr_addr_subst_dest(struct hsr_node *node_src, struct sk_buff *skb,
        if (!is_unicast_ether_addr(eth_hdr(skb)->h_dest))
                return;
 
-       node_dst = find_node_by_addr_A(&port->hsr->node_db,
+       hash = hsr_mac_hash(port->hsr, eth_hdr(skb)->h_dest);
+       node_dst = find_node_by_addr_A(&port->hsr->node_db[hash],
                                       eth_hdr(skb)->h_dest);
        if (!node_dst) {
                if (net_ratelimit())
@@ -491,59 +512,73 @@ static struct hsr_port *get_late_port(struct hsr_priv *hsr,
 void hsr_prune_nodes(struct timer_list *t)
 {
        struct hsr_priv *hsr = from_timer(hsr, t, prune_timer);
+       struct hlist_node *tmp;
        struct hsr_node *node;
-       struct hsr_node *tmp;
        struct hsr_port *port;
        unsigned long timestamp;
        unsigned long time_a, time_b;
+       int i;
 
        spin_lock_bh(&hsr->list_lock);
-       list_for_each_entry_safe(node, tmp, &hsr->node_db, mac_list) {
-               /* Don't prune own node. Neither time_in[HSR_PT_SLAVE_A]
-                * nor time_in[HSR_PT_SLAVE_B], will ever be updated for
-                * the master port. Thus the master node will be repeatedly
-                * pruned leading to packet loss.
-                */
-               if (hsr_addr_is_self(hsr, node->macaddress_A))
-                       continue;
-
-               /* Shorthand */
-               time_a = node->time_in[HSR_PT_SLAVE_A];
-               time_b = node->time_in[HSR_PT_SLAVE_B];
-
-               /* Check for timestamps old enough to risk wrap-around */
-               if (time_after(jiffies, time_a + MAX_JIFFY_OFFSET / 2))
-                       node->time_in_stale[HSR_PT_SLAVE_A] = true;
-               if (time_after(jiffies, time_b + MAX_JIFFY_OFFSET / 2))
-                       node->time_in_stale[HSR_PT_SLAVE_B] = true;
-
-               /* Get age of newest frame from node.
-                * At least one time_in is OK here; nodes get pruned long
-                * before both time_ins can get stale
-                */
-               timestamp = time_a;
-               if (node->time_in_stale[HSR_PT_SLAVE_A] ||
-                   (!node->time_in_stale[HSR_PT_SLAVE_B] &&
-                   time_after(time_b, time_a)))
-                       timestamp = time_b;
-
-               /* Warn of ring error only as long as we get frames at all */
-               if (time_is_after_jiffies(timestamp +
-                               msecs_to_jiffies(1.5 * MAX_SLAVE_DIFF))) {
-                       rcu_read_lock();
-                       port = get_late_port(hsr, node);
-                       if (port)
-                               hsr_nl_ringerror(hsr, node->macaddress_A, port);
-                       rcu_read_unlock();
-               }
 
-               /* Prune old entries */
-               if (time_is_before_jiffies(timestamp +
-                               msecs_to_jiffies(HSR_NODE_FORGET_TIME))) {
-                       hsr_nl_nodedown(hsr, node->macaddress_A);
-                       list_del_rcu(&node->mac_list);
-                       /* Note that we need to free this entry later: */
-                       kfree_rcu(node, rcu_head);
+       for (i = 0; i < hsr->hash_buckets; i++) {
+               hlist_for_each_entry_safe(node, tmp, &hsr->node_db[i],
+                                         mac_list) {
+                       /* Don't prune own node.
+                        * Neither time_in[HSR_PT_SLAVE_A]
+                        * nor time_in[HSR_PT_SLAVE_B], will ever be updated
+                        * for the master port. Thus the master node will be
+                        * repeatedly pruned leading to packet loss.
+                        */
+                       if (hsr_addr_is_self(hsr, node->macaddress_A))
+                               continue;
+
+                       /* Shorthand */
+                       time_a = node->time_in[HSR_PT_SLAVE_A];
+                       time_b = node->time_in[HSR_PT_SLAVE_B];
+
+                       /* Check for timestamps old enough to
+                        * risk wrap-around
+                        */
+                       if (time_after(jiffies, time_a + MAX_JIFFY_OFFSET / 2))
+                               node->time_in_stale[HSR_PT_SLAVE_A] = true;
+                       if (time_after(jiffies, time_b + MAX_JIFFY_OFFSET / 2))
+                               node->time_in_stale[HSR_PT_SLAVE_B] = true;
+
+                       /* Get age of newest frame from node.
+                        * At least one time_in is OK here; nodes get pruned
+                        * long before both time_ins can get stale
+                        */
+                       timestamp = time_a;
+                       if (node->time_in_stale[HSR_PT_SLAVE_A] ||
+                           (!node->time_in_stale[HSR_PT_SLAVE_B] &&
+                            time_after(time_b, time_a)))
+                               timestamp = time_b;
+
+                       /* Warn of ring error only as long as we get
+                        * frames at all
+                        */
+                       if (time_is_after_jiffies(timestamp +
+                                                 msecs_to_jiffies(1.5 * MAX_SLAVE_DIFF))) {
+                               rcu_read_lock();
+                               port = get_late_port(hsr, node);
+                               if (port)
+                                       hsr_nl_ringerror(hsr,
+                                                        node->macaddress_A,
+                                                        port);
+                               rcu_read_unlock();
+                       }
+
+                       /* Prune old entries */
+                       if (time_is_before_jiffies(timestamp +
+                                                  msecs_to_jiffies(HSR_NODE_FORGET_TIME))) {
+                               hsr_nl_nodedown(hsr, node->macaddress_A);
+                               hlist_del_rcu(&node->mac_list);
+                               /* Note that we need to free this
+                                * entry later:
+                                */
+                               kfree_rcu(node, rcu_head);
+                       }
                }
        }
        spin_unlock_bh(&hsr->list_lock);
@@ -557,17 +592,19 @@ void *hsr_get_next_node(struct hsr_priv *hsr, void *_pos,
                        unsigned char addr[ETH_ALEN])
 {
        struct hsr_node *node;
+       u32 hash;
+
+       hash = hsr_mac_hash(hsr, addr);
 
        if (!_pos) {
-               node = list_first_or_null_rcu(&hsr->node_db,
-                                             struct hsr_node, mac_list);
+               node = hsr_node_get_first(&hsr->node_db[hash]);
                if (node)
                        ether_addr_copy(addr, node->macaddress_A);
                return node;
        }
 
        node = _pos;
-       list_for_each_entry_continue_rcu(node, &hsr->node_db, mac_list) {
+       hlist_for_each_entry_continue_rcu(node, mac_list) {
                ether_addr_copy(addr, node->macaddress_A);
                return node;
        }
@@ -587,8 +624,11 @@ int hsr_get_node_data(struct hsr_priv *hsr,
        struct hsr_node *node;
        struct hsr_port *port;
        unsigned long tdiff;
+       u32 hash;
+
+       hash = hsr_mac_hash(hsr, addr);
 
-       node = find_node_by_addr_A(&hsr->node_db, addr);
+       node = find_node_by_addr_A(&hsr->node_db[hash], addr);
        if (!node)
                return -ENOENT;
 
index bdbb8c8..d7cce6b 100644 (file)
@@ -28,9 +28,11 @@ struct hsr_frame_info {
        bool is_from_san;
 };
 
+u32 hsr_mac_hash(struct hsr_priv *hsr, const unsigned char *addr);
+struct hsr_node *hsr_node_get_first(struct hlist_head *head);
 void hsr_del_self_node(struct hsr_priv *hsr);
-void hsr_del_nodes(struct list_head *node_db);
-struct hsr_node *hsr_get_node(struct hsr_port *port, struct list_head *node_db,
+void hsr_del_nodes(struct hlist_head *node_db);
+struct hsr_node *hsr_get_node(struct hsr_port *port, struct hlist_head *node_db,
                              struct sk_buff *skb, bool is_sup,
                              enum hsr_port_type rx_port);
 void hsr_handle_sup_frame(struct hsr_frame_info *frame);
@@ -68,7 +70,7 @@ void prp_handle_san_frame(bool san, enum hsr_port_type port,
 void prp_update_san_info(struct hsr_node *node, bool is_sup);
 
 struct hsr_node {
-       struct list_head        mac_list;
+       struct hlist_node       mac_list;
        unsigned char           macaddress_A[ETH_ALEN];
        unsigned char           macaddress_B[ETH_ALEN];
        /* Local slave through which AddrB frames are received from this node */
index 043e4e9..ca556bd 100644 (file)
@@ -63,6 +63,9 @@ struct hsr_tag {
 
 #define HSR_V1_SUP_LSDUSIZE            52
 
+#define HSR_HSIZE_SHIFT        8
+#define HSR_HSIZE      BIT(HSR_HSIZE_SHIFT)
+
 /* The helper functions below assumes that 'path' occupies the 4 most
  * significant bits of the 16-bit field shared by 'path' and 'LSDU_size' (or
  * equivalently, the 4 most significant bits of HSR tag byte 14).
@@ -201,8 +204,8 @@ struct hsr_proto_ops {
 struct hsr_priv {
        struct rcu_head         rcu_head;
        struct list_head        ports;
-       struct list_head        node_db;        /* Known HSR nodes */
-       struct list_head        self_node_db;   /* MACs of slaves */
+       struct hlist_head       node_db[HSR_HSIZE];     /* Known HSR nodes */
+       struct hlist_head       self_node_db;   /* MACs of slaves */
        struct timer_list       announce_timer; /* Supervision frame dispatch */
        struct timer_list       prune_timer;
        int announce_count;
@@ -212,6 +215,8 @@ struct hsr_priv {
        spinlock_t seqnr_lock;  /* locking for sequence_nr */
        spinlock_t list_lock;   /* locking for node list */
        struct hsr_proto_ops    *proto_ops;
+       u32 hash_buckets;
+       u32 hash_seed;
 #define PRP_LAN_ID     0x5     /* 0x1010 for A and 0x1011 for B. Bit 0 is set
                                 * based on SLAVE_A or SLAVE_B
                                 */
@@ -259,11 +264,6 @@ static inline u16 prp_get_skb_sequence_nr(struct prp_rct *rct)
        return ntohs(rct->sequence_nr);
 }
 
-static inline u16 get_prp_lan_id(struct prp_rct *rct)
-{
-       return ntohs(rct->lan_id_and_LSDU_size) >> 12;
-}
-
 /* assume there is a valid rct */
 static inline bool prp_check_lsdu_size(struct sk_buff *skb,
                                       struct prp_rct *rct,
index f3c8f91..1405c03 100644 (file)
@@ -105,6 +105,7 @@ static int hsr_newlink(struct net *src_net, struct net_device *dev,
 static void hsr_dellink(struct net_device *dev, struct list_head *head)
 {
        struct hsr_priv *hsr = netdev_priv(dev);
+       int i;
 
        del_timer_sync(&hsr->prune_timer);
        del_timer_sync(&hsr->announce_timer);
@@ -113,7 +114,8 @@ static void hsr_dellink(struct net_device *dev, struct list_head *head)
        hsr_del_ports(hsr);
 
        hsr_del_self_node(hsr);
-       hsr_del_nodes(&hsr->node_db);
+       for (i = 0; i < hsr->hash_buckets; i++)
+               hsr_del_nodes(&hsr->node_db[i]);
 
        unregister_netdevice_queue(dev, head);
 }
index 2cf6271..2c087b7 100644 (file)
@@ -47,6 +47,7 @@
 #include <linux/module.h>
 #include <linux/netdevice.h>
 #include <linux/ieee802154.h>
+#include <linux/if_arp.h>
 
 #include <net/ipv6.h>
 
index dd5a45f..359249a 100644 (file)
@@ -30,7 +30,7 @@ static int ieee802154_nl_fill_phy(struct sk_buff *msg, u32 portid,
 {
        void *hdr;
        int i, pages = 0;
-       uint32_t *buf = kcalloc(32, sizeof(uint32_t), GFP_KERNEL);
+       u32 *buf = kcalloc(IEEE802154_MAX_PAGE + 1, sizeof(u32), GFP_KERNEL);
 
        pr_debug("%s\n", __func__);
 
@@ -47,7 +47,7 @@ static int ieee802154_nl_fill_phy(struct sk_buff *msg, u32 portid,
            nla_put_u8(msg, IEEE802154_ATTR_PAGE, phy->current_page) ||
            nla_put_u8(msg, IEEE802154_ATTR_CHANNEL, phy->current_channel))
                goto nla_put_failure;
-       for (i = 0; i < 32; i++) {
+       for (i = 0; i <= IEEE802154_MAX_PAGE; i++) {
                if (phy->supported.channels[i])
                        buf[pages++] = phy->supported.channels[i] | (i << 27);
        }
index de610cb..f79ab94 100644 (file)
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2019 Facebook  */
 
+#include <linux/init.h>
 #include <linux/types.h>
 #include <linux/bpf_verifier.h>
 #include <linux/bpf.h>
@@ -95,12 +96,14 @@ static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log,
                                        const struct btf *btf,
                                        const struct btf_type *t, int off,
                                        int size, enum bpf_access_type atype,
-                                       u32 *next_btf_id)
+                                       u32 *next_btf_id,
+                                       enum bpf_type_flag *flag)
 {
        size_t end;
 
        if (atype == BPF_READ)
-               return btf_struct_access(log, btf, t, off, size, atype, next_btf_id);
+               return btf_struct_access(log, btf, t, off, size, atype, next_btf_id,
+                                        flag);
 
        if (t != tcp_sock_type) {
                bpf_log(log, "only read is supported\n");
@@ -212,26 +215,23 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id,
        }
 }
 
-BTF_SET_START(bpf_tcp_ca_kfunc_ids)
+BTF_SET_START(bpf_tcp_ca_check_kfunc_ids)
 BTF_ID(func, tcp_reno_ssthresh)
 BTF_ID(func, tcp_reno_cong_avoid)
 BTF_ID(func, tcp_reno_undo_cwnd)
 BTF_ID(func, tcp_slow_start)
 BTF_ID(func, tcp_cong_avoid_ai)
-BTF_SET_END(bpf_tcp_ca_kfunc_ids)
+BTF_SET_END(bpf_tcp_ca_check_kfunc_ids)
 
-static bool bpf_tcp_ca_check_kfunc_call(u32 kfunc_btf_id, struct module *owner)
-{
-       if (btf_id_set_contains(&bpf_tcp_ca_kfunc_ids, kfunc_btf_id))
-               return true;
-       return bpf_check_mod_kfunc_call(&bpf_tcp_ca_kfunc_list, kfunc_btf_id, owner);
-}
+static const struct btf_kfunc_id_set bpf_tcp_ca_kfunc_set = {
+       .owner     = THIS_MODULE,
+       .check_set = &bpf_tcp_ca_check_kfunc_ids,
+};
 
 static const struct bpf_verifier_ops bpf_tcp_ca_verifier_ops = {
        .get_func_proto         = bpf_tcp_ca_get_func_proto,
        .is_valid_access        = bpf_tcp_ca_is_valid_access,
        .btf_struct_access      = bpf_tcp_ca_btf_struct_access,
-       .check_kfunc_call       = bpf_tcp_ca_check_kfunc_call,
 };
 
 static int bpf_tcp_ca_init_member(const struct btf_type *t,
@@ -300,3 +300,9 @@ struct bpf_struct_ops bpf_tcp_congestion_ops = {
        .init = bpf_tcp_ca_init,
        .name = "tcp_congestion_ops",
 };
+
+static int __init bpf_tcp_ca_kfunc_init(void)
+{
+       return register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &bpf_tcp_ca_kfunc_set);
+}
+late_initcall(bpf_tcp_ca_kfunc_init);
index 4d61ddd..5481172 100644 (file)
@@ -32,6 +32,7 @@
 #include <linux/list.h>
 #include <linux/slab.h>
 
+#include <net/inet_dscp.h>
 #include <net/ip.h>
 #include <net/protocol.h>
 #include <net/route.h>
@@ -735,8 +736,16 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
        memset(cfg, 0, sizeof(*cfg));
 
        rtm = nlmsg_data(nlh);
+
+       if (!inet_validate_dscp(rtm->rtm_tos)) {
+               NL_SET_ERR_MSG(extack,
+                              "Invalid dsfield (tos): ECN bits must be 0");
+               err = -EINVAL;
+               goto errout;
+       }
+       cfg->fc_dscp = inet_dsfield_to_dscp(rtm->rtm_tos);
+
        cfg->fc_dst_len = rtm->rtm_dst_len;
-       cfg->fc_tos = rtm->rtm_tos;
        cfg->fc_table = rtm->rtm_table;
        cfg->fc_protocol = rtm->rtm_protocol;
        cfg->fc_scope = rtm->rtm_scope;
@@ -1547,7 +1556,7 @@ static void ip_fib_net_exit(struct net *net)
 {
        int i;
 
-       rtnl_lock();
+       ASSERT_RTNL();
 #ifdef CONFIG_IP_MULTIPLE_TABLES
        RCU_INIT_POINTER(net->ipv4.fib_main, NULL);
        RCU_INIT_POINTER(net->ipv4.fib_default, NULL);
@@ -1572,7 +1581,7 @@ static void ip_fib_net_exit(struct net *net)
 #ifdef CONFIG_IP_MULTIPLE_TABLES
        fib4_rules_exit(net);
 #endif
-       rtnl_unlock();
+
        kfree(net->ipv4.fib_table_hash);
        fib4_notifier_exit(net);
 }
@@ -1599,7 +1608,9 @@ out:
 out_proc:
        nl_fib_lookup_exit(net);
 out_nlfl:
+       rtnl_lock();
        ip_fib_net_exit(net);
+       rtnl_unlock();
        goto out;
 }
 
@@ -1607,12 +1618,23 @@ static void __net_exit fib_net_exit(struct net *net)
 {
        fib_proc_exit(net);
        nl_fib_lookup_exit(net);
-       ip_fib_net_exit(net);
+}
+
+static void __net_exit fib_net_exit_batch(struct list_head *net_list)
+{
+       struct net *net;
+
+       rtnl_lock();
+       list_for_each_entry(net, net_list, exit_list)
+               ip_fib_net_exit(net);
+
+       rtnl_unlock();
 }
 
 static struct pernet_operations fib_net_ops = {
        .init = fib_net_init,
        .exit = fib_net_exit,
+       .exit_batch = fib_net_exit_batch,
 };
 
 void __init ip_fib_init(void)
index e184bcb..a63014b 100644 (file)
@@ -4,13 +4,14 @@
 
 #include <linux/types.h>
 #include <linux/list.h>
+#include <net/inet_dscp.h>
 #include <net/ip_fib.h>
 #include <net/nexthop.h>
 
 struct fib_alias {
        struct hlist_node       fa_list;
        struct fib_info         *fa_info;
-       u8                      fa_tos;
+       dscp_t                  fa_dscp;
        u8                      fa_type;
        u8                      fa_state;
        u8                      fa_slen;
index e0b6c8b..001fea3 100644 (file)
@@ -23,6 +23,7 @@
 #include <linux/list.h>
 #include <linux/rcupdate.h>
 #include <linux/export.h>
+#include <net/inet_dscp.h>
 #include <net/ip.h>
 #include <net/route.h>
 #include <net/tcp.h>
@@ -35,7 +36,7 @@ struct fib4_rule {
        struct fib_rule         common;
        u8                      dst_len;
        u8                      src_len;
-       u8                      tos;
+       dscp_t                  dscp;
        __be32                  src;
        __be32                  srcmask;
        __be32                  dst;
@@ -49,7 +50,7 @@ static bool fib4_rule_matchall(const struct fib_rule *rule)
 {
        struct fib4_rule *r = container_of(rule, struct fib4_rule, common);
 
-       if (r->dst_len || r->src_len || r->tos)
+       if (r->dst_len || r->src_len || r->dscp)
                return false;
        return fib_rule_matchall(rule);
 }
@@ -185,7 +186,7 @@ INDIRECT_CALLABLE_SCOPE int fib4_rule_match(struct fib_rule *rule,
            ((daddr ^ r->dst) & r->dstmask))
                return 0;
 
-       if (r->tos && (r->tos != fl4->flowi4_tos))
+       if (r->dscp && r->dscp != inet_dsfield_to_dscp(fl4->flowi4_tos))
                return 0;
 
        if (rule->ip_proto && (rule->ip_proto != fl4->flowi4_proto))
@@ -225,10 +226,17 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
        int err = -EINVAL;
        struct fib4_rule *rule4 = (struct fib4_rule *) rule;
 
+       if (!inet_validate_dscp(frh->tos)) {
+               NL_SET_ERR_MSG(extack,
+                              "Invalid dsfield (tos): ECN bits must be 0");
+               goto errout;
+       }
+       /* IPv4 currently doesn't handle high order DSCP bits correctly */
        if (frh->tos & ~IPTOS_TOS_MASK) {
                NL_SET_ERR_MSG(extack, "Invalid tos");
                goto errout;
        }
+       rule4->dscp = inet_dsfield_to_dscp(frh->tos);
 
        /* split local/main if they are not already split */
        err = fib_unmerge(net);
@@ -270,7 +278,6 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
        rule4->srcmask = inet_make_mask(rule4->src_len);
        rule4->dst_len = frh->dst_len;
        rule4->dstmask = inet_make_mask(rule4->dst_len);
-       rule4->tos = frh->tos;
 
        net->ipv4.fib_has_custom_rules = true;
 
@@ -313,7 +320,7 @@ static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
        if (frh->dst_len && (rule4->dst_len != frh->dst_len))
                return 0;
 
-       if (frh->tos && (rule4->tos != frh->tos))
+       if (frh->tos && inet_dscp_to_dsfield(rule4->dscp) != frh->tos)
                return 0;
 
 #ifdef CONFIG_IP_ROUTE_CLASSID
@@ -337,7 +344,7 @@ static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
 
        frh->dst_len = rule4->dst_len;
        frh->src_len = rule4->src_len;
-       frh->tos = rule4->tos;
+       frh->tos = inet_dscp_to_dsfield(rule4->dscp);
 
        if ((rule4->dst_len &&
             nla_put_in_addr(skb, FRA_DST, rule4->dst)) ||
index b458986..c9c4f2f 100644 (file)
@@ -32,6 +32,7 @@
 #include <linux/hash.h>
 
 #include <net/arp.h>
+#include <net/inet_dscp.h>
 #include <net/ip.h>
 #include <net/protocol.h>
 #include <net/route.h>
@@ -523,7 +524,7 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
        fri.tb_id = tb_id;
        fri.dst = key;
        fri.dst_len = dst_len;
-       fri.tos = fa->fa_tos;
+       fri.tos = inet_dscp_to_dsfield(fa->fa_dscp);
        fri.type = fa->fa_type;
        fri.offload = fa->offload;
        fri.trap = fa->trap;
@@ -1257,34 +1258,13 @@ fib_info_laddrhash_bucket(const struct net *net, __be32 val)
        return &fib_info_laddrhash[slot];
 }
 
-static struct hlist_head *fib_info_hash_alloc(int bytes)
-{
-       if (bytes <= PAGE_SIZE)
-               return kzalloc(bytes, GFP_KERNEL);
-       else
-               return (struct hlist_head *)
-                       __get_free_pages(GFP_KERNEL | __GFP_ZERO,
-                                        get_order(bytes));
-}
-
-static void fib_info_hash_free(struct hlist_head *hash, int bytes)
-{
-       if (!hash)
-               return;
-
-       if (bytes <= PAGE_SIZE)
-               kfree(hash);
-       else
-               free_pages((unsigned long) hash, get_order(bytes));
-}
-
 static void fib_info_hash_move(struct hlist_head *new_info_hash,
                               struct hlist_head *new_laddrhash,
                               unsigned int new_size)
 {
        struct hlist_head *old_info_hash, *old_laddrhash;
        unsigned int old_size = fib_info_hash_size;
-       unsigned int i, bytes;
+       unsigned int i;
 
        spin_lock_bh(&fib_info_lock);
        old_info_hash = fib_info_hash;
@@ -1325,9 +1305,8 @@ static void fib_info_hash_move(struct hlist_head *new_info_hash,
 
        spin_unlock_bh(&fib_info_lock);
 
-       bytes = old_size * sizeof(struct hlist_head *);
-       fib_info_hash_free(old_info_hash, bytes);
-       fib_info_hash_free(old_laddrhash, bytes);
+       kvfree(old_info_hash);
+       kvfree(old_laddrhash);
 }
 
 __be32 fib_info_update_nhc_saddr(struct net *net, struct fib_nh_common *nhc,
@@ -1444,19 +1423,19 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
                unsigned int new_size = fib_info_hash_size << 1;
                struct hlist_head *new_info_hash;
                struct hlist_head *new_laddrhash;
-               unsigned int bytes;
+               size_t bytes;
 
                if (!new_size)
                        new_size = 16;
-               bytes = new_size * sizeof(struct hlist_head *);
-               new_info_hash = fib_info_hash_alloc(bytes);
-               new_laddrhash = fib_info_hash_alloc(bytes);
+               bytes = (size_t)new_size * sizeof(struct hlist_head *);
+               new_info_hash = kvzalloc(bytes, GFP_KERNEL);
+               new_laddrhash = kvzalloc(bytes, GFP_KERNEL);
                if (!new_info_hash || !new_laddrhash) {
-                       fib_info_hash_free(new_info_hash, bytes);
-                       fib_info_hash_free(new_laddrhash, bytes);
-               } else
+                       kvfree(new_info_hash);
+                       kvfree(new_laddrhash);
+               } else {
                        fib_info_hash_move(new_info_hash, new_laddrhash, new_size);
-
+               }
                if (!fib_info_hash_size)
                        goto failure;
        }
@@ -2061,7 +2040,7 @@ static void fib_select_default(const struct flowi4 *flp, struct fib_result *res)
        int order = -1, last_idx = -1;
        struct fib_alias *fa, *fa1 = NULL;
        u32 last_prio = res->fi->fib_priority;
-       u8 last_tos = 0;
+       dscp_t last_dscp = 0;
 
        hlist_for_each_entry_rcu(fa, fa_head, fa_list) {
                struct fib_info *next_fi = fa->fa_info;
@@ -2069,19 +2048,20 @@ static void fib_select_default(const struct flowi4 *flp, struct fib_result *res)
 
                if (fa->fa_slen != slen)
                        continue;
-               if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
+               if (fa->fa_dscp &&
+                   fa->fa_dscp != inet_dsfield_to_dscp(flp->flowi4_tos))
                        continue;
                if (fa->tb_id != tb->tb_id)
                        continue;
                if (next_fi->fib_priority > last_prio &&
-                   fa->fa_tos == last_tos) {
-                       if (last_tos)
+                   fa->fa_dscp == last_dscp) {
+                       if (last_dscp)
                                continue;
                        break;
                }
                if (next_fi->fib_flags & RTNH_F_DEAD)
                        continue;
-               last_tos = fa->fa_tos;
+               last_dscp = fa->fa_dscp;
                last_prio = next_fi->fib_priority;
 
                if (next_fi->fib_scope != res->scope ||
index 8060524..c05cd10 100644 (file)
@@ -61,6 +61,7 @@
 #include <linux/vmalloc.h>
 #include <linux/notifier.h>
 #include <net/net_namespace.h>
+#include <net/inet_dscp.h>
 #include <net/ip.h>
 #include <net/protocol.h>
 #include <net/route.h>
@@ -81,7 +82,7 @@ static int call_fib_entry_notifier(struct notifier_block *nb,
                .dst = dst,
                .dst_len = dst_len,
                .fi = fa->fa_info,
-               .tos = fa->fa_tos,
+               .tos = inet_dscp_to_dsfield(fa->fa_dscp),
                .type = fa->fa_type,
                .tb_id = fa->tb_id,
        };
@@ -98,7 +99,7 @@ static int call_fib_entry_notifiers(struct net *net,
                .dst = dst,
                .dst_len = dst_len,
                .fi = fa->fa_info,
-               .tos = fa->fa_tos,
+               .tos = inet_dscp_to_dsfield(fa->fa_dscp),
                .type = fa->fa_type,
                .tb_id = fa->tb_id,
        };
@@ -973,13 +974,13 @@ static struct key_vector *fib_find_node(struct trie *t,
        return n;
 }
 
-/* Return the first fib alias matching TOS with
+/* Return the first fib alias matching DSCP with
  * priority less than or equal to PRIO.
  * If 'find_first' is set, return the first matching
- * fib alias, regardless of TOS and priority.
+ * fib alias, regardless of DSCP and priority.
  */
 static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen,
-                                       u8 tos, u32 prio, u32 tb_id,
+                                       dscp_t dscp, u32 prio, u32 tb_id,
                                        bool find_first)
 {
        struct fib_alias *fa;
@@ -988,6 +989,10 @@ static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen,
                return NULL;
 
        hlist_for_each_entry(fa, fah, fa_list) {
+               /* Avoid Sparse warning when using dscp_t in inequalities */
+               u8 __fa_dscp = inet_dscp_to_dsfield(fa->fa_dscp);
+               u8 __dscp = inet_dscp_to_dsfield(dscp);
+
                if (fa->fa_slen < slen)
                        continue;
                if (fa->fa_slen != slen)
@@ -998,9 +1003,9 @@ static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen,
                        break;
                if (find_first)
                        return fa;
-               if (fa->fa_tos > tos)
+               if (__fa_dscp > __dscp)
                        continue;
-               if (fa->fa_info->fib_priority >= prio || fa->fa_tos < tos)
+               if (fa->fa_info->fib_priority >= prio || __fa_dscp < __dscp)
                        return fa;
        }
 
@@ -1027,8 +1032,8 @@ fib_find_matching_alias(struct net *net, const struct fib_rt_info *fri)
 
        hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
                if (fa->fa_slen == slen && fa->tb_id == fri->tb_id &&
-                   fa->fa_tos == fri->tos && fa->fa_info == fri->fi &&
-                   fa->fa_type == fri->type)
+                   fa->fa_dscp == inet_dsfield_to_dscp(fri->tos) &&
+                   fa->fa_info == fri->fi && fa->fa_type == fri->type)
                        return fa;
        }
 
@@ -1210,7 +1215,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
        struct fib_info *fi;
        u8 plen = cfg->fc_dst_len;
        u8 slen = KEYLENGTH - plen;
-       u8 tos = cfg->fc_tos;
+       dscp_t dscp;
        u32 key;
        int err;
 
@@ -1227,12 +1232,13 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
                goto err;
        }
 
+       dscp = cfg->fc_dscp;
        l = fib_find_node(t, &tp, key);
-       fa = l ? fib_find_alias(&l->leaf, slen, tos, fi->fib_priority,
+       fa = l ? fib_find_alias(&l->leaf, slen, dscp, fi->fib_priority,
                                tb->tb_id, false) : NULL;
 
        /* Now fa, if non-NULL, points to the first fib alias
-        * with the same keys [prefix,tos,priority], if such key already
+        * with the same keys [prefix,dscp,priority], if such key already
         * exists or to the node before which we will insert new one.
         *
         * If fa is NULL, we will need to allocate a new one and
@@ -1240,7 +1246,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
         * of the new alias.
         */
 
-       if (fa && fa->fa_tos == tos &&
+       if (fa && fa->fa_dscp == dscp &&
            fa->fa_info->fib_priority == fi->fib_priority) {
                struct fib_alias *fa_first, *fa_match;
 
@@ -1260,7 +1266,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
                hlist_for_each_entry_from(fa, fa_list) {
                        if ((fa->fa_slen != slen) ||
                            (fa->tb_id != tb->tb_id) ||
-                           (fa->fa_tos != tos))
+                           (fa->fa_dscp != dscp))
                                break;
                        if (fa->fa_info->fib_priority != fi->fib_priority)
                                break;
@@ -1288,7 +1294,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
                                goto out;
 
                        fi_drop = fa->fa_info;
-                       new_fa->fa_tos = fa->fa_tos;
+                       new_fa->fa_dscp = fa->fa_dscp;
                        new_fa->fa_info = fi;
                        new_fa->fa_type = cfg->fc_type;
                        state = fa->fa_state;
@@ -1351,7 +1357,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
                goto out;
 
        new_fa->fa_info = fi;
-       new_fa->fa_tos = tos;
+       new_fa->fa_dscp = dscp;
        new_fa->fa_type = cfg->fc_type;
        new_fa->fa_state = 0;
        new_fa->fa_slen = slen;
@@ -1567,7 +1573,8 @@ found:
                        if (index >= (1ul << fa->fa_slen))
                                continue;
                }
-               if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
+               if (fa->fa_dscp &&
+                   inet_dscp_to_dsfield(fa->fa_dscp) != flp->flowi4_tos)
                        continue;
                if (fi->fib_dead)
                        continue;
@@ -1703,7 +1710,7 @@ int fib_table_delete(struct net *net, struct fib_table *tb,
        struct key_vector *l, *tp;
        u8 plen = cfg->fc_dst_len;
        u8 slen = KEYLENGTH - plen;
-       u8 tos = cfg->fc_tos;
+       dscp_t dscp;
        u32 key;
 
        key = ntohl(cfg->fc_dst);
@@ -1715,11 +1722,13 @@ int fib_table_delete(struct net *net, struct fib_table *tb,
        if (!l)
                return -ESRCH;
 
-       fa = fib_find_alias(&l->leaf, slen, tos, 0, tb->tb_id, false);
+       dscp = cfg->fc_dscp;
+       fa = fib_find_alias(&l->leaf, slen, dscp, 0, tb->tb_id, false);
        if (!fa)
                return -ESRCH;
 
-       pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
+       pr_debug("Deleting %08x/%d dsfield=0x%02x t=%p\n", key, plen,
+                inet_dscp_to_dsfield(dscp), t);
 
        fa_to_delete = NULL;
        hlist_for_each_entry_from(fa, fa_list) {
@@ -1727,7 +1736,7 @@ int fib_table_delete(struct net *net, struct fib_table *tb,
 
                if ((fa->fa_slen != slen) ||
                    (fa->tb_id != tb->tb_id) ||
-                   (fa->fa_tos != tos))
+                   (fa->fa_dscp != dscp))
                        break;
 
                if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
@@ -2295,7 +2304,7 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
                                fri.tb_id = tb->tb_id;
                                fri.dst = xkey;
                                fri.dst_len = KEYLENGTH - fa->fa_slen;
-                               fri.tos = fa->fa_tos;
+                               fri.tos = inet_dscp_to_dsfield(fa->fa_dscp);
                                fri.type = fa->fa_type;
                                fri.offload = fa->offload;
                                fri.trap = fa->trap;
@@ -2807,8 +2816,9 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v)
                                             fa->fa_info->fib_scope),
                                   rtn_type(buf2, sizeof(buf2),
                                            fa->fa_type));
-                       if (fa->fa_tos)
-                               seq_printf(seq, " tos=%d", fa->fa_tos);
+                       if (fa->fa_dscp)
+                               seq_printf(seq, " tos=%d",
+                                          inet_dscp_to_dsfield(fa->fa_dscp));
                        seq_putc(seq, '\n');
                }
        }
index b7e277d..72a375c 100644 (file)
@@ -192,24 +192,14 @@ struct icmp_control {
 
 static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1];
 
-/*
- *     The ICMP socket(s). This is the most convenient way to flow control
- *     our ICMP output as well as maintain a clean interface throughout
- *     all layers. All Socketless IP sends will soon be gone.
- *
- *     On SMP we have one ICMP socket per-cpu.
- */
-static struct sock *icmp_sk(struct net *net)
-{
-       return this_cpu_read(*net->ipv4.icmp_sk);
-}
+static DEFINE_PER_CPU(struct sock *, ipv4_icmp_sk);
 
 /* Called with BH disabled */
 static inline struct sock *icmp_xmit_lock(struct net *net)
 {
        struct sock *sk;
 
-       sk = icmp_sk(net);
+       sk = this_cpu_read(ipv4_icmp_sk);
 
        if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
                /* This can happen if the output path signals a
@@ -217,11 +207,13 @@ static inline struct sock *icmp_xmit_lock(struct net *net)
                 */
                return NULL;
        }
+       sock_net_set(sk, net);
        return sk;
 }
 
 static inline void icmp_xmit_unlock(struct sock *sk)
 {
+       sock_net_set(sk, &init_net);
        spin_unlock(&sk->sk_lock.slock);
 }
 
@@ -363,14 +355,13 @@ static int icmp_glue_bits(void *from, char *to, int offset, int len, int odd,
        return 0;
 }
 
-static void icmp_push_reply(struct icmp_bxm *icmp_param,
+static void icmp_push_reply(struct sock *sk,
+                           struct icmp_bxm *icmp_param,
                            struct flowi4 *fl4,
                            struct ipcm_cookie *ipc, struct rtable **rt)
 {
-       struct sock *sk;
        struct sk_buff *skb;
 
-       sk = icmp_sk(dev_net((*rt)->dst.dev));
        if (ip_append_data(sk, fl4, icmp_glue_bits, icmp_param,
                           icmp_param->data_len+icmp_param->head_len,
                           icmp_param->head_len,
@@ -452,7 +443,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
        if (IS_ERR(rt))
                goto out_unlock;
        if (icmpv4_xrlim_allow(net, rt, &fl4, type, code))
-               icmp_push_reply(icmp_param, &fl4, &ipc, &rt);
+               icmp_push_reply(sk, icmp_param, &fl4, &ipc, &rt);
        ip_rt_put(rt);
 out_unlock:
        icmp_xmit_unlock(sk);
@@ -766,7 +757,7 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
        if (!fl4.saddr)
                fl4.saddr = htonl(INADDR_DUMMY);
 
-       icmp_push_reply(&icmp_param, &fl4, &ipc, &rt);
+       icmp_push_reply(sk, &icmp_param, &fl4, &ipc, &rt);
 ende:
        ip_rt_put(rt);
 out_unlock:
@@ -1434,46 +1425,8 @@ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = {
        },
 };
 
-static void __net_exit icmp_sk_exit(struct net *net)
-{
-       int i;
-
-       for_each_possible_cpu(i)
-               inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.icmp_sk, i));
-       free_percpu(net->ipv4.icmp_sk);
-       net->ipv4.icmp_sk = NULL;
-}
-
 static int __net_init icmp_sk_init(struct net *net)
 {
-       int i, err;
-
-       net->ipv4.icmp_sk = alloc_percpu(struct sock *);
-       if (!net->ipv4.icmp_sk)
-               return -ENOMEM;
-
-       for_each_possible_cpu(i) {
-               struct sock *sk;
-
-               err = inet_ctl_sock_create(&sk, PF_INET,
-                                          SOCK_RAW, IPPROTO_ICMP, net);
-               if (err < 0)
-                       goto fail;
-
-               *per_cpu_ptr(net->ipv4.icmp_sk, i) = sk;
-
-               /* Enough space for 2 64K ICMP packets, including
-                * sk_buff/skb_shared_info struct overhead.
-                */
-               sk->sk_sndbuf = 2 * SKB_TRUESIZE(64 * 1024);
-
-               /*
-                * Speedup sock_wfree()
-                */
-               sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
-               inet_sk(sk)->pmtudisc = IP_PMTUDISC_DONT;
-       }
-
        /* Control parameters for ECHO replies. */
        net->ipv4.sysctl_icmp_echo_ignore_all = 0;
        net->ipv4.sysctl_icmp_echo_enable_probe = 0;
@@ -1499,18 +1452,36 @@ static int __net_init icmp_sk_init(struct net *net)
        net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr = 0;
 
        return 0;
-
-fail:
-       icmp_sk_exit(net);
-       return err;
 }
 
 static struct pernet_operations __net_initdata icmp_sk_ops = {
        .init = icmp_sk_init,
-       .exit = icmp_sk_exit,
 };
 
 int __init icmp_init(void)
 {
+       int err, i;
+
+       for_each_possible_cpu(i) {
+               struct sock *sk;
+
+               err = inet_ctl_sock_create(&sk, PF_INET,
+                                          SOCK_RAW, IPPROTO_ICMP, &init_net);
+               if (err < 0)
+                       return err;
+
+               per_cpu(ipv4_icmp_sk, i) = sk;
+
+               /* Enough space for 2 64K ICMP packets, including
+                * sk_buff/skb_shared_info struct overhead.
+                */
+               sk->sk_sndbuf = 2 * SKB_TRUESIZE(64 * 1024);
+
+               /*
+                * Speedup sock_wfree()
+                */
+               sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
+               inet_sk(sk)->pmtudisc = IP_PMTUDISC_DONT;
+       }
        return register_pernet_subsys(&icmp_sk_ops);
 }
index fc2a985..1e5b53c 100644 (file)
@@ -866,12 +866,9 @@ static void reqsk_timer_handler(struct timer_list *t)
            (!resend ||
             !inet_rtx_syn_ack(sk_listener, req) ||
             inet_rsk(req)->acked)) {
-               unsigned long timeo;
-
                if (req->num_timeout++ == 0)
                        atomic_dec(&queue->young);
-               timeo = min(TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX);
-               mod_timer(&req->rsk_timer, jiffies + timeo);
+               mod_timer(&req->rsk_timer, jiffies + reqsk_timeout(req, TCP_RTO_MAX));
 
                if (!nreq)
                        return;
@@ -1046,6 +1043,9 @@ int inet_csk_listen_start(struct sock *sk)
        sk->sk_ack_backlog = 0;
        inet_csk_delack_init(sk);
 
+       if (sk->sk_txrehash == SOCK_TXREHASH_DEFAULT)
+               sk->sk_txrehash = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
+
        /* There is race window here: we announce ourselves listening,
         * but this transition is still not validated by get_port().
         * It is OK, because this socket enters to hash table only
index 30ab717..1744084 100644 (file)
@@ -637,7 +637,9 @@ int __inet_hash(struct sock *sk, struct sock *osk)
        int err = 0;
 
        if (sk->sk_state != TCP_LISTEN) {
+               local_bh_disable();
                inet_ehash_nolisten(sk, osk, NULL);
+               local_bh_enable();
                return 0;
        }
        WARN_ON(!sk_unhashed(sk));
@@ -669,45 +671,54 @@ int inet_hash(struct sock *sk)
 {
        int err = 0;
 
-       if (sk->sk_state != TCP_CLOSE) {
-               local_bh_disable();
+       if (sk->sk_state != TCP_CLOSE)
                err = __inet_hash(sk, NULL);
-               local_bh_enable();
-       }
 
        return err;
 }
 EXPORT_SYMBOL_GPL(inet_hash);
 
-void inet_unhash(struct sock *sk)
+static void __inet_unhash(struct sock *sk, struct inet_listen_hashbucket *ilb)
 {
-       struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
-       struct inet_listen_hashbucket *ilb = NULL;
-       spinlock_t *lock;
-
        if (sk_unhashed(sk))
                return;
 
-       if (sk->sk_state == TCP_LISTEN) {
-               ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
-               lock = &ilb->lock;
-       } else {
-               lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
-       }
-       spin_lock_bh(lock);
-       if (sk_unhashed(sk))
-               goto unlock;
-
        if (rcu_access_pointer(sk->sk_reuseport_cb))
                reuseport_stop_listen_sock(sk);
        if (ilb) {
+               struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
+
                inet_unhash2(hashinfo, sk);
                ilb->count--;
        }
        __sk_nulls_del_node_init_rcu(sk);
        sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
-unlock:
-       spin_unlock_bh(lock);
+}
+
+void inet_unhash(struct sock *sk)
+{
+       struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
+
+       if (sk_unhashed(sk))
+               return;
+
+       if (sk->sk_state == TCP_LISTEN) {
+               struct inet_listen_hashbucket *ilb;
+
+               ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
+               /* Don't disable bottom halves while acquiring the lock to
+                * avoid circular locking dependency on PREEMPT_RT.
+                */
+               spin_lock(&ilb->lock);
+               __inet_unhash(sk, ilb);
+               spin_unlock(&ilb->lock);
+       } else {
+               spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
+
+               spin_lock_bh(lock);
+               __inet_unhash(sk, NULL);
+               spin_unlock_bh(lock);
+       }
 }
 EXPORT_SYMBOL_GPL(inet_unhash);
 
index 437afe3..9e0bbd0 100644 (file)
@@ -52,14 +52,15 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw)
        spin_unlock(lock);
 
        /* Disassociate with bind bucket. */
-       bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num,
-                       hashinfo->bhash_size)];
+       bhead = &hashinfo->bhash[tw->tw_bslot];
 
        spin_lock(&bhead->lock);
        inet_twsk_bind_unhash(tw, hashinfo);
        spin_unlock(&bhead->lock);
 
-       atomic_dec(&tw->tw_dr->tw_count);
+       if (refcount_dec_and_test(&tw->tw_dr->tw_refcount))
+               kfree(tw->tw_dr);
+
        inet_twsk_put(tw);
 }
 
@@ -110,8 +111,12 @@ void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
           Note, that any socket with inet->num != 0 MUST be bound in
           binding cache, even if it is closed.
         */
-       bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num,
-                       hashinfo->bhash_size)];
+       /* Cache inet_bhashfn(), because 'struct net' might be no longer
+        * available later in inet_twsk_kill().
+        */
+       tw->tw_bslot = inet_bhashfn(twsk_net(tw), inet->inet_num,
+                                   hashinfo->bhash_size);
+       bhead = &hashinfo->bhash[tw->tw_bslot];
        spin_lock(&bhead->lock);
        tw->tw_tb = icsk->icsk_bind_hash;
        WARN_ON(!icsk->icsk_bind_hash);
@@ -145,10 +150,6 @@ static void tw_timer_handler(struct timer_list *t)
 {
        struct inet_timewait_sock *tw = from_timer(tw, t, tw_timer);
 
-       if (tw->tw_kill)
-               __NET_INC_STATS(twsk_net(tw), LINUX_MIB_TIMEWAITKILLED);
-       else
-               __NET_INC_STATS(twsk_net(tw), LINUX_MIB_TIMEWAITED);
        inet_twsk_kill(tw);
 }
 
@@ -158,7 +159,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
 {
        struct inet_timewait_sock *tw;
 
-       if (atomic_read(&dr->tw_count) >= dr->sysctl_max_tw_buckets)
+       if (refcount_read(&dr->tw_refcount) - 1 >= dr->sysctl_max_tw_buckets)
                return NULL;
 
        tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab,
@@ -244,59 +245,15 @@ void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm)
         * of PAWS.
         */
 
-       tw->tw_kill = timeo <= 4*HZ;
        if (!rearm) {
+               bool kill = timeo <= 4*HZ;
+
+               __NET_INC_STATS(twsk_net(tw), kill ? LINUX_MIB_TIMEWAITKILLED :
+                                                    LINUX_MIB_TIMEWAITED);
                BUG_ON(mod_timer(&tw->tw_timer, jiffies + timeo));
-               atomic_inc(&tw->tw_dr->tw_count);
+               refcount_inc(&tw->tw_dr->tw_refcount);
        } else {
                mod_timer_pending(&tw->tw_timer, jiffies + timeo);
        }
 }
 EXPORT_SYMBOL_GPL(__inet_twsk_schedule);
-
-void inet_twsk_purge(struct inet_hashinfo *hashinfo, int family)
-{
-       struct inet_timewait_sock *tw;
-       struct sock *sk;
-       struct hlist_nulls_node *node;
-       unsigned int slot;
-
-       for (slot = 0; slot <= hashinfo->ehash_mask; slot++) {
-               struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
-restart_rcu:
-               cond_resched();
-               rcu_read_lock();
-restart:
-               sk_nulls_for_each_rcu(sk, node, &head->chain) {
-                       if (sk->sk_state != TCP_TIME_WAIT)
-                               continue;
-                       tw = inet_twsk(sk);
-                       if ((tw->tw_family != family) ||
-                               refcount_read(&twsk_net(tw)->ns.count))
-                               continue;
-
-                       if (unlikely(!refcount_inc_not_zero(&tw->tw_refcnt)))
-                               continue;
-
-                       if (unlikely((tw->tw_family != family) ||
-                                    refcount_read(&twsk_net(tw)->ns.count))) {
-                               inet_twsk_put(tw);
-                               goto restart;
-                       }
-
-                       rcu_read_unlock();
-                       local_bh_disable();
-                       inet_twsk_deschedule_put(tw);
-                       local_bh_enable();
-                       goto restart_rcu;
-               }
-               /* If the nulls value we got at the end of this lookup is
-                * not the expected one, we must restart lookup.
-                * We probably met an item that was moved to another chain.
-                */
-               if (get_nulls_value(node) != slot)
-                       goto restart;
-               rcu_read_unlock();
-       }
-}
-EXPORT_SYMBOL_GPL(inet_twsk_purge);
index 3a025c0..d94f9f7 100644 (file)
@@ -196,7 +196,8 @@ resubmit:
        if (ipprot) {
                if (!ipprot->no_policy) {
                        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
-                               kfree_skb(skb);
+                               kfree_skb_reason(skb,
+                                                SKB_DROP_REASON_XFRM_POLICY);
                                return;
                        }
                        nf_reset_ct(skb);
@@ -215,7 +216,7 @@ resubmit:
                                icmp_send(skb, ICMP_DEST_UNREACH,
                                          ICMP_PROT_UNREACH, 0);
                        }
-                       kfree_skb(skb);
+                       kfree_skb_reason(skb, SKB_DROP_REASON_IP_NOPROTO);
                } else {
                        __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
                        consume_skb(skb);
@@ -318,8 +319,10 @@ static int ip_rcv_finish_core(struct net *net, struct sock *sk,
 {
        const struct iphdr *iph = ip_hdr(skb);
        int (*edemux)(struct sk_buff *skb);
+       int err, drop_reason;
        struct rtable *rt;
-       int err;
+
+       drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
 
        if (ip_can_use_hint(skb, iph, hint)) {
                err = ip_route_use_hint(skb, iph->daddr, iph->saddr, iph->tos,
@@ -396,19 +399,23 @@ static int ip_rcv_finish_core(struct net *net, struct sock *sk,
                 * so-called "hole-196" attack) so do it for both.
                 */
                if (in_dev &&
-                   IN_DEV_ORCONF(in_dev, DROP_UNICAST_IN_L2_MULTICAST))
+                   IN_DEV_ORCONF(in_dev, DROP_UNICAST_IN_L2_MULTICAST)) {
+                       drop_reason = SKB_DROP_REASON_UNICAST_IN_L2_MULTICAST;
                        goto drop;
+               }
        }
 
        return NET_RX_SUCCESS;
 
 drop:
-       kfree_skb(skb);
+       kfree_skb_reason(skb, drop_reason);
        return NET_RX_DROP;
 
 drop_error:
-       if (err == -EXDEV)
+       if (err == -EXDEV) {
+               drop_reason = SKB_DROP_REASON_IP_RPFILTER;
                __NET_INC_STATS(net, LINUX_MIB_IPRPFILTER);
+       }
        goto drop;
 }
 
@@ -436,13 +443,16 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
 static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
 {
        const struct iphdr *iph;
+       int drop_reason;
        u32 len;
 
        /* When the interface is in promisc. mode, drop all the crap
         * that it receives, do not try to analyse it.
         */
-       if (skb->pkt_type == PACKET_OTHERHOST)
+       if (skb->pkt_type == PACKET_OTHERHOST) {
+               drop_reason = SKB_DROP_REASON_OTHERHOST;
                goto drop;
+       }
 
        __IP_UPD_PO_STATS(net, IPSTATS_MIB_IN, skb->len);
 
@@ -452,6 +462,7 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
                goto out;
        }
 
+       drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
        if (!pskb_may_pull(skb, sizeof(struct iphdr)))
                goto inhdr_error;
 
@@ -488,6 +499,7 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
 
        len = ntohs(iph->tot_len);
        if (skb->len < len) {
+               drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
                __IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS);
                goto drop;
        } else if (len < (iph->ihl*4))
@@ -516,11 +528,14 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
        return skb;
 
 csum_error:
+       drop_reason = SKB_DROP_REASON_IP_CSUM;
        __IP_INC_STATS(net, IPSTATS_MIB_CSUMERRORS);
 inhdr_error:
+       if (drop_reason == SKB_DROP_REASON_NOT_SPECIFIED)
+               drop_reason = SKB_DROP_REASON_IP_INHDR;
        __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
 drop:
-       kfree_skb(skb);
+       kfree_skb_reason(skb, drop_reason);
 out:
        return NULL;
 }
index da1b503..a9e22a0 100644 (file)
@@ -42,7 +42,7 @@
  */
 
 void ip_options_build(struct sk_buff *skb, struct ip_options *opt,
-                     __be32 daddr, struct rtable *rt, int is_frag)
+                     __be32 daddr, struct rtable *rt)
 {
        unsigned char *iph = skb_network_header(skb);
 
@@ -53,28 +53,15 @@ void ip_options_build(struct sk_buff *skb, struct ip_options *opt,
        if (opt->srr)
                memcpy(iph + opt->srr + iph[opt->srr + 1] - 4, &daddr, 4);
 
-       if (!is_frag) {
-               if (opt->rr_needaddr)
-                       ip_rt_get_source(iph + opt->rr + iph[opt->rr + 2] - 5, skb, rt);
-               if (opt->ts_needaddr)
-                       ip_rt_get_source(iph + opt->ts + iph[opt->ts + 2] - 9, skb, rt);
-               if (opt->ts_needtime) {
-                       __be32 midtime;
+       if (opt->rr_needaddr)
+               ip_rt_get_source(iph + opt->rr + iph[opt->rr + 2] - 5, skb, rt);
+       if (opt->ts_needaddr)
+               ip_rt_get_source(iph + opt->ts + iph[opt->ts + 2] - 9, skb, rt);
+       if (opt->ts_needtime) {
+               __be32 midtime;
 
-                       midtime = inet_current_timestamp();
-                       memcpy(iph + opt->ts + iph[opt->ts + 2] - 5, &midtime, 4);
-               }
-               return;
-       }
-       if (opt->rr) {
-               memset(iph + opt->rr, IPOPT_NOP, iph[opt->rr + 1]);
-               opt->rr = 0;
-               opt->rr_needaddr = 0;
-       }
-       if (opt->ts) {
-               memset(iph + opt->ts, IPOPT_NOP, iph[opt->ts + 1]);
-               opt->ts = 0;
-               opt->ts_needaddr = opt->ts_needtime = 0;
+               midtime = inet_current_timestamp();
+               memcpy(iph + opt->ts + iph[opt->ts + 2] - 5, &midtime, 4);
        }
 }
 
index 139cec2..0c0574e 100644 (file)
@@ -179,7 +179,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk,
 
        if (opt && opt->opt.optlen) {
                iph->ihl += opt->opt.optlen>>2;
-               ip_options_build(skb, &opt->opt, daddr, rt, 0);
+               ip_options_build(skb, &opt->opt, daddr, rt);
        }
 
        skb->priority = sk->sk_priority;
@@ -519,7 +519,7 @@ packet_routed:
 
        if (inet_opt && inet_opt->opt.optlen) {
                iph->ihl += inet_opt->opt.optlen >> 2;
-               ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
+               ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt);
        }
 
        ip_select_ident_segs(net, skb, sk,
@@ -1541,7 +1541,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
 
        if (opt) {
                iph->ihl += opt->optlen >> 2;
-               ip_options_build(skb, opt, cork->addr, rt, 0);
+               ip_options_build(skb, opt, cork->addr, rt);
        }
 
        skb->priority = (cork->tos != -1) ? cork->priority: sk->sk_priority;
index 29bbe2b..c860519 100644 (file)
@@ -268,13 +268,12 @@ static void __net_exit ipmr_rules_exit(struct net *net)
 {
        struct mr_table *mrt, *next;
 
-       rtnl_lock();
+       ASSERT_RTNL();
        list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) {
                list_del(&mrt->list);
                ipmr_free_table(mrt);
        }
        fib_rules_unregister(net->ipv4.mr_rules_ops);
-       rtnl_unlock();
 }
 
 static int ipmr_rules_dump(struct net *net, struct notifier_block *nb,
@@ -330,10 +329,9 @@ static int __net_init ipmr_rules_init(struct net *net)
 
 static void __net_exit ipmr_rules_exit(struct net *net)
 {
-       rtnl_lock();
+       ASSERT_RTNL();
        ipmr_free_table(net->ipv4.mrt);
        net->ipv4.mrt = NULL;
-       rtnl_unlock();
 }
 
 static int ipmr_rules_dump(struct net *net, struct notifier_block *nb,
@@ -3077,7 +3075,9 @@ static int __net_init ipmr_net_init(struct net *net)
 proc_cache_fail:
        remove_proc_entry("ip_mr_vif", net->proc_net);
 proc_vif_fail:
+       rtnl_lock();
        ipmr_rules_exit(net);
+       rtnl_unlock();
 #endif
 ipmr_rules_fail:
        ipmr_notifier_exit(net);
@@ -3092,12 +3092,22 @@ static void __net_exit ipmr_net_exit(struct net *net)
        remove_proc_entry("ip_mr_vif", net->proc_net);
 #endif
        ipmr_notifier_exit(net);
-       ipmr_rules_exit(net);
+}
+
+static void __net_exit ipmr_net_exit_batch(struct list_head *net_list)
+{
+       struct net *net;
+
+       rtnl_lock();
+       list_for_each_entry(net, net_list, exit_list)
+               ipmr_rules_exit(net);
+       rtnl_unlock();
 }
 
 static struct pernet_operations ipmr_net_ops = {
        .init = ipmr_net_init,
        .exit = ipmr_net_exit,
+       .exit_batch = ipmr_net_exit_batch,
 };
 
 int __init ip_mr_init(void)
index 3f248a1..fab357c 100644 (file)
@@ -295,28 +295,24 @@ pptp_inbound_pkt(struct sk_buff *skb,
        return NF_ACCEPT;
 }
 
+static const struct nf_nat_pptp_hook pptp_hooks = {
+       .outbound = pptp_outbound_pkt,
+       .inbound = pptp_inbound_pkt,
+       .exp_gre = pptp_exp_gre,
+       .expectfn = pptp_nat_expected,
+};
+
 static int __init nf_nat_helper_pptp_init(void)
 {
-       BUG_ON(nf_nat_pptp_hook_outbound != NULL);
-       RCU_INIT_POINTER(nf_nat_pptp_hook_outbound, pptp_outbound_pkt);
-
-       BUG_ON(nf_nat_pptp_hook_inbound != NULL);
-       RCU_INIT_POINTER(nf_nat_pptp_hook_inbound, pptp_inbound_pkt);
-
-       BUG_ON(nf_nat_pptp_hook_exp_gre != NULL);
-       RCU_INIT_POINTER(nf_nat_pptp_hook_exp_gre, pptp_exp_gre);
+       WARN_ON(nf_nat_pptp_hook != NULL);
+       RCU_INIT_POINTER(nf_nat_pptp_hook, &pptp_hooks);
 
-       BUG_ON(nf_nat_pptp_hook_expectfn != NULL);
-       RCU_INIT_POINTER(nf_nat_pptp_hook_expectfn, pptp_nat_expected);
        return 0;
 }
 
 static void __exit nf_nat_helper_pptp_fini(void)
 {
-       RCU_INIT_POINTER(nf_nat_pptp_hook_expectfn, NULL);
-       RCU_INIT_POINTER(nf_nat_pptp_hook_exp_gre, NULL);
-       RCU_INIT_POINTER(nf_nat_pptp_hook_inbound, NULL);
-       RCU_INIT_POINTER(nf_nat_pptp_hook_outbound, NULL);
+       RCU_INIT_POINTER(nf_nat_pptp_hook, NULL);
        synchronize_rcu();
 }
 
index eeafecc..e459a39 100644 (file)
@@ -3733,12 +3733,16 @@ out:
 }
 EXPORT_SYMBOL(nexthop_res_grp_activity_update);
 
-static void __net_exit nexthop_net_exit(struct net *net)
+static void __net_exit nexthop_net_exit_batch(struct list_head *net_list)
 {
+       struct net *net;
+
        rtnl_lock();
-       flush_all_nexthops(net);
+       list_for_each_entry(net, net_list, exit_list) {
+               flush_all_nexthops(net);
+               kfree(net->nexthop.devhash);
+       }
        rtnl_unlock();
-       kfree(net->nexthop.devhash);
 }
 
 static int __net_init nexthop_net_init(struct net *net)
@@ -3756,7 +3760,7 @@ static int __net_init nexthop_net_init(struct net *net)
 
 static struct pernet_operations nexthop_net_ops = {
        .init = nexthop_net_init,
-       .exit = nexthop_net_exit,
+       .exit_batch = nexthop_net_exit_batch,
 };
 
 static int __init nexthop_init(void)
index f30273a..2883607 100644 (file)
@@ -59,8 +59,8 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
        socket_seq_show(seq);
        seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n",
                   sock_prot_inuse_get(net, &tcp_prot), orphans,
-                  atomic_read(&net->ipv4.tcp_death_row.tw_count), sockets,
-                  proto_memory_allocated(&tcp_prot));
+                  refcount_read(&net->ipv4.tcp_death_row->tw_refcount) - 1,
+                  sockets, proto_memory_allocated(&tcp_prot));
        seq_printf(seq, "UDP: inuse %d mem %ld\n",
                   sock_prot_inuse_get(net, &udp_prot),
                   proto_memory_allocated(&udp_prot));
index ff6f91c..634766e 100644 (file)
@@ -84,6 +84,7 @@
 #include <linux/jhash.h>
 #include <net/dst.h>
 #include <net/dst_metadata.h>
+#include <net/inet_dscp.h>
 #include <net/net_namespace.h>
 #include <net/ip.h>
 #include <net/route.h>
 
 #define DEFAULT_MIN_PMTU (512 + 20 + 20)
 #define DEFAULT_MTU_EXPIRES (10 * 60 * HZ)
-
+#define DEFAULT_MIN_ADVMSS 256
 static int ip_rt_max_size;
 static int ip_rt_redirect_number __read_mostly = 9;
 static int ip_rt_redirect_load __read_mostly   = HZ / 50;
 static int ip_rt_redirect_silence __read_mostly        = ((HZ / 50) << (9 + 1));
 static int ip_rt_error_cost __read_mostly      = HZ;
 static int ip_rt_error_burst __read_mostly     = 5 * HZ;
-static int ip_rt_min_advmss __read_mostly      = 256;
 
 static int ip_rt_gc_timeout __read_mostly      = RT_GC_TIMEOUT;
 
@@ -458,7 +458,7 @@ static u32 *ip_tstamps __read_mostly;
  * if one generator is seldom used. This makes hard for an attacker
  * to infer how many packets were sent between two points in time.
  */
-u32 ip_idents_reserve(u32 hash, int segs)
+static u32 ip_idents_reserve(u32 hash, int segs)
 {
        u32 bucket, old, now = (u32)jiffies;
        atomic_t *p_id;
@@ -479,7 +479,6 @@ u32 ip_idents_reserve(u32 hash, int segs)
         */
        return atomic_add_return(segs + delta, p_id) - segs;
 }
-EXPORT_SYMBOL(ip_idents_reserve);
 
 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 {
@@ -1298,9 +1297,10 @@ static void set_class_tag(struct rtable *rt, u32 tag)
 
 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
 {
+       struct net *net = dev_net(dst->dev);
        unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
        unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
-                                   ip_rt_min_advmss);
+                                   net->ipv4.ip_rt_min_advmss);
 
        return min(advmss, IPV4_MAX_PMTU - header_size);
 }
@@ -3392,7 +3392,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 
                                if (fa->fa_slen == slen &&
                                    fa->tb_id == fri.tb_id &&
-                                   fa->fa_tos == fri.tos &&
+                                   fa->fa_dscp == inet_dsfield_to_dscp(fri.tos) &&
                                    fa->fa_info == res.fi &&
                                    fa->fa_type == fri.type) {
                                        fri.offload = fa->offload;
@@ -3535,13 +3535,6 @@ static struct ctl_table ipv4_route_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
        },
-       {
-               .procname       = "min_adv_mss",
-               .data           = &ip_rt_min_advmss,
-               .maxlen         = sizeof(int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec,
-       },
        { }
 };
 
@@ -3569,6 +3562,13 @@ static struct ctl_table ipv4_route_netns_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec_jiffies,
        },
+       {
+               .procname   = "min_adv_mss",
+               .data       = &init_net.ipv4.ip_rt_min_advmss,
+               .maxlen     = sizeof(int),
+               .mode       = 0644,
+               .proc_handler   = proc_dointvec,
+       },
        { },
 };
 
@@ -3631,6 +3631,7 @@ static __net_init int netns_ip_rt_init(struct net *net)
        /* Set default value for namespaceified sysctls */
        net->ipv4.ip_rt_min_pmtu = DEFAULT_MIN_PMTU;
        net->ipv4.ip_rt_mtu_expires = DEFAULT_MTU_EXPIRES;
+       net->ipv4.ip_rt_min_advmss = DEFAULT_MIN_ADVMSS;
        return 0;
 }
 
index 97eb547..1cae27b 100644 (file)
@@ -589,6 +589,14 @@ static struct ctl_table ipv4_table[] = {
 };
 
 static struct ctl_table ipv4_net_table[] = {
+       /* tcp_max_tw_buckets must be first in this table. */
+       {
+               .procname       = "tcp_max_tw_buckets",
+/*             .data           = &init_net.ipv4.tcp_death_row.sysctl_max_tw_buckets, */
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec
+       },
        {
                .procname       = "icmp_echo_ignore_all",
                .data           = &init_net.ipv4.sysctl_icmp_echo_ignore_all,
@@ -1001,13 +1009,6 @@ static struct ctl_table ipv4_net_table[] = {
                .extra2         = &two,
        },
        {
-               .procname       = "tcp_max_tw_buckets",
-               .data           = &init_net.ipv4.tcp_death_row.sysctl_max_tw_buckets,
-               .maxlen         = sizeof(int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec
-       },
-       {
                .procname       = "tcp_max_syn_backlog",
                .data           = &init_net.ipv4.sysctl_max_syn_backlog,
                .maxlen         = sizeof(int),
@@ -1400,7 +1401,8 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
                if (!table)
                        goto err_alloc;
 
-               for (i = 0; i < ARRAY_SIZE(ipv4_net_table) - 1; i++) {
+               /* skip first entry (sysctl_max_tw_buckets) */
+               for (i = 1; i < ARRAY_SIZE(ipv4_net_table) - 1; i++) {
                        if (table[i].data) {
                                /* Update the variables to point into
                                 * the current struct net
@@ -1415,6 +1417,8 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
                }
        }
 
+       table[0].data = &net->ipv4.tcp_death_row->sysctl_max_tw_buckets;
+
        net->ipv4.ipv4_hdr = register_net_sysctl(net, "net/ipv4", table);
        if (!net->ipv4.ipv4_hdr)
                goto err_reg;
index 02cb275..760e822 100644 (file)
@@ -894,8 +894,7 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
                return mss_now;
 
        /* Note : tcp_tso_autosize() will eventually split this later */
-       new_size_goal = sk->sk_gso_max_size - 1 - MAX_TCP_HEADER;
-       new_size_goal = tcp_bound_to_half_wnd(tp, new_size_goal);
+       new_size_goal = tcp_bound_to_half_wnd(tp, sk->sk_gso_max_size);
 
        /* We try hard to avoid divides here */
        size_goal = tp->gso_segs * mss_now;
index ec55500..02e8626 100644 (file)
@@ -1154,7 +1154,7 @@ static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
        .set_state      = bbr_set_state,
 };
 
-BTF_SET_START(tcp_bbr_kfunc_ids)
+BTF_SET_START(tcp_bbr_check_kfunc_ids)
 #ifdef CONFIG_X86
 #ifdef CONFIG_DYNAMIC_FTRACE
 BTF_ID(func, bbr_init)
@@ -1167,25 +1167,27 @@ BTF_ID(func, bbr_min_tso_segs)
 BTF_ID(func, bbr_set_state)
 #endif
 #endif
-BTF_SET_END(tcp_bbr_kfunc_ids)
+BTF_SET_END(tcp_bbr_check_kfunc_ids)
 
-static DEFINE_KFUNC_BTF_ID_SET(&tcp_bbr_kfunc_ids, tcp_bbr_kfunc_btf_set);
+static const struct btf_kfunc_id_set tcp_bbr_kfunc_set = {
+       .owner     = THIS_MODULE,
+       .check_set = &tcp_bbr_check_kfunc_ids,
+};
 
 static int __init bbr_register(void)
 {
        int ret;
 
        BUILD_BUG_ON(sizeof(struct bbr) > ICSK_CA_PRIV_SIZE);
-       ret = tcp_register_congestion_control(&tcp_bbr_cong_ops);
-       if (ret)
+
+       ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &tcp_bbr_kfunc_set);
+       if (ret < 0)
                return ret;
-       register_kfunc_btf_id_set(&bpf_tcp_ca_kfunc_list, &tcp_bbr_kfunc_btf_set);
-       return 0;
+       return tcp_register_congestion_control(&tcp_bbr_cong_ops);
 }
 
 static void __exit bbr_unregister(void)
 {
-       unregister_kfunc_btf_id_set(&bpf_tcp_ca_kfunc_list, &tcp_bbr_kfunc_btf_set);
        tcp_unregister_congestion_control(&tcp_bbr_cong_ops);
 }
 
index e07837e..24d562d 100644 (file)
@@ -485,7 +485,7 @@ static struct tcp_congestion_ops cubictcp __read_mostly = {
        .name           = "cubic",
 };
 
-BTF_SET_START(tcp_cubic_kfunc_ids)
+BTF_SET_START(tcp_cubic_check_kfunc_ids)
 #ifdef CONFIG_X86
 #ifdef CONFIG_DYNAMIC_FTRACE
 BTF_ID(func, cubictcp_init)
@@ -496,9 +496,12 @@ BTF_ID(func, cubictcp_cwnd_event)
 BTF_ID(func, cubictcp_acked)
 #endif
 #endif
-BTF_SET_END(tcp_cubic_kfunc_ids)
+BTF_SET_END(tcp_cubic_check_kfunc_ids)
 
-static DEFINE_KFUNC_BTF_ID_SET(&tcp_cubic_kfunc_ids, tcp_cubic_kfunc_btf_set);
+static const struct btf_kfunc_id_set tcp_cubic_kfunc_set = {
+       .owner     = THIS_MODULE,
+       .check_set = &tcp_cubic_check_kfunc_ids,
+};
 
 static int __init cubictcp_register(void)
 {
@@ -534,16 +537,14 @@ static int __init cubictcp_register(void)
        /* divide by bic_scale and by constant Srtt (100ms) */
        do_div(cube_factor, bic_scale * 10);
 
-       ret = tcp_register_congestion_control(&cubictcp);
-       if (ret)
+       ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &tcp_cubic_kfunc_set);
+       if (ret < 0)
                return ret;
-       register_kfunc_btf_id_set(&bpf_tcp_ca_kfunc_list, &tcp_cubic_kfunc_btf_set);
-       return 0;
+       return tcp_register_congestion_control(&cubictcp);
 }
 
 static void __exit cubictcp_unregister(void)
 {
-       unregister_kfunc_btf_id_set(&bpf_tcp_ca_kfunc_list, &tcp_cubic_kfunc_btf_set);
        tcp_unregister_congestion_control(&cubictcp);
 }
 
index 0d7ab3c..1943a66 100644 (file)
@@ -238,7 +238,7 @@ static struct tcp_congestion_ops dctcp_reno __read_mostly = {
        .name           = "dctcp-reno",
 };
 
-BTF_SET_START(tcp_dctcp_kfunc_ids)
+BTF_SET_START(tcp_dctcp_check_kfunc_ids)
 #ifdef CONFIG_X86
 #ifdef CONFIG_DYNAMIC_FTRACE
 BTF_ID(func, dctcp_init)
@@ -249,25 +249,27 @@ BTF_ID(func, dctcp_cwnd_undo)
 BTF_ID(func, dctcp_state)
 #endif
 #endif
-BTF_SET_END(tcp_dctcp_kfunc_ids)
+BTF_SET_END(tcp_dctcp_check_kfunc_ids)
 
-static DEFINE_KFUNC_BTF_ID_SET(&tcp_dctcp_kfunc_ids, tcp_dctcp_kfunc_btf_set);
+static const struct btf_kfunc_id_set tcp_dctcp_kfunc_set = {
+       .owner     = THIS_MODULE,
+       .check_set = &tcp_dctcp_check_kfunc_ids,
+};
 
 static int __init dctcp_register(void)
 {
        int ret;
 
        BUILD_BUG_ON(sizeof(struct dctcp) > ICSK_CA_PRIV_SIZE);
-       ret = tcp_register_congestion_control(&dctcp);
-       if (ret)
+
+       ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &tcp_dctcp_kfunc_set);
+       if (ret < 0)
                return ret;
-       register_kfunc_btf_id_set(&bpf_tcp_ca_kfunc_list, &tcp_dctcp_kfunc_btf_set);
-       return 0;
+       return tcp_register_congestion_control(&dctcp);
 }
 
 static void __exit dctcp_unregister(void)
 {
-       unregister_kfunc_btf_id_set(&bpf_tcp_ca_kfunc_list, &tcp_dctcp_kfunc_btf_set);
        tcp_unregister_congestion_control(&dctcp);
 }
 
index bfe4112..af94a6d 100644 (file)
@@ -6725,6 +6725,7 @@ struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
                ireq->ireq_state = TCP_NEW_SYN_RECV;
                write_pnet(&ireq->ireq_net, sock_net(sk_listener));
                ireq->ireq_family = sk_listener->sk_family;
+               req->timeout = TCP_TIMEOUT_INIT;
        }
 
        return req;
@@ -6941,9 +6942,10 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
                sock_put(fastopen_sk);
        } else {
                tcp_rsk(req)->tfo_listener = false;
-               if (!want_cookie)
-                       inet_csk_reqsk_queue_hash_add(sk, req,
-                               tcp_timeout_init((struct sock *)req));
+               if (!want_cookie) {
+                       req->timeout = tcp_timeout_init((struct sock *)req);
+                       inet_csk_reqsk_queue_hash_add(sk, req, req->timeout);
+               }
                af_ops->send_synack(sk, dst, &fl, req, &foc,
                                    !want_cookie ? TCP_SYNACK_NORMAL :
                                                   TCP_SYNACK_COOKIE,
index fec656f..6873f46 100644 (file)
@@ -91,6 +91,8 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
 struct inet_hashinfo tcp_hashinfo;
 EXPORT_SYMBOL(tcp_hashinfo);
 
+static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
+
 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
 {
        return secure_tcp_seq(ip_hdr(skb)->daddr,
@@ -206,7 +208,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
        struct rtable *rt;
        int err;
        struct ip_options_rcu *inet_opt;
-       struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
+       struct inet_timewait_death_row *tcp_death_row = sock_net(sk)->ipv4.tcp_death_row;
 
        if (addr_len < sizeof(struct sockaddr_in))
                return -EINVAL;
@@ -810,7 +812,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
        arg.tos = ip_hdr(skb)->tos;
        arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
        local_bh_disable();
-       ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
+       ctl_sk = this_cpu_read(ipv4_tcp_sk);
+       sock_net_set(ctl_sk, net);
        if (sk) {
                ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
                                   inet_twsk(sk)->tw_mark : sk->sk_mark;
@@ -825,6 +828,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
                              transmit_time);
 
        ctl_sk->sk_mark = 0;
+       sock_net_set(ctl_sk, &init_net);
        __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
        __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
        local_bh_enable();
@@ -908,7 +912,8 @@ static void tcp_v4_send_ack(const struct sock *sk,
        arg.tos = tos;
        arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
        local_bh_disable();
-       ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
+       ctl_sk = this_cpu_read(ipv4_tcp_sk);
+       sock_net_set(ctl_sk, net);
        ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
                           inet_twsk(sk)->tw_mark : sk->sk_mark;
        ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
@@ -921,6 +926,7 @@ static void tcp_v4_send_ack(const struct sock *sk,
                              transmit_time);
 
        ctl_sk->sk_mark = 0;
+       sock_net_set(ctl_sk, &init_net);
        __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
        local_bh_enable();
 }
@@ -3111,41 +3117,18 @@ EXPORT_SYMBOL(tcp_prot);
 
 static void __net_exit tcp_sk_exit(struct net *net)
 {
-       int cpu;
+       struct inet_timewait_death_row *tcp_death_row = net->ipv4.tcp_death_row;
 
        if (net->ipv4.tcp_congestion_control)
                bpf_module_put(net->ipv4.tcp_congestion_control,
                               net->ipv4.tcp_congestion_control->owner);
-
-       for_each_possible_cpu(cpu)
-               inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
-       free_percpu(net->ipv4.tcp_sk);
+       if (refcount_dec_and_test(&tcp_death_row->tw_refcount))
+               kfree(tcp_death_row);
 }
 
 static int __net_init tcp_sk_init(struct net *net)
 {
-       int res, cpu, cnt;
-
-       net->ipv4.tcp_sk = alloc_percpu(struct sock *);
-       if (!net->ipv4.tcp_sk)
-               return -ENOMEM;
-
-       for_each_possible_cpu(cpu) {
-               struct sock *sk;
-
-               res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
-                                          IPPROTO_TCP, net);
-               if (res)
-                       goto fail;
-               sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
-
-               /* Please enforce IP_DF and IPID==0 for RST and
-                * ACK sent in SYN-RECV and TIME-WAIT state.
-                */
-               inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
-
-               *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
-       }
+       int cnt;
 
        net->ipv4.sysctl_tcp_ecn = 2;
        net->ipv4.sysctl_tcp_ecn_fallback = 1;
@@ -3172,9 +3155,13 @@ static int __net_init tcp_sk_init(struct net *net)
        net->ipv4.sysctl_tcp_tw_reuse = 2;
        net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
 
+       net->ipv4.tcp_death_row = kzalloc(sizeof(struct inet_timewait_death_row), GFP_KERNEL);
+       if (!net->ipv4.tcp_death_row)
+               return -ENOMEM;
+       refcount_set(&net->ipv4.tcp_death_row->tw_refcount, 1);
        cnt = tcp_hashinfo.ehash_mask + 1;
-       net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
-       net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
+       net->ipv4.tcp_death_row->sysctl_max_tw_buckets = cnt / 2;
+       net->ipv4.tcp_death_row->hashinfo = &tcp_hashinfo;
 
        net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
        net->ipv4.sysctl_tcp_sack = 1;
@@ -3229,18 +3216,12 @@ static int __net_init tcp_sk_init(struct net *net)
                net->ipv4.tcp_congestion_control = &tcp_reno;
 
        return 0;
-fail:
-       tcp_sk_exit(net);
-
-       return res;
 }
 
 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
 {
        struct net *net;
 
-       inet_twsk_purge(&tcp_hashinfo, AF_INET);
-
        list_for_each_entry(net, net_exit_list, exit_list)
                tcp_fastopen_ctx_destroy(net);
 }
@@ -3326,6 +3307,24 @@ static void __init bpf_iter_register(void)
 
 void __init tcp_v4_init(void)
 {
+       int cpu, res;
+
+       for_each_possible_cpu(cpu) {
+               struct sock *sk;
+
+               res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
+                                          IPPROTO_TCP, &init_net);
+               if (res)
+                       panic("Failed to create the TCP control socket.\n");
+               sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
+
+               /* Please enforce IP_DF and IPID==0 for RST and
+                * ACK sent in SYN-RECV and TIME-WAIT state.
+                */
+               inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
+
+               per_cpu(ipv4_tcp_sk, cpu) = sk;
+       }
        if (register_pernet_subsys(&tcp_sk_ops))
                panic("Failed to create the TCP control socket.\n");
 
index 7c2d3ac..6366df7 100644 (file)
@@ -248,7 +248,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
        const struct inet_connection_sock *icsk = inet_csk(sk);
        const struct tcp_sock *tp = tcp_sk(sk);
        struct inet_timewait_sock *tw;
-       struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
+       struct inet_timewait_death_row *tcp_death_row = sock_net(sk)->ipv4.tcp_death_row;
 
        tw = inet_twsk_alloc(sk, tcp_death_row, state);
 
@@ -583,7 +583,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
                         * it can be estimated (approximately)
                         * from another data.
                         */
-                       tmp_opt.ts_recent_stamp = ktime_get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->num_timeout);
+                       tmp_opt.ts_recent_stamp = ktime_get_seconds() - reqsk_timeout(req, TCP_RTO_MAX) / HZ;
                        paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
                }
        }
@@ -622,8 +622,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
                    !inet_rtx_syn_ack(sk, req)) {
                        unsigned long expires = jiffies;
 
-                       expires += min(TCP_TIMEOUT_INIT << req->num_timeout,
-                                      TCP_RTO_MAX);
+                       expires += reqsk_timeout(req, TCP_RTO_MAX);
                        if (!fastopen)
                                mod_timer_pending(&req->rsk_timer, expires);
                        else
index 5079832..e76bf1e 100644 (file)
@@ -1960,7 +1960,7 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
 
        bytes = min_t(unsigned long,
                      sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift),
-                     sk->sk_gso_max_size - 1 - MAX_TCP_HEADER);
+                     sk->sk_gso_max_size);
 
        /* Goal is to send at least one packet per ms,
         * not one big TSO packet every 100 ms.
@@ -4092,7 +4092,9 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
        struct flowi fl;
        int res;
 
-       tcp_rsk(req)->txhash = net_tx_rndhash();
+       /* Paired with WRITE_ONCE() in sock_setsockopt() */
+       if (READ_ONCE(sk->sk_txrehash) == SOCK_TXREHASH_ENABLED)
+               tcp_rsk(req)->txhash = net_tx_rndhash();
        res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL,
                                  NULL);
        if (!res) {
index 0903609..6b4d836 100644 (file)
@@ -2093,16 +2093,20 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
        rc = __udp_enqueue_schedule_skb(sk, skb);
        if (rc < 0) {
                int is_udplite = IS_UDPLITE(sk);
+               int drop_reason;
 
                /* Note that an ENOMEM error is charged twice */
-               if (rc == -ENOMEM)
+               if (rc == -ENOMEM) {
                        UDP_INC_STATS(sock_net(sk), UDP_MIB_RCVBUFERRORS,
                                        is_udplite);
-               else
+                       drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
+               } else {
                        UDP_INC_STATS(sock_net(sk), UDP_MIB_MEMERRORS,
                                      is_udplite);
+                       drop_reason = SKB_DROP_REASON_PROTO_MEM;
+               }
                UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
-               kfree_skb(skb);
+               kfree_skb_reason(skb, drop_reason);
                trace_udp_fail_queue_rcv_skb(rc, sk);
                return -1;
        }
@@ -2120,14 +2124,17 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
  */
 static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
 {
+       int drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
        struct udp_sock *up = udp_sk(sk);
        int is_udplite = IS_UDPLITE(sk);
 
        /*
         *      Charge it to the socket, dropping if the queue is full.
         */
-       if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
+       if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
+               drop_reason = SKB_DROP_REASON_XFRM_POLICY;
                goto drop;
+       }
        nf_reset_ct(skb);
 
        if (static_branch_unlikely(&udp_encap_needed_key) && up->encap_type) {
@@ -2204,8 +2211,10 @@ static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
            udp_lib_checksum_complete(skb))
                        goto csum_error;
 
-       if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr)))
+       if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr))) {
+               drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
                goto drop;
+       }
 
        udp_csum_pull_header(skb);
 
@@ -2213,11 +2222,12 @@ static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
        return __udp_queue_rcv_skb(sk, skb);
 
 csum_error:
+       drop_reason = SKB_DROP_REASON_UDP_CSUM;
        __UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
 drop:
        __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
        atomic_inc(&sk->sk_drops);
-       kfree_skb(skb);
+       kfree_skb_reason(skb, drop_reason);
        return -1;
 }
 
index f927c19..4f402bc 100644 (file)
@@ -146,18 +146,11 @@ static int ipv6_generate_stable_address(struct in6_addr *addr,
 
 #define IN6_ADDR_HSIZE_SHIFT   8
 #define IN6_ADDR_HSIZE         (1 << IN6_ADDR_HSIZE_SHIFT)
-/*
- *     Configured unicast address hash table
- */
-static struct hlist_head inet6_addr_lst[IN6_ADDR_HSIZE];
-static DEFINE_SPINLOCK(addrconf_hash_lock);
 
-static void addrconf_verify(void);
-static void addrconf_verify_rtnl(void);
-static void addrconf_verify_work(struct work_struct *);
+static void addrconf_verify(struct net *net);
+static void addrconf_verify_rtnl(struct net *net);
 
 static struct workqueue_struct *addrconf_wq;
-static DECLARE_DELAYED_WORK(addr_chk_work, addrconf_verify_work);
 
 static void addrconf_join_anycast(struct inet6_ifaddr *ifp);
 static void addrconf_leave_anycast(struct inet6_ifaddr *ifp);
@@ -554,7 +547,7 @@ static int inet6_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
 #ifdef CONFIG_IPV6_MROUTE
        if ((all || type == NETCONFA_MC_FORWARDING) &&
            nla_put_s32(skb, NETCONFA_MC_FORWARDING,
-                       devconf->mc_forwarding) < 0)
+                       atomic_read(&devconf->mc_forwarding)) < 0)
                goto nla_put_failure;
 #endif
        if ((all || type == NETCONFA_PROXY_NEIGH) &&
@@ -1011,9 +1004,7 @@ static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr,
 {
        struct inet6_ifaddr *ifp;
 
-       hlist_for_each_entry(ifp, &inet6_addr_lst[hash], addr_lst) {
-               if (!net_eq(dev_net(ifp->idev->dev), net))
-                       continue;
+       hlist_for_each_entry(ifp, &net->ipv6.inet6_addr_lst[hash], addr_lst) {
                if (ipv6_addr_equal(&ifp->addr, addr)) {
                        if (!dev || ifp->idev->dev == dev)
                                return true;
@@ -1024,20 +1015,21 @@ static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr,
 
 static int ipv6_add_addr_hash(struct net_device *dev, struct inet6_ifaddr *ifa)
 {
-       unsigned int hash = inet6_addr_hash(dev_net(dev), &ifa->addr);
+       struct net *net = dev_net(dev);
+       unsigned int hash = inet6_addr_hash(net, &ifa->addr);
        int err = 0;
 
-       spin_lock(&addrconf_hash_lock);
+       spin_lock(&net->ipv6.addrconf_hash_lock);
 
        /* Ignore adding duplicate addresses on an interface */
-       if (ipv6_chk_same_addr(dev_net(dev), &ifa->addr, dev, hash)) {
+       if (ipv6_chk_same_addr(net, &ifa->addr, dev, hash)) {
                netdev_dbg(dev, "ipv6_add_addr: already assigned\n");
                err = -EEXIST;
        } else {
-               hlist_add_head_rcu(&ifa->addr_lst, &inet6_addr_lst[hash]);
+               hlist_add_head_rcu(&ifa->addr_lst, &net->ipv6.inet6_addr_lst[hash]);
        }
 
-       spin_unlock(&addrconf_hash_lock);
+       spin_unlock(&net->ipv6.addrconf_hash_lock);
 
        return err;
 }
@@ -1261,9 +1253,10 @@ cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires,
 
 static void ipv6_del_addr(struct inet6_ifaddr *ifp)
 {
-       int state;
        enum cleanup_prefix_rt_t action = CLEANUP_PREFIX_RT_NOP;
+       struct net *net = dev_net(ifp->idev->dev);
        unsigned long expires;
+       int state;
 
        ASSERT_RTNL();
 
@@ -1275,9 +1268,9 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp)
        if (state == INET6_IFADDR_STATE_DEAD)
                goto out;
 
-       spin_lock_bh(&addrconf_hash_lock);
+       spin_lock_bh(&net->ipv6.addrconf_hash_lock);
        hlist_del_init_rcu(&ifp->addr_lst);
-       spin_unlock_bh(&addrconf_hash_lock);
+       spin_unlock_bh(&net->ipv6.addrconf_hash_lock);
 
        write_lock_bh(&ifp->idev->lock);
 
@@ -1920,10 +1913,8 @@ __ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr,
        if (skip_dev_check)
                dev = NULL;
 
-       hlist_for_each_entry_rcu(ifp, &inet6_addr_lst[hash], addr_lst) {
+       hlist_for_each_entry_rcu(ifp, &net->ipv6.inet6_addr_lst[hash], addr_lst) {
                ndev = ifp->idev->dev;
-               if (!net_eq(dev_net(ndev), net))
-                       continue;
 
                if (l3mdev_master_dev_rcu(ndev) != l3mdev)
                        continue;
@@ -2027,9 +2018,7 @@ struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net, const struct in6_addr *add
        struct inet6_ifaddr *ifp, *result = NULL;
 
        rcu_read_lock();
-       hlist_for_each_entry_rcu(ifp, &inet6_addr_lst[hash], addr_lst) {
-               if (!net_eq(dev_net(ifp->idev->dev), net))
-                       continue;
+       hlist_for_each_entry_rcu(ifp, &net->ipv6.inet6_addr_lst[hash], addr_lst) {
                if (ipv6_addr_equal(&ifp->addr, addr)) {
                        if (!dev || ifp->idev->dev == dev ||
                            !(ifp->scope&(IFA_LINK|IFA_HOST) || strict)) {
@@ -2096,7 +2085,7 @@ static int addrconf_dad_end(struct inet6_ifaddr *ifp)
 void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp)
 {
        struct inet6_dev *idev = ifp->idev;
-       struct net *net = dev_net(ifp->idev->dev);
+       struct net *net = dev_net(idev->dev);
 
        if (addrconf_dad_end(ifp)) {
                in6_ifa_put(ifp);
@@ -2675,7 +2664,7 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev,
                                 create, now);
 
                in6_ifa_put(ifp);
-               addrconf_verify();
+               addrconf_verify(net);
        }
 
        return 0;
@@ -2987,7 +2976,7 @@ static int inet6_addr_add(struct net *net, int ifindex,
                        manage_tempaddrs(idev, ifp, cfg->valid_lft,
                                         cfg->preferred_lft, true, jiffies);
                in6_ifa_put(ifp);
-               addrconf_verify_rtnl();
+               addrconf_verify_rtnl(net);
                return 0;
        } else if (cfg->ifa_flags & IFA_F_MCAUTOJOIN) {
                ipv6_mc_config(net->ipv6.mc_autojoin_sk, false,
@@ -3027,7 +3016,7 @@ static int inet6_addr_del(struct net *net, int ifindex, u32 ifa_flags,
                                manage_tempaddrs(idev, ifp, 0, 0, false,
                                                 jiffies);
                        ipv6_del_addr(ifp);
-                       addrconf_verify_rtnl();
+                       addrconf_verify_rtnl(net);
                        if (ipv6_addr_is_multicast(pfx)) {
                                ipv6_mc_config(net->ipv6.mc_autojoin_sk,
                                               false, pfx, dev->ifindex);
@@ -3772,9 +3761,9 @@ static int addrconf_ifdown(struct net_device *dev, bool unregister)
 
        /* Step 2: clear hash table */
        for (i = 0; i < IN6_ADDR_HSIZE; i++) {
-               struct hlist_head *h = &inet6_addr_lst[i];
+               struct hlist_head *h = &net->ipv6.inet6_addr_lst[i];
 
-               spin_lock_bh(&addrconf_hash_lock);
+               spin_lock_bh(&net->ipv6.addrconf_hash_lock);
 restart:
                hlist_for_each_entry_rcu(ifa, h, addr_lst) {
                        if (ifa->idev == idev) {
@@ -3790,7 +3779,7 @@ restart:
                                }
                        }
                }
-               spin_unlock_bh(&addrconf_hash_lock);
+               spin_unlock_bh(&net->ipv6.addrconf_hash_lock);
        }
 
        write_lock_bh(&idev->lock);
@@ -4246,7 +4235,7 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id,
         * before this temporary address becomes deprecated.
         */
        if (ifp->flags & IFA_F_TEMPORARY)
-               addrconf_verify_rtnl();
+               addrconf_verify_rtnl(dev_net(dev));
 }
 
 static void addrconf_dad_run(struct inet6_dev *idev, bool restart)
@@ -4288,10 +4277,8 @@ static struct inet6_ifaddr *if6_get_first(struct seq_file *seq, loff_t pos)
        }
 
        for (; state->bucket < IN6_ADDR_HSIZE; ++state->bucket) {
-               hlist_for_each_entry_rcu(ifa, &inet6_addr_lst[state->bucket],
+               hlist_for_each_entry_rcu(ifa, &net->ipv6.inet6_addr_lst[state->bucket],
                                         addr_lst) {
-                       if (!net_eq(dev_net(ifa->idev->dev), net))
-                               continue;
                        /* sync with offset */
                        if (p < state->offset) {
                                p++;
@@ -4314,8 +4301,6 @@ static struct inet6_ifaddr *if6_get_next(struct seq_file *seq,
        struct net *net = seq_file_net(seq);
 
        hlist_for_each_entry_continue_rcu(ifa, addr_lst) {
-               if (!net_eq(dev_net(ifa->idev->dev), net))
-                       continue;
                state->offset++;
                return ifa;
        }
@@ -4323,9 +4308,7 @@ static struct inet6_ifaddr *if6_get_next(struct seq_file *seq,
        state->offset = 0;
        while (++state->bucket < IN6_ADDR_HSIZE) {
                hlist_for_each_entry_rcu(ifa,
-                                    &inet6_addr_lst[state->bucket], addr_lst) {
-                       if (!net_eq(dev_net(ifa->idev->dev), net))
-                               continue;
+                                    &net->ipv6.inet6_addr_lst[state->bucket], addr_lst) {
                        return ifa;
                }
        }
@@ -4413,9 +4396,7 @@ int ipv6_chk_home_addr(struct net *net, const struct in6_addr *addr)
        int ret = 0;
 
        rcu_read_lock();
-       hlist_for_each_entry_rcu(ifp, &inet6_addr_lst[hash], addr_lst) {
-               if (!net_eq(dev_net(ifp->idev->dev), net))
-                       continue;
+       hlist_for_each_entry_rcu(ifp, &net->ipv6.inet6_addr_lst[hash], addr_lst) {
                if (ipv6_addr_equal(&ifp->addr, addr) &&
                    (ifp->flags & IFA_F_HOMEADDRESS)) {
                        ret = 1;
@@ -4453,9 +4434,7 @@ int ipv6_chk_rpl_srh_loop(struct net *net, const struct in6_addr *segs,
                hash = inet6_addr_hash(net, addr);
 
                hash_found = false;
-               hlist_for_each_entry_rcu(ifp, &inet6_addr_lst[hash], addr_lst) {
-                       if (!net_eq(dev_net(ifp->idev->dev), net))
-                               continue;
+               hlist_for_each_entry_rcu(ifp, &net->ipv6.inet6_addr_lst[hash], addr_lst) {
 
                        if (ipv6_addr_equal(&ifp->addr, addr)) {
                                hash_found = true;
@@ -4484,7 +4463,7 @@ int ipv6_chk_rpl_srh_loop(struct net *net, const struct in6_addr *segs,
  *     Periodic address status verification
  */
 
-static void addrconf_verify_rtnl(void)
+static void addrconf_verify_rtnl(struct net *net)
 {
        unsigned long now, next, next_sec, next_sched;
        struct inet6_ifaddr *ifp;
@@ -4496,11 +4475,11 @@ static void addrconf_verify_rtnl(void)
        now = jiffies;
        next = round_jiffies_up(now + ADDR_CHECK_FREQUENCY);
 
-       cancel_delayed_work(&addr_chk_work);
+       cancel_delayed_work(&net->ipv6.addr_chk_work);
 
        for (i = 0; i < IN6_ADDR_HSIZE; i++) {
 restart:
-               hlist_for_each_entry_rcu_bh(ifp, &inet6_addr_lst[i], addr_lst) {
+               hlist_for_each_entry_rcu_bh(ifp, &net->ipv6.inet6_addr_lst[i], addr_lst) {
                        unsigned long age;
 
                        /* When setting preferred_lft to a value not zero or
@@ -4599,20 +4578,23 @@ restart:
 
        pr_debug("now = %lu, schedule = %lu, rounded schedule = %lu => %lu\n",
                 now, next, next_sec, next_sched);
-       mod_delayed_work(addrconf_wq, &addr_chk_work, next_sched - now);
+       mod_delayed_work(addrconf_wq, &net->ipv6.addr_chk_work, next_sched - now);
        rcu_read_unlock_bh();
 }
 
 static void addrconf_verify_work(struct work_struct *w)
 {
+       struct net *net = container_of(to_delayed_work(w), struct net,
+                                      ipv6.addr_chk_work);
+
        rtnl_lock();
-       addrconf_verify_rtnl();
+       addrconf_verify_rtnl(net);
        rtnl_unlock();
 }
 
-static void addrconf_verify(void)
+static void addrconf_verify(struct net *net)
 {
-       mod_delayed_work(addrconf_wq, &addr_chk_work, 0);
+       mod_delayed_work(addrconf_wq, &net->ipv6.addr_chk_work, 0);
 }
 
 static struct in6_addr *extract_addr(struct nlattr *addr, struct nlattr *local,
@@ -4708,7 +4690,8 @@ static int modify_prefix_route(struct inet6_ifaddr *ifp,
        return 0;
 }
 
-static int inet6_addr_modify(struct inet6_ifaddr *ifp, struct ifa6_config *cfg)
+static int inet6_addr_modify(struct net *net, struct inet6_ifaddr *ifp,
+                            struct ifa6_config *cfg)
 {
        u32 flags;
        clock_t expires;
@@ -4822,7 +4805,7 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, struct ifa6_config *cfg)
                                 jiffies);
        }
 
-       addrconf_verify_rtnl();
+       addrconf_verify_rtnl(net);
 
        return 0;
 }
@@ -4909,7 +4892,7 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
            !(nlh->nlmsg_flags & NLM_F_REPLACE))
                err = -EEXIST;
        else
-               err = inet6_addr_modify(ifa, &cfg);
+               err = inet6_addr_modify(net, ifa, &cfg);
 
        in6_ifa_put(ifa);
 
@@ -5533,7 +5516,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
        array[DEVCONF_USE_OPTIMISTIC] = cnf->use_optimistic;
 #endif
 #ifdef CONFIG_IPV6_MROUTE
-       array[DEVCONF_MC_FORWARDING] = cnf->mc_forwarding;
+       array[DEVCONF_MC_FORWARDING] = atomic_read(&cnf->mc_forwarding);
 #endif
        array[DEVCONF_DISABLE_IPV6] = cnf->disable_ipv6;
        array[DEVCONF_ACCEPT_DAD] = cnf->accept_dad;
@@ -5794,7 +5777,7 @@ update_lft:
 
        write_unlock_bh(&idev->lock);
        inet6_ifinfo_notify(RTM_NEWLINK, idev);
-       addrconf_verify_rtnl();
+       addrconf_verify_rtnl(dev_net(dev));
        return 0;
 }
 
@@ -7111,6 +7094,14 @@ static int __net_init addrconf_init_net(struct net *net)
        int err = -ENOMEM;
        struct ipv6_devconf *all, *dflt;
 
+       spin_lock_init(&net->ipv6.addrconf_hash_lock);
+       INIT_DEFERRABLE_WORK(&net->ipv6.addr_chk_work, addrconf_verify_work);
+       net->ipv6.inet6_addr_lst = kcalloc(IN6_ADDR_HSIZE,
+                                          sizeof(struct hlist_head),
+                                          GFP_KERNEL);
+       if (!net->ipv6.inet6_addr_lst)
+               goto err_alloc_addr;
+
        all = kmemdup(&ipv6_devconf, sizeof(ipv6_devconf), GFP_KERNEL);
        if (!all)
                goto err_alloc_all;
@@ -7172,11 +7163,15 @@ err_reg_all:
 err_alloc_dflt:
        kfree(all);
 err_alloc_all:
+       kfree(net->ipv6.inet6_addr_lst);
+err_alloc_addr:
        return err;
 }
 
 static void __net_exit addrconf_exit_net(struct net *net)
 {
+       int i;
+
 #ifdef CONFIG_SYSCTL
        __addrconf_sysctl_unregister(net, net->ipv6.devconf_dflt,
                                     NETCONFA_IFINDEX_DEFAULT);
@@ -7184,7 +7179,19 @@ static void __net_exit addrconf_exit_net(struct net *net)
                                     NETCONFA_IFINDEX_ALL);
 #endif
        kfree(net->ipv6.devconf_dflt);
+       net->ipv6.devconf_dflt = NULL;
        kfree(net->ipv6.devconf_all);
+       net->ipv6.devconf_all = NULL;
+
+       cancel_delayed_work(&net->ipv6.addr_chk_work);
+       /*
+        *      Check hash table, then free it.
+        */
+       for (i = 0; i < IN6_ADDR_HSIZE; i++)
+               WARN_ON_ONCE(!hlist_empty(&net->ipv6.inet6_addr_lst[i]));
+
+       kfree(net->ipv6.inet6_addr_lst);
+       net->ipv6.inet6_addr_lst = NULL;
 }
 
 static struct pernet_operations addrconf_ops = {
@@ -7207,7 +7214,7 @@ static struct rtnl_af_ops inet6_ops __read_mostly = {
 int __init addrconf_init(void)
 {
        struct inet6_dev *idev;
-       int i, err;
+       int err;
 
        err = ipv6_addr_label_init();
        if (err < 0) {
@@ -7254,12 +7261,9 @@ int __init addrconf_init(void)
 
        ip6_route_init_special_entries();
 
-       for (i = 0; i < IN6_ADDR_HSIZE; i++)
-               INIT_HLIST_HEAD(&inet6_addr_lst[i]);
-
        register_netdevice_notifier(&ipv6_dev_notf);
 
-       addrconf_verify();
+       addrconf_verify(&init_net);
 
        rtnl_af_register(&inet6_ops);
 
@@ -7317,7 +7321,6 @@ out:
 void addrconf_cleanup(void)
 {
        struct net_device *dev;
-       int i;
 
        unregister_netdevice_notifier(&ipv6_dev_notf);
        unregister_pernet_subsys(&addrconf_ops);
@@ -7335,14 +7338,6 @@ void addrconf_cleanup(void)
        }
        addrconf_ifdown(init_net.loopback_dev, true);
 
-       /*
-        *      Check hash table.
-        */
-       spin_lock_bh(&addrconf_hash_lock);
-       for (i = 0; i < IN6_ADDR_HSIZE; i++)
-               WARN_ON(!hlist_empty(&inet6_addr_lst[i]));
-       spin_unlock_bh(&addrconf_hash_lock);
-       cancel_delayed_work(&addr_chk_work);
        rtnl_unlock();
 
        destroy_workqueue(addrconf_wq);
index 77e34ae..658d5ea 100644 (file)
@@ -1344,14 +1344,14 @@ ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt,
        return opt2;
 }
 
-struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space,
-                                         struct ipv6_txoptions *opt)
+struct ipv6_txoptions *__ipv6_fixup_options(struct ipv6_txoptions *opt_space,
+                                           struct ipv6_txoptions *opt)
 {
        /*
         * ignore the dest before srcrt unless srcrt is being included.
         * --yoshfuji
         */
-       if (opt && opt->dst0opt && !opt->srcrt) {
+       if (opt->dst0opt && !opt->srcrt) {
                if (opt_space != opt) {
                        memcpy(opt_space, opt, sizeof(*opt_space));
                        opt = opt_space;
@@ -1362,7 +1362,7 @@ struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space,
 
        return opt;
 }
-EXPORT_SYMBOL_GPL(ipv6_fixup_options);
+EXPORT_SYMBOL_GPL(__ipv6_fixup_options);
 
 /**
  * fl6_update_dst - update flowi destination address with info given
index ec029c8..7c20038 100644 (file)
@@ -16,6 +16,7 @@
 #include <linux/indirect_call_wrapper.h>
 
 #include <net/fib_rules.h>
+#include <net/inet_dscp.h>
 #include <net/ipv6.h>
 #include <net/addrconf.h>
 #include <net/ip6_route.h>
@@ -25,14 +26,14 @@ struct fib6_rule {
        struct fib_rule         common;
        struct rt6key           src;
        struct rt6key           dst;
-       u8                      tclass;
+       dscp_t                  dscp;
 };
 
 static bool fib6_rule_matchall(const struct fib_rule *rule)
 {
        struct fib6_rule *r = container_of(rule, struct fib6_rule, common);
 
-       if (r->dst.plen || r->src.plen || r->tclass)
+       if (r->dst.plen || r->src.plen || r->dscp)
                return false;
        return fib_rule_matchall(rule);
 }
@@ -323,7 +324,7 @@ INDIRECT_CALLABLE_SCOPE int fib6_rule_match(struct fib_rule *rule,
                        return 0;
        }
 
-       if (r->tclass && r->tclass != ip6_tclass(fl6->flowlabel))
+       if (r->dscp && r->dscp != ip6_dscp(fl6->flowlabel))
                return 0;
 
        if (rule->ip_proto && (rule->ip_proto != fl6->flowi6_proto))
@@ -349,6 +350,13 @@ static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
        struct net *net = sock_net(skb->sk);
        struct fib6_rule *rule6 = (struct fib6_rule *) rule;
 
+       if (!inet_validate_dscp(frh->tos)) {
+               NL_SET_ERR_MSG(extack,
+                              "Invalid dsfield (tos): ECN bits must be 0");
+               goto errout;
+       }
+       rule6->dscp = inet_dsfield_to_dscp(frh->tos);
+
        if (rule->action == FR_ACT_TO_TBL && !rule->l3mdev) {
                if (rule->table == RT6_TABLE_UNSPEC) {
                        NL_SET_ERR_MSG(extack, "Invalid table");
@@ -369,7 +377,6 @@ static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
 
        rule6->src.plen = frh->src_len;
        rule6->dst.plen = frh->dst_len;
-       rule6->tclass = frh->tos;
 
        if (fib_rule_requires_fldissect(rule))
                net->ipv6.fib6_rules_require_fldissect++;
@@ -402,7 +409,7 @@ static int fib6_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
        if (frh->dst_len && (rule6->dst.plen != frh->dst_len))
                return 0;
 
-       if (frh->tos && (rule6->tclass != frh->tos))
+       if (frh->tos && inet_dscp_to_dsfield(rule6->dscp) != frh->tos)
                return 0;
 
        if (frh->src_len &&
@@ -423,7 +430,7 @@ static int fib6_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
 
        frh->dst_len = rule6->dst.plen;
        frh->src_len = rule6->src.plen;
-       frh->tos = rule6->tclass;
+       frh->tos = inet_dscp_to_dsfield(rule6->dscp);
 
        if ((rule6->dst.plen &&
             nla_put_in6_addr(skb, FRA_DST, &rule6->dst.addr)) ||
@@ -486,16 +493,21 @@ out_fib6_rules_ops:
        goto out;
 }
 
-static void __net_exit fib6_rules_net_exit(struct net *net)
+static void __net_exit fib6_rules_net_exit_batch(struct list_head *net_list)
 {
+       struct net *net;
+
        rtnl_lock();
-       fib_rules_unregister(net->ipv6.fib6_rules_ops);
+       list_for_each_entry(net, net_list, exit_list) {
+               fib_rules_unregister(net->ipv6.fib6_rules_ops);
+               cond_resched();
+       }
        rtnl_unlock();
 }
 
 static struct pernet_operations fib6_rules_net_ops = {
        .init = fib6_rules_net_init,
-       .exit = fib6_rules_net_exit,
+       .exit_batch = fib6_rules_net_exit_batch,
 };
 
 int __init fib6_rules_init(void)
index 96c5cc0..e6b978e 100644 (file)
 
 #include <linux/uaccess.h>
 
-/*
- *     The ICMP socket(s). This is the most convenient way to flow control
- *     our ICMP output as well as maintain a clean interface throughout
- *     all layers. All Socketless IP sends will soon be gone.
- *
- *     On SMP we have one ICMP socket per-cpu.
- */
-static struct sock *icmpv6_sk(struct net *net)
-{
-       return this_cpu_read(*net->ipv6.icmp_sk);
-}
+static DEFINE_PER_CPU(struct sock *, ipv6_icmp_sk);
 
 static int icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
                       u8 type, u8 code, int offset, __be32 info)
@@ -110,11 +100,11 @@ static const struct inet6_protocol icmpv6_protocol = {
 };
 
 /* Called with BH disabled */
-static __inline__ struct sock *icmpv6_xmit_lock(struct net *net)
+static struct sock *icmpv6_xmit_lock(struct net *net)
 {
        struct sock *sk;
 
-       sk = icmpv6_sk(net);
+       sk = this_cpu_read(ipv6_icmp_sk);
        if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
                /* This can happen if the output path (f.e. SIT or
                 * ip6ip6 tunnel) signals dst_link_failure() for an
@@ -122,11 +112,13 @@ static __inline__ struct sock *icmpv6_xmit_lock(struct net *net)
                 */
                return NULL;
        }
+       sock_net_set(sk, net);
        return sk;
 }
 
-static __inline__ void icmpv6_xmit_unlock(struct sock *sk)
+static void icmpv6_xmit_unlock(struct sock *sk)
 {
+       sock_net_set(sk, &init_net);
        spin_unlock(&sk->sk_lock.slock);
 }
 
@@ -1034,59 +1026,27 @@ void icmpv6_flow_init(struct sock *sk, struct flowi6 *fl6,
        security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6));
 }
 
-static void __net_exit icmpv6_sk_exit(struct net *net)
-{
-       int i;
-
-       for_each_possible_cpu(i)
-               inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv6.icmp_sk, i));
-       free_percpu(net->ipv6.icmp_sk);
-}
-
-static int __net_init icmpv6_sk_init(struct net *net)
+int __init icmpv6_init(void)
 {
        struct sock *sk;
        int err, i;
 
-       net->ipv6.icmp_sk = alloc_percpu(struct sock *);
-       if (!net->ipv6.icmp_sk)
-               return -ENOMEM;
-
        for_each_possible_cpu(i) {
                err = inet_ctl_sock_create(&sk, PF_INET6,
-                                          SOCK_RAW, IPPROTO_ICMPV6, net);
+                                          SOCK_RAW, IPPROTO_ICMPV6, &init_net);
                if (err < 0) {
                        pr_err("Failed to initialize the ICMP6 control socket (err %d)\n",
                               err);
-                       goto fail;
+                       return err;
                }
 
-               *per_cpu_ptr(net->ipv6.icmp_sk, i) = sk;
+               per_cpu(ipv6_icmp_sk, i) = sk;
 
                /* Enough space for 2 64K ICMP packets, including
                 * sk_buff struct overhead.
                 */
                sk->sk_sndbuf = 2 * SKB_TRUESIZE(64 * 1024);
        }
-       return 0;
-
- fail:
-       icmpv6_sk_exit(net);
-       return err;
-}
-
-static struct pernet_operations icmpv6_sk_ops = {
-       .init = icmpv6_sk_init,
-       .exit = icmpv6_sk_exit,
-};
-
-int __init icmpv6_init(void)
-{
-       int err;
-
-       err = register_pernet_subsys(&icmpv6_sk_ops);
-       if (err < 0)
-               return err;
 
        err = -EAGAIN;
        if (inet6_add_protocol(&icmpv6_protocol, IPPROTO_ICMPV6) < 0)
@@ -1101,14 +1061,12 @@ sender_reg_err:
        inet6_del_protocol(&icmpv6_protocol, IPPROTO_ICMPV6);
 fail:
        pr_err("Failed to register ICMP6 protocol\n");
-       unregister_pernet_subsys(&icmpv6_sk_ops);
        return err;
 }
 
 void icmpv6_cleanup(void)
 {
        inet6_unregister_icmp_sender(icmp6_send);
-       unregister_pernet_subsys(&icmpv6_sk_ops);
        inet6_del_protocol(&icmpv6_protocol, IPPROTO_ICMPV6);
 }
 
index 4514444..4740afe 100644 (file)
@@ -333,11 +333,8 @@ int inet6_hash(struct sock *sk)
 {
        int err = 0;
 
-       if (sk->sk_state != TCP_CLOSE) {
-               local_bh_disable();
+       if (sk->sk_state != TCP_CLOSE)
                err = __inet_hash(sk, NULL);
-               local_bh_enable();
-       }
 
        return err;
 }
index f90a873..f6f5b83 100644 (file)
@@ -32,13 +32,25 @@ struct ioam6_lwt_encap {
        struct ioam6_trace_hdr traceh;
 } __packed;
 
+struct ioam6_lwt_freq {
+       u32 k;
+       u32 n;
+};
+
 struct ioam6_lwt {
        struct dst_cache cache;
+       struct ioam6_lwt_freq freq;
+       atomic_t pkt_cnt;
        u8 mode;
        struct in6_addr tundst;
        struct ioam6_lwt_encap  tuninfo;
 };
 
+static struct netlink_range_validation freq_range = {
+       .min = IOAM6_IPTUNNEL_FREQ_MIN,
+       .max = IOAM6_IPTUNNEL_FREQ_MAX,
+};
+
 static struct ioam6_lwt *ioam6_lwt_state(struct lwtunnel_state *lwt)
 {
        return (struct ioam6_lwt *)lwt->data;
@@ -55,6 +67,8 @@ static struct ioam6_trace_hdr *ioam6_lwt_trace(struct lwtunnel_state *lwt)
 }
 
 static const struct nla_policy ioam6_iptunnel_policy[IOAM6_IPTUNNEL_MAX + 1] = {
+       [IOAM6_IPTUNNEL_FREQ_K] = NLA_POLICY_FULL_RANGE(NLA_U32, &freq_range),
+       [IOAM6_IPTUNNEL_FREQ_N] = NLA_POLICY_FULL_RANGE(NLA_U32, &freq_range),
        [IOAM6_IPTUNNEL_MODE]   = NLA_POLICY_RANGE(NLA_U8,
                                                   IOAM6_IPTUNNEL_MODE_MIN,
                                                   IOAM6_IPTUNNEL_MODE_MAX),
@@ -96,6 +110,7 @@ static int ioam6_build_state(struct net *net, struct nlattr *nla,
        struct lwtunnel_state *lwt;
        struct ioam6_lwt *ilwt;
        int len_aligned, err;
+       u32 freq_k, freq_n;
        u8 mode;
 
        if (family != AF_INET6)
@@ -106,6 +121,23 @@ static int ioam6_build_state(struct net *net, struct nlattr *nla,
        if (err < 0)
                return err;
 
+       if ((!tb[IOAM6_IPTUNNEL_FREQ_K] && tb[IOAM6_IPTUNNEL_FREQ_N]) ||
+           (tb[IOAM6_IPTUNNEL_FREQ_K] && !tb[IOAM6_IPTUNNEL_FREQ_N])) {
+               NL_SET_ERR_MSG(extack, "freq: missing parameter");
+               return -EINVAL;
+       } else if (!tb[IOAM6_IPTUNNEL_FREQ_K] && !tb[IOAM6_IPTUNNEL_FREQ_N]) {
+               freq_k = IOAM6_IPTUNNEL_FREQ_MIN;
+               freq_n = IOAM6_IPTUNNEL_FREQ_MIN;
+       } else {
+               freq_k = nla_get_u32(tb[IOAM6_IPTUNNEL_FREQ_K]);
+               freq_n = nla_get_u32(tb[IOAM6_IPTUNNEL_FREQ_N]);
+
+               if (freq_k > freq_n) {
+                       NL_SET_ERR_MSG(extack, "freq: k > n is forbidden");
+                       return -EINVAL;
+               }
+       }
+
        if (!tb[IOAM6_IPTUNNEL_MODE])
                mode = IOAM6_IPTUNNEL_MODE_INLINE;
        else
@@ -140,6 +172,10 @@ static int ioam6_build_state(struct net *net, struct nlattr *nla,
                return err;
        }
 
+       atomic_set(&ilwt->pkt_cnt, 0);
+       ilwt->freq.k = freq_k;
+       ilwt->freq.n = freq_n;
+
        ilwt->mode = mode;
        if (tb[IOAM6_IPTUNNEL_DST])
                ilwt->tundst = nla_get_in6_addr(tb[IOAM6_IPTUNNEL_DST]);
@@ -263,11 +299,18 @@ static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
        struct in6_addr orig_daddr;
        struct ioam6_lwt *ilwt;
        int err = -EINVAL;
+       u32 pkt_cnt;
 
        if (skb->protocol != htons(ETH_P_IPV6))
                goto drop;
 
        ilwt = ioam6_lwt_state(dst->lwtstate);
+
+       /* Check for insertion frequency (i.e., "k over n" insertions) */
+       pkt_cnt = atomic_fetch_inc(&ilwt->pkt_cnt);
+       if (pkt_cnt % ilwt->freq.n >= ilwt->freq.k)
+               goto out;
+
        orig_daddr = ipv6_hdr(skb)->daddr;
 
        switch (ilwt->mode) {
@@ -358,6 +401,14 @@ static int ioam6_fill_encap_info(struct sk_buff *skb,
        struct ioam6_lwt *ilwt = ioam6_lwt_state(lwtstate);
        int err;
 
+       err = nla_put_u32(skb, IOAM6_IPTUNNEL_FREQ_K, ilwt->freq.k);
+       if (err)
+               goto ret;
+
+       err = nla_put_u32(skb, IOAM6_IPTUNNEL_FREQ_N, ilwt->freq.n);
+       if (err)
+               goto ret;
+
        err = nla_put_u8(skb, IOAM6_IPTUNNEL_MODE, ilwt->mode);
        if (err)
                goto ret;
@@ -379,7 +430,9 @@ static int ioam6_encap_nlsize(struct lwtunnel_state *lwtstate)
        struct ioam6_lwt *ilwt = ioam6_lwt_state(lwtstate);
        int nlsize;
 
-       nlsize = nla_total_size(sizeof(ilwt->mode)) +
+       nlsize = nla_total_size(sizeof(ilwt->freq.k)) +
+                 nla_total_size(sizeof(ilwt->freq.n)) +
+                 nla_total_size(sizeof(ilwt->mode)) +
                  nla_total_size(sizeof(ilwt->tuninfo.traceh));
 
        if (ilwt->mode != IOAM6_IPTUNNEL_MODE_INLINE)
@@ -395,7 +448,9 @@ static int ioam6_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
        struct ioam6_lwt *ilwt_a = ioam6_lwt_state(a);
        struct ioam6_lwt *ilwt_b = ioam6_lwt_state(b);
 
-       return (ilwt_a->mode != ilwt_b->mode ||
+       return (ilwt_a->freq.k != ilwt_b->freq.k ||
+               ilwt_a->freq.n != ilwt_b->freq.n ||
+               ilwt_a->mode != ilwt_b->mode ||
                (ilwt_a->mode != IOAM6_IPTUNNEL_MODE_INLINE &&
                 !ipv6_addr_equal(&ilwt_a->tundst, &ilwt_b->tundst)) ||
                trace_a->namespace_id != trace_b->namespace_id);
index 8025671..d4b1e2c 100644 (file)
@@ -508,7 +508,7 @@ int ip6_mc_input(struct sk_buff *skb)
        /*
         *      IPv6 multicast router mode is now supported ;)
         */
-       if (dev_net(skb->dev)->ipv6.devconf_all->mc_forwarding &&
+       if (atomic_read(&dev_net(skb->dev)->ipv6.devconf_all->mc_forwarding) &&
            !(ipv6_addr_type(&hdr->daddr) &
              (IPV6_ADDR_LOOPBACK|IPV6_ADDR_LINKLOCAL)) &&
            likely(!(IP6CB(skb)->flags & IP6SKB_FORWARDED))) {
index b29e9ba..d37a79a 100644 (file)
@@ -249,7 +249,7 @@ INDIRECT_CALLABLE_SCOPE struct sk_buff *ipv6_gro_receive(struct list_head *head,
                 if ((first_word & htonl(0xF00FFFFF)) ||
                     !ipv6_addr_equal(&iph->saddr, &iph2->saddr) ||
                     !ipv6_addr_equal(&iph->daddr, &iph2->daddr) ||
-                    *(u16 *)&iph->nexthdr != *(u16 *)&iph2->nexthdr) {
+                    iph->nexthdr != iph2->nexthdr) {
 not_same_flow:
                        NAPI_GRO_CB(p)->same_flow = 0;
                        continue;
@@ -260,7 +260,8 @@ not_same_flow:
                                goto not_same_flow;
                }
                /* flush if Traffic Class fields are different */
-               NAPI_GRO_CB(p)->flush |= !!(first_word & htonl(0x0FF00000));
+               NAPI_GRO_CB(p)->flush |= !!((first_word & htonl(0x0FF00000)) |
+                       (__force __be32)(iph->hop_limit ^ iph2->hop_limit));
                NAPI_GRO_CB(p)->flush |= flush;
 
                /* If the previous IP ID value was based on an atomic
index 2995f8d..0c6c971 100644 (file)
@@ -1350,11 +1350,16 @@ static void ip6_append_data_mtu(unsigned int *mtu,
 
 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
                          struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
-                         struct rt6_info *rt, struct flowi6 *fl6)
+                         struct rt6_info *rt)
 {
        struct ipv6_pinfo *np = inet6_sk(sk);
        unsigned int mtu;
-       struct ipv6_txoptions *opt = ipc6->opt;
+       struct ipv6_txoptions *nopt, *opt = ipc6->opt;
+
+       /* callers pass dst together with a reference, set it first so
+        * ip6_cork_release() can put it down even in case of an error.
+        */
+       cork->base.dst = &rt->dst;
 
        /*
         * setup for corking
@@ -1363,39 +1368,32 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
                if (WARN_ON(v6_cork->opt))
                        return -EINVAL;
 
-               v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
-               if (unlikely(!v6_cork->opt))
+               nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
+               if (unlikely(!nopt))
                        return -ENOBUFS;
 
-               v6_cork->opt->tot_len = sizeof(*opt);
-               v6_cork->opt->opt_flen = opt->opt_flen;
-               v6_cork->opt->opt_nflen = opt->opt_nflen;
+               nopt->tot_len = sizeof(*opt);
+               nopt->opt_flen = opt->opt_flen;
+               nopt->opt_nflen = opt->opt_nflen;
 
-               v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
-                                                   sk->sk_allocation);
-               if (opt->dst0opt && !v6_cork->opt->dst0opt)
+               nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation);
+               if (opt->dst0opt && !nopt->dst0opt)
                        return -ENOBUFS;
 
-               v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
-                                                   sk->sk_allocation);
-               if (opt->dst1opt && !v6_cork->opt->dst1opt)
+               nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation);
+               if (opt->dst1opt && !nopt->dst1opt)
                        return -ENOBUFS;
 
-               v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
-                                                  sk->sk_allocation);
-               if (opt->hopopt && !v6_cork->opt->hopopt)
+               nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation);
+               if (opt->hopopt && !nopt->hopopt)
                        return -ENOBUFS;
 
-               v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
-                                                   sk->sk_allocation);
-               if (opt->srcrt && !v6_cork->opt->srcrt)
+               nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation);
+               if (opt->srcrt && !nopt->srcrt)
                        return -ENOBUFS;
 
                /* need source address above miyazawa*/
        }
-       dst_hold(&rt->dst);
-       cork->base.dst = &rt->dst;
-       cork->fl.u.ip6 = *fl6;
        v6_cork->hop_limit = ipc6->hlimit;
        v6_cork->tclass = ipc6->tclass;
        if (rt->dst.flags & DST_XFRM_TUNNEL)
@@ -1426,9 +1424,8 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
 }
 
 static int __ip6_append_data(struct sock *sk,
-                            struct flowi6 *fl6,
                             struct sk_buff_head *queue,
-                            struct inet_cork *cork,
+                            struct inet_cork_full *cork_full,
                             struct inet6_cork *v6_cork,
                             struct page_frag *pfrag,
                             int getfrag(void *from, char *to, int offset,
@@ -1437,6 +1434,8 @@ static int __ip6_append_data(struct sock *sk,
                             unsigned int flags, struct ipcm6_cookie *ipc6)
 {
        struct sk_buff *skb, *skb_prev = NULL;
+       struct inet_cork *cork = &cork_full->base;
+       struct flowi6 *fl6 = &cork_full->fl.u.ip6;
        unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
        struct ubuf_info *uarg = NULL;
        int exthdrlen = 0;
@@ -1788,34 +1787,46 @@ int ip6_append_data(struct sock *sk,
                /*
                 * setup for corking
                 */
+               dst_hold(&rt->dst);
                err = ip6_setup_cork(sk, &inet->cork, &np->cork,
-                                    ipc6, rt, fl6);
+                                    ipc6, rt);
                if (err)
                        return err;
 
+               inet->cork.fl.u.ip6 = *fl6;
                exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
                length += exthdrlen;
                transhdrlen += exthdrlen;
        } else {
-               fl6 = &inet->cork.fl.u.ip6;
                transhdrlen = 0;
        }
 
-       return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
+       return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork,
                                 &np->cork, sk_page_frag(sk), getfrag,
                                 from, length, transhdrlen, flags, ipc6);
 }
 EXPORT_SYMBOL_GPL(ip6_append_data);
 
+static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork)
+{
+       struct dst_entry *dst = cork->base.dst;
+
+       cork->base.dst = NULL;
+       cork->base.flags &= ~IPCORK_ALLFRAG;
+       skb_dst_set(skb, dst);
+}
+
 static void ip6_cork_release(struct inet_cork_full *cork,
                             struct inet6_cork *v6_cork)
 {
        if (v6_cork->opt) {
-               kfree(v6_cork->opt->dst0opt);
-               kfree(v6_cork->opt->dst1opt);
-               kfree(v6_cork->opt->hopopt);
-               kfree(v6_cork->opt->srcrt);
-               kfree(v6_cork->opt);
+               struct ipv6_txoptions *opt = v6_cork->opt;
+
+               kfree(opt->dst0opt);
+               kfree(opt->dst1opt);
+               kfree(opt->hopopt);
+               kfree(opt->srcrt);
+               kfree(opt);
                v6_cork->opt = NULL;
        }
 
@@ -1824,7 +1835,6 @@ static void ip6_cork_release(struct inet_cork_full *cork,
                cork->base.dst = NULL;
                cork->base.flags &= ~IPCORK_ALLFRAG;
        }
-       memset(&cork->fl, 0, sizeof(cork->fl));
 }
 
 struct sk_buff *__ip6_make_skb(struct sock *sk,
@@ -1834,7 +1844,7 @@ struct sk_buff *__ip6_make_skb(struct sock *sk,
 {
        struct sk_buff *skb, *tmp_skb;
        struct sk_buff **tail_skb;
-       struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
+       struct in6_addr *final_dst;
        struct ipv6_pinfo *np = inet6_sk(sk);
        struct net *net = sock_net(sk);
        struct ipv6hdr *hdr;
@@ -1864,9 +1874,9 @@ struct sk_buff *__ip6_make_skb(struct sock *sk,
 
        /* Allow local fragmentation. */
        skb->ignore_df = ip6_sk_ignore_df(sk);
-
-       *final_dst = fl6->daddr;
        __skb_pull(skb, skb_network_header_len(skb));
+
+       final_dst = &fl6->daddr;
        if (opt && opt->opt_flen)
                ipv6_push_frag_opts(skb, opt, &proto);
        if (opt && opt->opt_nflen)
@@ -1886,10 +1896,9 @@ struct sk_buff *__ip6_make_skb(struct sock *sk,
 
        skb->priority = sk->sk_priority;
        skb->mark = cork->base.mark;
-
        skb->tstamp = cork->base.transmit_time;
 
-       skb_dst_set(skb, dst_clone(&rt->dst));
+       ip6_cork_steal_dst(skb, cork);
        IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
        if (proto == IPPROTO_ICMPV6) {
                struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
@@ -1961,26 +1970,26 @@ struct sk_buff *ip6_make_skb(struct sock *sk,
                             int getfrag(void *from, char *to, int offset,
                                         int len, int odd, struct sk_buff *skb),
                             void *from, int length, int transhdrlen,
-                            struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
-                            struct rt6_info *rt, unsigned int flags,
-                            struct inet_cork_full *cork)
+                            struct ipcm6_cookie *ipc6, struct rt6_info *rt,
+                            unsigned int flags, struct inet_cork_full *cork)
 {
        struct inet6_cork v6_cork;
        struct sk_buff_head queue;
        int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
        int err;
 
-       if (flags & MSG_PROBE)
+       if (flags & MSG_PROBE) {
+               dst_release(&rt->dst);
                return NULL;
+       }
 
        __skb_queue_head_init(&queue);
 
        cork->base.flags = 0;
        cork->base.addr = 0;
        cork->base.opt = NULL;
-       cork->base.dst = NULL;
        v6_cork.opt = NULL;
-       err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
+       err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt);
        if (err) {
                ip6_cork_release(cork, &v6_cork);
                return ERR_PTR(err);
@@ -1988,7 +1997,7 @@ struct sk_buff *ip6_make_skb(struct sock *sk,
        if (ipc6->dontfrag < 0)
                ipc6->dontfrag = inet6_sk(sk)->dontfrag;
 
-       err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
+       err = __ip6_append_data(sk, &queue, cork, &v6_cork,
                                &current->task_frag, getfrag, from,
                                length + exthdrlen, transhdrlen + exthdrlen,
                                flags, ipc6);
index 97ade83..53f632a 100644 (file)
@@ -1121,6 +1121,14 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
 
                        memcpy(&fl6->daddr, addr6, sizeof(fl6->daddr));
                        neigh_release(neigh);
+               } else if (skb->protocol == htons(ETH_P_IP)) {
+                       const struct rtable *rt = skb_rtable(skb);
+
+                       if (!rt)
+                               goto tx_err_link_failure;
+
+                       if (rt->rt_gw_family == AF_INET6)
+                               memcpy(&fl6->daddr, &rt->rt_gw6, sizeof(fl6->daddr));
                }
        } else if (t->parms.proto != 0 && !(t->parms.flags &
                                            (IP6_TNL_F_USE_ORIG_TCLASS |
index 8a2db92..0ebaaec 100644 (file)
@@ -255,13 +255,12 @@ static void __net_exit ip6mr_rules_exit(struct net *net)
 {
        struct mr_table *mrt, *next;
 
-       rtnl_lock();
+       ASSERT_RTNL();
        list_for_each_entry_safe(mrt, next, &net->ipv6.mr6_tables, list) {
                list_del(&mrt->list);
                ip6mr_free_table(mrt);
        }
        fib_rules_unregister(net->ipv6.mr6_rules_ops);
-       rtnl_unlock();
 }
 
 static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb,
@@ -318,10 +317,9 @@ static int __net_init ip6mr_rules_init(struct net *net)
 
 static void __net_exit ip6mr_rules_exit(struct net *net)
 {
-       rtnl_lock();
+       ASSERT_RTNL();
        ip6mr_free_table(net->ipv6.mrt6);
        net->ipv6.mrt6 = NULL;
-       rtnl_unlock();
 }
 
 static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb,
@@ -734,7 +732,7 @@ static int mif6_delete(struct mr_table *mrt, int vifi, int notify,
 
        in6_dev = __in6_dev_get(dev);
        if (in6_dev) {
-               in6_dev->cnf.mc_forwarding--;
+               atomic_dec(&in6_dev->cnf.mc_forwarding);
                inet6_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF,
                                             NETCONFA_MC_FORWARDING,
                                             dev->ifindex, &in6_dev->cnf);
@@ -902,7 +900,7 @@ static int mif6_add(struct net *net, struct mr_table *mrt,
 
        in6_dev = __in6_dev_get(dev);
        if (in6_dev) {
-               in6_dev->cnf.mc_forwarding++;
+               atomic_inc(&in6_dev->cnf.mc_forwarding);
                inet6_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF,
                                             NETCONFA_MC_FORWARDING,
                                             dev->ifindex, &in6_dev->cnf);
@@ -1325,7 +1323,9 @@ static int __net_init ip6mr_net_init(struct net *net)
 proc_cache_fail:
        remove_proc_entry("ip6_mr_vif", net->proc_net);
 proc_vif_fail:
+       rtnl_lock();
        ip6mr_rules_exit(net);
+       rtnl_unlock();
 #endif
 ip6mr_rules_fail:
        ip6mr_notifier_exit(net);
@@ -1338,13 +1338,23 @@ static void __net_exit ip6mr_net_exit(struct net *net)
        remove_proc_entry("ip6_mr_cache", net->proc_net);
        remove_proc_entry("ip6_mr_vif", net->proc_net);
 #endif
-       ip6mr_rules_exit(net);
        ip6mr_notifier_exit(net);
 }
 
+static void __net_exit ip6mr_net_exit_batch(struct list_head *net_list)
+{
+       struct net *net;
+
+       rtnl_lock();
+       list_for_each_entry(net, net_list, exit_list)
+               ip6mr_rules_exit(net);
+       rtnl_unlock();
+}
+
 static struct pernet_operations ip6mr_net_ops = {
        .init = ip6mr_net_init,
        .exit = ip6mr_net_exit,
+       .exit_batch = ip6mr_net_exit_batch,
 };
 
 int __init ip6_mr_init(void)
@@ -1553,7 +1563,7 @@ static int ip6mr_sk_init(struct mr_table *mrt, struct sock *sk)
        } else {
                rcu_assign_pointer(mrt->mroute_sk, sk);
                sock_set_flag(sk, SOCK_RCU_FREE);
-               net->ipv6.devconf_all->mc_forwarding++;
+               atomic_inc(&net->ipv6.devconf_all->mc_forwarding);
        }
        write_unlock_bh(&mrt_lock);
 
@@ -1569,14 +1579,19 @@ static int ip6mr_sk_init(struct mr_table *mrt, struct sock *sk)
 
 int ip6mr_sk_done(struct sock *sk)
 {
-       int err = -EACCES;
        struct net *net = sock_net(sk);
+       struct ipv6_devconf *devconf;
        struct mr_table *mrt;
+       int err = -EACCES;
 
        if (sk->sk_type != SOCK_RAW ||
            inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
                return err;
 
+       devconf = net->ipv6.devconf_all;
+       if (!devconf || !atomic_read(&devconf->mc_forwarding))
+               return err;
+
        rtnl_lock();
        ip6mr_for_each_table(mrt, net) {
                if (sk == rtnl_dereference(mrt->mroute_sk)) {
@@ -1586,7 +1601,7 @@ int ip6mr_sk_done(struct sock *sk)
                         * so the RCU grace period before sk freeing
                         * is guaranteed by sk_destruct()
                         */
-                       net->ipv6.devconf_all->mc_forwarding--;
+                       atomic_dec(&devconf->mc_forwarding);
                        write_unlock_bh(&mrt_lock);
                        inet6_netconf_notify_devconf(net, RTM_NEWNETCONF,
                                                     NETCONFA_MC_FORWARDING,
index 9256f6b..d5544cf 100644 (file)
@@ -59,8 +59,6 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
        struct pingfakehdr pfh;
        struct ipcm6_cookie ipc6;
 
-       pr_debug("ping_v6_sendmsg(sk=%p,sk->num=%u)\n", inet, inet->inet_num);
-
        err = ping_common_sendmsg(AF_INET6, msg, len, &user_icmph,
                                  sizeof(user_icmph));
        if (err)
@@ -99,6 +97,14 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
            (oif && sk->sk_bound_dev_if && oif != sk->sk_bound_dev_if))
                return -EINVAL;
 
+       ipcm6_init_sk(&ipc6, np);
+       ipc6.sockc.tsflags = sk->sk_tsflags;
+       ipc6.sockc.mark = sk->sk_mark;
+
+       err = sock_cmsg_send(sk, msg, &ipc6.sockc);
+       if (err)
+               return err;
+
        /* TODO: use ip6_datagram_send_ctl to get options from cmsg */
 
        memset(&fl6, 0, sizeof(fl6));
@@ -107,14 +113,12 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
        fl6.saddr = np->saddr;
        fl6.daddr = *daddr;
        fl6.flowi6_oif = oif;
-       fl6.flowi6_mark = sk->sk_mark;
+       fl6.flowi6_mark = ipc6.sockc.mark;
        fl6.flowi6_uid = sk->sk_uid;
        fl6.fl6_icmp_type = user_icmph.icmp6_type;
        fl6.fl6_icmp_code = user_icmph.icmp6_code;
        security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6));
 
-       ipcm6_init_sk(&ipc6, np);
-       ipc6.sockc.mark = sk->sk_mark;
        fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel);
 
        dst = ip6_sk_dst_lookup_flow(sk, &fl6, daddr, false);
index 075ee8a..0c648bf 100644 (file)
@@ -148,6 +148,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
        struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr;
        struct inet_sock *inet = inet_sk(sk);
        struct inet_connection_sock *icsk = inet_csk(sk);
+       struct inet_timewait_death_row *tcp_death_row;
        struct ipv6_pinfo *np = tcp_inet6_sk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        struct in6_addr *saddr = NULL, *final_p, final;
@@ -156,7 +157,6 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
        struct dst_entry *dst;
        int addr_type;
        int err;
-       struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 
        if (addr_len < SIN6_LEN_RFC2133)
                return -EINVAL;
@@ -308,6 +308,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
        inet->inet_dport = usin->sin6_port;
 
        tcp_set_state(sk, TCP_SYN_SENT);
+       tcp_death_row = sock_net(sk)->ipv4.tcp_death_row;
        err = inet6_hash_connect(tcp_death_row, sk);
        if (err)
                goto late_failure;
@@ -2237,15 +2238,9 @@ static void __net_exit tcpv6_net_exit(struct net *net)
        inet_ctl_sock_destroy(net->ipv6.tcp_sk);
 }
 
-static void __net_exit tcpv6_net_exit_batch(struct list_head *net_exit_list)
-{
-       inet_twsk_purge(&tcp_hashinfo, AF_INET6);
-}
-
 static struct pernet_operations tcpv6_net_ops = {
        .init       = tcpv6_net_init,
        .exit       = tcpv6_net_exit,
-       .exit_batch = tcpv6_net_exit_batch,
 };
 
 int __init tcpv6_init(void)
index 528b81e..c687259 100644 (file)
@@ -1266,23 +1266,17 @@ static int udp_v6_push_pending_frames(struct sock *sk)
 {
        struct sk_buff *skb;
        struct udp_sock  *up = udp_sk(sk);
-       struct flowi6 fl6;
        int err = 0;
 
        if (up->pending == AF_INET)
                return udp_push_pending_frames(sk);
 
-       /* ip6_finish_skb will release the cork, so make a copy of
-        * fl6 here.
-        */
-       fl6 = inet_sk(sk)->cork.fl.u.ip6;
-
        skb = ip6_finish_skb(sk);
        if (!skb)
                goto out;
 
-       err = udp_v6_send_skb(skb, &fl6, &inet_sk(sk)->cork.base);
-
+       err = udp_v6_send_skb(skb, &inet_sk(sk)->cork.fl.u.ip6,
+                             &inet_sk(sk)->cork.base);
 out:
        up->len = 0;
        up->pending = 0;
@@ -1300,7 +1294,8 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
        struct ipv6_txoptions *opt = NULL;
        struct ipv6_txoptions *opt_to_free = NULL;
        struct ip6_flowlabel *flowlabel = NULL;
-       struct flowi6 fl6;
+       struct inet_cork_full cork;
+       struct flowi6 *fl6 = &cork.fl.u.ip6;
        struct dst_entry *dst;
        struct ipcm6_cookie ipc6;
        int addr_len = msg->msg_namelen;
@@ -1363,9 +1358,6 @@ do_udp_sendmsg:
                }
        }
 
-       if (up->pending == AF_INET)
-               return udp_sendmsg(sk, msg, len);
-
        /* Rough check on arithmetic overflow,
           better check is made in ip6_append_data().
           */
@@ -1374,6 +1366,8 @@ do_udp_sendmsg:
 
        getfrag  =  is_udplite ?  udplite_getfrag : ip_generic_getfrag;
        if (up->pending) {
+               if (up->pending == AF_INET)
+                       return udp_sendmsg(sk, msg, len);
                /*
                 * There are pending frames.
                 * The socket lock must be held while it's corked.
@@ -1391,19 +1385,19 @@ do_udp_sendmsg:
        }
        ulen += sizeof(struct udphdr);
 
-       memset(&fl6, 0, sizeof(fl6));
+       memset(fl6, 0, sizeof(*fl6));
 
        if (sin6) {
                if (sin6->sin6_port == 0)
                        return -EINVAL;
 
-               fl6.fl6_dport = sin6->sin6_port;
+               fl6->fl6_dport = sin6->sin6_port;
                daddr = &sin6->sin6_addr;
 
                if (np->sndflow) {
-                       fl6.flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK;
-                       if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) {
-                               flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
+                       fl6->flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK;
+                       if (fl6->flowlabel & IPV6_FLOWLABEL_MASK) {
+                               flowlabel = fl6_sock_lookup(sk, fl6->flowlabel);
                                if (IS_ERR(flowlabel))
                                        return -EINVAL;
                        }
@@ -1420,24 +1414,24 @@ do_udp_sendmsg:
                if (addr_len >= sizeof(struct sockaddr_in6) &&
                    sin6->sin6_scope_id &&
                    __ipv6_addr_needs_scope_id(__ipv6_addr_type(daddr)))
-                       fl6.flowi6_oif = sin6->sin6_scope_id;
+                       fl6->flowi6_oif = sin6->sin6_scope_id;
        } else {
                if (sk->sk_state != TCP_ESTABLISHED)
                        return -EDESTADDRREQ;
 
-               fl6.fl6_dport = inet->inet_dport;
+               fl6->fl6_dport = inet->inet_dport;
                daddr = &sk->sk_v6_daddr;
-               fl6.flowlabel = np->flow_label;
+               fl6->flowlabel = np->flow_label;
                connected = true;
        }
 
-       if (!fl6.flowi6_oif)
-               fl6.flowi6_oif = sk->sk_bound_dev_if;
+       if (!fl6->flowi6_oif)
+               fl6->flowi6_oif = sk->sk_bound_dev_if;
 
-       if (!fl6.flowi6_oif)
-               fl6.flowi6_oif = np->sticky_pktinfo.ipi6_ifindex;
+       if (!fl6->flowi6_oif)
+               fl6->flowi6_oif = np->sticky_pktinfo.ipi6_ifindex;
 
-       fl6.flowi6_uid = sk->sk_uid;
+       fl6->flowi6_uid = sk->sk_uid;
 
        if (msg->msg_controllen) {
                opt = &opt_space;
@@ -1447,14 +1441,14 @@ do_udp_sendmsg:
 
                err = udp_cmsg_send(sk, msg, &ipc6.gso_size);
                if (err > 0)
-                       err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6,
+                       err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, fl6,
                                                    &ipc6);
                if (err < 0) {
                        fl6_sock_release(flowlabel);
                        return err;
                }
-               if ((fl6.flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) {
-                       flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
+               if ((fl6->flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) {
+                       flowlabel = fl6_sock_lookup(sk, fl6->flowlabel);
                        if (IS_ERR(flowlabel))
                                return -EINVAL;
                }
@@ -1471,16 +1465,17 @@ do_udp_sendmsg:
        opt = ipv6_fixup_options(&opt_space, opt);
        ipc6.opt = opt;
 
-       fl6.flowi6_proto = sk->sk_protocol;
-       fl6.flowi6_mark = ipc6.sockc.mark;
-       fl6.daddr = *daddr;
-       if (ipv6_addr_any(&fl6.saddr) && !ipv6_addr_any(&np->saddr))
-               fl6.saddr = np->saddr;
-       fl6.fl6_sport = inet->inet_sport;
+       fl6->flowi6_proto = sk->sk_protocol;
+       fl6->flowi6_mark = ipc6.sockc.mark;
+       fl6->daddr = *daddr;
+       if (ipv6_addr_any(&fl6->saddr) && !ipv6_addr_any(&np->saddr))
+               fl6->saddr = np->saddr;
+       fl6->fl6_sport = inet->inet_sport;
 
        if (cgroup_bpf_enabled(CGROUP_UDP6_SENDMSG) && !connected) {
                err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk,
-                                          (struct sockaddr *)sin6, &fl6.saddr);
+                                          (struct sockaddr *)sin6,
+                                          &fl6->saddr);
                if (err)
                        goto out_no_dst;
                if (sin6) {
@@ -1496,32 +1491,32 @@ do_udp_sendmsg:
                                err = -EINVAL;
                                goto out_no_dst;
                        }
-                       fl6.fl6_dport = sin6->sin6_port;
-                       fl6.daddr = sin6->sin6_addr;
+                       fl6->fl6_dport = sin6->sin6_port;
+                       fl6->daddr = sin6->sin6_addr;
                }
        }
 
-       if (ipv6_addr_any(&fl6.daddr))
-               fl6.daddr.s6_addr[15] = 0x1; /* :: means loopback (BSD'ism) */
+       if (ipv6_addr_any(&fl6->daddr))
+               fl6->daddr.s6_addr[15] = 0x1; /* :: means loopback (BSD'ism) */
 
-       final_p = fl6_update_dst(&fl6, opt, &final);
+       final_p = fl6_update_dst(fl6, opt, &final);
        if (final_p)
                connected = false;
 
-       if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) {
-               fl6.flowi6_oif = np->mcast_oif;
+       if (!fl6->flowi6_oif && ipv6_addr_is_multicast(&fl6->daddr)) {
+               fl6->flowi6_oif = np->mcast_oif;
                connected = false;
-       } else if (!fl6.flowi6_oif)
-               fl6.flowi6_oif = np->ucast_oif;
+       } else if (!fl6->flowi6_oif)
+               fl6->flowi6_oif = np->ucast_oif;
 
-       security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6));
+       security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6));
 
        if (ipc6.tclass < 0)
                ipc6.tclass = np->tclass;
 
-       fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel);
+       fl6->flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6->flowlabel);
 
-       dst = ip6_sk_dst_lookup_flow(sk, &fl6, final_p, connected);
+       dst = ip6_sk_dst_lookup_flow(sk, fl6, final_p, connected);
        if (IS_ERR(dst)) {
                err = PTR_ERR(dst);
                dst = NULL;
@@ -1529,7 +1524,7 @@ do_udp_sendmsg:
        }
 
        if (ipc6.hlimit < 0)
-               ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
+               ipc6.hlimit = ip6_sk_dst_hoplimit(np, fl6, dst);
 
        if (msg->msg_flags&MSG_CONFIRM)
                goto do_confirm;
@@ -1537,17 +1532,17 @@ back_from_confirm:
 
        /* Lockless fast path for the non-corking case */
        if (!corkreq) {
-               struct inet_cork_full cork;
                struct sk_buff *skb;
 
                skb = ip6_make_skb(sk, getfrag, msg, ulen,
                                   sizeof(struct udphdr), &ipc6,
-                                  &fl6, (struct rt6_info *)dst,
+                                  (struct rt6_info *)dst,
                                   msg->msg_flags, &cork);
                err = PTR_ERR(skb);
                if (!IS_ERR_OR_NULL(skb))
-                       err = udp_v6_send_skb(skb, &fl6, &cork.base);
-               goto out;
+                       err = udp_v6_send_skb(skb, fl6, &cork.base);
+               /* ip6_make_skb steals dst reference */
+               goto out_no_dst;
        }
 
        lock_sock(sk);
@@ -1568,7 +1563,7 @@ do_append_data:
                ipc6.dontfrag = np->dontfrag;
        up->len += ulen;
        err = ip6_append_data(sk, getfrag, msg, ulen, sizeof(struct udphdr),
-                             &ipc6, &fl6, (struct rt6_info *)dst,
+                             &ipc6, fl6, (struct rt6_info *)dst,
                              corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
        if (err)
                udp_v6_flush_pending_frames(sk);
@@ -1603,7 +1598,7 @@ out_no_dst:
 
 do_confirm:
        if (msg->msg_flags & MSG_PROBE)
-               dst_confirm_neigh(dst, &fl6.daddr);
+               dst_confirm_neigh(dst, &fl6->daddr);
        if (!(msg->msg_flags&MSG_PROBE) || len)
                goto back_from_confirm;
        err = 0;
index c921de6..f0702d9 100644 (file)
@@ -6,6 +6,7 @@
  * Copyright (c) 2021 Google
  */
 
+#include <linux/compat.h>
 #include <linux/if_arp.h>
 #include <linux/net.h>
 #include <linux/mctp.h>
@@ -21,6 +22,8 @@
 
 /* socket implementation */
 
+static void mctp_sk_expire_keys(struct timer_list *timer);
+
 static int mctp_release(struct socket *sock)
 {
        struct sock *sk = sock->sk;
@@ -99,13 +102,20 @@ static int mctp_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
        struct sk_buff *skb;
 
        if (addr) {
+               const u8 tagbits = MCTP_TAG_MASK | MCTP_TAG_OWNER |
+                       MCTP_TAG_PREALLOC;
+
                if (addrlen < sizeof(struct sockaddr_mctp))
                        return -EINVAL;
                if (addr->smctp_family != AF_MCTP)
                        return -EINVAL;
                if (!mctp_sockaddr_is_ok(addr))
                        return -EINVAL;
-               if (addr->smctp_tag & ~(MCTP_TAG_MASK | MCTP_TAG_OWNER))
+               if (addr->smctp_tag & ~tagbits)
+                       return -EINVAL;
+               /* can't preallocate a non-owned tag */
+               if (addr->smctp_tag & MCTP_TAG_PREALLOC &&
+                   !(addr->smctp_tag & MCTP_TAG_OWNER))
                        return -EINVAL;
 
        } else {
@@ -248,6 +258,32 @@ out_free:
        return rc;
 }
 
+/* We're done with the key; invalidate, stop reassembly, and remove from lists.
+ */
+static void __mctp_key_remove(struct mctp_sk_key *key, struct net *net,
+                             unsigned long flags, unsigned long reason)
+__releases(&key->lock)
+__must_hold(&net->mctp.keys_lock)
+{
+       struct sk_buff *skb;
+
+       trace_mctp_key_release(key, reason);
+       skb = key->reasm_head;
+       key->reasm_head = NULL;
+       key->reasm_dead = true;
+       key->valid = false;
+       mctp_dev_release_key(key->dev, key);
+       spin_unlock_irqrestore(&key->lock, flags);
+
+       hlist_del(&key->hlist);
+       hlist_del(&key->sklist);
+
+       /* unref for the lists */
+       mctp_key_unref(key);
+
+       kfree_skb(skb);
+}
+
 static int mctp_setsockopt(struct socket *sock, int level, int optname,
                           sockptr_t optval, unsigned int optlen)
 {
@@ -293,6 +329,115 @@ static int mctp_getsockopt(struct socket *sock, int level, int optname,
        return -EINVAL;
 }
 
+static int mctp_ioctl_alloctag(struct mctp_sock *msk, unsigned long arg)
+{
+       struct net *net = sock_net(&msk->sk);
+       struct mctp_sk_key *key = NULL;
+       struct mctp_ioc_tag_ctl ctl;
+       unsigned long flags;
+       u8 tag;
+
+       if (copy_from_user(&ctl, (void __user *)arg, sizeof(ctl)))
+               return -EFAULT;
+
+       if (ctl.tag)
+               return -EINVAL;
+
+       if (ctl.flags)
+               return -EINVAL;
+
+       key = mctp_alloc_local_tag(msk, ctl.peer_addr, MCTP_ADDR_ANY,
+                                  true, &tag);
+       if (IS_ERR(key))
+               return PTR_ERR(key);
+
+       ctl.tag = tag | MCTP_TAG_OWNER | MCTP_TAG_PREALLOC;
+       if (copy_to_user((void __user *)arg, &ctl, sizeof(ctl))) {
+               spin_lock_irqsave(&key->lock, flags);
+               __mctp_key_remove(key, net, flags, MCTP_TRACE_KEY_DROPPED);
+               mctp_key_unref(key);
+               return -EFAULT;
+       }
+
+       mctp_key_unref(key);
+       return 0;
+}
+
+static int mctp_ioctl_droptag(struct mctp_sock *msk, unsigned long arg)
+{
+       struct net *net = sock_net(&msk->sk);
+       struct mctp_ioc_tag_ctl ctl;
+       unsigned long flags, fl2;
+       struct mctp_sk_key *key;
+       struct hlist_node *tmp;
+       int rc;
+       u8 tag;
+
+       if (copy_from_user(&ctl, (void __user *)arg, sizeof(ctl)))
+               return -EFAULT;
+
+       if (ctl.flags)
+               return -EINVAL;
+
+       /* Must be a local tag, TO set, preallocated */
+       if ((ctl.tag & ~MCTP_TAG_MASK) != (MCTP_TAG_OWNER | MCTP_TAG_PREALLOC))
+               return -EINVAL;
+
+       tag = ctl.tag & MCTP_TAG_MASK;
+       rc = -EINVAL;
+
+       spin_lock_irqsave(&net->mctp.keys_lock, flags);
+       hlist_for_each_entry_safe(key, tmp, &msk->keys, sklist) {
+               /* we do an irqsave here, even though we know the irq state,
+                * so we have the flags to pass to __mctp_key_remove
+                */
+               spin_lock_irqsave(&key->lock, fl2);
+               if (key->manual_alloc &&
+                   ctl.peer_addr == key->peer_addr &&
+                   tag == key->tag) {
+                       __mctp_key_remove(key, net, fl2,
+                                         MCTP_TRACE_KEY_DROPPED);
+                       rc = 0;
+               } else {
+                       spin_unlock_irqrestore(&key->lock, fl2);
+               }
+       }
+       spin_unlock_irqrestore(&net->mctp.keys_lock, flags);
+
+       return rc;
+}
+
+static int mctp_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+       struct mctp_sock *msk = container_of(sock->sk, struct mctp_sock, sk);
+
+       switch (cmd) {
+       case SIOCMCTPALLOCTAG:
+               return mctp_ioctl_alloctag(msk, arg);
+       case SIOCMCTPDROPTAG:
+               return mctp_ioctl_droptag(msk, arg);
+       }
+
+       return -EINVAL;
+}
+
+#ifdef CONFIG_COMPAT
+static int mctp_compat_ioctl(struct socket *sock, unsigned int cmd,
+                            unsigned long arg)
+{
+       void __user *argp = compat_ptr(arg);
+
+       switch (cmd) {
+       /* These have compatible ptr layouts */
+       case SIOCMCTPALLOCTAG:
+       case SIOCMCTPDROPTAG:
+               return mctp_ioctl(sock, cmd, (unsigned long)argp);
+       }
+
+       return -ENOIOCTLCMD;
+}
+#endif
+
 static const struct proto_ops mctp_dgram_ops = {
        .family         = PF_MCTP,
        .release        = mctp_release,
@@ -302,7 +447,7 @@ static const struct proto_ops mctp_dgram_ops = {
        .accept         = sock_no_accept,
        .getname        = sock_no_getname,
        .poll           = datagram_poll,
-       .ioctl          = sock_no_ioctl,
+       .ioctl          = mctp_ioctl,
        .gettstamp      = sock_gettstamp,
        .listen         = sock_no_listen,
        .shutdown       = sock_no_shutdown,
@@ -312,6 +457,9 @@ static const struct proto_ops mctp_dgram_ops = {
        .recvmsg        = mctp_recvmsg,
        .mmap           = sock_no_mmap,
        .sendpage       = sock_no_sendpage,
+#ifdef CONFIG_COMPAT
+       .compat_ioctl   = mctp_compat_ioctl,
+#endif
 };
 
 static void mctp_sk_expire_keys(struct timer_list *timer)
@@ -319,7 +467,7 @@ static void mctp_sk_expire_keys(struct timer_list *timer)
        struct mctp_sock *msk = container_of(timer, struct mctp_sock,
                                             key_expiry);
        struct net *net = sock_net(&msk->sk);
-       unsigned long next_expiry, flags;
+       unsigned long next_expiry, flags, fl2;
        struct mctp_sk_key *key;
        struct hlist_node *tmp;
        bool next_expiry_valid = false;
@@ -327,15 +475,16 @@ static void mctp_sk_expire_keys(struct timer_list *timer)
        spin_lock_irqsave(&net->mctp.keys_lock, flags);
 
        hlist_for_each_entry_safe(key, tmp, &msk->keys, sklist) {
-               spin_lock(&key->lock);
+               /* don't expire. manual_alloc is immutable, no locking
+                * required.
+                */
+               if (key->manual_alloc)
+                       continue;
 
+               spin_lock_irqsave(&key->lock, fl2);
                if (!time_after_eq(key->expiry, jiffies)) {
-                       trace_mctp_key_release(key, MCTP_TRACE_KEY_TIMEOUT);
-                       key->valid = false;
-                       hlist_del_rcu(&key->hlist);
-                       hlist_del_rcu(&key->sklist);
-                       spin_unlock(&key->lock);
-                       mctp_key_unref(key);
+                       __mctp_key_remove(key, net, fl2,
+                                         MCTP_TRACE_KEY_TIMEOUT);
                        continue;
                }
 
@@ -346,7 +495,7 @@ static void mctp_sk_expire_keys(struct timer_list *timer)
                        next_expiry = key->expiry;
                        next_expiry_valid = true;
                }
-               spin_unlock(&key->lock);
+               spin_unlock_irqrestore(&key->lock, fl2);
        }
 
        spin_unlock_irqrestore(&net->mctp.keys_lock, flags);
@@ -387,9 +536,9 @@ static void mctp_sk_unhash(struct sock *sk)
 {
        struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk);
        struct net *net = sock_net(sk);
+       unsigned long flags, fl2;
        struct mctp_sk_key *key;
        struct hlist_node *tmp;
-       unsigned long flags;
 
        /* remove from any type-based binds */
        mutex_lock(&net->mctp.bind_lock);
@@ -399,20 +548,8 @@ static void mctp_sk_unhash(struct sock *sk)
        /* remove tag allocations */
        spin_lock_irqsave(&net->mctp.keys_lock, flags);
        hlist_for_each_entry_safe(key, tmp, &msk->keys, sklist) {
-               hlist_del(&key->sklist);
-               hlist_del(&key->hlist);
-
-               trace_mctp_key_release(key, MCTP_TRACE_KEY_CLOSED);
-
-               spin_lock(&key->lock);
-               kfree_skb(key->reasm_head);
-               key->reasm_head = NULL;
-               key->reasm_dead = true;
-               key->valid = false;
-               spin_unlock(&key->lock);
-
-               /* key is no longer on the lookup lists, unref */
-               mctp_key_unref(key);
+               spin_lock_irqsave(&key->lock, fl2);
+               __mctp_key_remove(key, net, fl2, MCTP_TRACE_KEY_CLOSED);
        }
        spin_unlock_irqrestore(&net->mctp.keys_lock, flags);
 }
index ef2755f..02ddc0f 100644 (file)
@@ -6,6 +6,7 @@
  * Copyright (c) 2021 Google
  */
 
+#include <linux/if_arp.h>
 #include <linux/if_link.h>
 #include <linux/mctp.h>
 #include <linux/netdevice.h>
index 8d9f4ff..17e3482 100644 (file)
@@ -64,8 +64,7 @@ static struct mctp_sock *mctp_lookup_bind(struct net *net, struct sk_buff *skb)
                if (msk->bind_type != type)
                        continue;
 
-               if (msk->bind_addr != MCTP_ADDR_ANY &&
-                   msk->bind_addr != mh->dest)
+               if (!mctp_address_matches(msk->bind_addr, mh->dest))
                        continue;
 
                return msk;
@@ -77,7 +76,7 @@ static struct mctp_sock *mctp_lookup_bind(struct net *net, struct sk_buff *skb)
 static bool mctp_key_match(struct mctp_sk_key *key, mctp_eid_t local,
                           mctp_eid_t peer, u8 tag)
 {
-       if (key->local_addr != local)
+       if (!mctp_address_matches(key->local_addr, local))
                return false;
 
        if (key->peer_addr != peer)
@@ -204,29 +203,38 @@ static int mctp_key_add(struct mctp_sk_key *key, struct mctp_sock *msk)
        return rc;
 }
 
-/* We're done with the key; unset valid and remove from lists. There may still
- * be outstanding refs on the key though...
+/* Helper for mctp_route_input().
+ * We're done with the key; unlock and unref the key.
+ * For the usual case of automatic expiry we remove the key from lists.
+ * In the case that manual allocation is set on a key we release the lock
+ * and local ref, reset reassembly, but don't remove from lists.
  */
-static void __mctp_key_unlock_drop(struct mctp_sk_key *key, struct net *net,
-                                  unsigned long flags)
-       __releases(&key->lock)
+static void __mctp_key_done_in(struct mctp_sk_key *key, struct net *net,
+                              unsigned long flags, unsigned long reason)
+__releases(&key->lock)
 {
        struct sk_buff *skb;
 
+       trace_mctp_key_release(key, reason);
        skb = key->reasm_head;
        key->reasm_head = NULL;
-       key->reasm_dead = true;
-       key->valid = false;
-       mctp_dev_release_key(key->dev, key);
+
+       if (!key->manual_alloc) {
+               key->reasm_dead = true;
+               key->valid = false;
+               mctp_dev_release_key(key->dev, key);
+       }
        spin_unlock_irqrestore(&key->lock, flags);
 
-       spin_lock_irqsave(&net->mctp.keys_lock, flags);
-       hlist_del(&key->hlist);
-       hlist_del(&key->sklist);
-       spin_unlock_irqrestore(&net->mctp.keys_lock, flags);
+       if (!key->manual_alloc) {
+               spin_lock_irqsave(&net->mctp.keys_lock, flags);
+               hlist_del(&key->hlist);
+               hlist_del(&key->sklist);
+               spin_unlock_irqrestore(&net->mctp.keys_lock, flags);
 
-       /* one unref for the lists */
-       mctp_key_unref(key);
+               /* unref for the lists */
+               mctp_key_unref(key);
+       }
 
        /* and one for the local reference */
        mctp_key_unref(key);
@@ -380,9 +388,8 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb)
                                /* we've hit a pending reassembly; not much we
                                 * can do but drop it
                                 */
-                               trace_mctp_key_release(key,
-                                                      MCTP_TRACE_KEY_REPLIED);
-                               __mctp_key_unlock_drop(key, net, f);
+                               __mctp_key_done_in(key, net, f,
+                                                  MCTP_TRACE_KEY_REPLIED);
                                key = NULL;
                        }
                        rc = 0;
@@ -424,9 +431,8 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb)
                } else {
                        if (key->reasm_head || key->reasm_dead) {
                                /* duplicate start? drop everything */
-                               trace_mctp_key_release(key,
-                                                      MCTP_TRACE_KEY_INVALIDATED);
-                               __mctp_key_unlock_drop(key, net, f);
+                               __mctp_key_done_in(key, net, f,
+                                                  MCTP_TRACE_KEY_INVALIDATED);
                                rc = -EEXIST;
                                key = NULL;
                        } else {
@@ -449,10 +455,10 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb)
                 * the reassembly/response key
                 */
                if (!rc && flags & MCTP_HDR_FLAG_EOM) {
+                       msk = container_of(key->sk, struct mctp_sock, sk);
                        sock_queue_rcv_skb(key->sk, key->reasm_head);
                        key->reasm_head = NULL;
-                       trace_mctp_key_release(key, MCTP_TRACE_KEY_REPLIED);
-                       __mctp_key_unlock_drop(key, net, f);
+                       __mctp_key_done_in(key, net, f, MCTP_TRACE_KEY_REPLIED);
                        key = NULL;
                }
 
@@ -580,9 +586,9 @@ static void mctp_reserve_tag(struct net *net, struct mctp_sk_key *key,
 /* Allocate a locally-owned tag value for (saddr, daddr), and reserve
  * it for the socket msk
  */
-static struct mctp_sk_key *mctp_alloc_local_tag(struct mctp_sock *msk,
-                                               mctp_eid_t saddr,
-                                               mctp_eid_t daddr, u8 *tagp)
+struct mctp_sk_key *mctp_alloc_local_tag(struct mctp_sock *msk,
+                                        mctp_eid_t daddr, mctp_eid_t saddr,
+                                        bool manual, u8 *tagp)
 {
        struct net *net = sock_net(&msk->sk);
        struct netns_mctp *mns = &net->mctp;
@@ -616,9 +622,8 @@ static struct mctp_sk_key *mctp_alloc_local_tag(struct mctp_sock *msk,
                if (tmp->tag & MCTP_HDR_FLAG_TO)
                        continue;
 
-               if (!((tmp->peer_addr == daddr ||
-                      tmp->peer_addr == MCTP_ADDR_ANY) &&
-                      tmp->local_addr == saddr))
+               if (!(mctp_address_matches(tmp->peer_addr, daddr) &&
+                     mctp_address_matches(tmp->local_addr, saddr)))
                        continue;
 
                spin_lock(&tmp->lock);
@@ -638,6 +643,7 @@ static struct mctp_sk_key *mctp_alloc_local_tag(struct mctp_sock *msk,
                mctp_reserve_tag(net, key, msk);
                trace_mctp_key_acquire(key);
 
+               key->manual_alloc = manual;
                *tagp = key->tag;
        }
 
@@ -651,6 +657,50 @@ static struct mctp_sk_key *mctp_alloc_local_tag(struct mctp_sock *msk,
        return key;
 }
 
+static struct mctp_sk_key *mctp_lookup_prealloc_tag(struct mctp_sock *msk,
+                                                   mctp_eid_t daddr,
+                                                   u8 req_tag, u8 *tagp)
+{
+       struct net *net = sock_net(&msk->sk);
+       struct netns_mctp *mns = &net->mctp;
+       struct mctp_sk_key *key, *tmp;
+       unsigned long flags;
+
+       req_tag &= ~(MCTP_TAG_PREALLOC | MCTP_TAG_OWNER);
+       key = NULL;
+
+       spin_lock_irqsave(&mns->keys_lock, flags);
+
+       hlist_for_each_entry(tmp, &mns->keys, hlist) {
+               if (tmp->tag != req_tag)
+                       continue;
+
+               if (!mctp_address_matches(tmp->peer_addr, daddr))
+                       continue;
+
+               if (!tmp->manual_alloc)
+                       continue;
+
+               spin_lock(&tmp->lock);
+               if (tmp->valid) {
+                       key = tmp;
+                       refcount_inc(&key->refs);
+                       spin_unlock(&tmp->lock);
+                       break;
+               }
+               spin_unlock(&tmp->lock);
+       }
+       spin_unlock_irqrestore(&mns->keys_lock, flags);
+
+       if (!key)
+               return ERR_PTR(-ENOENT);
+
+       if (tagp)
+               *tagp = key->tag;
+
+       return key;
+}
+
 /* routing lookups */
 static bool mctp_rt_match_eid(struct mctp_route *rt,
                              unsigned int net, mctp_eid_t eid)
@@ -845,8 +895,14 @@ int mctp_local_output(struct sock *sk, struct mctp_route *rt,
        if (rc)
                goto out_release;
 
-       if (req_tag & MCTP_HDR_FLAG_TO) {
-               key = mctp_alloc_local_tag(msk, saddr, daddr, &tag);
+       if (req_tag & MCTP_TAG_OWNER) {
+               if (req_tag & MCTP_TAG_PREALLOC)
+                       key = mctp_lookup_prealloc_tag(msk, daddr,
+                                                      req_tag, &tag);
+               else
+                       key = mctp_alloc_local_tag(msk, daddr, saddr,
+                                                  false, &tag);
+
                if (IS_ERR(key)) {
                        rc = PTR_ERR(key);
                        goto out_release;
@@ -857,7 +913,7 @@ int mctp_local_output(struct sock *sk, struct mctp_route *rt,
                tag |= MCTP_HDR_FLAG_TO;
        } else {
                key = NULL;
-               tag = req_tag;
+               tag = req_tag & MCTP_TAG_MASK;
        }
 
        skb->protocol = htons(ETH_P_MCTP);
index 750f9f9..61205cf 100644 (file)
@@ -369,14 +369,15 @@ static void mctp_test_route_input_sk(struct kunit *test)
 
 #define FL_S   (MCTP_HDR_FLAG_SOM)
 #define FL_E   (MCTP_HDR_FLAG_EOM)
-#define FL_T   (MCTP_HDR_FLAG_TO)
+#define FL_TO  (MCTP_HDR_FLAG_TO)
+#define FL_T(t)        ((t) & MCTP_HDR_TAG_MASK)
 
 static const struct mctp_route_input_sk_test mctp_route_input_sk_tests[] = {
-       { .hdr = RX_HDR(1, 10, 8, FL_S | FL_E | FL_T), .type = 0, .deliver = true },
-       { .hdr = RX_HDR(1, 10, 8, FL_S | FL_E | FL_T), .type = 1, .deliver = false },
+       { .hdr = RX_HDR(1, 10, 8, FL_S | FL_E | FL_TO), .type = 0, .deliver = true },
+       { .hdr = RX_HDR(1, 10, 8, FL_S | FL_E | FL_TO), .type = 1, .deliver = false },
        { .hdr = RX_HDR(1, 10, 8, FL_S | FL_E), .type = 0, .deliver = false },
-       { .hdr = RX_HDR(1, 10, 8, FL_E | FL_T), .type = 0, .deliver = false },
-       { .hdr = RX_HDR(1, 10, 8, FL_T), .type = 0, .deliver = false },
+       { .hdr = RX_HDR(1, 10, 8, FL_E | FL_TO), .type = 0, .deliver = false },
+       { .hdr = RX_HDR(1, 10, 8, FL_TO), .type = 0, .deliver = false },
        { .hdr = RX_HDR(1, 10, 8, 0), .type = 0, .deliver = false },
 };
 
@@ -436,7 +437,7 @@ static void mctp_test_route_input_sk_reasm(struct kunit *test)
        __mctp_route_test_fini(test, dev, rt, sock);
 }
 
-#define RX_FRAG(f, s) RX_HDR(1, 10, 8, FL_T | (f) | ((s) << MCTP_HDR_SEQ_SHIFT))
+#define RX_FRAG(f, s) RX_HDR(1, 10, 8, FL_TO | (f) | ((s) << MCTP_HDR_SEQ_SHIFT))
 
 static const struct mctp_route_input_sk_reasm_test mctp_route_input_sk_reasm_tests[] = {
        {
@@ -522,12 +523,156 @@ static void mctp_route_input_sk_reasm_to_desc(
 KUNIT_ARRAY_PARAM(mctp_route_input_sk_reasm, mctp_route_input_sk_reasm_tests,
                  mctp_route_input_sk_reasm_to_desc);
 
+struct mctp_route_input_sk_keys_test {
+       const char      *name;
+       mctp_eid_t      key_peer_addr;
+       mctp_eid_t      key_local_addr;
+       u8              key_tag;
+       struct mctp_hdr hdr;
+       bool            deliver;
+};
+
+/* test packet rx in the presence of various key configurations */
+static void mctp_test_route_input_sk_keys(struct kunit *test)
+{
+       const struct mctp_route_input_sk_keys_test *params;
+       struct mctp_test_route *rt;
+       struct sk_buff *skb, *skb2;
+       struct mctp_test_dev *dev;
+       struct mctp_sk_key *key;
+       struct netns_mctp *mns;
+       struct mctp_sock *msk;
+       struct socket *sock;
+       unsigned long flags;
+       int rc;
+       u8 c;
+
+       params = test->param_value;
+
+       dev = mctp_test_create_dev();
+       KUNIT_ASSERT_NOT_ERR_OR_NULL(test, dev);
+
+       rt = mctp_test_create_route(&init_net, dev->mdev, 8, 68);
+       KUNIT_ASSERT_NOT_ERR_OR_NULL(test, rt);
+
+       rc = sock_create_kern(&init_net, AF_MCTP, SOCK_DGRAM, 0, &sock);
+       KUNIT_ASSERT_EQ(test, rc, 0);
+
+       msk = container_of(sock->sk, struct mctp_sock, sk);
+       mns = &sock_net(sock->sk)->mctp;
+
+       /* set the incoming tag according to test params */
+       key = mctp_key_alloc(msk, params->key_local_addr, params->key_peer_addr,
+                            params->key_tag, GFP_KERNEL);
+
+       KUNIT_ASSERT_NOT_ERR_OR_NULL(test, key);
+
+       spin_lock_irqsave(&mns->keys_lock, flags);
+       mctp_reserve_tag(&init_net, key, msk);
+       spin_unlock_irqrestore(&mns->keys_lock, flags);
+
+       /* create packet and route */
+       c = 0;
+       skb = mctp_test_create_skb_data(&params->hdr, &c);
+       KUNIT_ASSERT_NOT_ERR_OR_NULL(test, skb);
+
+       skb->dev = dev->ndev;
+       __mctp_cb(skb);
+
+       rc = mctp_route_input(&rt->rt, skb);
+
+       /* (potentially) receive message */
+       skb2 = skb_recv_datagram(sock->sk, 0, 1, &rc);
+
+       if (params->deliver)
+               KUNIT_EXPECT_NOT_ERR_OR_NULL(test, skb2);
+       else
+               KUNIT_EXPECT_PTR_EQ(test, skb2, NULL);
+
+       if (skb2)
+               skb_free_datagram(sock->sk, skb2);
+
+       mctp_key_unref(key);
+       __mctp_route_test_fini(test, dev, rt, sock);
+}
+
+static const struct mctp_route_input_sk_keys_test mctp_route_input_sk_keys_tests[] = {
+       {
+               .name = "direct match",
+               .key_peer_addr = 9,
+               .key_local_addr = 8,
+               .key_tag = 1,
+               .hdr = RX_HDR(1, 9, 8, FL_S | FL_E | FL_T(1)),
+               .deliver = true,
+       },
+       {
+               .name = "flipped src/dest",
+               .key_peer_addr = 8,
+               .key_local_addr = 9,
+               .key_tag = 1,
+               .hdr = RX_HDR(1, 9, 8, FL_S | FL_E | FL_T(1)),
+               .deliver = false,
+       },
+       {
+               .name = "peer addr mismatch",
+               .key_peer_addr = 9,
+               .key_local_addr = 8,
+               .key_tag = 1,
+               .hdr = RX_HDR(1, 10, 8, FL_S | FL_E | FL_T(1)),
+               .deliver = false,
+       },
+       {
+               .name = "tag value mismatch",
+               .key_peer_addr = 9,
+               .key_local_addr = 8,
+               .key_tag = 1,
+               .hdr = RX_HDR(1, 9, 8, FL_S | FL_E | FL_T(2)),
+               .deliver = false,
+       },
+       {
+               .name = "TO mismatch",
+               .key_peer_addr = 9,
+               .key_local_addr = 8,
+               .key_tag = 1,
+               .hdr = RX_HDR(1, 9, 8, FL_S | FL_E | FL_T(1) | FL_TO),
+               .deliver = false,
+       },
+       {
+               .name = "broadcast response",
+               .key_peer_addr = MCTP_ADDR_ANY,
+               .key_local_addr = 8,
+               .key_tag = 1,
+               .hdr = RX_HDR(1, 11, 8, FL_S | FL_E | FL_T(1)),
+               .deliver = true,
+       },
+       {
+               .name = "any local match",
+               .key_peer_addr = 12,
+               .key_local_addr = MCTP_ADDR_ANY,
+               .key_tag = 1,
+               .hdr = RX_HDR(1, 12, 8, FL_S | FL_E | FL_T(1)),
+               .deliver = true,
+       },
+};
+
+static void mctp_route_input_sk_keys_to_desc(
+                               const struct mctp_route_input_sk_keys_test *t,
+                               char *desc)
+{
+       sprintf(desc, "%s", t->name);
+}
+
+KUNIT_ARRAY_PARAM(mctp_route_input_sk_keys, mctp_route_input_sk_keys_tests,
+                 mctp_route_input_sk_keys_to_desc);
+
 static struct kunit_case mctp_test_cases[] = {
        KUNIT_CASE_PARAM(mctp_test_fragment, mctp_frag_gen_params),
        KUNIT_CASE_PARAM(mctp_test_rx_input, mctp_rx_input_gen_params),
        KUNIT_CASE_PARAM(mctp_test_route_input_sk, mctp_route_input_sk_gen_params),
        KUNIT_CASE_PARAM(mctp_test_route_input_sk_reasm,
                         mctp_route_input_sk_reasm_gen_params),
+       KUNIT_CASE_PARAM(mctp_test_route_input_sk_keys,
+                        mctp_route_input_sk_keys_gen_params),
        {}
 };
 
index 645dd98..3e82ac2 100644 (file)
@@ -336,6 +336,8 @@ static void mptcp_parse_option(const struct sk_buff *skb,
                flags = *ptr++;
                mp_opt->reset_transient = flags & MPTCP_RST_TRANSIENT;
                mp_opt->reset_reason = *ptr;
+               pr_debug("MP_RST: transient=%u reason=%u",
+                        mp_opt->reset_transient, mp_opt->reset_reason);
                break;
 
        case MPTCPOPT_MP_FAIL:
@@ -1264,22 +1266,30 @@ static u16 mptcp_make_csum(const struct mptcp_ext *mpext)
 void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp,
                         struct mptcp_out_options *opts)
 {
-       if (unlikely(OPTION_MPTCP_FAIL & opts->suboptions)) {
-               const struct sock *ssk = (const struct sock *)tp;
-               struct mptcp_subflow_context *subflow;
-
-               subflow = mptcp_subflow_ctx(ssk);
-               subflow->send_mp_fail = 0;
-
-               *ptr++ = mptcp_option(MPTCPOPT_MP_FAIL,
-                                     TCPOLEN_MPTCP_FAIL,
-                                     0, 0);
-               put_unaligned_be64(opts->fail_seq, ptr);
-               ptr += 2;
-       }
-
-       /* DSS, MPC, MPJ, ADD_ADDR, FASTCLOSE and RST are mutually exclusive,
-        * see mptcp_established_options*()
+       const struct sock *ssk = (const struct sock *)tp;
+       struct mptcp_subflow_context *subflow;
+
+       /* Which options can be used together?
+        *
+        * X: mutually exclusive
+        * O: often used together
+        * C: can be used together in some cases
+        * P: could be used together but we prefer not to (optimisations)
+        *
+        *  Opt: | MPC  | MPJ  | DSS  | ADD  |  RM  | PRIO | FAIL |  FC  |
+        * ------|------|------|------|------|------|------|------|------|
+        *  MPC  |------|------|------|------|------|------|------|------|
+        *  MPJ  |  X   |------|------|------|------|------|------|------|
+        *  DSS  |  X   |  X   |------|------|------|------|------|------|
+        *  ADD  |  X   |  X   |  P   |------|------|------|------|------|
+        *  RM   |  C   |  C   |  C   |  P   |------|------|------|------|
+        *  PRIO |  X   |  C   |  C   |  C   |  C   |------|------|------|
+        *  FAIL |  X   |  X   |  C   |  X   |  X   |  X   |------|------|
+        *  FC   |  X   |  X   |  X   |  X   |  X   |  X   |  X   |------|
+        *  RST  |  X   |  X   |  X   |  X   |  X   |  X   |  O   |  O   |
+        * ------|------|------|------|------|------|------|------|------|
+        *
+        * The same applies in mptcp_established_options() function.
         */
        if (likely(OPTION_MPTCP_DSS & opts->suboptions)) {
                struct mptcp_ext *mpext = &opts->ext_copy;
@@ -1336,6 +1346,10 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp,
                        }
                        ptr += 1;
                }
+
+               /* We might need to add MP_FAIL options in rare cases */
+               if (unlikely(OPTION_MPTCP_FAIL & opts->suboptions))
+                       goto mp_fail;
        } else if (OPTIONS_MPTCP_MPC & opts->suboptions) {
                u8 len, flag = MPTCP_CAP_HMAC_SHA256;
 
@@ -1479,6 +1493,21 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp,
                if (OPTION_MPTCP_RST & opts->suboptions)
                        goto mp_rst;
                return;
+       } else if (unlikely(OPTION_MPTCP_FAIL & opts->suboptions)) {
+mp_fail:
+               /* MP_FAIL is mutually exclusive with others except RST */
+               subflow = mptcp_subflow_ctx(ssk);
+               subflow->send_mp_fail = 0;
+
+               *ptr++ = mptcp_option(MPTCPOPT_MP_FAIL,
+                                     TCPOLEN_MPTCP_FAIL,
+                                     0, 0);
+               put_unaligned_be64(opts->fail_seq, ptr);
+               ptr += 2;
+
+               if (OPTION_MPTCP_RST & opts->suboptions)
+                       goto mp_rst;
+               return;
        } else if (unlikely(OPTION_MPTCP_RST & opts->suboptions)) {
 mp_rst:
                *ptr++ = mptcp_option(MPTCPOPT_RST,
@@ -1489,9 +1518,6 @@ mp_rst:
        }
 
        if (OPTION_MPTCP_PRIO & opts->suboptions) {
-               const struct sock *ssk = (const struct sock *)tp;
-               struct mptcp_subflow_context *subflow;
-
                subflow = mptcp_subflow_ctx(ssk);
                subflow->send_mp_prio = 0;
 
index 356f596..e4fd54f 100644 (file)
@@ -1178,14 +1178,8 @@ skip_family:
        if (tb[MPTCP_PM_ADDR_ATTR_FLAGS])
                entry->flags = nla_get_u32(tb[MPTCP_PM_ADDR_ATTR_FLAGS]);
 
-       if (tb[MPTCP_PM_ADDR_ATTR_PORT]) {
-               if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) {
-                       NL_SET_ERR_MSG_ATTR(info->extack, attr,
-                                           "flags must have signal when using port");
-                       return -EINVAL;
-               }
+       if (tb[MPTCP_PM_ADDR_ATTR_PORT])
                entry->addr.port = htons(nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_PORT]));
-       }
 
        return 0;
 }
@@ -1231,6 +1225,11 @@ static int mptcp_nl_cmd_add_addr(struct sk_buff *skb, struct genl_info *info)
        if (ret < 0)
                return ret;
 
+       if (addr.addr.port && !(addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) {
+               GENL_SET_ERR_MSG(info, "flags must have signal when using port");
+               return -EINVAL;
+       }
+
        entry = kmalloc(sizeof(*entry), GFP_KERNEL);
        if (!entry) {
                GENL_SET_ERR_MSG(info, "can't allocate addr");
@@ -1732,9 +1731,20 @@ fail:
        return -EMSGSIZE;
 }
 
-static int mptcp_nl_addr_backup(struct net *net,
-                               struct mptcp_addr_info *addr,
-                               u8 bkup)
+static void mptcp_pm_nl_fullmesh(struct mptcp_sock *msk,
+                                struct mptcp_addr_info *addr)
+{
+       struct mptcp_rm_list list = { .nr = 0 };
+
+       list.ids[list.nr++] = addr->id;
+
+       mptcp_pm_nl_rm_subflow_received(msk, &list);
+       mptcp_pm_create_subflow_or_signal_addr(msk);
+}
+
+static int mptcp_nl_set_flags(struct net *net,
+                             struct mptcp_addr_info *addr,
+                             u8 bkup, u8 changed)
 {
        long s_slot = 0, s_num = 0;
        struct mptcp_sock *msk;
@@ -1748,7 +1758,10 @@ static int mptcp_nl_addr_backup(struct net *net,
 
                lock_sock(sk);
                spin_lock_bh(&msk->pm.lock);
-               ret = mptcp_pm_nl_mp_prio_send_ack(msk, addr, bkup);
+               if (changed & MPTCP_PM_ADDR_FLAG_BACKUP)
+                       ret = mptcp_pm_nl_mp_prio_send_ack(msk, addr, bkup);
+               if (changed & MPTCP_PM_ADDR_FLAG_FULLMESH)
+                       mptcp_pm_nl_fullmesh(msk, addr);
                spin_unlock_bh(&msk->pm.lock);
                release_sock(sk);
 
@@ -1765,6 +1778,8 @@ static int mptcp_nl_cmd_set_flags(struct sk_buff *skb, struct genl_info *info)
        struct mptcp_pm_addr_entry addr = { .addr = { .family = AF_UNSPEC }, }, *entry;
        struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR];
        struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
+       u8 changed, mask = MPTCP_PM_ADDR_FLAG_BACKUP |
+                          MPTCP_PM_ADDR_FLAG_FULLMESH;
        struct net *net = sock_net(skb->sk);
        u8 bkup = 0, lookup_by_id = 0;
        int ret;
@@ -1787,15 +1802,18 @@ static int mptcp_nl_cmd_set_flags(struct sk_buff *skb, struct genl_info *info)
                spin_unlock_bh(&pernet->lock);
                return -EINVAL;
        }
+       if ((addr.flags & MPTCP_PM_ADDR_FLAG_FULLMESH) &&
+           (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) {
+               spin_unlock_bh(&pernet->lock);
+               return -EINVAL;
+       }
 
-       if (bkup)
-               entry->flags |= MPTCP_PM_ADDR_FLAG_BACKUP;
-       else
-               entry->flags &= ~MPTCP_PM_ADDR_FLAG_BACKUP;
+       changed = (addr.flags ^ entry->flags) & mask;
+       entry->flags = (entry->flags & ~mask) | (addr.flags & mask);
        addr = *entry;
        spin_unlock_bh(&pernet->lock);
 
-       mptcp_nl_addr_backup(net, &addr.addr, bkup);
+       mptcp_nl_set_flags(net, &addr.addr, bkup, changed);
        return 0;
 }
 
index a135b1a..238b6a6 100644 (file)
@@ -14,6 +14,11 @@ nf_conntrack-$(CONFIG_NF_CONNTRACK_LABELS) += nf_conntrack_labels.o
 nf_conntrack-$(CONFIG_NF_CT_PROTO_DCCP) += nf_conntrack_proto_dccp.o
 nf_conntrack-$(CONFIG_NF_CT_PROTO_SCTP) += nf_conntrack_proto_sctp.o
 nf_conntrack-$(CONFIG_NF_CT_PROTO_GRE) += nf_conntrack_proto_gre.o
+ifeq ($(CONFIG_NF_CONNTRACK),m)
+nf_conntrack-$(CONFIG_DEBUG_INFO_BTF_MODULES) += nf_conntrack_bpf.o
+else ifeq ($(CONFIG_NF_CONNTRACK),y)
+nf_conntrack-$(CONFIG_DEBUG_INFO_BTF) += nf_conntrack_bpf.o
+endif
 
 obj-$(CONFIG_NETFILTER) = netfilter.o
 
index 354cb47..d1c9dfb 100644 (file)
@@ -621,7 +621,8 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state,
                case NF_ACCEPT:
                        break;
                case NF_DROP:
-                       kfree_skb(skb);
+                       kfree_skb_reason(skb,
+                                        SKB_DROP_REASON_NETFILTER_DROP);
                        ret = NF_DROP_GETERR(verdict);
                        if (ret == 0)
                                ret = -EPERM;
index 91bc8df..385a5f4 100644 (file)
@@ -22,26 +22,7 @@ static bool nf_ct_acct __read_mostly;
 module_param_named(acct, nf_ct_acct, bool, 0644);
 MODULE_PARM_DESC(acct, "Enable connection tracking flow accounting.");
 
-static const struct nf_ct_ext_type acct_extend = {
-       .len    = sizeof(struct nf_conn_acct),
-       .align  = __alignof__(struct nf_conn_acct),
-       .id     = NF_CT_EXT_ACCT,
-};
-
 void nf_conntrack_acct_pernet_init(struct net *net)
 {
        net->ct.sysctl_acct = nf_ct_acct;
 }
-
-int nf_conntrack_acct_init(void)
-{
-       int ret = nf_ct_extend_register(&acct_extend);
-       if (ret < 0)
-               pr_err("Unable to register extension\n");
-       return ret;
-}
-
-void nf_conntrack_acct_fini(void)
-{
-       nf_ct_extend_unregister(&acct_extend);
-}
diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c
new file mode 100644 (file)
index 0000000..8ad3f52
--- /dev/null
@@ -0,0 +1,257 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Unstable Conntrack Helpers for XDP and TC-BPF hook
+ *
+ * These are called from the XDP and SCHED_CLS BPF programs. Note that it is
+ * allowed to break compatibility for these functions since the interface they
+ * are exposed through to BPF programs is explicitly unstable.
+ */
+
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/types.h>
+#include <linux/btf_ids.h>
+#include <linux/net_namespace.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+
+/* bpf_ct_opts - Options for CT lookup helpers
+ *
+ * Members:
+ * @netns_id   - Specify the network namespace for lookup
+ *              Values:
+ *                BPF_F_CURRENT_NETNS (-1)
+ *                  Use namespace associated with ctx (xdp_md, __sk_buff)
+ *                [0, S32_MAX]
+ *                  Network Namespace ID
+ * @error      - Out parameter, set for any errors encountered
+ *              Values:
+ *                -EINVAL - Passed NULL for bpf_tuple pointer
+ *                -EINVAL - opts->reserved is not 0
+ *                -EINVAL - netns_id is less than -1
+ *                -EINVAL - opts__sz isn't NF_BPF_CT_OPTS_SZ (12)
+ *                -EPROTO - l4proto isn't one of IPPROTO_TCP or IPPROTO_UDP
+ *                -ENONET - No network namespace found for netns_id
+ *                -ENOENT - Conntrack lookup could not find entry for tuple
+ *                -EAFNOSUPPORT - tuple__sz isn't one of sizeof(tuple->ipv4)
+ *                                or sizeof(tuple->ipv6)
+ * @l4proto    - Layer 4 protocol
+ *              Values:
+ *                IPPROTO_TCP, IPPROTO_UDP
+ * @reserved   - Reserved member, will be reused for more options in future
+ *              Values:
+ *                0
+ */
+struct bpf_ct_opts {
+       s32 netns_id;
+       s32 error;
+       u8 l4proto;
+       u8 reserved[3];
+};
+
+enum {
+       NF_BPF_CT_OPTS_SZ = 12,
+};
+
+static struct nf_conn *__bpf_nf_ct_lookup(struct net *net,
+                                         struct bpf_sock_tuple *bpf_tuple,
+                                         u32 tuple_len, u8 protonum,
+                                         s32 netns_id)
+{
+       struct nf_conntrack_tuple_hash *hash;
+       struct nf_conntrack_tuple tuple;
+
+       if (unlikely(protonum != IPPROTO_TCP && protonum != IPPROTO_UDP))
+               return ERR_PTR(-EPROTO);
+       if (unlikely(netns_id < BPF_F_CURRENT_NETNS))
+               return ERR_PTR(-EINVAL);
+
+       memset(&tuple, 0, sizeof(tuple));
+       switch (tuple_len) {
+       case sizeof(bpf_tuple->ipv4):
+               tuple.src.l3num = AF_INET;
+               tuple.src.u3.ip = bpf_tuple->ipv4.saddr;
+               tuple.src.u.tcp.port = bpf_tuple->ipv4.sport;
+               tuple.dst.u3.ip = bpf_tuple->ipv4.daddr;
+               tuple.dst.u.tcp.port = bpf_tuple->ipv4.dport;
+               break;
+       case sizeof(bpf_tuple->ipv6):
+               tuple.src.l3num = AF_INET6;
+               memcpy(tuple.src.u3.ip6, bpf_tuple->ipv6.saddr, sizeof(bpf_tuple->ipv6.saddr));
+               tuple.src.u.tcp.port = bpf_tuple->ipv6.sport;
+               memcpy(tuple.dst.u3.ip6, bpf_tuple->ipv6.daddr, sizeof(bpf_tuple->ipv6.daddr));
+               tuple.dst.u.tcp.port = bpf_tuple->ipv6.dport;
+               break;
+       default:
+               return ERR_PTR(-EAFNOSUPPORT);
+       }
+
+       tuple.dst.protonum = protonum;
+
+       if (netns_id >= 0) {
+               net = get_net_ns_by_id(net, netns_id);
+               if (unlikely(!net))
+                       return ERR_PTR(-ENONET);
+       }
+
+       hash = nf_conntrack_find_get(net, &nf_ct_zone_dflt, &tuple);
+       if (netns_id >= 0)
+               put_net(net);
+       if (!hash)
+               return ERR_PTR(-ENOENT);
+       return nf_ct_tuplehash_to_ctrack(hash);
+}
+
+__diag_push();
+__diag_ignore(GCC, 8, "-Wmissing-prototypes",
+             "Global functions as their definitions will be in nf_conntrack BTF");
+
+/* bpf_xdp_ct_lookup - Lookup CT entry for the given tuple, and acquire a
+ *                    reference to it
+ *
+ * Parameters:
+ * @xdp_ctx    - Pointer to ctx (xdp_md) in XDP program
+ *                 Cannot be NULL
+ * @bpf_tuple  - Pointer to memory representing the tuple to look up
+ *                 Cannot be NULL
+ * @tuple__sz  - Length of the tuple structure
+ *                 Must be one of sizeof(bpf_tuple->ipv4) or
+ *                 sizeof(bpf_tuple->ipv6)
+ * @opts       - Additional options for lookup (documented above)
+ *                 Cannot be NULL
+ * @opts__sz   - Length of the bpf_ct_opts structure
+ *                 Must be NF_BPF_CT_OPTS_SZ (12)
+ */
+struct nf_conn *
+bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple,
+                 u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz)
+{
+       struct xdp_buff *ctx = (struct xdp_buff *)xdp_ctx;
+       struct net *caller_net;
+       struct nf_conn *nfct;
+
+       BUILD_BUG_ON(sizeof(struct bpf_ct_opts) != NF_BPF_CT_OPTS_SZ);
+
+       if (!opts)
+               return NULL;
+       if (!bpf_tuple || opts->reserved[0] || opts->reserved[1] ||
+           opts->reserved[2] || opts__sz != NF_BPF_CT_OPTS_SZ) {
+               opts->error = -EINVAL;
+               return NULL;
+       }
+       caller_net = dev_net(ctx->rxq->dev);
+       nfct = __bpf_nf_ct_lookup(caller_net, bpf_tuple, tuple__sz, opts->l4proto,
+                                 opts->netns_id);
+       if (IS_ERR(nfct)) {
+               opts->error = PTR_ERR(nfct);
+               return NULL;
+       }
+       return nfct;
+}
+
+/* bpf_skb_ct_lookup - Lookup CT entry for the given tuple, and acquire a
+ *                    reference to it
+ *
+ * Parameters:
+ * @skb_ctx    - Pointer to ctx (__sk_buff) in TC program
+ *                 Cannot be NULL
+ * @bpf_tuple  - Pointer to memory representing the tuple to look up
+ *                 Cannot be NULL
+ * @tuple__sz  - Length of the tuple structure
+ *                 Must be one of sizeof(bpf_tuple->ipv4) or
+ *                 sizeof(bpf_tuple->ipv6)
+ * @opts       - Additional options for lookup (documented above)
+ *                 Cannot be NULL
+ * @opts__sz   - Length of the bpf_ct_opts structure
+ *                 Must be NF_BPF_CT_OPTS_SZ (12)
+ */
+struct nf_conn *
+bpf_skb_ct_lookup(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple,
+                 u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz)
+{
+       struct sk_buff *skb = (struct sk_buff *)skb_ctx;
+       struct net *caller_net;
+       struct nf_conn *nfct;
+
+       BUILD_BUG_ON(sizeof(struct bpf_ct_opts) != NF_BPF_CT_OPTS_SZ);
+
+       if (!opts)
+               return NULL;
+       if (!bpf_tuple || opts->reserved[0] || opts->reserved[1] ||
+           opts->reserved[2] || opts__sz != NF_BPF_CT_OPTS_SZ) {
+               opts->error = -EINVAL;
+               return NULL;
+       }
+       caller_net = skb->dev ? dev_net(skb->dev) : sock_net(skb->sk);
+       nfct = __bpf_nf_ct_lookup(caller_net, bpf_tuple, tuple__sz, opts->l4proto,
+                                 opts->netns_id);
+       if (IS_ERR(nfct)) {
+               opts->error = PTR_ERR(nfct);
+               return NULL;
+       }
+       return nfct;
+}
+
+/* bpf_ct_release - Release acquired nf_conn object
+ *
+ * This must be invoked for referenced PTR_TO_BTF_ID, and the verifier rejects
+ * the program if any references remain in the program in all of the explored
+ * states.
+ *
+ * Parameters:
+ * @nf_conn     - Pointer to referenced nf_conn object, obtained using
+ *                bpf_xdp_ct_lookup or bpf_skb_ct_lookup.
+ */
+void bpf_ct_release(struct nf_conn *nfct)
+{
+       if (!nfct)
+               return;
+       nf_ct_put(nfct);
+}
+
+__diag_pop()
+
+BTF_SET_START(nf_ct_xdp_check_kfunc_ids)
+BTF_ID(func, bpf_xdp_ct_lookup)
+BTF_ID(func, bpf_ct_release)
+BTF_SET_END(nf_ct_xdp_check_kfunc_ids)
+
+BTF_SET_START(nf_ct_tc_check_kfunc_ids)
+BTF_ID(func, bpf_skb_ct_lookup)
+BTF_ID(func, bpf_ct_release)
+BTF_SET_END(nf_ct_tc_check_kfunc_ids)
+
+BTF_SET_START(nf_ct_acquire_kfunc_ids)
+BTF_ID(func, bpf_xdp_ct_lookup)
+BTF_ID(func, bpf_skb_ct_lookup)
+BTF_SET_END(nf_ct_acquire_kfunc_ids)
+
+BTF_SET_START(nf_ct_release_kfunc_ids)
+BTF_ID(func, bpf_ct_release)
+BTF_SET_END(nf_ct_release_kfunc_ids)
+
+/* Both sets are identical */
+#define nf_ct_ret_null_kfunc_ids nf_ct_acquire_kfunc_ids
+
+static const struct btf_kfunc_id_set nf_conntrack_xdp_kfunc_set = {
+       .owner        = THIS_MODULE,
+       .check_set    = &nf_ct_xdp_check_kfunc_ids,
+       .acquire_set  = &nf_ct_acquire_kfunc_ids,
+       .release_set  = &nf_ct_release_kfunc_ids,
+       .ret_null_set = &nf_ct_ret_null_kfunc_ids,
+};
+
+static const struct btf_kfunc_id_set nf_conntrack_tc_kfunc_set = {
+       .owner        = THIS_MODULE,
+       .check_set    = &nf_ct_tc_check_kfunc_ids,
+       .acquire_set  = &nf_ct_acquire_kfunc_ids,
+       .release_set  = &nf_ct_release_kfunc_ids,
+       .ret_null_set = &nf_ct_ret_null_kfunc_ids,
+};
+
+int register_nf_conntrack_bpf(void)
+{
+       int ret;
+
+       ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &nf_conntrack_xdp_kfunc_set);
+       return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &nf_conntrack_tc_kfunc_set);
+}
index d6aa5b4..9b7f9c9 100644 (file)
 #include <linux/rculist_nulls.h>
 
 #include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_bpf.h>
 #include <net/netfilter/nf_conntrack_l4proto.h>
 #include <net/netfilter/nf_conntrack_expect.h>
 #include <net/netfilter/nf_conntrack_helper.h>
-#include <net/netfilter/nf_conntrack_seqadj.h>
 #include <net/netfilter/nf_conntrack_core.h>
 #include <net/netfilter/nf_conntrack_extend.h>
 #include <net/netfilter/nf_conntrack_acct.h>
@@ -47,7 +47,6 @@
 #include <net/netfilter/nf_conntrack_timeout.h>
 #include <net/netfilter/nf_conntrack_labels.h>
 #include <net/netfilter/nf_conntrack_synproxy.h>
-#include <net/netfilter/nf_conntrack_act_ct.h>
 #include <net/netfilter/nf_nat.h>
 #include <net/netfilter/nf_nat_helper.h>
 #include <net/netns/hash.h>
@@ -594,7 +593,7 @@ EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc);
 
 void nf_ct_tmpl_free(struct nf_conn *tmpl)
 {
-       nf_ct_ext_destroy(tmpl);
+       kfree(tmpl->ext);
 
        if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK)
                kfree((char *)tmpl - tmpl->proto.tmpl_padto);
@@ -1597,7 +1596,17 @@ void nf_conntrack_free(struct nf_conn *ct)
         */
        WARN_ON(refcount_read(&ct->ct_general.use) != 0);
 
-       nf_ct_ext_destroy(ct);
+       if (ct->status & IPS_SRC_NAT_DONE) {
+               const struct nf_nat_hook *nat_hook;
+
+               rcu_read_lock();
+               nat_hook = rcu_dereference(nf_nat_hook);
+               if (nat_hook)
+                       nat_hook->remove_nat_bysrc(ct);
+               rcu_read_unlock();
+       }
+
+       kfree(ct->ext);
        kmem_cache_free(nf_conntrack_cachep, ct);
        cnet = nf_ct_pernet(net);
 
@@ -2467,13 +2476,7 @@ void nf_conntrack_cleanup_end(void)
        kvfree(nf_conntrack_hash);
 
        nf_conntrack_proto_fini();
-       nf_conntrack_seqadj_fini();
-       nf_conntrack_labels_fini();
        nf_conntrack_helper_fini();
-       nf_conntrack_timeout_fini();
-       nf_conntrack_ecache_fini();
-       nf_conntrack_tstamp_fini();
-       nf_conntrack_acct_fini();
        nf_conntrack_expect_fini();
 
        kmem_cache_destroy(nf_conntrack_cachep);
@@ -2628,39 +2631,6 @@ int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp)
        return nf_conntrack_hash_resize(hashsize);
 }
 
-static __always_inline unsigned int total_extension_size(void)
-{
-       /* remember to add new extensions below */
-       BUILD_BUG_ON(NF_CT_EXT_NUM > 10);
-
-       return sizeof(struct nf_ct_ext) +
-              sizeof(struct nf_conn_help)
-#if IS_ENABLED(CONFIG_NF_NAT)
-               + sizeof(struct nf_conn_nat)
-#endif
-               + sizeof(struct nf_conn_seqadj)
-               + sizeof(struct nf_conn_acct)
-#ifdef CONFIG_NF_CONNTRACK_EVENTS
-               + sizeof(struct nf_conntrack_ecache)
-#endif
-#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
-               + sizeof(struct nf_conn_tstamp)
-#endif
-#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
-               + sizeof(struct nf_conn_timeout)
-#endif
-#ifdef CONFIG_NF_CONNTRACK_LABELS
-               + sizeof(struct nf_conn_labels)
-#endif
-#if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY)
-               + sizeof(struct nf_conn_synproxy)
-#endif
-#if IS_ENABLED(CONFIG_NET_ACT_CT)
-               + sizeof(struct nf_conn_act_ct_ext)
-#endif
-       ;
-};
-
 int nf_conntrack_init_start(void)
 {
        unsigned long nr_pages = totalram_pages();
@@ -2668,9 +2638,6 @@ int nf_conntrack_init_start(void)
        int ret = -ENOMEM;
        int i;
 
-       /* struct nf_ct_ext uses u8 to store offsets/size */
-       BUILD_BUG_ON(total_extension_size() > 255u);
-
        seqcount_spinlock_init(&nf_conntrack_generation,
                               &nf_conntrack_locks_all_lock);
 
@@ -2715,34 +2682,10 @@ int nf_conntrack_init_start(void)
        if (ret < 0)
                goto err_expect;
 
-       ret = nf_conntrack_acct_init();
-       if (ret < 0)
-               goto err_acct;
-
-       ret = nf_conntrack_tstamp_init();
-       if (ret < 0)
-               goto err_tstamp;
-
-       ret = nf_conntrack_ecache_init();
-       if (ret < 0)
-               goto err_ecache;
-
-       ret = nf_conntrack_timeout_init();
-       if (ret < 0)
-               goto err_timeout;
-
        ret = nf_conntrack_helper_init();
        if (ret < 0)
                goto err_helper;
 
-       ret = nf_conntrack_labels_init();
-       if (ret < 0)
-               goto err_labels;
-
-       ret = nf_conntrack_seqadj_init();
-       if (ret < 0)
-               goto err_seqadj;
-
        ret = nf_conntrack_proto_init();
        if (ret < 0)
                goto err_proto;
@@ -2750,23 +2693,18 @@ int nf_conntrack_init_start(void)
        conntrack_gc_work_init(&conntrack_gc_work);
        queue_delayed_work(system_power_efficient_wq, &conntrack_gc_work.dwork, HZ);
 
+       ret = register_nf_conntrack_bpf();
+       if (ret < 0)
+               goto err_kfunc;
+
        return 0;
 
+err_kfunc:
+       cancel_delayed_work_sync(&conntrack_gc_work.dwork);
+       nf_conntrack_proto_fini();
 err_proto:
-       nf_conntrack_seqadj_fini();
-err_seqadj:
-       nf_conntrack_labels_fini();
-err_labels:
        nf_conntrack_helper_fini();
 err_helper:
-       nf_conntrack_timeout_fini();
-err_timeout:
-       nf_conntrack_ecache_fini();
-err_ecache:
-       nf_conntrack_tstamp_fini();
-err_tstamp:
-       nf_conntrack_acct_fini();
-err_acct:
        nf_conntrack_expect_fini();
 err_expect:
        kmem_cache_destroy(nf_conntrack_cachep);
index 41768ff..07e65b4 100644 (file)
@@ -131,13 +131,13 @@ static void ecache_work(struct work_struct *work)
 }
 
 static int __nf_conntrack_eventmask_report(struct nf_conntrack_ecache *e,
-                                          const unsigned int events,
-                                          const unsigned long missed,
+                                          const u32 events,
+                                          const u32 missed,
                                           const struct nf_ct_event *item)
 {
-       struct nf_conn *ct = item->ct;
        struct net *net = nf_ct_net(item->ct);
        struct nf_ct_event_notifier *notify;
+       u32 old, want;
        int ret;
 
        if (!((events | missed) & e->ctmask))
@@ -157,12 +157,13 @@ static int __nf_conntrack_eventmask_report(struct nf_conntrack_ecache *e,
        if (likely(ret >= 0 && missed == 0))
                return 0;
 
-       spin_lock_bh(&ct->lock);
-       if (ret < 0)
-               e->missed |= events;
-       else
-               e->missed &= ~missed;
-       spin_unlock_bh(&ct->lock);
+       do {
+               old = READ_ONCE(e->missed);
+               if (ret < 0)
+                       want = old | events;
+               else
+                       want = old & ~missed;
+       } while (cmpxchg(&e->missed, old, want) != old);
 
        return ret;
 }
@@ -172,7 +173,7 @@ int nf_conntrack_eventmask_report(unsigned int events, struct nf_conn *ct,
 {
        struct nf_conntrack_ecache *e;
        struct nf_ct_event item;
-       unsigned long missed;
+       unsigned int missed;
        int ret;
 
        if (!nf_ct_is_confirmed(ct))
@@ -211,7 +212,7 @@ void nf_ct_deliver_cached_events(struct nf_conn *ct)
 {
        struct nf_conntrack_ecache *e;
        struct nf_ct_event item;
-       unsigned long events;
+       unsigned int events;
 
        if (!nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct))
                return;
@@ -304,12 +305,6 @@ void nf_conntrack_ecache_work(struct net *net, enum nf_ct_ecache_state state)
 #define NF_CT_EVENTS_DEFAULT 1
 static int nf_ct_events __read_mostly = NF_CT_EVENTS_DEFAULT;
 
-static const struct nf_ct_ext_type event_extend = {
-       .len    = sizeof(struct nf_conntrack_ecache),
-       .align  = __alignof__(struct nf_conntrack_ecache),
-       .id     = NF_CT_EXT_ECACHE,
-};
-
 void nf_conntrack_ecache_pernet_init(struct net *net)
 {
        struct nf_conntrack_net *cnet = nf_ct_pernet(net);
@@ -317,6 +312,8 @@ void nf_conntrack_ecache_pernet_init(struct net *net)
        net->ct.sysctl_events = nf_ct_events;
        cnet->ct_net = &net->ct;
        INIT_DELAYED_WORK(&cnet->ecache_dwork, ecache_work);
+
+       BUILD_BUG_ON(__IPCT_MAX >= 16); /* e->ctmask is u16 */
 }
 
 void nf_conntrack_ecache_pernet_fini(struct net *net)
@@ -325,19 +322,3 @@ void nf_conntrack_ecache_pernet_fini(struct net *net)
 
        cancel_delayed_work_sync(&cnet->ecache_dwork);
 }
-
-int nf_conntrack_ecache_init(void)
-{
-       int ret = nf_ct_extend_register(&event_extend);
-       if (ret < 0)
-               pr_err("Unable to register event extension\n");
-
-       BUILD_BUG_ON(__IPCT_MAX >= 16); /* ctmask, missed use u16 */
-
-       return ret;
-}
-
-void nf_conntrack_ecache_fini(void)
-{
-       nf_ct_extend_unregister(&event_extend);
-}
index 3dbe232..1296fda 100644 (file)
 #include <linux/skbuff.h>
 #include <net/netfilter/nf_conntrack_extend.h>
 
-static struct nf_ct_ext_type __rcu *nf_ct_ext_types[NF_CT_EXT_NUM];
-static DEFINE_MUTEX(nf_ct_ext_type_mutex);
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_acct.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/nf_conntrack_timestamp.h>
+#include <net/netfilter/nf_conntrack_timeout.h>
+#include <net/netfilter/nf_conntrack_labels.h>
+#include <net/netfilter/nf_conntrack_synproxy.h>
+#include <net/netfilter/nf_conntrack_act_ct.h>
+#include <net/netfilter/nf_nat.h>
+
 #define NF_CT_EXT_PREALLOC     128u /* conntrack events are on by default */
 
-void nf_ct_ext_destroy(struct nf_conn *ct)
+static const u8 nf_ct_ext_type_len[NF_CT_EXT_NUM] = {
+       [NF_CT_EXT_HELPER] = sizeof(struct nf_conn_help),
+#if IS_ENABLED(CONFIG_NF_NAT)
+       [NF_CT_EXT_NAT] = sizeof(struct nf_conn_nat),
+#endif
+       [NF_CT_EXT_SEQADJ] = sizeof(struct nf_conn_seqadj),
+       [NF_CT_EXT_ACCT] = sizeof(struct nf_conn_acct),
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+       [NF_CT_EXT_ECACHE] = sizeof(struct nf_conntrack_ecache),
+#endif
+#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
+       [NF_CT_EXT_TSTAMP] = sizeof(struct nf_conn_acct),
+#endif
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
+       [NF_CT_EXT_TIMEOUT] = sizeof(struct nf_conn_tstamp),
+#endif
+#ifdef CONFIG_NF_CONNTRACK_LABELS
+       [NF_CT_EXT_LABELS] = sizeof(struct nf_conn_labels),
+#endif
+#if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY)
+       [NF_CT_EXT_SYNPROXY] = sizeof(struct nf_conn_synproxy),
+#endif
+#if IS_ENABLED(CONFIG_NET_ACT_CT)
+       [NF_CT_EXT_ACT_CT] = sizeof(struct nf_conn_act_ct_ext),
+#endif
+};
+
+static __always_inline unsigned int total_extension_size(void)
 {
-       unsigned int i;
-       struct nf_ct_ext_type *t;
-
-       for (i = 0; i < NF_CT_EXT_NUM; i++) {
-               rcu_read_lock();
-               t = rcu_dereference(nf_ct_ext_types[i]);
-
-               /* Here the nf_ct_ext_type might have been unregisterd.
-                * I.e., it has responsible to cleanup private
-                * area in all conntracks when it is unregisterd.
-                */
-               if (t && t->destroy)
-                       t->destroy(ct);
-               rcu_read_unlock();
-       }
-
-       kfree(ct->ext);
+       /* remember to add new extensions below */
+       BUILD_BUG_ON(NF_CT_EXT_NUM > 10);
+
+       return sizeof(struct nf_ct_ext) +
+              sizeof(struct nf_conn_help)
+#if IS_ENABLED(CONFIG_NF_NAT)
+               + sizeof(struct nf_conn_nat)
+#endif
+               + sizeof(struct nf_conn_seqadj)
+               + sizeof(struct nf_conn_acct)
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+               + sizeof(struct nf_conntrack_ecache)
+#endif
+#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
+               + sizeof(struct nf_conn_tstamp)
+#endif
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
+               + sizeof(struct nf_conn_timeout)
+#endif
+#ifdef CONFIG_NF_CONNTRACK_LABELS
+               + sizeof(struct nf_conn_labels)
+#endif
+#if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY)
+               + sizeof(struct nf_conn_synproxy)
+#endif
+#if IS_ENABLED(CONFIG_NET_ACT_CT)
+               + sizeof(struct nf_conn_act_ct_ext)
+#endif
+       ;
 }
 
 void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp)
 {
        unsigned int newlen, newoff, oldlen, alloc;
-       struct nf_ct_ext_type *t;
        struct nf_ct_ext *new;
 
        /* Conntrack must not be confirmed to avoid races on reallocation. */
        WARN_ON(nf_ct_is_confirmed(ct));
 
+       /* struct nf_ct_ext uses u8 to store offsets/size */
+       BUILD_BUG_ON(total_extension_size() > 255u);
 
        if (ct->ext) {
                const struct nf_ct_ext *old = ct->ext;
@@ -58,16 +108,8 @@ void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp)
                oldlen = sizeof(*new);
        }
 
-       rcu_read_lock();
-       t = rcu_dereference(nf_ct_ext_types[id]);
-       if (!t) {
-               rcu_read_unlock();
-               return NULL;
-       }
-
-       newoff = ALIGN(oldlen, t->align);
-       newlen = newoff + t->len;
-       rcu_read_unlock();
+       newoff = ALIGN(oldlen, __alignof__(struct nf_ct_ext));
+       newlen = newoff + nf_ct_ext_type_len[id];
 
        alloc = max(newlen, NF_CT_EXT_PREALLOC);
        new = krealloc(ct->ext, alloc, gfp);
@@ -85,31 +127,3 @@ void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp)
        return (void *)new + newoff;
 }
 EXPORT_SYMBOL(nf_ct_ext_add);
-
-/* This MUST be called in process context. */
-int nf_ct_extend_register(const struct nf_ct_ext_type *type)
-{
-       int ret = 0;
-
-       mutex_lock(&nf_ct_ext_type_mutex);
-       if (nf_ct_ext_types[type->id]) {
-               ret = -EBUSY;
-               goto out;
-       }
-
-       rcu_assign_pointer(nf_ct_ext_types[type->id], type);
-out:
-       mutex_unlock(&nf_ct_ext_type_mutex);
-       return ret;
-}
-EXPORT_SYMBOL_GPL(nf_ct_extend_register);
-
-/* This MUST be called in process context. */
-void nf_ct_extend_unregister(const struct nf_ct_ext_type *type)
-{
-       mutex_lock(&nf_ct_ext_type_mutex);
-       RCU_INIT_POINTER(nf_ct_ext_types[type->id], NULL);
-       mutex_unlock(&nf_ct_ext_type_mutex);
-       synchronize_rcu();
-}
-EXPORT_SYMBOL_GPL(nf_ct_extend_unregister);
index ae4488a..a97ddb1 100644 (file)
@@ -550,12 +550,6 @@ void nf_nat_helper_unregister(struct nf_conntrack_nat_helper *nat)
 }
 EXPORT_SYMBOL_GPL(nf_nat_helper_unregister);
 
-static const struct nf_ct_ext_type helper_extend = {
-       .len    = sizeof(struct nf_conn_help),
-       .align  = __alignof__(struct nf_conn_help),
-       .id     = NF_CT_EXT_HELPER,
-};
-
 void nf_conntrack_helper_pernet_init(struct net *net)
 {
        struct nf_conntrack_net *cnet = nf_ct_pernet(net);
@@ -565,28 +559,17 @@ void nf_conntrack_helper_pernet_init(struct net *net)
 
 int nf_conntrack_helper_init(void)
 {
-       int ret;
        nf_ct_helper_hsize = 1; /* gets rounded up to use one page */
        nf_ct_helper_hash =
                nf_ct_alloc_hashtable(&nf_ct_helper_hsize, 0);
        if (!nf_ct_helper_hash)
                return -ENOMEM;
 
-       ret = nf_ct_extend_register(&helper_extend);
-       if (ret < 0) {
-               pr_err("nf_ct_helper: Unable to register helper extension.\n");
-               goto out_extend;
-       }
-
        INIT_LIST_HEAD(&nf_ct_nat_helpers);
        return 0;
-out_extend:
-       kvfree(nf_ct_helper_hash);
-       return ret;
 }
 
 void nf_conntrack_helper_fini(void)
 {
-       nf_ct_extend_unregister(&helper_extend);
        kvfree(nf_ct_helper_hash);
 }
index 5227925..6e70e13 100644 (file)
@@ -67,6 +67,8 @@ int nf_connlabels_get(struct net *net, unsigned int bits)
        net->ct.labels_used++;
        spin_unlock(&nf_connlabels_lock);
 
+       BUILD_BUG_ON(NF_CT_LABELS_MAX_SIZE / sizeof(long) >= U8_MAX);
+
        return 0;
 }
 EXPORT_SYMBOL_GPL(nf_connlabels_get);
@@ -78,21 +80,3 @@ void nf_connlabels_put(struct net *net)
        spin_unlock(&nf_connlabels_lock);
 }
 EXPORT_SYMBOL_GPL(nf_connlabels_put);
-
-static const struct nf_ct_ext_type labels_extend = {
-       .len    = sizeof(struct nf_conn_labels),
-       .align  = __alignof__(struct nf_conn_labels),
-       .id     = NF_CT_EXT_LABELS,
-};
-
-int nf_conntrack_labels_init(void)
-{
-       BUILD_BUG_ON(NF_CT_LABELS_MAX_SIZE / sizeof(long) >= U8_MAX);
-
-       return nf_ct_extend_register(&labels_extend);
-}
-
-void nf_conntrack_labels_fini(void)
-{
-       nf_ct_extend_unregister(&labels_extend);
-}
index 7032402..1ea2ad7 100644 (file)
 
 MODULE_LICENSE("GPL");
 
+struct ctnetlink_list_dump_ctx {
+       struct nf_conn *last;
+       unsigned int cpu;
+       bool done;
+};
+
 static int ctnetlink_dump_tuples_proto(struct sk_buff *skb,
                                const struct nf_conntrack_tuple *tuple,
                                const struct nf_conntrack_l4proto *l4proto)
@@ -1694,14 +1700,18 @@ static int ctnetlink_get_conntrack(struct sk_buff *skb,
 
 static int ctnetlink_done_list(struct netlink_callback *cb)
 {
-       if (cb->args[1])
-               nf_ct_put((struct nf_conn *)cb->args[1]);
+       struct ctnetlink_list_dump_ctx *ctx = (void *)cb->ctx;
+
+       if (ctx->last)
+               nf_ct_put(ctx->last);
+
        return 0;
 }
 
 static int
 ctnetlink_dump_list(struct sk_buff *skb, struct netlink_callback *cb, bool dying)
 {
+       struct ctnetlink_list_dump_ctx *ctx = (void *)cb->ctx;
        struct nf_conn *ct, *last;
        struct nf_conntrack_tuple_hash *h;
        struct hlist_nulls_node *n;
@@ -1712,12 +1722,12 @@ ctnetlink_dump_list(struct sk_buff *skb, struct netlink_callback *cb, bool dying
        struct hlist_nulls_head *list;
        struct net *net = sock_net(skb->sk);
 
-       if (cb->args[2])
+       if (ctx->done)
                return 0;
 
-       last = (struct nf_conn *)cb->args[1];
+       last = ctx->last;
 
-       for (cpu = cb->args[0]; cpu < nr_cpu_ids; cpu++) {
+       for (cpu = ctx->cpu; cpu < nr_cpu_ids; cpu++) {
                struct ct_pcpu *pcpu;
 
                if (!cpu_possible(cpu))
@@ -1731,10 +1741,10 @@ restart:
                        ct = nf_ct_tuplehash_to_ctrack(h);
                        if (l3proto && nf_ct_l3num(ct) != l3proto)
                                continue;
-                       if (cb->args[1]) {
+                       if (ctx->last) {
                                if (ct != last)
                                        continue;
-                               cb->args[1] = 0;
+                               ctx->last = NULL;
                        }
 
                        /* We can't dump extension info for the unconfirmed
@@ -1751,19 +1761,19 @@ restart:
                        if (res < 0) {
                                if (!refcount_inc_not_zero(&ct->ct_general.use))
                                        continue;
-                               cb->args[0] = cpu;
-                               cb->args[1] = (unsigned long)ct;
+                               ctx->cpu = cpu;
+                               ctx->last = ct;
                                spin_unlock_bh(&pcpu->lock);
                                goto out;
                        }
                }
-               if (cb->args[1]) {
-                       cb->args[1] = 0;
+               if (ctx->last) {
+                       ctx->last = NULL;
                        goto restart;
                }
                spin_unlock_bh(&pcpu->lock);
        }
-       cb->args[2] = 1;
+       ctx->done = true;
 out:
        if (last)
                nf_ct_put(last);
@@ -3878,6 +3888,8 @@ static int __init ctnetlink_init(void)
 {
        int ret;
 
+       BUILD_BUG_ON(sizeof(struct ctnetlink_list_dump_ctx) > sizeof_field(struct netlink_callback, ctx));
+
        ret = nfnetlink_subsys_register(&ctnl_subsys);
        if (ret < 0) {
                pr_err("ctnetlink_init: cannot register with nfnetlink.\n");
index 7d5708b..f3fa367 100644 (file)
@@ -45,30 +45,8 @@ MODULE_ALIAS_NFCT_HELPER("pptp");
 
 static DEFINE_SPINLOCK(nf_pptp_lock);
 
-int
-(*nf_nat_pptp_hook_outbound)(struct sk_buff *skb,
-                            struct nf_conn *ct, enum ip_conntrack_info ctinfo,
-                            unsigned int protoff, struct PptpControlHeader *ctlh,
-                            union pptp_ctrl_union *pptpReq) __read_mostly;
-EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_outbound);
-
-int
-(*nf_nat_pptp_hook_inbound)(struct sk_buff *skb,
-                           struct nf_conn *ct, enum ip_conntrack_info ctinfo,
-                           unsigned int protoff, struct PptpControlHeader *ctlh,
-                           union pptp_ctrl_union *pptpReq) __read_mostly;
-EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_inbound);
-
-void
-(*nf_nat_pptp_hook_exp_gre)(struct nf_conntrack_expect *expect_orig,
-                           struct nf_conntrack_expect *expect_reply)
-                           __read_mostly;
-EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_exp_gre);
-
-void
-(*nf_nat_pptp_hook_expectfn)(struct nf_conn *ct,
-                            struct nf_conntrack_expect *exp) __read_mostly;
-EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_expectfn);
+const struct nf_nat_pptp_hook *nf_nat_pptp_hook;
+EXPORT_SYMBOL_GPL(nf_nat_pptp_hook);
 
 #if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG)
 /* PptpControlMessageType names */
@@ -111,8 +89,8 @@ EXPORT_SYMBOL(pptp_msg_name);
 static void pptp_expectfn(struct nf_conn *ct,
                         struct nf_conntrack_expect *exp)
 {
+       const struct nf_nat_pptp_hook *hook;
        struct net *net = nf_ct_net(ct);
-       typeof(nf_nat_pptp_hook_expectfn) nf_nat_pptp_expectfn;
        pr_debug("increasing timeouts\n");
 
        /* increase timeout of GRE data channel conntrack entry */
@@ -122,9 +100,9 @@ static void pptp_expectfn(struct nf_conn *ct,
        /* Can you see how rusty this code is, compared with the pre-2.6.11
         * one? That's what happened to my shiny newnat of 2002 ;( -HW */
 
-       nf_nat_pptp_expectfn = rcu_dereference(nf_nat_pptp_hook_expectfn);
-       if (nf_nat_pptp_expectfn && ct->master->status & IPS_NAT_MASK)
-               nf_nat_pptp_expectfn(ct, exp);
+       hook = rcu_dereference(nf_nat_pptp_hook);
+       if (hook && ct->master->status & IPS_NAT_MASK)
+               hook->expectfn(ct, exp);
        else {
                struct nf_conntrack_tuple inv_t;
                struct nf_conntrack_expect *exp_other;
@@ -209,9 +187,9 @@ static void pptp_destroy_siblings(struct nf_conn *ct)
 static int exp_gre(struct nf_conn *ct, __be16 callid, __be16 peer_callid)
 {
        struct nf_conntrack_expect *exp_orig, *exp_reply;
+       const struct nf_nat_pptp_hook *hook;
        enum ip_conntrack_dir dir;
        int ret = 1;
-       typeof(nf_nat_pptp_hook_exp_gre) nf_nat_pptp_exp_gre;
 
        exp_orig = nf_ct_expect_alloc(ct);
        if (exp_orig == NULL)
@@ -239,9 +217,9 @@ static int exp_gre(struct nf_conn *ct, __be16 callid, __be16 peer_callid)
                          IPPROTO_GRE, &callid, &peer_callid);
        exp_reply->expectfn = pptp_expectfn;
 
-       nf_nat_pptp_exp_gre = rcu_dereference(nf_nat_pptp_hook_exp_gre);
-       if (nf_nat_pptp_exp_gre && ct->status & IPS_NAT_MASK)
-               nf_nat_pptp_exp_gre(exp_orig, exp_reply);
+       hook = rcu_dereference(nf_nat_pptp_hook);
+       if (hook && ct->status & IPS_NAT_MASK)
+               hook->exp_gre(exp_orig, exp_reply);
        if (nf_ct_expect_related(exp_orig, 0) != 0)
                goto out_put_both;
        if (nf_ct_expect_related(exp_reply, 0) != 0)
@@ -279,9 +257,9 @@ pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff,
                 enum ip_conntrack_info ctinfo)
 {
        struct nf_ct_pptp_master *info = nfct_help_data(ct);
+       const struct nf_nat_pptp_hook *hook;
        u_int16_t msg;
        __be16 cid = 0, pcid = 0;
-       typeof(nf_nat_pptp_hook_inbound) nf_nat_pptp_inbound;
 
        msg = ntohs(ctlh->messageType);
        pr_debug("inbound control message %s\n", pptp_msg_name(msg));
@@ -383,10 +361,9 @@ pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff,
                goto invalid;
        }
 
-       nf_nat_pptp_inbound = rcu_dereference(nf_nat_pptp_hook_inbound);
-       if (nf_nat_pptp_inbound && ct->status & IPS_NAT_MASK)
-               return nf_nat_pptp_inbound(skb, ct, ctinfo,
-                                          protoff, ctlh, pptpReq);
+       hook = rcu_dereference(nf_nat_pptp_hook);
+       if (hook && ct->status & IPS_NAT_MASK)
+               return hook->inbound(skb, ct, ctinfo, protoff, ctlh, pptpReq);
        return NF_ACCEPT;
 
 invalid:
@@ -407,9 +384,9 @@ pptp_outbound_pkt(struct sk_buff *skb, unsigned int protoff,
                  enum ip_conntrack_info ctinfo)
 {
        struct nf_ct_pptp_master *info = nfct_help_data(ct);
+       const struct nf_nat_pptp_hook *hook;
        u_int16_t msg;
        __be16 cid = 0, pcid = 0;
-       typeof(nf_nat_pptp_hook_outbound) nf_nat_pptp_outbound;
 
        msg = ntohs(ctlh->messageType);
        pr_debug("outbound control message %s\n", pptp_msg_name(msg));
@@ -479,10 +456,9 @@ pptp_outbound_pkt(struct sk_buff *skb, unsigned int protoff,
                goto invalid;
        }
 
-       nf_nat_pptp_outbound = rcu_dereference(nf_nat_pptp_hook_outbound);
-       if (nf_nat_pptp_outbound && ct->status & IPS_NAT_MASK)
-               return nf_nat_pptp_outbound(skb, ct, ctinfo,
-                                           protoff, ctlh, pptpReq);
+       hook = rcu_dereference(nf_nat_pptp_hook);
+       if (hook && ct->status & IPS_NAT_MASK)
+               return hook->outbound(skb, ct, ctinfo, protoff, ctlh, pptpReq);
        return NF_ACCEPT;
 
 invalid:
index 3b516cf..12f793d 100644 (file)
@@ -63,8 +63,10 @@ static bool udp_error(struct sk_buff *skb,
        }
 
        /* Packet with no checksum */
-       if (!hdr->check)
+       if (!hdr->check) {
+               skb->ip_summed = CHECKSUM_UNNECESSARY;
                return false;
+       }
 
        /* Checksum invalid? Ignore.
         * We skip checking packets on the outgoing path
index 3066449..7ab2b25 100644 (file)
@@ -232,19 +232,3 @@ s32 nf_ct_seq_offset(const struct nf_conn *ct,
                 this_way->offset_after : this_way->offset_before;
 }
 EXPORT_SYMBOL_GPL(nf_ct_seq_offset);
-
-static const struct nf_ct_ext_type nf_ct_seqadj_extend = {
-       .len    = sizeof(struct nf_conn_seqadj),
-       .align  = __alignof__(struct nf_conn_seqadj),
-       .id     = NF_CT_EXT_SEQADJ,
-};
-
-int nf_conntrack_seqadj_init(void)
-{
-       return nf_ct_extend_register(&nf_ct_seqadj_extend);
-}
-
-void nf_conntrack_seqadj_fini(void)
-{
-       nf_ct_extend_unregister(&nf_ct_seqadj_extend);
-}
index 14387e0..cec166e 100644 (file)
 #include <net/netfilter/nf_conntrack_l4proto.h>
 #include <net/netfilter/nf_conntrack_timeout.h>
 
-struct nf_ct_timeout *
-(*nf_ct_timeout_find_get_hook)(struct net *net, const char *name) __read_mostly;
-EXPORT_SYMBOL_GPL(nf_ct_timeout_find_get_hook);
-
-void (*nf_ct_timeout_put_hook)(struct nf_ct_timeout *timeout) __read_mostly;
-EXPORT_SYMBOL_GPL(nf_ct_timeout_put_hook);
+const struct nf_ct_timeout_hooks *nf_ct_timeout_hook __read_mostly;
+EXPORT_SYMBOL_GPL(nf_ct_timeout_hook);
 
 static int untimeout(struct nf_conn *ct, void *timeout)
 {
@@ -48,31 +44,30 @@ EXPORT_SYMBOL_GPL(nf_ct_untimeout);
 
 static void __nf_ct_timeout_put(struct nf_ct_timeout *timeout)
 {
-       typeof(nf_ct_timeout_put_hook) timeout_put;
+       const struct nf_ct_timeout_hooks *h = rcu_dereference(nf_ct_timeout_hook);
 
-       timeout_put = rcu_dereference(nf_ct_timeout_put_hook);
-       if (timeout_put)
-               timeout_put(timeout);
+       if (h)
+               h->timeout_put(timeout);
 }
 
 int nf_ct_set_timeout(struct net *net, struct nf_conn *ct,
                      u8 l3num, u8 l4num, const char *timeout_name)
 {
-       typeof(nf_ct_timeout_find_get_hook) timeout_find_get;
+       const struct nf_ct_timeout_hooks *h;
        struct nf_ct_timeout *timeout;
        struct nf_conn_timeout *timeout_ext;
        const char *errmsg = NULL;
        int ret = 0;
 
        rcu_read_lock();
-       timeout_find_get = rcu_dereference(nf_ct_timeout_find_get_hook);
-       if (!timeout_find_get) {
+       h = rcu_dereference(nf_ct_timeout_hook);
+       if (!h) {
                ret = -ENOENT;
                errmsg = "Timeout policy base is empty";
                goto out;
        }
 
-       timeout = timeout_find_get(net, timeout_name);
+       timeout = h->timeout_find_get(net, timeout_name);
        if (!timeout) {
                ret = -ENOENT;
                pr_info_ratelimited("No such timeout policy \"%s\"\n",
@@ -119,37 +114,18 @@ EXPORT_SYMBOL_GPL(nf_ct_set_timeout);
 void nf_ct_destroy_timeout(struct nf_conn *ct)
 {
        struct nf_conn_timeout *timeout_ext;
-       typeof(nf_ct_timeout_put_hook) timeout_put;
+       const struct nf_ct_timeout_hooks *h;
 
        rcu_read_lock();
-       timeout_put = rcu_dereference(nf_ct_timeout_put_hook);
+       h = rcu_dereference(nf_ct_timeout_hook);
 
-       if (timeout_put) {
+       if (h) {
                timeout_ext = nf_ct_timeout_find(ct);
                if (timeout_ext) {
-                       timeout_put(timeout_ext->timeout);
+                       h->timeout_put(timeout_ext->timeout);
                        RCU_INIT_POINTER(timeout_ext->timeout, NULL);
                }
        }
        rcu_read_unlock();
 }
 EXPORT_SYMBOL_GPL(nf_ct_destroy_timeout);
-
-static const struct nf_ct_ext_type timeout_extend = {
-       .len    = sizeof(struct nf_conn_timeout),
-       .align  = __alignof__(struct nf_conn_timeout),
-       .id     = NF_CT_EXT_TIMEOUT,
-};
-
-int nf_conntrack_timeout_init(void)
-{
-       int ret = nf_ct_extend_register(&timeout_extend);
-       if (ret < 0)
-               pr_err("nf_ct_timeout: Unable to register timeout extension.\n");
-       return ret;
-}
-
-void nf_conntrack_timeout_fini(void)
-{
-       nf_ct_extend_unregister(&timeout_extend);
-}
index f656d39..9e43a0a 100644 (file)
@@ -19,27 +19,7 @@ static bool nf_ct_tstamp __read_mostly;
 module_param_named(tstamp, nf_ct_tstamp, bool, 0644);
 MODULE_PARM_DESC(tstamp, "Enable connection tracking flow timestamping.");
 
-static const struct nf_ct_ext_type tstamp_extend = {
-       .len    = sizeof(struct nf_conn_tstamp),
-       .align  = __alignof__(struct nf_conn_tstamp),
-       .id     = NF_CT_EXT_TSTAMP,
-};
-
 void nf_conntrack_tstamp_pernet_init(struct net *net)
 {
        net->ct.sysctl_tstamp = nf_ct_tstamp;
 }
-
-int nf_conntrack_tstamp_init(void)
-{
-       int ret;
-       ret = nf_ct_extend_register(&tstamp_extend);
-       if (ret < 0)
-               pr_err("Unable to register extension\n");
-       return ret;
-}
-
-void nf_conntrack_tstamp_fini(void)
-{
-       nf_ct_extend_unregister(&tstamp_extend);
-}
index 2d06a66..58c06ac 100644 (file)
@@ -838,7 +838,7 @@ static int nf_nat_proto_remove(struct nf_conn *i, void *data)
        return i->status & IPS_NAT_MASK ? 1 : 0;
 }
 
-static void __nf_nat_cleanup_conntrack(struct nf_conn *ct)
+static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
 {
        unsigned int h;
 
@@ -860,7 +860,7 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data)
         * will delete entry from already-freed table.
         */
        if (test_and_clear_bit(IPS_SRC_NAT_DONE_BIT, &ct->status))
-               __nf_nat_cleanup_conntrack(ct);
+               nf_nat_cleanup_conntrack(ct);
 
        /* don't delete conntrack.  Although that would make things a lot
         * simpler, we'd end up flushing all conntracks on nat rmmod.
@@ -868,20 +868,6 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data)
        return 0;
 }
 
-/* No one using conntrack by the time this called. */
-static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
-{
-       if (ct->status & IPS_SRC_NAT_DONE)
-               __nf_nat_cleanup_conntrack(ct);
-}
-
-static struct nf_ct_ext_type nat_extend __read_mostly = {
-       .len            = sizeof(struct nf_conn_nat),
-       .align          = __alignof__(struct nf_conn_nat),
-       .destroy        = nf_nat_cleanup_conntrack,
-       .id             = NF_CT_EXT_NAT,
-};
-
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 
 #include <linux/netfilter/nfnetlink.h>
@@ -1173,6 +1159,7 @@ static const struct nf_nat_hook nat_hook = {
        .decode_session         = __nf_nat_decode_session,
 #endif
        .manip_pkt              = nf_nat_manip_pkt,
+       .remove_nat_bysrc       = nf_nat_cleanup_conntrack,
 };
 
 static int __init nf_nat_init(void)
@@ -1188,19 +1175,11 @@ static int __init nf_nat_init(void)
        if (!nf_nat_bysource)
                return -ENOMEM;
 
-       ret = nf_ct_extend_register(&nat_extend);
-       if (ret < 0) {
-               kvfree(nf_nat_bysource);
-               pr_err("Unable to register extension\n");
-               return ret;
-       }
-
        for (i = 0; i < CONNTRACK_LOCKS; i++)
                spin_lock_init(&nf_nat_locks[i]);
 
        ret = register_pernet_subsys(&nat_net_ops);
        if (ret < 0) {
-               nf_ct_extend_unregister(&nat_extend);
                kvfree(nf_nat_bysource);
                return ret;
        }
@@ -1219,7 +1198,6 @@ static void __exit nf_nat_cleanup(void)
 
        nf_ct_iterate_destroy(nf_nat_proto_clean, &clean);
 
-       nf_ct_extend_unregister(&nat_extend);
        nf_ct_helper_expectfn_unregister(&follow_master_nat);
        RCU_INIT_POINTER(nf_nat_hook, NULL);
 
index 2dfc5da..e479dd0 100644 (file)
@@ -236,12 +236,6 @@ synproxy_tstamp_adjust(struct sk_buff *skb, unsigned int protoff,
        return 1;
 }
 
-static struct nf_ct_ext_type nf_ct_synproxy_extend __read_mostly = {
-       .len            = sizeof(struct nf_conn_synproxy),
-       .align          = __alignof__(struct nf_conn_synproxy),
-       .id             = NF_CT_EXT_SYNPROXY,
-};
-
 #ifdef CONFIG_PROC_FS
 static void *synproxy_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 {
@@ -387,28 +381,12 @@ static struct pernet_operations synproxy_net_ops = {
 
 static int __init synproxy_core_init(void)
 {
-       int err;
-
-       err = nf_ct_extend_register(&nf_ct_synproxy_extend);
-       if (err < 0)
-               goto err1;
-
-       err = register_pernet_subsys(&synproxy_net_ops);
-       if (err < 0)
-               goto err2;
-
-       return 0;
-
-err2:
-       nf_ct_extend_unregister(&nf_ct_synproxy_extend);
-err1:
-       return err;
+       return register_pernet_subsys(&synproxy_net_ops);
 }
 
 static void __exit synproxy_core_exit(void)
 {
        unregister_pernet_subsys(&synproxy_net_ops);
-       nf_ct_extend_unregister(&nf_ct_synproxy_extend);
 }
 
 module_init(synproxy_core_init);
index 36e73f9..c6c05b2 100644 (file)
@@ -67,6 +67,20 @@ static void nft_cmp_fast_eval(const struct nft_expr *expr,
        regs->verdict.code = NFT_BREAK;
 }
 
+static void nft_cmp16_fast_eval(const struct nft_expr *expr,
+                               struct nft_regs *regs)
+{
+       const struct nft_cmp16_fast_expr *priv = nft_expr_priv(expr);
+       const u64 *reg_data = (const u64 *)&regs->data[priv->sreg];
+       const u64 *mask = (const u64 *)&priv->mask;
+       const u64 *data = (const u64 *)&priv->data;
+
+       if (((reg_data[0] & mask[0]) == data[0] &&
+           ((reg_data[1] & mask[1]) == data[1])) ^ priv->inv)
+               return;
+       regs->verdict.code = NFT_BREAK;
+}
+
 static noinline void __nft_trace_verdict(struct nft_traceinfo *info,
                                         const struct nft_chain *chain,
                                         const struct nft_regs *regs)
@@ -225,6 +239,8 @@ next_rule:
                nft_rule_dp_for_each_expr(expr, last, rule) {
                        if (expr->ops == &nft_cmp_fast_ops)
                                nft_cmp_fast_eval(expr, &regs);
+                       else if (expr->ops == &nft_cmp16_fast_ops)
+                               nft_cmp16_fast_eval(expr, &regs);
                        else if (expr->ops == &nft_bitwise_fast_ops)
                                nft_bitwise_fast_eval(expr, &regs);
                        else if (expr->ops != &nft_payload_fast_ops ||
index c57673d..b0d8888 100644 (file)
@@ -605,6 +605,11 @@ static struct pernet_operations cttimeout_ops = {
        .size   = sizeof(struct nfct_timeout_pernet),
 };
 
+static const struct nf_ct_timeout_hooks hooks = {
+       .timeout_find_get = ctnl_timeout_find_get,
+       .timeout_put = ctnl_timeout_put,
+};
+
 static int __init cttimeout_init(void)
 {
        int ret;
@@ -619,8 +624,7 @@ static int __init cttimeout_init(void)
                        "nfnetlink.\n");
                goto err_out;
        }
-       RCU_INIT_POINTER(nf_ct_timeout_find_get_hook, ctnl_timeout_find_get);
-       RCU_INIT_POINTER(nf_ct_timeout_put_hook, ctnl_timeout_put);
+       RCU_INIT_POINTER(nf_ct_timeout_hook, &hooks);
        return 0;
 
 err_out:
@@ -633,8 +637,7 @@ static void __exit cttimeout_exit(void)
        nfnetlink_subsys_unregister(&cttimeout_subsys);
 
        unregister_pernet_subsys(&cttimeout_ops);
-       RCU_INIT_POINTER(nf_ct_timeout_find_get_hook, NULL);
-       RCU_INIT_POINTER(nf_ct_timeout_put_hook, NULL);
+       RCU_INIT_POINTER(nf_ct_timeout_hook, NULL);
        synchronize_rcu();
 }
 
index ea2d9c2..8c15978 100644 (file)
@@ -402,6 +402,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
                + nla_total_size(sizeof(u_int32_t))     /* ifindex */
 #endif
                + nla_total_size(sizeof(u_int32_t))     /* mark */
+               + nla_total_size(sizeof(u_int32_t))     /* priority */
                + nla_total_size(sizeof(struct nfqnl_msg_packet_hw))
                + nla_total_size(sizeof(u_int32_t))     /* skbinfo */
                + nla_total_size(sizeof(u_int32_t));    /* cap_len */
@@ -559,6 +560,10 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
            nla_put_be32(skb, NFQA_MARK, htonl(entskb->mark)))
                goto nla_put_failure;
 
+       if (entskb->priority &&
+           nla_put_be32(skb, NFQA_PRIORITY, htonl(entskb->priority)))
+               goto nla_put_failure;
+
        if (indev && entskb->dev &&
            skb_mac_header_was_set(entskb) &&
            skb_mac_header_len(entskb) != 0) {
@@ -1014,11 +1019,13 @@ static const struct nla_policy nfqa_verdict_policy[NFQA_MAX+1] = {
        [NFQA_CT]               = { .type = NLA_UNSPEC },
        [NFQA_EXP]              = { .type = NLA_UNSPEC },
        [NFQA_VLAN]             = { .type = NLA_NESTED },
+       [NFQA_PRIORITY]         = { .type = NLA_U32 },
 };
 
 static const struct nla_policy nfqa_verdict_batch_policy[NFQA_MAX+1] = {
        [NFQA_VERDICT_HDR]      = { .len = sizeof(struct nfqnl_msg_verdict_hdr) },
        [NFQA_MARK]             = { .type = NLA_U32 },
+       [NFQA_PRIORITY]         = { .type = NLA_U32 },
 };
 
 static struct nfqnl_instance *
@@ -1099,6 +1106,9 @@ static int nfqnl_recv_verdict_batch(struct sk_buff *skb,
                if (nfqa[NFQA_MARK])
                        entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK]));
 
+               if (nfqa[NFQA_PRIORITY])
+                       entry->skb->priority = ntohl(nla_get_be32(nfqa[NFQA_PRIORITY]));
+
                nfqnl_reinject(entry, verdict);
        }
        return 0;
@@ -1225,6 +1235,9 @@ static int nfqnl_recv_verdict(struct sk_buff *skb, const struct nfnl_info *info,
        if (nfqa[NFQA_MARK])
                entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK]));
 
+       if (nfqa[NFQA_PRIORITY])
+               entry->skb->priority = ntohl(nla_get_be32(nfqa[NFQA_PRIORITY]));
+
        nfqnl_reinject(entry, verdict);
        return 0;
 }
index 47b6d05..917072a 100644 (file)
@@ -272,12 +272,103 @@ const struct nft_expr_ops nft_cmp_fast_ops = {
        .offload        = nft_cmp_fast_offload,
 };
 
+static u32 nft_cmp_mask(u32 bitlen)
+{
+       return (__force u32)cpu_to_le32(~0U >> (sizeof(u32) * BITS_PER_BYTE - bitlen));
+}
+
+static void nft_cmp16_fast_mask(struct nft_data *data, unsigned int bitlen)
+{
+       int len = bitlen / BITS_PER_BYTE;
+       int i, words = len / sizeof(u32);
+
+       for (i = 0; i < words; i++) {
+               data->data[i] = 0xffffffff;
+               bitlen -= sizeof(u32) * BITS_PER_BYTE;
+       }
+
+       if (len % sizeof(u32))
+               data->data[i++] = nft_cmp_mask(bitlen);
+
+       for (; i < 4; i++)
+               data->data[i] = 0;
+}
+
+static int nft_cmp16_fast_init(const struct nft_ctx *ctx,
+                              const struct nft_expr *expr,
+                              const struct nlattr * const tb[])
+{
+       struct nft_cmp16_fast_expr *priv = nft_expr_priv(expr);
+       struct nft_data_desc desc;
+       int err;
+
+       err = nft_data_init(NULL, &priv->data, sizeof(priv->data), &desc,
+                           tb[NFTA_CMP_DATA]);
+       if (err < 0)
+               return err;
+
+       err = nft_parse_register_load(tb[NFTA_CMP_SREG], &priv->sreg, desc.len);
+       if (err < 0)
+               return err;
+
+       nft_cmp16_fast_mask(&priv->mask, desc.len * BITS_PER_BYTE);
+       priv->inv = ntohl(nla_get_be32(tb[NFTA_CMP_OP])) != NFT_CMP_EQ;
+       priv->len = desc.len;
+
+       return 0;
+}
+
+static int nft_cmp16_fast_offload(struct nft_offload_ctx *ctx,
+                                 struct nft_flow_rule *flow,
+                                 const struct nft_expr *expr)
+{
+       const struct nft_cmp16_fast_expr *priv = nft_expr_priv(expr);
+       struct nft_cmp_expr cmp = {
+               .data   = priv->data,
+               .sreg   = priv->sreg,
+               .len    = priv->len,
+               .op     = priv->inv ? NFT_CMP_NEQ : NFT_CMP_EQ,
+       };
+
+       return __nft_cmp_offload(ctx, flow, &cmp);
+}
+
+static int nft_cmp16_fast_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+       const struct nft_cmp16_fast_expr *priv = nft_expr_priv(expr);
+       enum nft_cmp_ops op = priv->inv ? NFT_CMP_NEQ : NFT_CMP_EQ;
+
+       if (nft_dump_register(skb, NFTA_CMP_SREG, priv->sreg))
+               goto nla_put_failure;
+       if (nla_put_be32(skb, NFTA_CMP_OP, htonl(op)))
+               goto nla_put_failure;
+
+       if (nft_data_dump(skb, NFTA_CMP_DATA, &priv->data,
+                         NFT_DATA_VALUE, priv->len) < 0)
+               goto nla_put_failure;
+       return 0;
+
+nla_put_failure:
+       return -1;
+}
+
+
+const struct nft_expr_ops nft_cmp16_fast_ops = {
+       .type           = &nft_cmp_type,
+       .size           = NFT_EXPR_SIZE(sizeof(struct nft_cmp16_fast_expr)),
+       .eval           = NULL, /* inlined */
+       .init           = nft_cmp16_fast_init,
+       .dump           = nft_cmp16_fast_dump,
+       .offload        = nft_cmp16_fast_offload,
+};
+
 static const struct nft_expr_ops *
 nft_cmp_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[])
 {
        struct nft_data_desc desc;
        struct nft_data data;
        enum nft_cmp_ops op;
+       u8 sreg;
        int err;
 
        if (tb[NFTA_CMP_SREG] == NULL ||
@@ -306,9 +397,16 @@ nft_cmp_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[])
        if (desc.type != NFT_DATA_VALUE)
                goto err1;
 
-       if (desc.len <= sizeof(u32) && (op == NFT_CMP_EQ || op == NFT_CMP_NEQ))
-               return &nft_cmp_fast_ops;
+       sreg = ntohl(nla_get_be32(tb[NFTA_CMP_SREG]));
 
+       if (op == NFT_CMP_EQ || op == NFT_CMP_NEQ) {
+               if (desc.len <= sizeof(u32))
+                       return &nft_cmp_fast_ops;
+               else if (desc.len <= sizeof(data) &&
+                        ((sreg >= NFT_REG_1 && sreg <= NFT_REG_4) ||
+                         (sreg >= NFT_REG32_00 && sreg <= NFT_REG32_12 && sreg % 2 == 0)))
+                       return &nft_cmp16_fast_ops;
+       }
        return &nft_cmp_ops;
 err1:
        nft_data_release(&data, desc.type);
index f69cc73..5a46d82 100644 (file)
@@ -731,6 +731,14 @@ static const struct nfnetlink_subsystem nfnl_compat_subsys = {
 
 static struct nft_expr_type nft_match_type;
 
+static bool nft_match_reduce(struct nft_regs_track *track,
+                            const struct nft_expr *expr)
+{
+       const struct xt_match *match = expr->ops->data;
+
+       return strcmp(match->name, "comment") == 0;
+}
+
 static const struct nft_expr_ops *
 nft_match_select_ops(const struct nft_ctx *ctx,
                     const struct nlattr * const tb[])
@@ -773,6 +781,7 @@ nft_match_select_ops(const struct nft_ctx *ctx,
        ops->dump = nft_match_dump;
        ops->validate = nft_match_validate;
        ops->data = match;
+       ops->reduce = nft_match_reduce;
 
        matchsize = NFT_EXPR_SIZE(XT_ALIGN(match->matchsize));
        if (matchsize > NFT_MATCH_LARGE_THRESH) {
index 9e927ab..d2b9378 100644 (file)
@@ -308,6 +308,63 @@ err:
        regs->verdict.code = NFT_BREAK;
 }
 
+static void nft_exthdr_tcp_strip_eval(const struct nft_expr *expr,
+                                     struct nft_regs *regs,
+                                     const struct nft_pktinfo *pkt)
+{
+       u8 buff[sizeof(struct tcphdr) + MAX_TCP_OPTION_SPACE];
+       struct nft_exthdr *priv = nft_expr_priv(expr);
+       unsigned int i, tcphdr_len, optl;
+       struct tcphdr *tcph;
+       u8 *opt;
+
+       tcph = nft_tcp_header_pointer(pkt, sizeof(buff), buff, &tcphdr_len);
+       if (!tcph)
+               goto err;
+
+       if (skb_ensure_writable(pkt->skb, nft_thoff(pkt) + tcphdr_len))
+               goto drop;
+
+       opt = (u8 *)nft_tcp_header_pointer(pkt, sizeof(buff), buff, &tcphdr_len);
+       if (!opt)
+               goto err;
+       for (i = sizeof(*tcph); i < tcphdr_len - 1; i += optl) {
+               unsigned int j;
+
+               optl = optlen(opt, i);
+               if (priv->type != opt[i])
+                       continue;
+
+               if (i + optl > tcphdr_len)
+                       goto drop;
+
+               for (j = 0; j < optl; ++j) {
+                       u16 n = TCPOPT_NOP;
+                       u16 o = opt[i+j];
+
+                       if ((i + j) % 2 == 0) {
+                               o <<= 8;
+                               n <<= 8;
+                       }
+                       inet_proto_csum_replace2(&tcph->check, pkt->skb, htons(o),
+                                                htons(n), false);
+               }
+               memset(opt + i, TCPOPT_NOP, optl);
+               return;
+       }
+
+       /* option not found, continue. This allows to do multiple
+        * option removals per rule.
+        */
+       return;
+err:
+       regs->verdict.code = NFT_BREAK;
+       return;
+drop:
+       /* can't remove, no choice but to drop */
+       regs->verdict.code = NF_DROP;
+}
+
 static void nft_exthdr_sctp_eval(const struct nft_expr *expr,
                                 struct nft_regs *regs,
                                 const struct nft_pktinfo *pkt)
@@ -457,6 +514,28 @@ static int nft_exthdr_tcp_set_init(const struct nft_ctx *ctx,
                                       priv->len);
 }
 
+static int nft_exthdr_tcp_strip_init(const struct nft_ctx *ctx,
+                                    const struct nft_expr *expr,
+                                    const struct nlattr * const tb[])
+{
+       struct nft_exthdr *priv = nft_expr_priv(expr);
+
+       if (tb[NFTA_EXTHDR_SREG] ||
+           tb[NFTA_EXTHDR_DREG] ||
+           tb[NFTA_EXTHDR_FLAGS] ||
+           tb[NFTA_EXTHDR_OFFSET] ||
+           tb[NFTA_EXTHDR_LEN])
+               return -EINVAL;
+
+       if (!tb[NFTA_EXTHDR_TYPE])
+               return -EINVAL;
+
+       priv->type = nla_get_u8(tb[NFTA_EXTHDR_TYPE]);
+       priv->op = NFT_EXTHDR_OP_TCPOPT;
+
+       return 0;
+}
+
 static int nft_exthdr_ipv4_init(const struct nft_ctx *ctx,
                                const struct nft_expr *expr,
                                const struct nlattr * const tb[])
@@ -517,6 +596,13 @@ static int nft_exthdr_dump_set(struct sk_buff *skb, const struct nft_expr *expr)
        return nft_exthdr_dump_common(skb, priv);
 }
 
+static int nft_exthdr_dump_strip(struct sk_buff *skb, const struct nft_expr *expr)
+{
+       const struct nft_exthdr *priv = nft_expr_priv(expr);
+
+       return nft_exthdr_dump_common(skb, priv);
+}
+
 static const struct nft_expr_ops nft_exthdr_ipv6_ops = {
        .type           = &nft_exthdr_type,
        .size           = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)),
@@ -549,6 +635,14 @@ static const struct nft_expr_ops nft_exthdr_tcp_set_ops = {
        .dump           = nft_exthdr_dump_set,
 };
 
+static const struct nft_expr_ops nft_exthdr_tcp_strip_ops = {
+       .type           = &nft_exthdr_type,
+       .size           = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)),
+       .eval           = nft_exthdr_tcp_strip_eval,
+       .init           = nft_exthdr_tcp_strip_init,
+       .dump           = nft_exthdr_dump_strip,
+};
+
 static const struct nft_expr_ops nft_exthdr_sctp_ops = {
        .type           = &nft_exthdr_type,
        .size           = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)),
@@ -576,7 +670,7 @@ nft_exthdr_select_ops(const struct nft_ctx *ctx,
                        return &nft_exthdr_tcp_set_ops;
                if (tb[NFTA_EXTHDR_DREG])
                        return &nft_exthdr_tcp_ops;
-               break;
+               return &nft_exthdr_tcp_strip_ops;
        case NFT_EXTHDR_OP_IPV6:
                if (tb[NFTA_EXTHDR_DREG])
                        return &nft_exthdr_ipv6_ops;
index 67ad083..7e8a39a 100644 (file)
@@ -37,6 +37,7 @@
 #include <net/genetlink.h>
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
+#include <net/pkt_cls.h>
 
 #include "datapath.h"
 #include "flow.h"
@@ -1601,8 +1602,6 @@ static void ovs_dp_reset_user_features(struct sk_buff *skb,
        dp->user_features = 0;
 }
 
-DEFINE_STATIC_KEY_FALSE(tc_recirc_sharing_support);
-
 static int ovs_dp_set_upcall_portids(struct datapath *dp,
                              const struct nlattr *ids)
 {
@@ -1657,7 +1656,7 @@ u32 ovs_dp_get_upcall_portid(const struct datapath *dp, uint32_t cpu_id)
 
 static int ovs_dp_change(struct datapath *dp, struct nlattr *a[])
 {
-       u32 user_features = 0;
+       u32 user_features = 0, old_features = dp->user_features;
        int err;
 
        if (a[OVS_DP_ATTR_USER_FEATURES]) {
@@ -1696,10 +1695,12 @@ static int ovs_dp_change(struct datapath *dp, struct nlattr *a[])
                        return err;
        }
 
-       if (dp->user_features & OVS_DP_F_TC_RECIRC_SHARING)
-               static_branch_enable(&tc_recirc_sharing_support);
-       else
-               static_branch_disable(&tc_recirc_sharing_support);
+       if ((dp->user_features & OVS_DP_F_TC_RECIRC_SHARING) &&
+           !(old_features & OVS_DP_F_TC_RECIRC_SHARING))
+               tc_skb_ext_tc_enable();
+       else if (!(dp->user_features & OVS_DP_F_TC_RECIRC_SHARING) &&
+                (old_features & OVS_DP_F_TC_RECIRC_SHARING))
+               tc_skb_ext_tc_disable();
 
        return 0;
 }
@@ -1839,6 +1840,9 @@ static void __dp_destroy(struct datapath *dp)
        struct flow_table *table = &dp->table;
        int i;
 
+       if (dp->user_features & OVS_DP_F_TC_RECIRC_SHARING)
+               tc_skb_ext_tc_disable();
+
        for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) {
                struct vport *vport;
                struct hlist_node *n;
index fcfe6cb..0cd2997 100644 (file)
@@ -253,8 +253,6 @@ static inline struct datapath *get_dp(struct net *net, int dp_ifindex)
 extern struct notifier_block ovs_dp_device_notifier;
 extern struct genl_family dp_vport_genl_family;
 
-DECLARE_STATIC_KEY_FALSE(tc_recirc_sharing_support);
-
 void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key);
 void ovs_dp_detach_port(struct vport *);
 int ovs_dp_upcall(struct datapath *, struct sk_buff *,
index 02096f2..f6cd24f 100644 (file)
@@ -34,6 +34,7 @@
 #include <net/mpls.h>
 #include <net/ndisc.h>
 #include <net/nsh.h>
+#include <net/pkt_cls.h>
 #include <net/netfilter/nf_conntrack_zones.h>
 
 #include "conntrack.h"
@@ -895,7 +896,7 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
        key->mac_proto = res;
 
 #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
-       if (static_branch_unlikely(&tc_recirc_sharing_support)) {
+       if (tc_skb_ext_tc_enabled()) {
                tc_ext = skb_ext_find(skb, TC_SKB_EXT);
                key->recirc_id = tc_ext ? tc_ext->chain : 0;
                OVS_CB(skb)->mru = tc_ext ? tc_ext->mru : 0;
index f99247f..7108e71 100644 (file)
@@ -57,12 +57,6 @@ static const struct rhashtable_params zones_params = {
        .automatic_shrinking = true,
 };
 
-static struct nf_ct_ext_type act_ct_extend __read_mostly = {
-       .len            = sizeof(struct nf_conn_act_ct_ext),
-       .align          = __alignof__(struct nf_conn_act_ct_ext),
-       .id             = NF_CT_EXT_ACT_CT,
-};
-
 static struct flow_action_entry *
 tcf_ct_flow_table_flow_action_get_next(struct flow_action *flow_action)
 {
@@ -1608,16 +1602,10 @@ static int __init ct_init_module(void)
        if (err)
                goto err_register;
 
-       err = nf_ct_extend_register(&act_ct_extend);
-       if (err)
-               goto err_register_extend;
-
        static_branch_inc(&tcf_frag_xmit_count);
 
        return 0;
 
-err_register_extend:
-       tcf_unregister_action(&act_ct_ops, &ct_net_ops);
 err_register:
        tcf_ct_flow_tables_uninit();
 err_tbl_init:
@@ -1628,7 +1616,6 @@ err_tbl_init:
 static void __exit ct_cleanup_module(void)
 {
        static_branch_dec(&tcf_frag_xmit_count);
-       nf_ct_extend_unregister(&act_ct_extend);
        tcf_unregister_action(&act_ct_ops, &ct_net_ops);
        tcf_ct_flow_tables_uninit();
        destroy_workqueue(act_ct_wq);
index 5f0f346..ff1e6b4 100644 (file)
@@ -49,6 +49,23 @@ static LIST_HEAD(tcf_proto_base);
 /* Protects list of registered TC modules. It is pure SMP lock. */
 static DEFINE_RWLOCK(cls_mod_lock);
 
+#ifdef CONFIG_NET_CLS_ACT
+DEFINE_STATIC_KEY_FALSE(tc_skb_ext_tc);
+EXPORT_SYMBOL(tc_skb_ext_tc);
+
+void tc_skb_ext_tc_enable(void)
+{
+       static_branch_inc(&tc_skb_ext_tc);
+}
+EXPORT_SYMBOL(tc_skb_ext_tc_enable);
+
+void tc_skb_ext_tc_disable(void)
+{
+       static_branch_dec(&tc_skb_ext_tc);
+}
+EXPORT_SYMBOL(tc_skb_ext_tc_disable);
+#endif
+
 static u32 destroy_obj_hashfn(const struct tcf_proto *tp)
 {
        return jhash_3words(tp->chain->index, tp->prio,
@@ -1615,19 +1632,21 @@ int tcf_classify(struct sk_buff *skb,
        ret = __tcf_classify(skb, tp, orig_tp, res, compat_mode,
                             &last_executed_chain);
 
-       /* If we missed on some chain */
-       if (ret == TC_ACT_UNSPEC && last_executed_chain) {
-               struct tc_skb_cb *cb = tc_skb_cb(skb);
-
-               ext = tc_skb_ext_alloc(skb);
-               if (WARN_ON_ONCE(!ext))
-                       return TC_ACT_SHOT;
-               ext->chain = last_executed_chain;
-               ext->mru = cb->mru;
-               ext->post_ct = cb->post_ct;
-               ext->post_ct_snat = cb->post_ct_snat;
-               ext->post_ct_dnat = cb->post_ct_dnat;
-               ext->zone = cb->zone;
+       if (tc_skb_ext_tc_enabled()) {
+               /* If we missed on some chain */
+               if (ret == TC_ACT_UNSPEC && last_executed_chain) {
+                       struct tc_skb_cb *cb = tc_skb_cb(skb);
+
+                       ext = tc_skb_ext_alloc(skb);
+                       if (WARN_ON_ONCE(!ext))
+                               return TC_ACT_SHOT;
+                       ext->chain = last_executed_chain;
+                       ext->mru = cb->mru;
+                       ext->post_ct = cb->post_ct;
+                       ext->post_ct_snat = cb->post_ct_snat;
+                       ext->post_ct_dnat = cb->post_ct_dnat;
+                       ext->zone = cb->zone;
+               }
        }
 
        return ret;
index 8c89d0b..00b2e9d 100644 (file)
@@ -2626,8 +2626,8 @@ static int smc_setsockopt(struct socket *sock, int level, int optname,
                    sk->sk_state != SMC_CLOSED) {
                        if (!val) {
                                SMC_STAT_INC(smc, cork_cnt);
-                               mod_delayed_work(smc->conn.lgr->tx_wq,
-                                                &smc->conn.tx_work, 0);
+                               smc_tx_pending(&smc->conn);
+                               cancel_delayed_work(&smc->conn.tx_work);
                        }
                }
                break;
@@ -2765,8 +2765,10 @@ static ssize_t smc_sendpage(struct socket *sock, struct page *page,
                rc = kernel_sendpage(smc->clcsock, page, offset,
                                     size, flags);
        } else {
+               lock_sock(sk);
+               rc = smc_tx_sendpage(smc, page, offset, size, flags);
+               release_sock(sk);
                SMC_STAT_INC(smc, sendpage_cnt);
-               rc = sock_no_sendpage(sock, page, offset, size, flags);
        }
 
 out:
index be241d5..a96ce16 100644 (file)
@@ -31,7 +31,6 @@
 #include "smc_tracepoint.h"
 
 #define SMC_TX_WORK_DELAY      0
-#define SMC_TX_CORK_DELAY      (HZ >> 2)       /* 250 ms */
 
 /***************************** sndbuf producer *******************************/
 
@@ -236,16 +235,15 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
                 */
                if ((msg->msg_flags & MSG_OOB) && !send_remaining)
                        conn->urg_tx_pend = true;
-               if ((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc)) &&
-                   (atomic_read(&conn->sndbuf_space) >
-                                               (conn->sndbuf_desc->len >> 1)))
-                       /* for a corked socket defer the RDMA writes if there
-                        * is still sufficient sndbuf_space available
+               if ((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc) ||
+                    msg->msg_flags & MSG_SENDPAGE_NOTLAST) &&
+                   (atomic_read(&conn->sndbuf_space)))
+                       /* for a corked socket defer the RDMA writes if
+                        * sndbuf_space is still available. The applications
+                        * should known how/when to uncork it.
                         */
-                       queue_delayed_work(conn->lgr->tx_wq, &conn->tx_work,
-                                          SMC_TX_CORK_DELAY);
-               else
-                       smc_tx_sndbuf_nonempty(conn);
+                       continue;
+               smc_tx_sndbuf_nonempty(conn);
 
                trace_smc_tx_sendmsg(smc, copylen);
        } /* while (msg_data_left(msg)) */
@@ -260,6 +258,22 @@ out_err:
        return rc;
 }
 
+int smc_tx_sendpage(struct smc_sock *smc, struct page *page, int offset,
+                   size_t size, int flags)
+{
+       struct msghdr msg = {.msg_flags = flags};
+       char *kaddr = kmap(page);
+       struct kvec iov;
+       int rc;
+
+       iov.iov_base = kaddr + offset;
+       iov.iov_len = size;
+       iov_iter_kvec(&msg.msg_iter, WRITE, &iov, 1, size);
+       rc = smc_tx_sendmsg(smc, &msg, size);
+       kunmap(page);
+       return rc;
+}
+
 /***************************** sndbuf consumer *******************************/
 
 /* sndbuf consumer: actual data transfer of one target chunk with ISM write */
@@ -597,27 +611,32 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn)
        return rc;
 }
 
-/* Wakeup sndbuf consumers from process context
- * since there is more data to transmit
- */
-void smc_tx_work(struct work_struct *work)
+void smc_tx_pending(struct smc_connection *conn)
 {
-       struct smc_connection *conn = container_of(to_delayed_work(work),
-                                                  struct smc_connection,
-                                                  tx_work);
        struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
        int rc;
 
-       lock_sock(&smc->sk);
        if (smc->sk.sk_err)
-               goto out;
+               return;
 
        rc = smc_tx_sndbuf_nonempty(conn);
        if (!rc && conn->local_rx_ctrl.prod_flags.write_blocked &&
            !atomic_read(&conn->bytes_to_rcv))
                conn->local_rx_ctrl.prod_flags.write_blocked = 0;
+}
+
+/* Wakeup sndbuf consumers from process context
+ * since there is more data to transmit
+ */
+void smc_tx_work(struct work_struct *work)
+{
+       struct smc_connection *conn = container_of(to_delayed_work(work),
+                                                  struct smc_connection,
+                                                  tx_work);
+       struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
 
-out:
+       lock_sock(&smc->sk);
+       smc_tx_pending(conn);
        release_sock(&smc->sk);
 }
 
index 07e6ad7..34b5784 100644 (file)
@@ -27,9 +27,12 @@ static inline int smc_tx_prepared_sends(struct smc_connection *conn)
        return smc_curs_diff(conn->sndbuf_desc->len, &sent, &prep);
 }
 
+void smc_tx_pending(struct smc_connection *conn);
 void smc_tx_work(struct work_struct *work);
 void smc_tx_init(struct smc_sock *smc);
 int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len);
+int smc_tx_sendpage(struct smc_sock *smc, struct page *page, int offset,
+                   size_t size, int flags);
 int smc_tx_sndbuf_nonempty(struct smc_connection *conn);
 void smc_tx_sndbuf_nonfull(struct smc_sock *smc);
 void smc_tx_consumer_update(struct smc_connection *conn, bool force);
index 5f42aa5..8eb7e85 100644 (file)
@@ -72,7 +72,8 @@ struct gss_auth {
        struct gss_api_mech *mech;
        enum rpc_gss_svc service;
        struct rpc_clnt *client;
-       struct net *net;
+       struct net      *net;
+       netns_tracker   ns_tracker;
        /*
         * There are two upcall pipes; dentry[1], named "gssd", is used
         * for the new text-based upcall; dentry[0] is named after the
@@ -1013,7 +1014,8 @@ gss_create_new(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
                        goto err_free;
        }
        gss_auth->client = clnt;
-       gss_auth->net = get_net(rpc_net_ns(clnt));
+       gss_auth->net = get_net_track(rpc_net_ns(clnt), &gss_auth->ns_tracker,
+                                     GFP_KERNEL);
        err = -EINVAL;
        gss_auth->mech = gss_mech_get_by_pseudoflavor(flavor);
        if (!gss_auth->mech)
@@ -1068,7 +1070,7 @@ err_destroy_credcache:
 err_put_mech:
        gss_mech_put(gss_auth->mech);
 err_put_net:
-       put_net(gss_auth->net);
+       put_net_track(gss_auth->net, &gss_auth->ns_tracker);
 err_free:
        kfree(gss_auth->target_name);
        kfree(gss_auth);
@@ -1084,7 +1086,7 @@ gss_free(struct gss_auth *gss_auth)
        gss_pipe_free(gss_auth->gss_pipe[0]);
        gss_pipe_free(gss_auth->gss_pipe[1]);
        gss_mech_put(gss_auth->mech);
-       put_net(gss_auth->net);
+       put_net_track(gss_auth->net, &gss_auth->ns_tracker);
        kfree(gss_auth->target_name);
 
        kfree(gss_auth);
index b21ad79..db878e8 100644 (file)
@@ -162,7 +162,7 @@ static void svc_xprt_free(struct kref *kref)
        if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags))
                svcauth_unix_info_release(xprt);
        put_cred(xprt->xpt_cred);
-       put_net(xprt->xpt_net);
+       put_net_track(xprt->xpt_net, &xprt->ns_tracker);
        /* See comment on corresponding get in xs_setup_bc_tcp(): */
        if (xprt->xpt_bc_xprt)
                xprt_put(xprt->xpt_bc_xprt);
@@ -198,7 +198,7 @@ void svc_xprt_init(struct net *net, struct svc_xprt_class *xcl,
        mutex_init(&xprt->xpt_mutex);
        spin_lock_init(&xprt->xpt_lock);
        set_bit(XPT_BUSY, &xprt->xpt_flags);
-       xprt->xpt_net = get_net(net);
+       xprt->xpt_net = get_net_track(net, &xprt->ns_tracker, GFP_ATOMIC);
        strcpy(xprt->xpt_remotebuf, "uninitialized");
 }
 EXPORT_SYMBOL_GPL(svc_xprt_init);
index a02de2b..5af484d 100644 (file)
@@ -1835,7 +1835,7 @@ EXPORT_SYMBOL_GPL(xprt_alloc);
 
 void xprt_free(struct rpc_xprt *xprt)
 {
-       put_net(xprt->xprt_net);
+       put_net_track(xprt->xprt_net, &xprt->ns_tracker);
        xprt_free_all_slots(xprt);
        xprt_free_id(xprt);
        rpc_sysfs_xprt_destroy(xprt);
@@ -2027,7 +2027,7 @@ static void xprt_init(struct rpc_xprt *xprt, struct net *net)
 
        xprt_init_xid(xprt);
 
-       xprt->xprt_net = get_net(net);
+       xprt->xprt_net = get_net_track(net, &xprt->ns_tracker, GFP_KERNEL);
 }
 
 /**
index b625652..12e6b41 100644 (file)
@@ -85,7 +85,7 @@ static int switchdev_deferred_enqueue(struct net_device *dev,
 {
        struct switchdev_deferred_item *dfitem;
 
-       dfitem = kmalloc(sizeof(*dfitem) + data_len, GFP_ATOMIC);
+       dfitem = kmalloc(struct_size(dfitem, data, data_len), GFP_ATOMIC);
        if (!dfitem)
                return -ENOMEM;
        dfitem->dev = dev;
index 64ae4c4..c5eec16 100644 (file)
@@ -226,14 +226,6 @@ static inline void msg_set_bits(struct tipc_msg *m, u32 w,
        m->hdr[w] |= htonl(val);
 }
 
-static inline void msg_swap_words(struct tipc_msg *msg, u32 a, u32 b)
-{
-       u32 temp = msg->hdr[a];
-
-       msg->hdr[a] = msg->hdr[b];
-       msg->hdr[b] = temp;
-}
-
 /*
  * Word 0
  */
@@ -480,11 +472,6 @@ static inline void msg_incr_reroute_cnt(struct tipc_msg *m)
        msg_set_bits(m, 1, 21, 0xf, msg_reroute_cnt(m) + 1);
 }
 
-static inline void msg_reset_reroute_cnt(struct tipc_msg *m)
-{
-       msg_set_bits(m, 1, 21, 0xf, 0);
-}
-
 static inline u32 msg_lookup_scope(struct tipc_msg *m)
 {
        return msg_bits(m, 1, 19, 0x3);
@@ -800,11 +787,6 @@ static inline void msg_set_dest_domain(struct tipc_msg *m, u32 n)
        msg_set_word(m, 2, n);
 }
 
-static inline u32 msg_bcgap_after(struct tipc_msg *m)
-{
-       return msg_bits(m, 2, 16, 0xffff);
-}
-
 static inline void msg_set_bcgap_after(struct tipc_msg *m, u32 n)
 {
        msg_set_bits(m, 2, 16, 0xffff, n);
@@ -868,11 +850,6 @@ static inline void msg_set_next_sent(struct tipc_msg *m, u16 n)
        msg_set_bits(m, 4, 0, 0xffff, n);
 }
 
-static inline void msg_set_long_msgno(struct tipc_msg *m, u32 n)
-{
-       msg_set_bits(m, 4, 0, 0xffff, n);
-}
-
 static inline u32 msg_bc_netid(struct tipc_msg *m)
 {
        return msg_word(m, 4);
index efc8484..0024a69 100644 (file)
@@ -1433,7 +1433,8 @@ static int decrypt_internal(struct sock *sk, struct sk_buff *skb,
 
        if (*zc && (out_iov || out_sg)) {
                if (out_iov)
-                       n_sgout = iov_iter_npages(out_iov, INT_MAX) + 1;
+                       n_sgout = 1 +
+                               iov_iter_npages_cap(out_iov, INT_MAX, data_len);
                else
                        n_sgout = sg_nents(out_sg);
                n_sgin = skb_nsg(skb, rxm->offset + prot->prepend_size,
index c195698..3e0d628 100644 (file)
@@ -3240,49 +3240,58 @@ static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
        return sk;
 }
 
-static struct sock *unix_next_socket(struct seq_file *seq,
-                                    struct sock *sk,
-                                    loff_t *pos)
+static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
 {
        unsigned long bucket = get_bucket(*pos);
+       struct sock *sk;
 
-       while (sk > (struct sock *)SEQ_START_TOKEN) {
-               sk = sk_next(sk);
-               if (!sk)
-                       goto next_bucket;
-               if (sock_net(sk) == seq_file_net(seq))
-                       return sk;
-       }
-
-       do {
+       while (bucket < ARRAY_SIZE(unix_socket_table)) {
                spin_lock(&unix_table_locks[bucket]);
+
                sk = unix_from_bucket(seq, pos);
                if (sk)
                        return sk;
 
-next_bucket:
-               spin_unlock(&unix_table_locks[bucket++]);
-               *pos = set_bucket_offset(bucket, 1);
-       } while (bucket < ARRAY_SIZE(unix_socket_table));
+               spin_unlock(&unix_table_locks[bucket]);
+
+               *pos = set_bucket_offset(++bucket, 1);
+       }
 
        return NULL;
 }
 
+static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
+                                 loff_t *pos)
+{
+       unsigned long bucket = get_bucket(*pos);
+
+       for (sk = sk_next(sk); sk; sk = sk_next(sk))
+               if (sock_net(sk) == seq_file_net(seq))
+                       return sk;
+
+       spin_unlock(&unix_table_locks[bucket]);
+
+       *pos = set_bucket_offset(++bucket, 1);
+
+       return unix_get_first(seq, pos);
+}
+
 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
 {
        if (!*pos)
                return SEQ_START_TOKEN;
 
-       if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table))
-               return NULL;
-
-       return unix_next_socket(seq, NULL, pos);
+       return unix_get_first(seq, pos);
 }
 
 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
        ++*pos;
-       return unix_next_socket(seq, v, pos);
+
+       if (v == SEQ_START_TOKEN)
+               return unix_get_first(seq, pos);
+
+       return unix_get_next(seq, v, pos);
 }
 
 static void unix_seq_stop(struct seq_file *seq, void *v)
@@ -3347,6 +3356,15 @@ static const struct seq_operations unix_seq_ops = {
 };
 
 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL)
+struct bpf_unix_iter_state {
+       struct seq_net_private p;
+       unsigned int cur_sk;
+       unsigned int end_sk;
+       unsigned int max_sk;
+       struct sock **batch;
+       bool st_bucket_done;
+};
+
 struct bpf_iter__unix {
        __bpf_md_ptr(struct bpf_iter_meta *, meta);
        __bpf_md_ptr(struct unix_sock *, unix_sk);
@@ -3365,24 +3383,156 @@ static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
        return bpf_iter_run_prog(prog, &ctx);
 }
 
+static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
+
+{
+       struct bpf_unix_iter_state *iter = seq->private;
+       unsigned int expected = 1;
+       struct sock *sk;
+
+       sock_hold(start_sk);
+       iter->batch[iter->end_sk++] = start_sk;
+
+       for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
+               if (sock_net(sk) != seq_file_net(seq))
+                       continue;
+
+               if (iter->end_sk < iter->max_sk) {
+                       sock_hold(sk);
+                       iter->batch[iter->end_sk++] = sk;
+               }
+
+               expected++;
+       }
+
+       spin_unlock(&unix_table_locks[start_sk->sk_hash]);
+
+       return expected;
+}
+
+static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
+{
+       while (iter->cur_sk < iter->end_sk)
+               sock_put(iter->batch[iter->cur_sk++]);
+}
+
+static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
+                                      unsigned int new_batch_sz)
+{
+       struct sock **new_batch;
+
+       new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
+                            GFP_USER | __GFP_NOWARN);
+       if (!new_batch)
+               return -ENOMEM;
+
+       bpf_iter_unix_put_batch(iter);
+       kvfree(iter->batch);
+       iter->batch = new_batch;
+       iter->max_sk = new_batch_sz;
+
+       return 0;
+}
+
+static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
+                                       loff_t *pos)
+{
+       struct bpf_unix_iter_state *iter = seq->private;
+       unsigned int expected;
+       bool resized = false;
+       struct sock *sk;
+
+       if (iter->st_bucket_done)
+               *pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
+
+again:
+       /* Get a new batch */
+       iter->cur_sk = 0;
+       iter->end_sk = 0;
+
+       sk = unix_get_first(seq, pos);
+       if (!sk)
+               return NULL; /* Done */
+
+       expected = bpf_iter_unix_hold_batch(seq, sk);
+
+       if (iter->end_sk == expected) {
+               iter->st_bucket_done = true;
+               return sk;
+       }
+
+       if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
+               resized = true;
+               goto again;
+       }
+
+       return sk;
+}
+
+static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
+{
+       if (!*pos)
+               return SEQ_START_TOKEN;
+
+       /* bpf iter does not support lseek, so it always
+        * continue from where it was stop()-ped.
+        */
+       return bpf_iter_unix_batch(seq, pos);
+}
+
+static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+       struct bpf_unix_iter_state *iter = seq->private;
+       struct sock *sk;
+
+       /* Whenever seq_next() is called, the iter->cur_sk is
+        * done with seq_show(), so advance to the next sk in
+        * the batch.
+        */
+       if (iter->cur_sk < iter->end_sk)
+               sock_put(iter->batch[iter->cur_sk++]);
+
+       ++*pos;
+
+       if (iter->cur_sk < iter->end_sk)
+               sk = iter->batch[iter->cur_sk];
+       else
+               sk = bpf_iter_unix_batch(seq, pos);
+
+       return sk;
+}
+
 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
 {
        struct bpf_iter_meta meta;
        struct bpf_prog *prog;
        struct sock *sk = v;
        uid_t uid;
+       bool slow;
+       int ret;
 
        if (v == SEQ_START_TOKEN)
                return 0;
 
+       slow = lock_sock_fast(sk);
+
+       if (unlikely(sk_unhashed(sk))) {
+               ret = SEQ_SKIP;
+               goto unlock;
+       }
+
        uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
        meta.seq = seq;
        prog = bpf_iter_get_info(&meta, false);
-       return unix_prog_seq_show(prog, &meta, v, uid);
+       ret = unix_prog_seq_show(prog, &meta, v, uid);
+unlock:
+       unlock_sock_fast(sk, slow);
+       return ret;
 }
 
 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
 {
+       struct bpf_unix_iter_state *iter = seq->private;
        struct bpf_iter_meta meta;
        struct bpf_prog *prog;
 
@@ -3393,12 +3543,13 @@ static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
                        (void)unix_prog_seq_show(prog, &meta, v, 0);
        }
 
-       unix_seq_stop(seq, v);
+       if (iter->cur_sk < iter->end_sk)
+               bpf_iter_unix_put_batch(iter);
 }
 
 static const struct seq_operations bpf_iter_unix_seq_ops = {
-       .start  = unix_seq_start,
-       .next   = unix_seq_next,
+       .start  = bpf_iter_unix_seq_start,
+       .next   = bpf_iter_unix_seq_next,
        .stop   = bpf_iter_unix_seq_stop,
        .show   = bpf_iter_unix_seq_show,
 };
@@ -3447,13 +3598,55 @@ static struct pernet_operations unix_net_ops = {
 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
                     struct unix_sock *unix_sk, uid_t uid)
 
+#define INIT_BATCH_SZ 16
+
+static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
+{
+       struct bpf_unix_iter_state *iter = priv_data;
+       int err;
+
+       err = bpf_iter_init_seq_net(priv_data, aux);
+       if (err)
+               return err;
+
+       err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
+       if (err) {
+               bpf_iter_fini_seq_net(priv_data);
+               return err;
+       }
+
+       return 0;
+}
+
+static void bpf_iter_fini_unix(void *priv_data)
+{
+       struct bpf_unix_iter_state *iter = priv_data;
+
+       bpf_iter_fini_seq_net(priv_data);
+       kvfree(iter->batch);
+}
+
 static const struct bpf_iter_seq_info unix_seq_info = {
        .seq_ops                = &bpf_iter_unix_seq_ops,
-       .init_seq_private       = bpf_iter_init_seq_net,
-       .fini_seq_private       = bpf_iter_fini_seq_net,
-       .seq_priv_size          = sizeof(struct seq_net_private),
+       .init_seq_private       = bpf_iter_init_unix,
+       .fini_seq_private       = bpf_iter_fini_unix,
+       .seq_priv_size          = sizeof(struct bpf_unix_iter_state),
 };
 
+static const struct bpf_func_proto *
+bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
+                            const struct bpf_prog *prog)
+{
+       switch (func_id) {
+       case BPF_FUNC_setsockopt:
+               return &bpf_sk_setsockopt_proto;
+       case BPF_FUNC_getsockopt:
+               return &bpf_sk_getsockopt_proto;
+       default:
+               return NULL;
+       }
+}
+
 static struct bpf_iter_reg unix_reg_info = {
        .target                 = "unix",
        .ctx_arg_info_size      = 1,
@@ -3461,6 +3654,7 @@ static struct bpf_iter_reg unix_reg_info = {
                { offsetof(struct bpf_iter__unix, unix_sk),
                  PTR_TO_BTF_ID_OR_NULL },
        },
+       .get_func_proto         = bpf_iter_unix_get_func_proto,
        .seq_info               = &unix_seq_info,
 };
 
index 28ef3f4..2abd64e 100644 (file)
@@ -343,9 +343,9 @@ out:
 }
 EXPORT_SYMBOL(xsk_tx_peek_desc);
 
-static u32 xsk_tx_peek_release_fallback(struct xsk_buff_pool *pool, struct xdp_desc *descs,
-                                       u32 max_entries)
+static u32 xsk_tx_peek_release_fallback(struct xsk_buff_pool *pool, u32 max_entries)
 {
+       struct xdp_desc *descs = pool->tx_descs;
        u32 nb_pkts = 0;
 
        while (nb_pkts < max_entries && xsk_tx_peek_desc(pool, &descs[nb_pkts]))
@@ -355,8 +355,7 @@ static u32 xsk_tx_peek_release_fallback(struct xsk_buff_pool *pool, struct xdp_d
        return nb_pkts;
 }
 
-u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, struct xdp_desc *descs,
-                                  u32 max_entries)
+u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 max_entries)
 {
        struct xdp_sock *xs;
        u32 nb_pkts;
@@ -365,7 +364,7 @@ u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, struct xdp_desc *
        if (!list_is_singular(&pool->xsk_tx_list)) {
                /* Fallback to the non-batched version */
                rcu_read_unlock();
-               return xsk_tx_peek_release_fallback(pool, descs, max_entries);
+               return xsk_tx_peek_release_fallback(pool, max_entries);
        }
 
        xs = list_first_or_null_rcu(&pool->xsk_tx_list, struct xdp_sock, tx_list);
@@ -374,7 +373,7 @@ u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, struct xdp_desc *
                goto out;
        }
 
-       nb_pkts = xskq_cons_peek_desc_batch(xs->tx, descs, pool, max_entries);
+       nb_pkts = xskq_cons_peek_desc_batch(xs->tx, pool, max_entries);
        if (!nb_pkts) {
                xs->tx->queue_empty_descs++;
                goto out;
@@ -386,7 +385,7 @@ u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, struct xdp_desc *
         * packets. This avoids having to implement any buffering in
         * the Tx path.
         */
-       nb_pkts = xskq_prod_reserve_addr_batch(pool->cq, descs, nb_pkts);
+       nb_pkts = xskq_prod_reserve_addr_batch(pool->cq, pool->tx_descs, nb_pkts);
        if (!nb_pkts)
                goto out;
 
index fd39bb6..b34fca6 100644 (file)
@@ -37,6 +37,7 @@ void xp_destroy(struct xsk_buff_pool *pool)
        if (!pool)
                return;
 
+       kvfree(pool->tx_descs);
        kvfree(pool->heads);
        kvfree(pool);
 }
@@ -58,6 +59,12 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
        if (!pool->heads)
                goto out;
 
+       if (xs->tx) {
+               pool->tx_descs = kcalloc(xs->tx->nentries, sizeof(*pool->tx_descs), GFP_KERNEL);
+               if (!pool->tx_descs)
+                       goto out;
+       }
+
        pool->chunk_mask = ~((u64)umem->chunk_size - 1);
        pool->addrs_cnt = umem->size;
        pool->heads_cnt = umem->chunks;
index e9aa2c2..801cda5 100644 (file)
@@ -205,11 +205,11 @@ static inline bool xskq_cons_read_desc(struct xsk_queue *q,
        return false;
 }
 
-static inline u32 xskq_cons_read_desc_batch(struct xsk_queue *q,
-                                           struct xdp_desc *descs,
-                                           struct xsk_buff_pool *pool, u32 max)
+static inline u32 xskq_cons_read_desc_batch(struct xsk_queue *q, struct xsk_buff_pool *pool,
+                                           u32 max)
 {
        u32 cached_cons = q->cached_cons, nb_entries = 0;
+       struct xdp_desc *descs = pool->tx_descs;
 
        while (cached_cons != q->cached_prod && nb_entries < max) {
                struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring;
@@ -282,12 +282,12 @@ static inline bool xskq_cons_peek_desc(struct xsk_queue *q,
        return xskq_cons_read_desc(q, desc, pool);
 }
 
-static inline u32 xskq_cons_peek_desc_batch(struct xsk_queue *q, struct xdp_desc *descs,
-                                           struct xsk_buff_pool *pool, u32 max)
+static inline u32 xskq_cons_peek_desc_batch(struct xsk_queue *q, struct xsk_buff_pool *pool,
+                                           u32 max)
 {
        u32 entries = xskq_cons_nb_entries(q, max);
 
-       return xskq_cons_read_desc_batch(q, descs, pool, entries);
+       return xskq_cons_read_desc_batch(q, pool, entries);
 }
 
 /* To improve performance in the xskq_cons_release functions, only update local state here.
@@ -304,13 +304,6 @@ static inline void xskq_cons_release_n(struct xsk_queue *q, u32 cnt)
        q->cached_cons += cnt;
 }
 
-static inline bool xskq_cons_is_full(struct xsk_queue *q)
-{
-       /* No barriers needed since data is not accessed */
-       return READ_ONCE(q->ring->producer) - READ_ONCE(q->ring->consumer) ==
-               q->nentries;
-}
-
 static inline u32 xskq_cons_present_entries(struct xsk_queue *q)
 {
        /* No barriers needed since data is not accessed */
index 319fd31..e69651a 100644 (file)
@@ -413,7 +413,7 @@ static void fixup_map(struct bpf_object *obj)
                for (i = 0; i < NR_TESTS; i++) {
                        if (!strcmp(test_map_names[i], name) &&
                            (check_test_flags(i))) {
-                               bpf_map__resize(map, num_map_entries);
+                               bpf_map__set_max_entries(map, num_map_entries);
                                continue;
                        }
                }
index 8675fa5..631f0ca 100644 (file)
@@ -26,12 +26,12 @@ static void int_exit(int sig)
 {
        __u32 curr_prog_id = 0;
 
-       if (bpf_get_link_xdp_id(ifindex, &curr_prog_id, xdp_flags)) {
-               printf("bpf_get_link_xdp_id failed\n");
+       if (bpf_xdp_query_id(ifindex, xdp_flags, &curr_prog_id)) {
+               printf("bpf_xdp_query_id failed\n");
                exit(1);
        }
        if (prog_id == curr_prog_id)
-               bpf_set_link_xdp_fd(ifindex, -1, xdp_flags);
+               bpf_xdp_detach(ifindex, xdp_flags, NULL);
        else if (!curr_prog_id)
                printf("couldn't find a prog id on a given interface\n");
        else
@@ -79,13 +79,11 @@ static void usage(const char *prog)
 
 int main(int argc, char **argv)
 {
-       struct bpf_prog_load_attr prog_load_attr = {
-               .prog_type      = BPF_PROG_TYPE_XDP,
-       };
        struct bpf_prog_info info = {};
        __u32 info_len = sizeof(info);
        const char *optstr = "FSN";
        int prog_fd, map_fd, opt;
+       struct bpf_program *prog;
        struct bpf_object *obj;
        struct bpf_map *map;
        char filename[256];
@@ -123,11 +121,19 @@ int main(int argc, char **argv)
        }
 
        snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
-       prog_load_attr.file = filename;
+       obj = bpf_object__open_file(filename, NULL);
+       if (libbpf_get_error(obj))
+               return 1;
+
+       prog = bpf_object__next_program(obj, NULL);
+       bpf_program__set_type(prog, BPF_PROG_TYPE_XDP);
 
-       if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd))
+       err = bpf_object__load(obj);
+       if (err)
                return 1;
 
+       prog_fd = bpf_program__fd(prog);
+
        map = bpf_object__next_map(obj, NULL);
        if (!map) {
                printf("finding a map in obj file failed\n");
@@ -143,7 +149,7 @@ int main(int argc, char **argv)
        signal(SIGINT, int_exit);
        signal(SIGTERM, int_exit);
 
-       if (bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags) < 0) {
+       if (bpf_xdp_attach(ifindex, prog_fd, xdp_flags, NULL) < 0) {
                printf("link set xdp fd failed\n");
                return 1;
        }
index a70b094..b3f6e49 100644 (file)
@@ -34,12 +34,12 @@ static void int_exit(int sig)
        __u32 curr_prog_id = 0;
 
        if (ifindex > -1) {
-               if (bpf_get_link_xdp_id(ifindex, &curr_prog_id, xdp_flags)) {
-                       printf("bpf_get_link_xdp_id failed\n");
+               if (bpf_xdp_query_id(ifindex, xdp_flags, &curr_prog_id)) {
+                       printf("bpf_xdp_query_id failed\n");
                        exit(1);
                }
                if (prog_id == curr_prog_id)
-                       bpf_set_link_xdp_fd(ifindex, -1, xdp_flags);
+                       bpf_xdp_detach(ifindex, xdp_flags, NULL);
                else if (!curr_prog_id)
                        printf("couldn't find a prog id on a given iface\n");
                else
@@ -82,15 +82,13 @@ static void usage(const char *cmd)
 
 int main(int argc, char **argv)
 {
-       struct bpf_prog_load_attr prog_load_attr = {
-               .prog_type      = BPF_PROG_TYPE_XDP,
-       };
        unsigned char opt_flags[256] = {};
        const char *optstr = "i:T:P:SNFh";
        struct bpf_prog_info info = {};
        __u32 info_len = sizeof(info);
        unsigned int kill_after_s = 0;
        int i, prog_fd, map_fd, opt;
+       struct bpf_program *prog;
        struct bpf_object *obj;
        __u32 max_pckt_size = 0;
        __u32 key = 0;
@@ -148,11 +146,20 @@ int main(int argc, char **argv)
        }
 
        snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
-       prog_load_attr.file = filename;
 
-       if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd))
+       obj = bpf_object__open_file(filename, NULL);
+       if (libbpf_get_error(obj))
                return 1;
 
+       prog = bpf_object__next_program(obj, NULL);
+       bpf_program__set_type(prog, BPF_PROG_TYPE_XDP);
+
+       err = bpf_object__load(obj);
+       if (err)
+               return 1;
+
+       prog_fd = bpf_program__fd(prog);
+
        /* static global var 'max_pcktsz' is accessible from .data section */
        if (max_pckt_size) {
                map_fd = bpf_object__find_map_fd_by_name(obj, "xdp_adju.data");
@@ -173,7 +180,7 @@ int main(int argc, char **argv)
        signal(SIGINT, int_exit);
        signal(SIGTERM, int_exit);
 
-       if (bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags) < 0) {
+       if (bpf_xdp_attach(ifindex, prog_fd, xdp_flags, NULL) < 0) {
                printf("link set xdp fd failed\n");
                return 1;
        }
index 4ad8967..1828487 100644 (file)
@@ -33,7 +33,7 @@ static int do_attach(int idx, int prog_fd, int map_fd, const char *name)
 {
        int err;
 
-       err = bpf_set_link_xdp_fd(idx, prog_fd, xdp_flags);
+       err = bpf_xdp_attach(idx, prog_fd, xdp_flags, NULL);
        if (err < 0) {
                printf("ERROR: failed to attach program to %s\n", name);
                return err;
@@ -51,7 +51,7 @@ static int do_detach(int idx, const char *name)
 {
        int err;
 
-       err = bpf_set_link_xdp_fd(idx, -1, xdp_flags);
+       err = bpf_xdp_detach(idx, xdp_flags, NULL);
        if (err < 0)
                printf("ERROR: failed to detach program from %s\n", name);
 
@@ -75,14 +75,11 @@ static void usage(const char *prog)
 
 int main(int argc, char **argv)
 {
-       struct bpf_prog_load_attr prog_load_attr = {
-               .prog_type      = BPF_PROG_TYPE_XDP,
-       };
        const char *prog_name = "xdp_fwd";
        struct bpf_program *prog = NULL;
        struct bpf_program *pos;
        const char *sec_name;
-       int prog_fd, map_fd = -1;
+       int prog_fd = -1, map_fd = -1;
        char filename[PATH_MAX];
        struct bpf_object *obj;
        int opt, i, idx, err;
@@ -119,7 +116,6 @@ int main(int argc, char **argv)
 
        if (attach) {
                snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
-               prog_load_attr.file = filename;
 
                if (access(filename, O_RDONLY) < 0) {
                        printf("error accessing file %s: %s\n",
@@ -127,7 +123,14 @@ int main(int argc, char **argv)
                        return 1;
                }
 
-               err = bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd);
+               obj = bpf_object__open_file(filename, NULL);
+               if (libbpf_get_error(obj))
+                       return 1;
+
+               prog = bpf_object__next_program(obj, NULL);
+               bpf_program__set_type(prog, BPF_PROG_TYPE_XDP);
+
+               err = bpf_object__load(obj);
                if (err) {
                        printf("Does kernel support devmap lookup?\n");
                        /* If not, the error message will be:
index 25e3a40..87c54bf 100644 (file)
@@ -491,7 +491,7 @@ int  xdp_prognum5_lb_hash_ip_pairs(struct xdp_md *ctx)
        return bpf_redirect_map(&cpu_map, cpu_dest, 0);
 }
 
-SEC("xdp_cpumap/redirect")
+SEC("xdp/cpumap")
 int xdp_redirect_cpu_devmap(struct xdp_md *ctx)
 {
        void *data_end = (void *)(long)ctx->data_end;
@@ -507,19 +507,19 @@ int xdp_redirect_cpu_devmap(struct xdp_md *ctx)
        return bpf_redirect_map(&tx_port, 0, 0);
 }
 
-SEC("xdp_cpumap/pass")
+SEC("xdp/cpumap")
 int xdp_redirect_cpu_pass(struct xdp_md *ctx)
 {
        return XDP_PASS;
 }
 
-SEC("xdp_cpumap/drop")
+SEC("xdp/cpumap")
 int xdp_redirect_cpu_drop(struct xdp_md *ctx)
 {
        return XDP_DROP;
 }
 
-SEC("xdp_devmap/egress")
+SEC("xdp/devmap")
 int xdp_redirect_egress_prog(struct xdp_md *ctx)
 {
        void *data_end = (void *)(long)ctx->data_end;
index a81704d..5f74a70 100644 (file)
@@ -70,7 +70,7 @@ static void print_avail_progs(struct bpf_object *obj)
 
        printf(" Programs to be used for -p/--progname:\n");
        bpf_object__for_each_program(pos, obj) {
-               if (bpf_program__is_xdp(pos)) {
+               if (bpf_program__type(pos) == BPF_PROG_TYPE_XDP) {
                        if (!strncmp(bpf_program__name(pos), "xdp_prognum",
                                     sizeof("xdp_prognum") - 1))
                                printf(" %s\n", bpf_program__name(pos));
index 59efd65..415bac1 100644 (file)
@@ -68,7 +68,7 @@ int xdp_redirect_map_native(struct xdp_md *ctx)
        return xdp_redirect_map(ctx, &tx_port_native);
 }
 
-SEC("xdp_devmap/egress")
+SEC("xdp/devmap")
 int xdp_redirect_map_egress(struct xdp_md *ctx)
 {
        void *data_end = (void *)(long)ctx->data_end;
index bb0a5a3..8b2fd4e 100644 (file)
@@ -53,7 +53,7 @@ int xdp_redirect_map_native(struct xdp_md *ctx)
        return xdp_redirect_map(ctx, &forward_map_native);
 }
 
-SEC("xdp_devmap/egress")
+SEC("xdp/devmap")
 int xdp_devmap_prog(struct xdp_md *ctx)
 {
        void *data_end = (void *)(long)ctx->data_end;
index cfaf7e5..6dae87d 100644 (file)
@@ -43,13 +43,13 @@ static void int_exit(int sig)
        int i = 0;
 
        for (i = 0; i < total_ifindex; i++) {
-               if (bpf_get_link_xdp_id(ifindex_list[i], &prog_id, flags)) {
-                       printf("bpf_get_link_xdp_id on iface %d failed\n",
+               if (bpf_xdp_query_id(ifindex_list[i], flags, &prog_id)) {
+                       printf("bpf_xdp_query_id on iface %d failed\n",
                               ifindex_list[i]);
                        exit(1);
                }
                if (prog_id_list[i] == prog_id)
-                       bpf_set_link_xdp_fd(ifindex_list[i], -1, flags);
+                       bpf_xdp_detach(ifindex_list[i], flags, NULL);
                else if (!prog_id)
                        printf("couldn't find a prog id on iface %d\n",
                               ifindex_list[i]);
@@ -640,12 +640,10 @@ static void usage(const char *prog)
 
 int main(int ac, char **argv)
 {
-       struct bpf_prog_load_attr prog_load_attr = {
-               .prog_type      = BPF_PROG_TYPE_XDP,
-       };
        struct bpf_prog_info info = {};
        __u32 info_len = sizeof(info);
        const char *optstr = "SF";
+       struct bpf_program *prog;
        struct bpf_object *obj;
        char filename[256];
        char **ifname_list;
@@ -653,7 +651,6 @@ int main(int ac, char **argv)
        int err, i = 1;
 
        snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
-       prog_load_attr.file = filename;
 
        total_ifindex = ac - 1;
        ifname_list = (argv + 1);
@@ -684,14 +681,20 @@ int main(int ac, char **argv)
                return 1;
        }
 
-       if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd))
+       obj = bpf_object__open_file(filename, NULL);
+       if (libbpf_get_error(obj))
                return 1;
 
+       prog = bpf_object__next_program(obj, NULL);
+       bpf_program__set_type(prog, BPF_PROG_TYPE_XDP);
+
        printf("\n******************loading bpf file*********************\n");
-       if (!prog_fd) {
-               printf("bpf_prog_load_xattr: %s\n", strerror(errno));
+       err = bpf_object__load(obj);
+       if (err) {
+               printf("bpf_object__load(): %s\n", strerror(errno));
                return 1;
        }
+       prog_fd = bpf_program__fd(prog);
 
        lpm_map_fd = bpf_object__find_map_fd_by_name(obj, "lpm_map");
        rxcnt_map_fd = bpf_object__find_map_fd_by_name(obj, "rxcnt");
@@ -716,12 +719,12 @@ int main(int ac, char **argv)
        }
        prog_id_list = (__u32 *)calloc(total_ifindex, sizeof(__u32 *));
        for (i = 0; i < total_ifindex; i++) {
-               if (bpf_set_link_xdp_fd(ifindex_list[i], prog_fd, flags) < 0) {
+               if (bpf_xdp_attach(ifindex_list[i], prog_fd, flags, NULL) < 0) {
                        printf("link set xdp fd failed\n");
                        int recovery_index = i;
 
                        for (i = 0; i < recovery_index; i++)
-                               bpf_set_link_xdp_fd(ifindex_list[i], -1, flags);
+                               bpf_xdp_detach(ifindex_list[i], flags, NULL);
 
                        return 1;
                }
index 74a2926..f2d90cb 100644 (file)
@@ -62,15 +62,15 @@ static void int_exit(int sig)
        __u32 curr_prog_id = 0;
 
        if (ifindex > -1) {
-               if (bpf_get_link_xdp_id(ifindex, &curr_prog_id, xdp_flags)) {
-                       printf("bpf_get_link_xdp_id failed\n");
+               if (bpf_xdp_query_id(ifindex, xdp_flags, &curr_prog_id)) {
+                       printf("bpf_xdp_query_id failed\n");
                        exit(EXIT_FAIL);
                }
                if (prog_id == curr_prog_id) {
                        fprintf(stderr,
                                "Interrupted: Removing XDP program on ifindex:%d device:%s\n",
                                ifindex, ifname);
-                       bpf_set_link_xdp_fd(ifindex, -1, xdp_flags);
+                       bpf_xdp_detach(ifindex, xdp_flags, NULL);
                } else if (!curr_prog_id) {
                        printf("couldn't find a prog id on a given iface\n");
                } else {
@@ -209,7 +209,7 @@ static struct datarec *alloc_record_per_cpu(void)
 
 static struct record *alloc_record_per_rxq(void)
 {
-       unsigned int nr_rxqs = bpf_map__def(rx_queue_index_map)->max_entries;
+       unsigned int nr_rxqs = bpf_map__max_entries(rx_queue_index_map);
        struct record *array;
 
        array = calloc(nr_rxqs, sizeof(struct record));
@@ -222,7 +222,7 @@ static struct record *alloc_record_per_rxq(void)
 
 static struct stats_record *alloc_stats_record(void)
 {
-       unsigned int nr_rxqs = bpf_map__def(rx_queue_index_map)->max_entries;
+       unsigned int nr_rxqs = bpf_map__max_entries(rx_queue_index_map);
        struct stats_record *rec;
        int i;
 
@@ -241,7 +241,7 @@ static struct stats_record *alloc_stats_record(void)
 
 static void free_stats_record(struct stats_record *r)
 {
-       unsigned int nr_rxqs = bpf_map__def(rx_queue_index_map)->max_entries;
+       unsigned int nr_rxqs = bpf_map__max_entries(rx_queue_index_map);
        int i;
 
        for (i = 0; i < nr_rxqs; i++)
@@ -289,7 +289,7 @@ static void stats_collect(struct stats_record *rec)
        map_collect_percpu(fd, 0, &rec->stats);
 
        fd = bpf_map__fd(rx_queue_index_map);
-       max_rxqs = bpf_map__def(rx_queue_index_map)->max_entries;
+       max_rxqs = bpf_map__max_entries(rx_queue_index_map);
        for (i = 0; i < max_rxqs; i++)
                map_collect_percpu(fd, i, &rec->rxq[i]);
 }
@@ -335,7 +335,7 @@ static void stats_print(struct stats_record *stats_rec,
                        struct stats_record *stats_prev,
                        int action, __u32 cfg_opt)
 {
-       unsigned int nr_rxqs = bpf_map__def(rx_queue_index_map)->max_entries;
+       unsigned int nr_rxqs = bpf_map__max_entries(rx_queue_index_map);
        unsigned int nr_cpus = bpf_num_possible_cpus();
        double pps = 0, err = 0;
        struct record *rec, *prev;
@@ -450,14 +450,12 @@ static void stats_poll(int interval, int action, __u32 cfg_opt)
 int main(int argc, char **argv)
 {
        __u32 cfg_options= NO_TOUCH ; /* Default: Don't touch packet memory */
-       struct bpf_prog_load_attr prog_load_attr = {
-               .prog_type      = BPF_PROG_TYPE_XDP,
-       };
        struct bpf_prog_info info = {};
        __u32 info_len = sizeof(info);
        int prog_fd, map_fd, opt, err;
        bool use_separators = true;
        struct config cfg = { 0 };
+       struct bpf_program *prog;
        struct bpf_object *obj;
        struct bpf_map *map;
        char filename[256];
@@ -471,11 +469,19 @@ int main(int argc, char **argv)
        char *action_str = NULL;
 
        snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
-       prog_load_attr.file = filename;
 
-       if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd))
+       obj = bpf_object__open_file(filename, NULL);
+       if (libbpf_get_error(obj))
                return EXIT_FAIL;
 
+       prog = bpf_object__next_program(obj, NULL);
+       bpf_program__set_type(prog, BPF_PROG_TYPE_XDP);
+
+       err = bpf_object__load(obj);
+       if (err)
+               return EXIT_FAIL;
+       prog_fd = bpf_program__fd(prog);
+
        map =  bpf_object__find_map_by_name(obj, "config_map");
        stats_global_map = bpf_object__find_map_by_name(obj, "stats_global_map");
        rx_queue_index_map = bpf_object__find_map_by_name(obj, "rx_queue_index_map");
@@ -582,7 +588,7 @@ int main(int argc, char **argv)
        signal(SIGINT, int_exit);
        signal(SIGTERM, int_exit);
 
-       if (bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags) < 0) {
+       if (bpf_xdp_attach(ifindex, prog_fd, xdp_flags, NULL) < 0) {
                fprintf(stderr, "link set xdp fd failed\n");
                return EXIT_FAIL_XDP;
        }
index 587eacb..0a2b3e9 100644 (file)
@@ -30,7 +30,7 @@ static int do_attach(int idx, int fd, const char *name)
        __u32 info_len = sizeof(info);
        int err;
 
-       err = bpf_set_link_xdp_fd(idx, fd, xdp_flags);
+       err = bpf_xdp_attach(idx, fd, xdp_flags, NULL);
        if (err < 0) {
                printf("ERROR: failed to attach program to %s\n", name);
                return err;
@@ -51,13 +51,13 @@ static int do_detach(int idx, const char *name)
        __u32 curr_prog_id = 0;
        int err = 0;
 
-       err = bpf_get_link_xdp_id(idx, &curr_prog_id, xdp_flags);
+       err = bpf_xdp_query_id(idx, xdp_flags, &curr_prog_id);
        if (err) {
-               printf("bpf_get_link_xdp_id failed\n");
+               printf("bpf_xdp_query_id failed\n");
                return err;
        }
        if (prog_id == curr_prog_id) {
-               err = bpf_set_link_xdp_fd(idx, -1, xdp_flags);
+               err = bpf_xdp_detach(idx, xdp_flags, NULL);
                if (err < 0)
                        printf("ERROR: failed to detach prog from %s\n", name);
        } else if (!curr_prog_id) {
index 8740838..c4332d0 100644 (file)
@@ -1218,7 +1218,7 @@ int sample_setup_maps(struct bpf_map **maps)
                default:
                        return -EINVAL;
                }
-               if (bpf_map__resize(sample_map[i], sample_map_count[i]) < 0)
+               if (bpf_map__set_max_entries(sample_map[i], sample_map_count[i]) < 0)
                        return -errno;
        }
        sample_map[MAP_DEVMAP_XMIT_MULTI] = maps[MAP_DEVMAP_XMIT_MULTI];
@@ -1265,7 +1265,7 @@ static int __sample_remove_xdp(int ifindex, __u32 prog_id, int xdp_flags)
        int ret;
 
        if (prog_id) {
-               ret = bpf_get_link_xdp_id(ifindex, &cur_prog_id, xdp_flags);
+               ret = bpf_xdp_query_id(ifindex, xdp_flags, &cur_prog_id);
                if (ret < 0)
                        return -errno;
 
@@ -1278,7 +1278,7 @@ static int __sample_remove_xdp(int ifindex, __u32 prog_id, int xdp_flags)
                }
        }
 
-       return bpf_set_link_xdp_fd(ifindex, -1, xdp_flags);
+       return bpf_xdp_detach(ifindex, xdp_flags, NULL);
 }
 
 int sample_install_xdp(struct bpf_program *xdp_prog, int ifindex, bool generic,
@@ -1295,8 +1295,7 @@ int sample_install_xdp(struct bpf_program *xdp_prog, int ifindex, bool generic,
 
        xdp_flags |= !force ? XDP_FLAGS_UPDATE_IF_NOEXIST : 0;
        xdp_flags |= generic ? XDP_FLAGS_SKB_MODE : XDP_FLAGS_DRV_MODE;
-       ret = bpf_set_link_xdp_fd(ifindex, bpf_program__fd(xdp_prog),
-                                 xdp_flags);
+       ret = bpf_xdp_attach(ifindex, bpf_program__fd(xdp_prog), xdp_flags, NULL);
        if (ret < 0) {
                ret = -errno;
                fprintf(stderr,
@@ -1308,7 +1307,7 @@ int sample_install_xdp(struct bpf_program *xdp_prog, int ifindex, bool generic,
                return ret;
        }
 
-       ret = bpf_get_link_xdp_id(ifindex, &prog_id, xdp_flags);
+       ret = bpf_xdp_query_id(ifindex, xdp_flags, &prog_id);
        if (ret < 0) {
                ret = -errno;
                fprintf(stderr,
index 5f44b87..f450516 100644 (file)
@@ -61,7 +61,7 @@ static inline char *safe_strncpy(char *dst, const char *src, size_t size)
 
 #define __attach_tp(name)                                                      \
        ({                                                                     \
-               if (!bpf_program__is_tracing(skel->progs.name))                \
+               if (bpf_program__type(skel->progs.name) != BPF_PROG_TYPE_TRACING)\
                        return -EINVAL;                                        \
                skel->links.name = bpf_program__attach(skel->progs.name);      \
                if (!skel->links.name)                                         \
index 1d4f305..2e811e4 100644 (file)
@@ -32,12 +32,12 @@ static void int_exit(int sig)
        __u32 curr_prog_id = 0;
 
        if (ifindex > -1) {
-               if (bpf_get_link_xdp_id(ifindex, &curr_prog_id, xdp_flags)) {
-                       printf("bpf_get_link_xdp_id failed\n");
+               if (bpf_xdp_query_id(ifindex, xdp_flags, &curr_prog_id)) {
+                       printf("bpf_xdp_query_id failed\n");
                        exit(1);
                }
                if (prog_id == curr_prog_id)
-                       bpf_set_link_xdp_fd(ifindex, -1, xdp_flags);
+                       bpf_xdp_detach(ifindex, xdp_flags, NULL);
                else if (!curr_prog_id)
                        printf("couldn't find a prog id on a given iface\n");
                else
@@ -152,9 +152,6 @@ static int parse_ports(const char *port_str, int *min_port, int *max_port)
 
 int main(int argc, char **argv)
 {
-       struct bpf_prog_load_attr prog_load_attr = {
-               .prog_type      = BPF_PROG_TYPE_XDP,
-       };
        int min_port = 0, max_port = 0, vip2tnl_map_fd;
        const char *optstr = "i:a:p:s:d:m:T:P:FSNh";
        unsigned char opt_flags[256] = {};
@@ -162,6 +159,7 @@ int main(int argc, char **argv)
        __u32 info_len = sizeof(info);
        unsigned int kill_after_s = 0;
        struct iptnl_info tnl = {};
+       struct bpf_program *prog;
        struct bpf_object *obj;
        struct vip vip = {};
        char filename[256];
@@ -259,15 +257,20 @@ int main(int argc, char **argv)
        }
 
        snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
-       prog_load_attr.file = filename;
 
-       if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd))
+       obj = bpf_object__open_file(filename, NULL);
+       if (libbpf_get_error(obj))
                return 1;
 
-       if (!prog_fd) {
-               printf("bpf_prog_load_xattr: %s\n", strerror(errno));
+       prog = bpf_object__next_program(obj, NULL);
+       bpf_program__set_type(prog, BPF_PROG_TYPE_XDP);
+
+       err = bpf_object__load(obj);
+       if (err) {
+               printf("bpf_object__load(): %s\n", strerror(errno));
                return 1;
        }
+       prog_fd = bpf_program__fd(prog);
 
        rxcnt_map_fd = bpf_object__find_map_fd_by_name(obj, "rxcnt");
        vip2tnl_map_fd = bpf_object__find_map_fd_by_name(obj, "vip2tnl");
@@ -288,7 +291,7 @@ int main(int argc, char **argv)
                }
        }
 
-       if (bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags) < 0) {
+       if (bpf_xdp_attach(ifindex, prog_fd, xdp_flags, NULL) < 0) {
                printf("link set xdp fd failed\n");
                return 1;
        }
@@ -302,7 +305,7 @@ int main(int argc, char **argv)
 
        poll_stats(kill_after_s);
 
-       bpf_set_link_xdp_fd(ifindex, -1, xdp_flags);
+       bpf_xdp_detach(ifindex, xdp_flags, NULL);
 
        return 0;
 }
index cc44087..28b5f2a 100644 (file)
@@ -173,7 +173,7 @@ main(int argc, char **argv)
        unlink(SOCKET_NAME);
 
        /* Unset fd for given ifindex */
-       err = bpf_set_link_xdp_fd(ifindex, -1, 0);
+       err = bpf_xdp_detach(ifindex, 0, NULL);
        if (err) {
                fprintf(stderr, "Error when unsetting bpf prog_fd for ifindex(%d)\n", ifindex);
                return err;
index aa50864..19288a2 100644 (file)
@@ -571,13 +571,13 @@ static void remove_xdp_program(void)
 {
        u32 curr_prog_id = 0;
 
-       if (bpf_get_link_xdp_id(opt_ifindex, &curr_prog_id, opt_xdp_flags)) {
-               printf("bpf_get_link_xdp_id failed\n");
+       if (bpf_xdp_query_id(opt_ifindex, opt_xdp_flags, &curr_prog_id)) {
+               printf("bpf_xdp_query_id failed\n");
                exit(EXIT_FAILURE);
        }
 
        if (prog_id == curr_prog_id)
-               bpf_set_link_xdp_fd(opt_ifindex, -1, opt_xdp_flags);
+               bpf_xdp_detach(opt_ifindex, opt_xdp_flags, NULL);
        else if (!curr_prog_id)
                printf("couldn't find a prog id on a given interface\n");
        else
@@ -1027,7 +1027,7 @@ static struct xsk_socket_info *xsk_configure_socket(struct xsk_umem_info *umem,
        if (ret)
                exit_with_error(-ret);
 
-       ret = bpf_get_link_xdp_id(opt_ifindex, &prog_id, opt_xdp_flags);
+       ret = bpf_xdp_query_id(opt_ifindex, opt_xdp_flags, &prog_id);
        if (ret)
                exit_with_error(-ret);
 
@@ -1760,7 +1760,7 @@ static void load_xdp_program(char **argv, struct bpf_object **obj)
                exit(EXIT_FAILURE);
        }
 
-       if (bpf_set_link_xdp_fd(opt_ifindex, prog_fd, opt_xdp_flags) < 0) {
+       if (bpf_xdp_attach(opt_ifindex, prog_fd, opt_xdp_flags, NULL) < 0) {
                fprintf(stderr, "ERROR: link set xdp fd failed\n");
                exit(EXIT_FAILURE);
        }
index 52e7c4f..2220509 100644 (file)
@@ -974,8 +974,8 @@ static void remove_xdp_program(void)
        int i;
 
        for (i = 0 ; i < n_ports; i++)
-               bpf_set_link_xdp_fd(if_nametoindex(port_params[i].iface), -1,
-                                   port_params[i].xsk_cfg.xdp_flags);
+               bpf_xdp_detach(if_nametoindex(port_params[i].iface),
+                              port_params[i].xsk_cfg.xdp_flags, NULL);
 }
 
 int main(int argc, char **argv)
index a6403dd..0966252 100755 (executable)
@@ -87,21 +87,25 @@ class HeaderParser(object):
         self.line = ''
         self.helpers = []
         self.commands = []
+        self.desc_unique_helpers = set()
+        self.define_unique_helpers = []
+        self.desc_syscalls = []
+        self.enum_syscalls = []
 
     def parse_element(self):
         proto    = self.parse_symbol()
-        desc     = self.parse_desc()
-        ret      = self.parse_ret()
+        desc     = self.parse_desc(proto)
+        ret      = self.parse_ret(proto)
         return APIElement(proto=proto, desc=desc, ret=ret)
 
     def parse_helper(self):
         proto    = self.parse_proto()
-        desc     = self.parse_desc()
-        ret      = self.parse_ret()
+        desc     = self.parse_desc(proto)
+        ret      = self.parse_ret(proto)
         return Helper(proto=proto, desc=desc, ret=ret)
 
     def parse_symbol(self):
-        p = re.compile(' \* ?(.+)$')
+        p = re.compile(' \* ?(BPF\w+)$')
         capture = p.match(self.line)
         if not capture:
             raise NoSyscallCommandFound
@@ -127,16 +131,15 @@ class HeaderParser(object):
         self.line = self.reader.readline()
         return capture.group(1)
 
-    def parse_desc(self):
+    def parse_desc(self, proto):
         p = re.compile(' \* ?(?:\t| {5,8})Description$')
         capture = p.match(self.line)
         if not capture:
-            # Helper can have empty description and we might be parsing another
-            # attribute: return but do not consume.
-            return ''
+            raise Exception("No description section found for " + proto)
         # Description can be several lines, some of them possibly empty, and it
         # stops when another subsection title is met.
         desc = ''
+        desc_present = False
         while True:
             self.line = self.reader.readline()
             if self.line == ' *\n':
@@ -145,21 +148,24 @@ class HeaderParser(object):
                 p = re.compile(' \* ?(?:\t| {5,8})(?:\t| {8})(.*)')
                 capture = p.match(self.line)
                 if capture:
+                    desc_present = True
                     desc += capture.group(1) + '\n'
                 else:
                     break
+
+        if not desc_present:
+            raise Exception("No description found for " + proto)
         return desc
 
-    def parse_ret(self):
+    def parse_ret(self, proto):
         p = re.compile(' \* ?(?:\t| {5,8})Return$')
         capture = p.match(self.line)
         if not capture:
-            # Helper can have empty retval and we might be parsing another
-            # attribute: return but do not consume.
-            return ''
+            raise Exception("No return section found for " + proto)
         # Return value description can be several lines, some of them possibly
         # empty, and it stops when another subsection title is met.
         ret = ''
+        ret_present = False
         while True:
             self.line = self.reader.readline()
             if self.line == ' *\n':
@@ -168,44 +174,101 @@ class HeaderParser(object):
                 p = re.compile(' \* ?(?:\t| {5,8})(?:\t| {8})(.*)')
                 capture = p.match(self.line)
                 if capture:
+                    ret_present = True
                     ret += capture.group(1) + '\n'
                 else:
                     break
+
+        if not ret_present:
+            raise Exception("No return found for " + proto)
         return ret
 
-    def seek_to(self, target, help_message):
+    def seek_to(self, target, help_message, discard_lines = 1):
         self.reader.seek(0)
         offset = self.reader.read().find(target)
         if offset == -1:
             raise Exception(help_message)
         self.reader.seek(offset)
         self.reader.readline()
-        self.reader.readline()
+        for _ in range(discard_lines):
+            self.reader.readline()
         self.line = self.reader.readline()
 
-    def parse_syscall(self):
+    def parse_desc_syscall(self):
         self.seek_to('* DOC: eBPF Syscall Commands',
                      'Could not find start of eBPF syscall descriptions list')
         while True:
             try:
                 command = self.parse_element()
                 self.commands.append(command)
+                self.desc_syscalls.append(command.proto)
+
             except NoSyscallCommandFound:
                 break
 
-    def parse_helpers(self):
+    def parse_enum_syscall(self):
+        self.seek_to('enum bpf_cmd {',
+                     'Could not find start of bpf_cmd enum', 0)
+        # Searches for either one or more BPF\w+ enums
+        bpf_p = re.compile('\s*(BPF\w+)+')
+        # Searches for an enum entry assigned to another entry,
+        # for e.g. BPF_PROG_RUN = BPF_PROG_TEST_RUN, which is
+        # not documented hence should be skipped in check to
+        # determine if the right number of syscalls are documented
+        assign_p = re.compile('\s*(BPF\w+)\s*=\s*(BPF\w+)')
+        bpf_cmd_str = ''
+        while True:
+            capture = assign_p.match(self.line)
+            if capture:
+                # Skip line if an enum entry is assigned to another entry
+                self.line = self.reader.readline()
+                continue
+            capture = bpf_p.match(self.line)
+            if capture:
+                bpf_cmd_str += self.line
+            else:
+                break
+            self.line = self.reader.readline()
+        # Find the number of occurences of BPF\w+
+        self.enum_syscalls = re.findall('(BPF\w+)+', bpf_cmd_str)
+
+    def parse_desc_helpers(self):
         self.seek_to('* Start of BPF helper function descriptions:',
                      'Could not find start of eBPF helper descriptions list')
         while True:
             try:
                 helper = self.parse_helper()
                 self.helpers.append(helper)
+                proto = helper.proto_break_down()
+                self.desc_unique_helpers.add(proto['name'])
             except NoHelperFound:
                 break
 
+    def parse_define_helpers(self):
+        # Parse the number of FN(...) in #define __BPF_FUNC_MAPPER to compare
+        # later with the number of unique function names present in description.
+        # Note: seek_to(..) discards the first line below the target search text,
+        # resulting in FN(unspec) being skipped and not added to self.define_unique_helpers.
+        self.seek_to('#define __BPF_FUNC_MAPPER(FN)',
+                     'Could not find start of eBPF helper definition list')
+        # Searches for either one or more FN(\w+) defines or a backslash for newline
+        p = re.compile('\s*(FN\(\w+\))+|\\\\')
+        fn_defines_str = ''
+        while True:
+            capture = p.match(self.line)
+            if capture:
+                fn_defines_str += self.line
+            else:
+                break
+            self.line = self.reader.readline()
+        # Find the number of occurences of FN(\w+)
+        self.define_unique_helpers = re.findall('FN\(\w+\)', fn_defines_str)
+
     def run(self):
-        self.parse_syscall()
-        self.parse_helpers()
+        self.parse_desc_syscall()
+        self.parse_enum_syscall()
+        self.parse_desc_helpers()
+        self.parse_define_helpers()
         self.reader.close()
 
 ###############################################################################
@@ -235,6 +298,25 @@ class Printer(object):
             self.print_one(elem)
         self.print_footer()
 
+    def elem_number_check(self, desc_unique_elem, define_unique_elem, type, instance):
+        """
+        Checks the number of helpers/syscalls documented within the header file
+        description with those defined as part of enum/macro and raise an
+        Exception if they don't match.
+        """
+        nr_desc_unique_elem = len(desc_unique_elem)
+        nr_define_unique_elem = len(define_unique_elem)
+        if nr_desc_unique_elem != nr_define_unique_elem:
+            exception_msg = '''
+The number of unique %s in description (%d) doesn\'t match the number of unique %s defined in %s (%d)
+''' % (type, nr_desc_unique_elem, type, instance, nr_define_unique_elem)
+            if nr_desc_unique_elem < nr_define_unique_elem:
+                # Function description is parsed until no helper is found (which can be due to
+                # misformatting). Hence, only print the first missing/misformatted helper/enum.
+                exception_msg += '''
+The description for %s is not present or formatted correctly.
+''' % (define_unique_elem[nr_desc_unique_elem])
+            raise Exception(exception_msg)
 
 class PrinterRST(Printer):
     """
@@ -295,7 +377,6 @@ class PrinterRST(Printer):
 
         print('')
 
-
 class PrinterHelpersRST(PrinterRST):
     """
     A printer for dumping collected information about helpers as a ReStructured
@@ -305,6 +386,7 @@ class PrinterHelpersRST(PrinterRST):
     """
     def __init__(self, parser):
         self.elements = parser.helpers
+        self.elem_number_check(parser.desc_unique_helpers, parser.define_unique_helpers, 'helper', '__BPF_FUNC_MAPPER')
 
     def print_header(self):
         header = '''\
@@ -478,6 +560,7 @@ class PrinterSyscallRST(PrinterRST):
     """
     def __init__(self, parser):
         self.elements = parser.commands
+        self.elem_number_check(parser.desc_syscalls, parser.enum_syscalls, 'syscall', 'bpf_cmd')
 
     def print_header(self):
         header = '''\
@@ -509,6 +592,7 @@ class PrinterHelpers(Printer):
     """
     def __init__(self, parser):
         self.elements = parser.helpers
+        self.elem_number_check(parser.desc_unique_helpers, parser.define_unique_helpers, 'helper', '__BPF_FUNC_MAPPER')
 
     type_fwds = [
             'struct bpf_fib_lookup',
index e6093ad..c293941 100755 (executable)
@@ -7,7 +7,7 @@ if ! [ -x "$(command -v ${PAHOLE})" ]; then
        exit 0
 fi
 
-pahole_ver=$(${PAHOLE} --version | sed -E 's/v([0-9]+)\.([0-9]+)/\1\2/')
+pahole_ver=$($(dirname $0)/pahole-version.sh ${PAHOLE})
 
 if [ "${pahole_ver}" -ge "118" ] && [ "${pahole_ver}" -le "121" ]; then
        # pahole 1.18 through 1.21 can't handle zero-sized per-CPU vars
diff --git a/scripts/pahole-version.sh b/scripts/pahole-version.sh
new file mode 100755 (executable)
index 0000000..f8a32ab
--- /dev/null
@@ -0,0 +1,13 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+#
+# Usage: $ ./pahole-version.sh pahole
+#
+# Prints pahole's version in a 3-digit form, such as 119 for v1.19.
+
+if [ ! -x "$(command -v "$@")" ]; then
+       echo 0
+       exit 1
+fi
+
+"$@" --version | sed -E 's/v([0-9]+)\.([0-9]+)/\1\2/'
index 842889f..a9f8c63 100644 (file)
@@ -838,7 +838,7 @@ int devcgroup_check_permission(short type, u32 major, u32 minor, short access)
        int rc = BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type, major, minor, access);
 
        if (rc)
-               return -EPERM;
+               return rc;
 
        #ifdef CONFIG_CGROUP_DEVICE
        return devcgroup_legacy_check_permission(type, major, minor, access);
index 5983312..a2c665b 100644 (file)
@@ -902,7 +902,7 @@ static int do_show(int argc, char **argv)
                                      equal_fn_for_key_as_id, NULL);
        btf_map_table = hashmap__new(hash_fn_for_key_as_id,
                                     equal_fn_for_key_as_id, NULL);
-       if (!btf_prog_table || !btf_map_table) {
+       if (IS_ERR(btf_prog_table) || IS_ERR(btf_map_table)) {
                hashmap__free(btf_prog_table);
                hashmap__free(btf_map_table);
                if (fd >= 0)
index 3571a28..effe136 100644 (file)
@@ -50,6 +50,7 @@ static int show_bpf_prog(int id, enum bpf_attach_type attach_type,
                         const char *attach_flags_str,
                         int level)
 {
+       char prog_name[MAX_PROG_FULL_NAME];
        struct bpf_prog_info info = {};
        __u32 info_len = sizeof(info);
        int prog_fd;
@@ -63,6 +64,7 @@ static int show_bpf_prog(int id, enum bpf_attach_type attach_type,
                return -1;
        }
 
+       get_prog_full_name(&info, prog_fd, prog_name, sizeof(prog_name));
        if (json_output) {
                jsonw_start_object(json_wtr);
                jsonw_uint_field(json_wtr, "id", info.id);
@@ -73,7 +75,7 @@ static int show_bpf_prog(int id, enum bpf_attach_type attach_type,
                        jsonw_uint_field(json_wtr, "attach_type", attach_type);
                jsonw_string_field(json_wtr, "attach_flags",
                                   attach_flags_str);
-               jsonw_string_field(json_wtr, "name", info.name);
+               jsonw_string_field(json_wtr, "name", prog_name);
                jsonw_end_object(json_wtr);
        } else {
                printf("%s%-8u ", level ? "    " : "", info.id);
@@ -81,7 +83,7 @@ static int show_bpf_prog(int id, enum bpf_attach_type attach_type,
                        printf("%-15s", attach_type_name[attach_type]);
                else
                        printf("type %-10u", attach_type);
-               printf(" %-15s %-15s\n", attach_flags_str, info.name);
+               printf(" %-15s %-15s\n", attach_flags_str, prog_name);
        }
 
        close(prog_fd);
index fa8eb81..606743c 100644 (file)
@@ -24,6 +24,7 @@
 #include <bpf/bpf.h>
 #include <bpf/hashmap.h>
 #include <bpf/libbpf.h> /* libbpf_num_possible_cpus */
+#include <bpf/btf.h>
 
 #include "main.h"
 
@@ -304,6 +305,49 @@ const char *get_fd_type_name(enum bpf_obj_type type)
        return names[type];
 }
 
+void get_prog_full_name(const struct bpf_prog_info *prog_info, int prog_fd,
+                       char *name_buff, size_t buff_len)
+{
+       const char *prog_name = prog_info->name;
+       const struct btf_type *func_type;
+       const struct bpf_func_info finfo = {};
+       struct bpf_prog_info info = {};
+       __u32 info_len = sizeof(info);
+       struct btf *prog_btf = NULL;
+
+       if (buff_len <= BPF_OBJ_NAME_LEN ||
+           strlen(prog_info->name) < BPF_OBJ_NAME_LEN - 1)
+               goto copy_name;
+
+       if (!prog_info->btf_id || prog_info->nr_func_info == 0)
+               goto copy_name;
+
+       info.nr_func_info = 1;
+       info.func_info_rec_size = prog_info->func_info_rec_size;
+       if (info.func_info_rec_size > sizeof(finfo))
+               info.func_info_rec_size = sizeof(finfo);
+       info.func_info = ptr_to_u64(&finfo);
+
+       if (bpf_obj_get_info_by_fd(prog_fd, &info, &info_len))
+               goto copy_name;
+
+       prog_btf = btf__load_from_kernel_by_id(info.btf_id);
+       if (!prog_btf)
+               goto copy_name;
+
+       func_type = btf__type_by_id(prog_btf, finfo.type_id);
+       if (!func_type || !btf_is_func(func_type))
+               goto copy_name;
+
+       prog_name = btf__name_by_offset(prog_btf, func_type->name_off);
+
+copy_name:
+       snprintf(name_buff, buff_len, "%s", prog_name);
+
+       if (prog_btf)
+               btf__free(prog_btf);
+}
+
 int get_fd_type(int fd)
 {
        char path[PATH_MAX];
index e999159..9c894b1 100644 (file)
@@ -487,17 +487,12 @@ probe_prog_type(enum bpf_prog_type prog_type, bool *supported_types,
        size_t maxlen;
        bool res;
 
-       if (ifindex)
-               /* Only test offload-able program types */
-               switch (prog_type) {
-               case BPF_PROG_TYPE_SCHED_CLS:
-               case BPF_PROG_TYPE_XDP:
-                       break;
-               default:
-                       return;
-               }
+       if (ifindex) {
+               p_info("BPF offload feature probing is not supported");
+               return;
+       }
 
-       res = bpf_probe_prog_type(prog_type, ifindex);
+       res = libbpf_probe_bpf_prog_type(prog_type, NULL);
 #ifdef USE_LIBCAP
        /* Probe may succeed even if program load fails, for unprivileged users
         * check that we did not fail because of insufficient permissions
@@ -535,7 +530,12 @@ probe_map_type(enum bpf_map_type map_type, const char *define_prefix,
        size_t maxlen;
        bool res;
 
-       res = bpf_probe_map_type(map_type, ifindex);
+       if (ifindex) {
+               p_info("BPF offload feature probing is not supported");
+               return;
+       }
+
+       res = libbpf_probe_bpf_map_type(map_type, NULL);
 
        /* Probe result depends on the success of map creation, no additional
         * check required for unprivileged users
@@ -567,7 +567,12 @@ probe_helper_for_progtype(enum bpf_prog_type prog_type, bool supported_type,
        bool res = false;
 
        if (supported_type) {
-               res = bpf_probe_helper(id, prog_type, ifindex);
+               if (ifindex) {
+                       p_info("BPF offload feature probing is not supported");
+                       return;
+               }
+
+               res = libbpf_probe_bpf_helper(prog_type, id, NULL);
 #ifdef USE_LIBCAP
                /* Probe may succeed even if program load fails, for
                 * unprivileged users check that we did not fail because of
index b4695df..eacfc6a 100644 (file)
@@ -227,7 +227,7 @@ static int codegen_datasecs(struct bpf_object *obj, const char *obj_name)
                /* only generate definitions for memory-mapped internal maps */
                if (!bpf_map__is_internal(map))
                        continue;
-               if (!(bpf_map__def(map)->map_flags & BPF_F_MMAPABLE))
+               if (!(bpf_map__map_flags(map) & BPF_F_MMAPABLE))
                        continue;
 
                if (!get_map_ident(map, map_ident, sizeof(map_ident)))
@@ -378,13 +378,16 @@ static void codegen_attach_detach(struct bpf_object *obj, const char *obj_name)
                                int prog_fd = skel->progs.%2$s.prog_fd;             \n\
                        ", obj_name, bpf_program__name(prog));
 
-               switch (bpf_program__get_type(prog)) {
+               switch (bpf_program__type(prog)) {
                case BPF_PROG_TYPE_RAW_TRACEPOINT:
                        tp_name = strchr(bpf_program__section_name(prog), '/') + 1;
-                       printf("\tint fd = bpf_raw_tracepoint_open(\"%s\", prog_fd);\n", tp_name);
+                       printf("\tint fd = skel_raw_tracepoint_open(\"%s\", prog_fd);\n", tp_name);
                        break;
                case BPF_PROG_TYPE_TRACING:
-                       printf("\tint fd = bpf_raw_tracepoint_open(NULL, prog_fd);\n");
+                       if (bpf_program__expected_attach_type(prog) == BPF_TRACE_ITER)
+                               printf("\tint fd = skel_link_create(prog_fd, 0, BPF_TRACE_ITER);\n");
+                       else
+                               printf("\tint fd = skel_raw_tracepoint_open(NULL, prog_fd);\n");
                        break;
                default:
                        printf("\tint fd = ((void)prog_fd, 0); /* auto-attach not supported */\n");
@@ -468,7 +471,7 @@ static void codegen_destroy(struct bpf_object *obj, const char *obj_name)
                if (!get_map_ident(map, ident, sizeof(ident)))
                        continue;
                if (bpf_map__is_internal(map) &&
-                   (bpf_map__def(map)->map_flags & BPF_F_MMAPABLE))
+                   (bpf_map__map_flags(map) & BPF_F_MMAPABLE))
                        printf("\tmunmap(skel->%1$s, %2$zd);\n",
                               ident, bpf_map_mmap_sz(map));
                codegen("\
@@ -536,7 +539,7 @@ static int gen_trace(struct bpf_object *obj, const char *obj_name, const char *h
                        continue;
 
                if (!bpf_map__is_internal(map) ||
-                   !(bpf_map__def(map)->map_flags & BPF_F_MMAPABLE))
+                   !(bpf_map__map_flags(map) & BPF_F_MMAPABLE))
                        continue;
 
                codegen("\
@@ -600,10 +603,10 @@ static int gen_trace(struct bpf_object *obj, const char *obj_name, const char *h
                        continue;
 
                if (!bpf_map__is_internal(map) ||
-                   !(bpf_map__def(map)->map_flags & BPF_F_MMAPABLE))
+                   !(bpf_map__map_flags(map) & BPF_F_MMAPABLE))
                        continue;
 
-               if (bpf_map__def(map)->map_flags & BPF_F_RDONLY_PROG)
+               if (bpf_map__map_flags(map) & BPF_F_RDONLY_PROG)
                        mmap_flags = "PROT_READ";
                else
                        mmap_flags = "PROT_READ | PROT_WRITE";
@@ -927,7 +930,6 @@ static int do_skeleton(int argc, char **argv)
                        s = (struct bpf_object_skeleton *)calloc(1, sizeof(*s));\n\
                        if (!s)                                             \n\
                                goto err;                                   \n\
-                       obj->skeleton = s;                                  \n\
                                                                            \n\
                        s->sz = sizeof(*s);                                 \n\
                        s->name = \"%1$s\";                                 \n\
@@ -962,7 +964,7 @@ static int do_skeleton(int argc, char **argv)
                                i, bpf_map__name(map), i, ident);
                        /* memory-mapped internal maps */
                        if (bpf_map__is_internal(map) &&
-                           (bpf_map__def(map)->map_flags & BPF_F_MMAPABLE)) {
+                           (bpf_map__map_flags(map) & BPF_F_MMAPABLE)) {
                                printf("\ts->maps[%zu].mmaped = (void **)&obj->%s;\n",
                                       i, ident);
                        }
@@ -1000,6 +1002,7 @@ static int do_skeleton(int argc, char **argv)
                                                                            \n\
                        s->data = (void *)%2$s__elf_bytes(&s->data_sz);     \n\
                                                                            \n\
+                       obj->skeleton = s;                                  \n\
                        return 0;                                           \n\
                err:                                                        \n\
                        bpf_object__destroy_skeleton(s);                    \n\
index 2c258db..97dec81 100644 (file)
@@ -2,6 +2,7 @@
 /* Copyright (C) 2020 Facebook */
 
 #include <errno.h>
+#include <linux/err.h>
 #include <net/if.h>
 #include <stdio.h>
 #include <unistd.h>
@@ -306,7 +307,7 @@ static int do_show(int argc, char **argv)
        if (show_pinned) {
                link_table = hashmap__new(hash_fn_for_key_as_id,
                                          equal_fn_for_key_as_id, NULL);
-               if (!link_table) {
+               if (IS_ERR(link_table)) {
                        p_err("failed to create hashmap for pinned paths");
                        return -1;
                }
index 020e91a..490f7bd 100644 (file)
@@ -478,7 +478,11 @@ int main(int argc, char **argv)
        }
 
        if (!legacy_libbpf) {
-               ret = libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
+               /* Allow legacy map definitions for skeleton generation.
+                * It will still be rejected if users use LIBBPF_STRICT_ALL
+                * mode for loading generated skeleton.
+                */
+               ret = libbpf_set_strict_mode(LIBBPF_STRICT_ALL & ~LIBBPF_STRICT_MAP_DEFINITIONS);
                if (ret)
                        p_err("failed to enable libbpf strict mode: %d", ret);
        }
index 8d76d93..0c38405 100644 (file)
@@ -140,6 +140,10 @@ struct cmd {
 int cmd_select(const struct cmd *cmds, int argc, char **argv,
               int (*help)(int argc, char **argv));
 
+#define MAX_PROG_FULL_NAME 128
+void get_prog_full_name(const struct bpf_prog_info *prog_info, int prog_fd,
+                       char *name_buff, size_t buff_len);
+
 int get_fd_type(int fd);
 const char *get_fd_type_name(enum bpf_obj_type type);
 char *get_fdinfo(int fd, const char *key);
index cc530a2..c66a3c9 100644 (file)
@@ -699,7 +699,7 @@ static int do_show(int argc, char **argv)
        if (show_pinned) {
                map_table = hashmap__new(hash_fn_for_key_as_id,
                                         equal_fn_for_key_as_id, NULL);
-               if (!map_table) {
+               if (IS_ERR(map_table)) {
                        p_err("failed to create hashmap for pinned paths");
                        return -1;
                }
index 6490537..526a332 100644 (file)
@@ -551,7 +551,7 @@ static int do_attach_detach_xdp(int progfd, enum net_attach_type attach_type,
        if (attach_type == NET_ATTACH_TYPE_XDP_OFFLOAD)
                flags |= XDP_FLAGS_HW_MODE;
 
-       return bpf_set_link_xdp_fd(ifindex, progfd, flags);
+       return bpf_xdp_attach(ifindex, progfd, flags, NULL);
 }
 
 static int do_attach(int argc, char **argv)
index 56b598e..7c384d1 100644 (file)
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
 /* Copyright (C) 2020 Facebook */
 #include <errno.h>
+#include <linux/err.h>
 #include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -101,7 +102,7 @@ int build_obj_refs_table(struct hashmap **map, enum bpf_obj_type type)
        libbpf_print_fn_t default_print;
 
        *map = hashmap__new(hash_fn_for_key_as_id, equal_fn_for_key_as_id, NULL);
-       if (!*map) {
+       if (IS_ERR(*map)) {
                p_err("failed to create hashmap for PID references");
                return -1;
        }
index 2a21d50..92a6f67 100644 (file)
@@ -424,8 +424,10 @@ out_free:
        free(value);
 }
 
-static void print_prog_header_json(struct bpf_prog_info *info)
+static void print_prog_header_json(struct bpf_prog_info *info, int fd)
 {
+       char prog_name[MAX_PROG_FULL_NAME];
+
        jsonw_uint_field(json_wtr, "id", info->id);
        if (info->type < ARRAY_SIZE(prog_type_name))
                jsonw_string_field(json_wtr, "type",
@@ -433,8 +435,10 @@ static void print_prog_header_json(struct bpf_prog_info *info)
        else
                jsonw_uint_field(json_wtr, "type", info->type);
 
-       if (*info->name)
-               jsonw_string_field(json_wtr, "name", info->name);
+       if (*info->name) {
+               get_prog_full_name(info, fd, prog_name, sizeof(prog_name));
+               jsonw_string_field(json_wtr, "name", prog_name);
+       }
 
        jsonw_name(json_wtr, "tag");
        jsonw_printf(json_wtr, "\"" BPF_TAG_FMT "\"",
@@ -455,7 +459,7 @@ static void print_prog_json(struct bpf_prog_info *info, int fd)
        char *memlock;
 
        jsonw_start_object(json_wtr);
-       print_prog_header_json(info);
+       print_prog_header_json(info, fd);
        print_dev_json(info->ifindex, info->netns_dev, info->netns_ino);
 
        if (info->load_time) {
@@ -507,16 +511,20 @@ static void print_prog_json(struct bpf_prog_info *info, int fd)
        jsonw_end_object(json_wtr);
 }
 
-static void print_prog_header_plain(struct bpf_prog_info *info)
+static void print_prog_header_plain(struct bpf_prog_info *info, int fd)
 {
+       char prog_name[MAX_PROG_FULL_NAME];
+
        printf("%u: ", info->id);
        if (info->type < ARRAY_SIZE(prog_type_name))
                printf("%s  ", prog_type_name[info->type]);
        else
                printf("type %u  ", info->type);
 
-       if (*info->name)
-               printf("name %s  ", info->name);
+       if (*info->name) {
+               get_prog_full_name(info, fd, prog_name, sizeof(prog_name));
+               printf("name %s  ", prog_name);
+       }
 
        printf("tag ");
        fprint_hex(stdout, info->tag, BPF_TAG_SIZE, "");
@@ -534,7 +542,7 @@ static void print_prog_plain(struct bpf_prog_info *info, int fd)
 {
        char *memlock;
 
-       print_prog_header_plain(info);
+       print_prog_header_plain(info, fd);
 
        if (info->load_time) {
                char buf[32];
@@ -641,7 +649,7 @@ static int do_show(int argc, char **argv)
        if (show_pinned) {
                prog_table = hashmap__new(hash_fn_for_key_as_id,
                                          equal_fn_for_key_as_id, NULL);
-               if (!prog_table) {
+               if (IS_ERR(prog_table)) {
                        p_err("failed to create hashmap for pinned paths");
                        return -1;
                }
@@ -972,10 +980,10 @@ static int do_dump(int argc, char **argv)
 
                if (json_output && nb_fds > 1) {
                        jsonw_start_object(json_wtr);   /* prog object */
-                       print_prog_header_json(&info);
+                       print_prog_header_json(&info, fds[i]);
                        jsonw_name(json_wtr, "insns");
                } else if (nb_fds > 1) {
-                       print_prog_header_plain(&info);
+                       print_prog_header_plain(&info, fds[i]);
                }
 
                err = prog_dump(&info, mode, filepath, opcodes, visual, linum);
@@ -1264,12 +1272,12 @@ static int do_run(int argc, char **argv)
 {
        char *data_fname_in = NULL, *data_fname_out = NULL;
        char *ctx_fname_in = NULL, *ctx_fname_out = NULL;
-       struct bpf_prog_test_run_attr test_attr = {0};
        const unsigned int default_size = SZ_32K;
        void *data_in = NULL, *data_out = NULL;
        void *ctx_in = NULL, *ctx_out = NULL;
        unsigned int repeat = 1;
        int fd, err;
+       LIBBPF_OPTS(bpf_test_run_opts, test_attr);
 
        if (!REQ_ARGS(4))
                return -1;
@@ -1387,14 +1395,13 @@ static int do_run(int argc, char **argv)
                        goto free_ctx_in;
        }
 
-       test_attr.prog_fd       = fd;
        test_attr.repeat        = repeat;
        test_attr.data_in       = data_in;
        test_attr.data_out      = data_out;
        test_attr.ctx_in        = ctx_in;
        test_attr.ctx_out       = ctx_out;
 
-       err = bpf_prog_test_run_xattr(&test_attr);
+       err = bpf_prog_test_run_opts(fd, &test_attr);
        if (err) {
                p_err("failed to run program: %s", strerror(errno));
                goto free_ctx_out;
@@ -2275,10 +2282,10 @@ static int do_profile(int argc, char **argv)
        profile_obj->rodata->num_metric = num_metric;
 
        /* adjust map sizes */
-       bpf_map__resize(profile_obj->maps.events, num_metric * num_cpu);
-       bpf_map__resize(profile_obj->maps.fentry_readings, num_metric);
-       bpf_map__resize(profile_obj->maps.accum_readings, num_metric);
-       bpf_map__resize(profile_obj->maps.counts, 1);
+       bpf_map__set_max_entries(profile_obj->maps.events, num_metric * num_cpu);
+       bpf_map__set_max_entries(profile_obj->maps.fentry_readings, num_metric);
+       bpf_map__set_max_entries(profile_obj->maps.accum_readings, num_metric);
+       bpf_map__set_max_entries(profile_obj->maps.counts, 1);
 
        /* change target name */
        profile_tgt_name = profile_target_name(profile_tgt_fd);
index 2f693b0..e08a6ff 100644 (file)
@@ -480,7 +480,6 @@ static int do_unregister(int argc, char **argv)
 static int do_register(int argc, char **argv)
 {
        LIBBPF_OPTS(bpf_object_open_opts, open_opts);
-       const struct bpf_map_def *def;
        struct bpf_map_info info = {};
        __u32 info_len = sizeof(info);
        int nr_errs = 0, nr_maps = 0;
@@ -510,8 +509,7 @@ static int do_register(int argc, char **argv)
        }
 
        bpf_object__for_each_map(map, obj) {
-               def = bpf_map__def(map);
-               if (def->type != BPF_MAP_TYPE_STRUCT_OPS)
+               if (bpf_map__type(map) != BPF_MAP_TYPE_STRUCT_OPS)
                        continue;
 
                link = bpf_map__attach_struct_ops(map);
index 320a88a..19a3112 100644 (file)
@@ -24,6 +24,8 @@ LD       = $(HOSTLD)
 ARCH     = $(HOSTARCH)
 RM      ?= rm
 CROSS_COMPILE =
+CFLAGS  := $(KBUILD_HOSTCFLAGS)
+LDFLAGS := $(KBUILD_HOSTLDFLAGS)
 
 OUTPUT ?= $(srctree)/tools/bpf/resolve_btfids/
 
@@ -51,10 +53,10 @@ $(SUBCMDOBJ): fixdep FORCE | $(OUTPUT)/libsubcmd
 
 $(BPFOBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(LIBBPF_OUT)
        $(Q)$(MAKE) $(submake_extras) -C $(LIBBPF_SRC) OUTPUT=$(LIBBPF_OUT)    \
-                   DESTDIR=$(LIBBPF_DESTDIR) prefix=                          \
+                   DESTDIR=$(LIBBPF_DESTDIR) prefix= EXTRA_CFLAGS="$(CFLAGS)" \
                    $(abspath $@) install_headers
 
-CFLAGS := -g \
+CFLAGS += -g \
           -I$(srctree)/tools/include \
           -I$(srctree)/tools/include/uapi \
           -I$(LIBBPF_INCLUDE) \
index b0383d3..afe3d0d 100644 (file)
@@ -330,6 +330,8 @@ union bpf_iter_link_info {
  *                     *ctx_out*, *data_in* and *data_out* must be NULL.
  *                     *repeat* must be zero.
  *
+ *             BPF_PROG_RUN is an alias for BPF_PROG_TEST_RUN.
+ *
  *     Return
  *             Returns zero on success. On error, -1 is returned and *errno*
  *             is set appropriately.
@@ -1111,6 +1113,11 @@ enum bpf_link_type {
  */
 #define BPF_F_SLEEPABLE                (1U << 4)
 
+/* If BPF_F_XDP_HAS_FRAGS is used in BPF_PROG_LOAD command, the loaded program
+ * fully support xdp frags.
+ */
+#define BPF_F_XDP_HAS_FRAGS    (1U << 5)
+
 /* When BPF ldimm64's insn[0].src_reg != 0 then this can have
  * the following extensions:
  *
@@ -1775,6 +1782,8 @@ union bpf_attr {
  *             0 on success, or a negative error in case of failure.
  *
  * u64 bpf_get_current_pid_tgid(void)
+ *     Description
+ *             Get the current pid and tgid.
  *     Return
  *             A 64-bit integer containing the current tgid and pid, and
  *             created as such:
@@ -1782,6 +1791,8 @@ union bpf_attr {
  *             *current_task*\ **->pid**.
  *
  * u64 bpf_get_current_uid_gid(void)
+ *     Description
+ *             Get the current uid and gid.
  *     Return
  *             A 64-bit integer containing the current GID and UID, and
  *             created as such: *current_gid* **<< 32 \|** *current_uid*.
@@ -2256,6 +2267,8 @@ union bpf_attr {
  *             The 32-bit hash.
  *
  * u64 bpf_get_current_task(void)
+ *     Description
+ *             Get the current task.
  *     Return
  *             A pointer to the current task struct.
  *
@@ -2369,6 +2382,8 @@ union bpf_attr {
  *             indicate that the hash is outdated and to trigger a
  *             recalculation the next time the kernel tries to access this
  *             hash or when the **bpf_get_hash_recalc**\ () helper is called.
+ *     Return
+ *             void.
  *
  * long bpf_get_numa_node_id(void)
  *     Description
@@ -2466,6 +2481,8 @@ union bpf_attr {
  *             A 8-byte long unique number or 0 if *sk* is NULL.
  *
  * u32 bpf_get_socket_uid(struct sk_buff *skb)
+ *     Description
+ *             Get the owner UID of the socked associated to *skb*.
  *     Return
  *             The owner UID of the socket associated to *skb*. If the socket
  *             is **NULL**, or if it is not a full socket (i.e. if it is a
@@ -3240,6 +3257,9 @@ union bpf_attr {
  *             The id is returned or 0 in case the id could not be retrieved.
  *
  * u64 bpf_get_current_cgroup_id(void)
+ *     Description
+ *             Get the current cgroup id based on the cgroup within which
+ *             the current task is running.
  *     Return
  *             A 64-bit integer containing the current cgroup id based
  *             on the cgroup within which the current task is running.
@@ -5018,6 +5038,54 @@ union bpf_attr {
  *
  *     Return
  *             The number of arguments of the traced function.
+ *
+ * int bpf_get_retval(void)
+ *     Description
+ *             Get the syscall's return value that will be returned to userspace.
+ *
+ *             This helper is currently supported by cgroup programs only.
+ *     Return
+ *             The syscall's return value.
+ *
+ * int bpf_set_retval(int retval)
+ *     Description
+ *             Set the syscall's return value that will be returned to userspace.
+ *
+ *             This helper is currently supported by cgroup programs only.
+ *     Return
+ *             0 on success, or a negative error in case of failure.
+ *
+ * u64 bpf_xdp_get_buff_len(struct xdp_buff *xdp_md)
+ *     Description
+ *             Get the total size of a given xdp buff (linear and paged area)
+ *     Return
+ *             The total size of a given xdp buffer.
+ *
+ * long bpf_xdp_load_bytes(struct xdp_buff *xdp_md, u32 offset, void *buf, u32 len)
+ *     Description
+ *             This helper is provided as an easy way to load data from a
+ *             xdp buffer. It can be used to load *len* bytes from *offset* from
+ *             the frame associated to *xdp_md*, into the buffer pointed by
+ *             *buf*.
+ *     Return
+ *             0 on success, or a negative error in case of failure.
+ *
+ * long bpf_xdp_store_bytes(struct xdp_buff *xdp_md, u32 offset, void *buf, u32 len)
+ *     Description
+ *             Store *len* bytes from buffer *buf* into the frame
+ *             associated to *xdp_md*, at *offset*.
+ *     Return
+ *             0 on success, or a negative error in case of failure.
+ *
+ * long bpf_copy_from_user_task(void *dst, u32 size, const void *user_ptr, struct task_struct *tsk, u64 flags)
+ *     Description
+ *             Read *size* bytes from user space address *user_ptr* in *tsk*'s
+ *             address space, and stores the data in *dst*. *flags* is not
+ *             used yet and is provided for future extensibility. This helper
+ *             can only be used by sleepable programs.
+ *     Return
+ *             0 on success, or a negative error in case of failure. On error
+ *             *dst* buffer is zeroed out.
  */
 #define __BPF_FUNC_MAPPER(FN)          \
        FN(unspec),                     \
@@ -5206,6 +5274,12 @@ union bpf_attr {
        FN(get_func_arg),               \
        FN(get_func_ret),               \
        FN(get_func_arg_cnt),           \
+       FN(get_retval),                 \
+       FN(set_retval),                 \
+       FN(xdp_get_buff_len),           \
+       FN(xdp_load_bytes),             \
+       FN(xdp_store_bytes),            \
+       FN(copy_from_user_task),        \
        /* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
@@ -5500,7 +5574,8 @@ struct bpf_sock {
        __u32 src_ip4;
        __u32 src_ip6[4];
        __u32 src_port;         /* host byte order */
-       __u32 dst_port;         /* network byte order */
+       __be16 dst_port;        /* network byte order */
+       __u16 :16;              /* zero padding */
        __u32 dst_ip4;
        __u32 dst_ip6[4];
        __u32 state;
@@ -6378,7 +6453,8 @@ struct bpf_sk_lookup {
        __u32 protocol;         /* IP protocol (IPPROTO_TCP, IPPROTO_UDP) */
        __u32 remote_ip4;       /* Network byte order */
        __u32 remote_ip6[4];    /* Network byte order */
-       __u32 remote_port;      /* Network byte order */
+       __be16 remote_port;     /* Network byte order */
+       __u16 :16;              /* Zero padding */
        __u32 local_ip4;        /* Network byte order */
        __u32 local_ip6[4];     /* Network byte order */
        __u32 local_port;       /* Host byte order */
index f947b61..b8b37fe 100644 (file)
@@ -131,7 +131,7 @@ GLOBAL_SYM_COUNT = $(shell readelf -s --wide $(BPF_IN_SHARED) | \
                           sort -u | wc -l)
 VERSIONED_SYM_COUNT = $(shell readelf --dyn-syms --wide $(OUTPUT)libbpf.so | \
                              sed 's/\[.*\]//' | \
-                             awk '/GLOBAL/ && /DEFAULT/ && !/UND/ {print $$NF}' | \
+                             awk '/GLOBAL/ && /DEFAULT/ && !/UND|ABS/ {print $$NF}' | \
                              grep -Eo '[^ ]+@LIBBPF_' | cut -d@ -f1 | sort -u | wc -l)
 
 CMD_TARGETS = $(LIB_TARGET) $(PC_FILE)
@@ -194,7 +194,7 @@ check_abi: $(OUTPUT)libbpf.so $(VERSION_SCRIPT)
                    sort -u > $(OUTPUT)libbpf_global_syms.tmp;           \
                readelf --dyn-syms --wide $(OUTPUT)libbpf.so |           \
                    sed 's/\[.*\]//' |                                   \
-                   awk '/GLOBAL/ && /DEFAULT/ && !/UND/ {print $$NF}'|  \
+                   awk '/GLOBAL/ && /DEFAULT/ && !/UND|ABS/ {print $$NF}'|  \
                    grep -Eo '[^ ]+@LIBBPF_' | cut -d@ -f1 |             \
                    sort -u > $(OUTPUT)libbpf_versioned_syms.tmp;        \
                diff -u $(OUTPUT)libbpf_global_syms.tmp                  \
index 550b4cb..418b259 100644 (file)
@@ -754,10 +754,10 @@ int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type,
                .flags = flags,
        );
 
-       return bpf_prog_attach_xattr(prog_fd, target_fd, type, &opts);
+       return bpf_prog_attach_opts(prog_fd, target_fd, type, &opts);
 }
 
-int bpf_prog_attach_xattr(int prog_fd, int target_fd,
+int bpf_prog_attach_opts(int prog_fd, int target_fd,
                          enum bpf_attach_type type,
                          const struct bpf_prog_attach_opts *opts)
 {
@@ -778,6 +778,11 @@ int bpf_prog_attach_xattr(int prog_fd, int target_fd,
        return libbpf_err_errno(ret);
 }
 
+__attribute__((alias("bpf_prog_attach_opts")))
+int bpf_prog_attach_xattr(int prog_fd, int target_fd,
+                         enum bpf_attach_type type,
+                         const struct bpf_prog_attach_opts *opts);
+
 int bpf_prog_detach(int target_fd, enum bpf_attach_type type)
 {
        union bpf_attr attr;
index 14e0d97..16b2175 100644 (file)
@@ -391,6 +391,10 @@ struct bpf_prog_attach_opts {
 
 LIBBPF_API int bpf_prog_attach(int prog_fd, int attachable_fd,
                               enum bpf_attach_type type, unsigned int flags);
+LIBBPF_API int bpf_prog_attach_opts(int prog_fd, int attachable_fd,
+                                    enum bpf_attach_type type,
+                                    const struct bpf_prog_attach_opts *opts);
+LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_prog_attach_opts() instead")
 LIBBPF_API int bpf_prog_attach_xattr(int prog_fd, int attachable_fd,
                                     enum bpf_attach_type type,
                                     const struct bpf_prog_attach_opts *opts);
@@ -449,12 +453,14 @@ struct bpf_prog_test_run_attr {
                             * out: length of cxt_out */
 };
 
+LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_prog_test_run_opts() instead")
 LIBBPF_API int bpf_prog_test_run_xattr(struct bpf_prog_test_run_attr *test_attr);
 
 /*
  * bpf_prog_test_run does not check that data_out is large enough. Consider
- * using bpf_prog_test_run_xattr instead.
+ * using bpf_prog_test_run_opts instead.
  */
+LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_prog_test_run_opts() instead")
 LIBBPF_API int bpf_prog_test_run(int prog_fd, int repeat, void *data,
                                 __u32 size, void *data_out, __u32 *size_out,
                                 __u32 *retval, __u32 *duration);
index 963b106..44df982 100644 (file)
@@ -133,7 +133,7 @@ struct bpf_map_def {
        unsigned int value_size;
        unsigned int max_entries;
        unsigned int map_flags;
-};
+} __attribute__((deprecated("use BTF-defined maps in .maps section")));
 
 enum libbpf_pin_type {
        LIBBPF_PIN_NONE,
index 90f56b0..e3a8c94 100644 (file)
@@ -76,6 +76,9 @@
 #define __PT_RC_REG ax
 #define __PT_SP_REG sp
 #define __PT_IP_REG ip
+/* syscall uses r10 for PARM4 */
+#define PT_REGS_PARM4_SYSCALL(x) ((x)->r10)
+#define PT_REGS_PARM4_CORE_SYSCALL(x) BPF_CORE_READ(x, r10)
 
 #else
 
 #define __PT_RC_REG rax
 #define __PT_SP_REG rsp
 #define __PT_IP_REG rip
+/* syscall uses r10 for PARM4 */
+#define PT_REGS_PARM4_SYSCALL(x) ((x)->r10)
+#define PT_REGS_PARM4_CORE_SYSCALL(x) BPF_CORE_READ(x, r10)
 
 #endif /* __i386__ */
 
 
 #elif defined(bpf_target_s390)
 
+struct pt_regs___s390 {
+       unsigned long orig_gpr2;
+};
+
 /* s390 provides user_pt_regs instead of struct pt_regs to userspace */
 #define __PT_REGS_CAST(x) ((const user_pt_regs *)(x))
 #define __PT_PARM1_REG gprs[2]
 #define __PT_RC_REG gprs[2]
 #define __PT_SP_REG gprs[15]
 #define __PT_IP_REG psw.addr
+#define PT_REGS_PARM1_SYSCALL(x) ({ _Pragma("GCC error \"use PT_REGS_PARM1_CORE_SYSCALL() instead\""); 0l; })
+#define PT_REGS_PARM1_CORE_SYSCALL(x) BPF_CORE_READ((const struct pt_regs___s390 *)(x), orig_gpr2)
 
 #elif defined(bpf_target_arm)
 
 
 #elif defined(bpf_target_arm64)
 
+struct pt_regs___arm64 {
+       unsigned long orig_x0;
+};
+
 /* arm64 provides struct user_pt_regs instead of struct pt_regs to userspace */
 #define __PT_REGS_CAST(x) ((const struct user_pt_regs *)(x))
 #define __PT_PARM1_REG regs[0]
 #define __PT_RC_REG regs[0]
 #define __PT_SP_REG sp
 #define __PT_IP_REG pc
+#define PT_REGS_PARM1_SYSCALL(x) ({ _Pragma("GCC error \"use PT_REGS_PARM1_CORE_SYSCALL() instead\""); 0l; })
+#define PT_REGS_PARM1_CORE_SYSCALL(x) BPF_CORE_READ((const struct pt_regs___arm64 *)(x), orig_x0)
 
 #elif defined(bpf_target_mips)
 
 #define __PT_RC_REG gpr[3]
 #define __PT_SP_REG sp
 #define __PT_IP_REG nip
+/* powerpc does not select ARCH_HAS_SYSCALL_WRAPPER. */
+#define PT_REGS_SYSCALL_REGS(ctx) ctx
 
 #elif defined(bpf_target_sparc)
 
 #define __PT_PARM4_REG a3
 #define __PT_PARM5_REG a4
 #define __PT_RET_REG ra
-#define __PT_FP_REG fp
+#define __PT_FP_REG s0
 #define __PT_RC_REG a5
 #define __PT_SP_REG sp
-#define __PT_IP_REG epc
+#define __PT_IP_REG pc
+/* riscv does not select ARCH_HAS_SYSCALL_WRAPPER. */
+#define PT_REGS_SYSCALL_REGS(ctx) ctx
 
 #endif
 
@@ -263,6 +285,26 @@ struct pt_regs;
 
 #endif
 
+#ifndef PT_REGS_PARM1_SYSCALL
+#define PT_REGS_PARM1_SYSCALL(x) PT_REGS_PARM1(x)
+#endif
+#define PT_REGS_PARM2_SYSCALL(x) PT_REGS_PARM2(x)
+#define PT_REGS_PARM3_SYSCALL(x) PT_REGS_PARM3(x)
+#ifndef PT_REGS_PARM4_SYSCALL
+#define PT_REGS_PARM4_SYSCALL(x) PT_REGS_PARM4(x)
+#endif
+#define PT_REGS_PARM5_SYSCALL(x) PT_REGS_PARM5(x)
+
+#ifndef PT_REGS_PARM1_CORE_SYSCALL
+#define PT_REGS_PARM1_CORE_SYSCALL(x) PT_REGS_PARM1_CORE(x)
+#endif
+#define PT_REGS_PARM2_CORE_SYSCALL(x) PT_REGS_PARM2_CORE(x)
+#define PT_REGS_PARM3_CORE_SYSCALL(x) PT_REGS_PARM3_CORE(x)
+#ifndef PT_REGS_PARM4_CORE_SYSCALL
+#define PT_REGS_PARM4_CORE_SYSCALL(x) PT_REGS_PARM4_CORE(x)
+#endif
+#define PT_REGS_PARM5_CORE_SYSCALL(x) PT_REGS_PARM5_CORE(x)
+
 #else /* defined(bpf_target_defined) */
 
 #define PT_REGS_PARM1(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
@@ -290,8 +332,30 @@ struct pt_regs;
 #define BPF_KPROBE_READ_RET_IP(ip, ctx) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
 #define BPF_KRETPROBE_READ_RET_IP(ip, ctx) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
 
+#define PT_REGS_PARM1_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
+#define PT_REGS_PARM2_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
+#define PT_REGS_PARM3_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
+#define PT_REGS_PARM4_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
+#define PT_REGS_PARM5_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
+
+#define PT_REGS_PARM1_CORE_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
+#define PT_REGS_PARM2_CORE_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
+#define PT_REGS_PARM3_CORE_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
+#define PT_REGS_PARM4_CORE_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
+#define PT_REGS_PARM5_CORE_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
+
 #endif /* defined(bpf_target_defined) */
 
+/*
+ * When invoked from a syscall handler kprobe, returns a pointer to a
+ * struct pt_regs containing syscall arguments and suitable for passing to
+ * PT_REGS_PARMn_SYSCALL() and PT_REGS_PARMn_CORE_SYSCALL().
+ */
+#ifndef PT_REGS_SYSCALL_REGS
+/* By default, assume that the arch selects ARCH_HAS_SYSCALL_WRAPPER. */
+#define PT_REGS_SYSCALL_REGS(ctx) ((struct pt_regs *)PT_REGS_PARM1(ctx))
+#endif
+
 #ifndef ___bpf_concat
 #define ___bpf_concat(a, b) a ## b
 #endif
@@ -406,4 +470,39 @@ typeof(name(0)) name(struct pt_regs *ctx)                              \
 }                                                                          \
 static __always_inline typeof(name(0)) ____##name(struct pt_regs *ctx, ##args)
 
+#define ___bpf_syscall_args0()           ctx
+#define ___bpf_syscall_args1(x)          ___bpf_syscall_args0(), (void *)PT_REGS_PARM1_CORE_SYSCALL(regs)
+#define ___bpf_syscall_args2(x, args...) ___bpf_syscall_args1(args), (void *)PT_REGS_PARM2_CORE_SYSCALL(regs)
+#define ___bpf_syscall_args3(x, args...) ___bpf_syscall_args2(args), (void *)PT_REGS_PARM3_CORE_SYSCALL(regs)
+#define ___bpf_syscall_args4(x, args...) ___bpf_syscall_args3(args), (void *)PT_REGS_PARM4_CORE_SYSCALL(regs)
+#define ___bpf_syscall_args5(x, args...) ___bpf_syscall_args4(args), (void *)PT_REGS_PARM5_CORE_SYSCALL(regs)
+#define ___bpf_syscall_args(args...)     ___bpf_apply(___bpf_syscall_args, ___bpf_narg(args))(args)
+
+/*
+ * BPF_KPROBE_SYSCALL is a variant of BPF_KPROBE, which is intended for
+ * tracing syscall functions, like __x64_sys_close. It hides the underlying
+ * platform-specific low-level way of getting syscall input arguments from
+ * struct pt_regs, and provides a familiar typed and named function arguments
+ * syntax and semantics of accessing syscall input parameters.
+ *
+ * Original struct pt_regs* context is preserved as 'ctx' argument. This might
+ * be necessary when using BPF helpers like bpf_perf_event_output().
+ *
+ * This macro relies on BPF CO-RE support.
+ */
+#define BPF_KPROBE_SYSCALL(name, args...)                                  \
+name(struct pt_regs *ctx);                                                 \
+static __attribute__((always_inline)) typeof(name(0))                      \
+____##name(struct pt_regs *ctx, ##args);                                   \
+typeof(name(0)) name(struct pt_regs *ctx)                                  \
+{                                                                          \
+       struct pt_regs *regs = PT_REGS_SYSCALL_REGS(ctx);                   \
+       _Pragma("GCC diagnostic push")                                      \
+       _Pragma("GCC diagnostic ignored \"-Wint-conversion\"")              \
+       return ____##name(___bpf_syscall_args(args));                       \
+       _Pragma("GCC diagnostic pop")                                       \
+}                                                                          \
+static __attribute__((always_inline)) typeof(name(0))                      \
+____##name(struct pt_regs *ctx, ##args)
+
 #endif
index 9aa19c8..1383e26 100644 (file)
@@ -1620,20 +1620,37 @@ static int btf_commit_type(struct btf *btf, int data_sz)
 struct btf_pipe {
        const struct btf *src;
        struct btf *dst;
+       struct hashmap *str_off_map; /* map string offsets from src to dst */
 };
 
 static int btf_rewrite_str(__u32 *str_off, void *ctx)
 {
        struct btf_pipe *p = ctx;
-       int off;
+       void *mapped_off;
+       int off, err;
 
        if (!*str_off) /* nothing to do for empty strings */
                return 0;
 
+       if (p->str_off_map &&
+           hashmap__find(p->str_off_map, (void *)(long)*str_off, &mapped_off)) {
+               *str_off = (__u32)(long)mapped_off;
+               return 0;
+       }
+
        off = btf__add_str(p->dst, btf__str_by_offset(p->src, *str_off));
        if (off < 0)
                return off;
 
+       /* Remember string mapping from src to dst.  It avoids
+        * performing expensive string comparisons.
+        */
+       if (p->str_off_map) {
+               err = hashmap__append(p->str_off_map, (void *)(long)*str_off, (void *)(long)off);
+               if (err)
+                       return err;
+       }
+
        *str_off = off;
        return 0;
 }
@@ -1680,6 +1697,9 @@ static int btf_rewrite_type_ids(__u32 *type_id, void *ctx)
        return 0;
 }
 
+static size_t btf_dedup_identity_hash_fn(const void *key, void *ctx);
+static bool btf_dedup_equal_fn(const void *k1, const void *k2, void *ctx);
+
 int btf__add_btf(struct btf *btf, const struct btf *src_btf)
 {
        struct btf_pipe p = { .src = src_btf, .dst = btf };
@@ -1713,6 +1733,11 @@ int btf__add_btf(struct btf *btf, const struct btf *src_btf)
        if (!off)
                return libbpf_err(-ENOMEM);
 
+       /* Map the string offsets from src_btf to the offsets from btf to improve performance */
+       p.str_off_map = hashmap__new(btf_dedup_identity_hash_fn, btf_dedup_equal_fn, NULL);
+       if (IS_ERR(p.str_off_map))
+               return libbpf_err(-ENOMEM);
+
        /* bulk copy types data for all types from src_btf */
        memcpy(t, src_btf->types_data, data_sz);
 
@@ -1754,6 +1779,8 @@ int btf__add_btf(struct btf *btf, const struct btf *src_btf)
        btf->hdr->str_off += data_sz;
        btf->nr_types += cnt;
 
+       hashmap__free(p.str_off_map);
+
        /* return type ID of the first added BTF type */
        return btf->start_id + btf->nr_types - cnt;
 err_out:
@@ -1767,6 +1794,8 @@ err_out:
         * wasn't modified, so doesn't need restoring, see big comment above */
        btf->hdr->str_len = old_strs_len;
 
+       hashmap__free(p.str_off_map);
+
        return libbpf_err(err);
 }
 
index 061839f..951ac74 100644 (file)
@@ -147,11 +147,10 @@ LIBBPF_API int btf__resolve_type(const struct btf *btf, __u32 type_id);
 LIBBPF_API int btf__align_of(const struct btf *btf, __u32 id);
 LIBBPF_API int btf__fd(const struct btf *btf);
 LIBBPF_API void btf__set_fd(struct btf *btf, int fd);
-LIBBPF_DEPRECATED_SINCE(0, 7, "use btf__raw_data() instead")
-LIBBPF_API const void *btf__get_raw_data(const struct btf *btf, __u32 *size);
 LIBBPF_API const void *btf__raw_data(const struct btf *btf, __u32 *size);
 LIBBPF_API const char *btf__name_by_offset(const struct btf *btf, __u32 offset);
 LIBBPF_API const char *btf__str_by_offset(const struct btf *btf, __u32 offset);
+LIBBPF_DEPRECATED_SINCE(0, 7, "this API is not necessary when BTF-defined maps are used")
 LIBBPF_API int btf__get_map_kv_tids(const struct btf *btf, const char *map_name,
                                    __u32 expected_key_size,
                                    __u32 expected_value_size,
@@ -159,8 +158,7 @@ LIBBPF_API int btf__get_map_kv_tids(const struct btf *btf, const char *map_name,
 
 LIBBPF_API struct btf_ext *btf_ext__new(const __u8 *data, __u32 size);
 LIBBPF_API void btf_ext__free(struct btf_ext *btf_ext);
-LIBBPF_API const void *btf_ext__get_raw_data(const struct btf_ext *btf_ext,
-                                            __u32 *size);
+LIBBPF_API const void *btf_ext__raw_data(const struct btf_ext *btf_ext, __u32 *size);
 LIBBPF_API LIBBPF_DEPRECATED("btf_ext__reloc_func_info was never meant as a public API and has wrong assumptions embedded in it; it will be removed in the future libbpf versions")
 int btf_ext__reloc_func_info(const struct btf *btf,
                             const struct btf_ext *btf_ext,
@@ -171,8 +169,10 @@ int btf_ext__reloc_line_info(const struct btf *btf,
                             const struct btf_ext *btf_ext,
                             const char *sec_name, __u32 insns_cnt,
                             void **line_info, __u32 *cnt);
-LIBBPF_API __u32 btf_ext__func_info_rec_size(const struct btf_ext *btf_ext);
-LIBBPF_API __u32 btf_ext__line_info_rec_size(const struct btf_ext *btf_ext);
+LIBBPF_API LIBBPF_DEPRECATED("btf_ext__reloc_func_info is deprecated; write custom func_info parsing to fetch rec_size")
+__u32 btf_ext__func_info_rec_size(const struct btf_ext *btf_ext);
+LIBBPF_API LIBBPF_DEPRECATED("btf_ext__reloc_line_info is deprecated; write custom line_info parsing to fetch rec_size")
+__u32 btf_ext__line_info_rec_size(const struct btf_ext *btf_ext);
 
 LIBBPF_API int btf__find_str(struct btf *btf, const char *s);
 LIBBPF_API int btf__add_str(struct btf *btf, const char *s);
@@ -375,8 +375,28 @@ btf_dump__dump_type_data(struct btf_dump *d, __u32 id,
                         const struct btf_dump_type_data_opts *opts);
 
 /*
- * A set of helpers for easier BTF types handling
+ * A set of helpers for easier BTF types handling.
+ *
+ * The inline functions below rely on constants from the kernel headers which
+ * may not be available for applications including this header file. To avoid
+ * compilation errors, we define all the constants here that were added after
+ * the initial introduction of the BTF_KIND* constants.
  */
+#ifndef BTF_KIND_FUNC
+#define BTF_KIND_FUNC          12      /* Function     */
+#define BTF_KIND_FUNC_PROTO    13      /* Function Proto       */
+#endif
+#ifndef BTF_KIND_VAR
+#define BTF_KIND_VAR           14      /* Variable     */
+#define BTF_KIND_DATASEC       15      /* Section      */
+#endif
+#ifndef BTF_KIND_FLOAT
+#define BTF_KIND_FLOAT         16      /* Floating point       */
+#endif
+/* The kernel header switched to enums, so these two were never #defined */
+#define BTF_KIND_DECL_TAG      17      /* Decl Tag */
+#define BTF_KIND_TYPE_TAG      18      /* Type Tag */
+
 static inline __u16 btf_kind(const struct btf_type *t)
 {
        return BTF_INFO_KIND(t->info);
index b9a3260..07ebe70 100644 (file)
@@ -1861,14 +1861,16 @@ static int btf_dump_array_data(struct btf_dump *d,
 {
        const struct btf_array *array = btf_array(t);
        const struct btf_type *elem_type;
-       __u32 i, elem_size = 0, elem_type_id;
+       __u32 i, elem_type_id;
+       __s64 elem_size;
        bool is_array_member;
 
        elem_type_id = array->type;
        elem_type = skip_mods_and_typedefs(d->btf, elem_type_id, NULL);
        elem_size = btf__resolve_size(d->btf, elem_type_id);
        if (elem_size <= 0) {
-               pr_warn("unexpected elem size %d for array type [%u]\n", elem_size, id);
+               pr_warn("unexpected elem size %zd for array type [%u]\n",
+                       (ssize_t)elem_size, id);
                return -EINVAL;
        }
 
index 3c20b12..aeb09c2 100644 (file)
@@ -75,7 +75,7 @@ void hashmap__clear(struct hashmap *map)
 
 void hashmap__free(struct hashmap *map)
 {
-       if (!map)
+       if (IS_ERR_OR_NULL(map))
                return;
 
        hashmap__clear(map);
@@ -238,4 +238,3 @@ bool hashmap__delete(struct hashmap *map, const void *key,
 
        return true;
 }
-
index 7f10dd5..2262bcd 100644 (file)
@@ -156,14 +156,6 @@ enum libbpf_strict_mode libbpf_mode = LIBBPF_STRICT_NONE;
 
 int libbpf_set_strict_mode(enum libbpf_strict_mode mode)
 {
-       /* __LIBBPF_STRICT_LAST is the last power-of-2 value used + 1, so to
-        * get all possible values we compensate last +1, and then (2*x - 1)
-        * to get the bit mask
-        */
-       if (mode != LIBBPF_STRICT_ALL
-           && (mode & ~((__LIBBPF_STRICT_LAST - 1) * 2 - 1)))
-               return errno = EINVAL, -EINVAL;
-
        libbpf_mode = mode;
        return 0;
 }
@@ -235,6 +227,10 @@ enum sec_def_flags {
        SEC_SLEEPABLE = 8,
        /* allow non-strict prefix matching */
        SEC_SLOPPY_PFX = 16,
+       /* BPF program support non-linear XDP buffer */
+       SEC_XDP_FRAGS = 32,
+       /* deprecated sec definitions not supposed to be used */
+       SEC_DEPRECATED = 64,
 };
 
 struct bpf_sec_def {
@@ -1937,6 +1933,11 @@ static int bpf_object__init_user_maps(struct bpf_object *obj, bool strict)
        if (obj->efile.maps_shndx < 0)
                return 0;
 
+       if (libbpf_mode & LIBBPF_STRICT_MAP_DEFINITIONS) {
+               pr_warn("legacy map definitions in SEC(\"maps\") are not supported\n");
+               return -EOPNOTSUPP;
+       }
+
        if (!symbols)
                return -EINVAL;
 
@@ -1999,6 +2000,8 @@ static int bpf_object__init_user_maps(struct bpf_object *obj, bool strict)
                        return -LIBBPF_ERRNO__FORMAT;
                }
 
+               pr_warn("map '%s' (legacy): legacy map definitions are deprecated, use BTF-defined maps instead\n", map_name);
+
                if (ELF64_ST_BIND(sym->st_info) == STB_LOCAL) {
                        pr_warn("map '%s' (legacy): static maps are not supported\n", map_name);
                        return -ENOTSUP;
@@ -4190,9 +4193,13 @@ static int bpf_map_find_btf_info(struct bpf_object *obj, struct bpf_map *map)
                return 0;
 
        if (!bpf_map__is_internal(map)) {
+               pr_warn("Use of BPF_ANNOTATE_KV_PAIR is deprecated, use BTF-defined maps in .maps section instead\n");
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
                ret = btf__get_map_kv_tids(obj->btf, map->name, def->key_size,
                                           def->value_size, &key_type_id,
                                           &value_type_id);
+#pragma GCC diagnostic pop
        } else {
                /*
                 * LLVM annotates global data differently in BTF, that is,
@@ -6562,6 +6569,13 @@ static int libbpf_preload_prog(struct bpf_program *prog,
        if (def & SEC_SLEEPABLE)
                opts->prog_flags |= BPF_F_SLEEPABLE;
 
+       if (prog->type == BPF_PROG_TYPE_XDP && (def & SEC_XDP_FRAGS))
+               opts->prog_flags |= BPF_F_XDP_HAS_FRAGS;
+
+       if (def & SEC_DEPRECATED)
+               pr_warn("SEC(\"%s\") is deprecated, please see https://github.com/libbpf/libbpf/wiki/Libbpf-1.0-migration-guide#bpf-program-sec-annotation-deprecations for details\n",
+                       prog->sec_name);
+
        if ((prog->type == BPF_PROG_TYPE_TRACING ||
             prog->type == BPF_PROG_TYPE_LSM ||
             prog->type == BPF_PROG_TYPE_EXT) && !prog->attach_btf_id) {
@@ -7883,10 +7897,8 @@ int bpf_map__set_pin_path(struct bpf_map *map, const char *path)
        return 0;
 }
 
-const char *bpf_map__get_pin_path(const struct bpf_map *map)
-{
-       return map->pin_path;
-}
+__alias(bpf_map__pin_path)
+const char *bpf_map__get_pin_path(const struct bpf_map *map);
 
 const char *bpf_map__pin_path(const struct bpf_map *map)
 {
@@ -8451,7 +8463,10 @@ static int bpf_program_nth_fd(const struct bpf_program *prog, int n)
        return fd;
 }
 
-enum bpf_prog_type bpf_program__get_type(const struct bpf_program *prog)
+__alias(bpf_program__type)
+enum bpf_prog_type bpf_program__get_type(const struct bpf_program *prog);
+
+enum bpf_prog_type bpf_program__type(const struct bpf_program *prog)
 {
        return prog->type;
 }
@@ -8495,8 +8510,10 @@ BPF_PROG_TYPE_FNS(struct_ops, BPF_PROG_TYPE_STRUCT_OPS);
 BPF_PROG_TYPE_FNS(extension, BPF_PROG_TYPE_EXT);
 BPF_PROG_TYPE_FNS(sk_lookup, BPF_PROG_TYPE_SK_LOOKUP);
 
-enum bpf_attach_type
-bpf_program__get_expected_attach_type(const struct bpf_program *prog)
+__alias(bpf_program__expected_attach_type)
+enum bpf_attach_type bpf_program__get_expected_attach_type(const struct bpf_program *prog);
+
+enum bpf_attach_type bpf_program__expected_attach_type(const struct bpf_program *prog)
 {
        return prog->expected_attach_type;
 }
@@ -8580,7 +8597,7 @@ static const struct bpf_sec_def section_defs[] = {
        SEC_DEF("kretprobe/",           KPROBE, 0, SEC_NONE, attach_kprobe),
        SEC_DEF("uretprobe/",           KPROBE, 0, SEC_NONE),
        SEC_DEF("tc",                   SCHED_CLS, 0, SEC_NONE),
-       SEC_DEF("classifier",           SCHED_CLS, 0, SEC_NONE | SEC_SLOPPY_PFX),
+       SEC_DEF("classifier",           SCHED_CLS, 0, SEC_NONE | SEC_SLOPPY_PFX | SEC_DEPRECATED),
        SEC_DEF("action",               SCHED_ACT, 0, SEC_NONE | SEC_SLOPPY_PFX),
        SEC_DEF("tracepoint/",          TRACEPOINT, 0, SEC_NONE, attach_tp),
        SEC_DEF("tp/",                  TRACEPOINT, 0, SEC_NONE, attach_tp),
@@ -8599,9 +8616,15 @@ static const struct bpf_sec_def section_defs[] = {
        SEC_DEF("lsm/",                 LSM, BPF_LSM_MAC, SEC_ATTACH_BTF, attach_lsm),
        SEC_DEF("lsm.s/",               LSM, BPF_LSM_MAC, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_lsm),
        SEC_DEF("iter/",                TRACING, BPF_TRACE_ITER, SEC_ATTACH_BTF, attach_iter),
+       SEC_DEF("iter.s/",              TRACING, BPF_TRACE_ITER, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_iter),
        SEC_DEF("syscall",              SYSCALL, 0, SEC_SLEEPABLE),
-       SEC_DEF("xdp_devmap/",          XDP, BPF_XDP_DEVMAP, SEC_ATTACHABLE),
-       SEC_DEF("xdp_cpumap/",          XDP, BPF_XDP_CPUMAP, SEC_ATTACHABLE),
+       SEC_DEF("xdp.frags/devmap",     XDP, BPF_XDP_DEVMAP, SEC_XDP_FRAGS),
+       SEC_DEF("xdp/devmap",           XDP, BPF_XDP_DEVMAP, SEC_ATTACHABLE),
+       SEC_DEF("xdp_devmap/",          XDP, BPF_XDP_DEVMAP, SEC_ATTACHABLE | SEC_DEPRECATED),
+       SEC_DEF("xdp.frags/cpumap",     XDP, BPF_XDP_CPUMAP, SEC_XDP_FRAGS),
+       SEC_DEF("xdp/cpumap",           XDP, BPF_XDP_CPUMAP, SEC_ATTACHABLE),
+       SEC_DEF("xdp_cpumap/",          XDP, BPF_XDP_CPUMAP, SEC_ATTACHABLE | SEC_DEPRECATED),
+       SEC_DEF("xdp.frags",            XDP, BPF_XDP, SEC_XDP_FRAGS),
        SEC_DEF("xdp",                  XDP, BPF_XDP, SEC_ATTACHABLE_OPT | SEC_SLOPPY_PFX),
        SEC_DEF("perf_event",           PERF_EVENT, 0, SEC_NONE | SEC_SLOPPY_PFX),
        SEC_DEF("lwt_in",               LWT_IN, 0, SEC_NONE | SEC_SLOPPY_PFX),
@@ -9443,7 +9466,7 @@ static int bpf_prog_load_xattr2(const struct bpf_prog_load_attr *attr,
        open_attr.file = attr->file;
        open_attr.prog_type = attr->prog_type;
 
-       obj = bpf_object__open_xattr(&open_attr);
+       obj = __bpf_object__open_xattr(&open_attr, 0);
        err = libbpf_get_error(obj);
        if (err)
                return libbpf_err(-ENOENT);
@@ -9460,7 +9483,7 @@ static int bpf_prog_load_xattr2(const struct bpf_prog_load_attr *attr,
                        bpf_program__set_expected_attach_type(prog,
                                                              attach_type);
                }
-               if (bpf_program__get_type(prog) == BPF_PROG_TYPE_UNSPEC) {
+               if (bpf_program__type(prog) == BPF_PROG_TYPE_UNSPEC) {
                        /*
                         * we haven't guessed from section name and user
                         * didn't provide a fallback type, too bad...
@@ -9477,7 +9500,7 @@ static int bpf_prog_load_xattr2(const struct bpf_prog_load_attr *attr,
        }
 
        bpf_object__for_each_map(map, obj) {
-               if (!bpf_map__is_offload_neutral(map))
+               if (map->def.type != BPF_MAP_TYPE_PERF_EVENT_ARRAY)
                        map->map_ifindex = attr->ifindex;
        }
 
@@ -10511,7 +10534,7 @@ bpf_program__attach_fd(const struct bpf_program *prog, int target_fd, int btf_id
                return libbpf_err_ptr(-ENOMEM);
        link->detach = &bpf_link__detach_fd;
 
-       attach_type = bpf_program__get_expected_attach_type(prog);
+       attach_type = bpf_program__expected_attach_type(prog);
        link_fd = bpf_link_create(prog_fd, target_fd, attach_type, &opts);
        if (link_fd < 0) {
                link_fd = -errno;
@@ -11795,6 +11818,9 @@ void bpf_object__detach_skeleton(struct bpf_object_skeleton *s)
 
 void bpf_object__destroy_skeleton(struct bpf_object_skeleton *s)
 {
+       if (!s)
+               return;
+
        if (s->progs)
                bpf_object__detach_skeleton(s);
        if (s->obj)
index 8b9bc5e..c8d8daa 100644 (file)
@@ -180,9 +180,11 @@ bpf_object__open_mem(const void *obj_buf, size_t obj_buf_sz,
                     const struct bpf_object_open_opts *opts);
 
 /* deprecated bpf_object__open variants */
+LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_object__open_mem() instead")
 LIBBPF_API struct bpf_object *
 bpf_object__open_buffer(const void *obj_buf, size_t obj_buf_sz,
                        const char *name);
+LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_object__open_file() instead")
 LIBBPF_API struct bpf_object *
 bpf_object__open_xattr(struct bpf_object_open_attr *attr);
 
@@ -244,8 +246,10 @@ struct bpf_object *bpf_object__next(struct bpf_object *prev);
             (pos) = (tmp), (tmp) = bpf_object__next(tmp))
 
 typedef void (*bpf_object_clear_priv_t)(struct bpf_object *, void *);
+LIBBPF_DEPRECATED_SINCE(0, 7, "storage via set_priv/priv is deprecated")
 LIBBPF_API int bpf_object__set_priv(struct bpf_object *obj, void *priv,
                                    bpf_object_clear_priv_t clear_priv);
+LIBBPF_DEPRECATED_SINCE(0, 7, "storage via set_priv/priv is deprecated")
 LIBBPF_API void *bpf_object__priv(const struct bpf_object *prog);
 
 LIBBPF_API int
@@ -277,9 +281,10 @@ bpf_object__prev_program(const struct bpf_object *obj, struct bpf_program *prog)
 
 typedef void (*bpf_program_clear_priv_t)(struct bpf_program *, void *);
 
+LIBBPF_DEPRECATED_SINCE(0, 7, "storage via set_priv/priv is deprecated")
 LIBBPF_API int bpf_program__set_priv(struct bpf_program *prog, void *priv,
                                     bpf_program_clear_priv_t clear_priv);
-
+LIBBPF_DEPRECATED_SINCE(0, 7, "storage via set_priv/priv is deprecated")
 LIBBPF_API void *bpf_program__priv(const struct bpf_program *prog);
 LIBBPF_API void bpf_program__set_ifindex(struct bpf_program *prog,
                                         __u32 ifindex);
@@ -591,26 +596,39 @@ LIBBPF_API int bpf_program__nth_fd(const struct bpf_program *prog, int n);
 /*
  * Adjust type of BPF program. Default is kprobe.
  */
+LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__set_type() instead")
 LIBBPF_API int bpf_program__set_socket_filter(struct bpf_program *prog);
+LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__set_type() instead")
 LIBBPF_API int bpf_program__set_tracepoint(struct bpf_program *prog);
+LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__set_type() instead")
 LIBBPF_API int bpf_program__set_raw_tracepoint(struct bpf_program *prog);
+LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__set_type() instead")
 LIBBPF_API int bpf_program__set_kprobe(struct bpf_program *prog);
+LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__set_type() instead")
 LIBBPF_API int bpf_program__set_lsm(struct bpf_program *prog);
+LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__set_type() instead")
 LIBBPF_API int bpf_program__set_sched_cls(struct bpf_program *prog);
+LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__set_type() instead")
 LIBBPF_API int bpf_program__set_sched_act(struct bpf_program *prog);
+LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__set_type() instead")
 LIBBPF_API int bpf_program__set_xdp(struct bpf_program *prog);
+LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__set_type() instead")
 LIBBPF_API int bpf_program__set_perf_event(struct bpf_program *prog);
+LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__set_type() instead")
 LIBBPF_API int bpf_program__set_tracing(struct bpf_program *prog);
+LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__set_type() instead")
 LIBBPF_API int bpf_program__set_struct_ops(struct bpf_program *prog);
+LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__set_type() instead")
 LIBBPF_API int bpf_program__set_extension(struct bpf_program *prog);
+LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__set_type() instead")
 LIBBPF_API int bpf_program__set_sk_lookup(struct bpf_program *prog);
 
-LIBBPF_API enum bpf_prog_type bpf_program__get_type(const struct bpf_program *prog);
+LIBBPF_API enum bpf_prog_type bpf_program__type(const struct bpf_program *prog);
 LIBBPF_API void bpf_program__set_type(struct bpf_program *prog,
                                      enum bpf_prog_type type);
 
 LIBBPF_API enum bpf_attach_type
-bpf_program__get_expected_attach_type(const struct bpf_program *prog);
+bpf_program__expected_attach_type(const struct bpf_program *prog);
 LIBBPF_API void
 bpf_program__set_expected_attach_type(struct bpf_program *prog,
                                      enum bpf_attach_type type);
@@ -631,18 +649,31 @@ LIBBPF_API int
 bpf_program__set_attach_target(struct bpf_program *prog, int attach_prog_fd,
                               const char *attach_func_name);
 
+LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__type() instead")
 LIBBPF_API bool bpf_program__is_socket_filter(const struct bpf_program *prog);
+LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__type() instead")
 LIBBPF_API bool bpf_program__is_tracepoint(const struct bpf_program *prog);
+LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__type() instead")
 LIBBPF_API bool bpf_program__is_raw_tracepoint(const struct bpf_program *prog);
+LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__type() instead")
 LIBBPF_API bool bpf_program__is_kprobe(const struct bpf_program *prog);
+LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__type() instead")
 LIBBPF_API bool bpf_program__is_lsm(const struct bpf_program *prog);
+LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__type() instead")
 LIBBPF_API bool bpf_program__is_sched_cls(const struct bpf_program *prog);
+LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__type() instead")
 LIBBPF_API bool bpf_program__is_sched_act(const struct bpf_program *prog);
+LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__type() instead")
 LIBBPF_API bool bpf_program__is_xdp(const struct bpf_program *prog);
+LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__type() instead")
 LIBBPF_API bool bpf_program__is_perf_event(const struct bpf_program *prog);
+LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__type() instead")
 LIBBPF_API bool bpf_program__is_tracing(const struct bpf_program *prog);
+LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__type() instead")
 LIBBPF_API bool bpf_program__is_struct_ops(const struct bpf_program *prog);
+LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__type() instead")
 LIBBPF_API bool bpf_program__is_extension(const struct bpf_program *prog);
+LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__type() instead")
 LIBBPF_API bool bpf_program__is_sk_lookup(const struct bpf_program *prog);
 
 /*
@@ -706,7 +737,8 @@ bpf_object__prev_map(const struct bpf_object *obj, const struct bpf_map *map);
 LIBBPF_API int bpf_map__fd(const struct bpf_map *map);
 LIBBPF_API int bpf_map__reuse_fd(struct bpf_map *map, int fd);
 /* get map definition */
-LIBBPF_API const struct bpf_map_def *bpf_map__def(const struct bpf_map *map);
+LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 8, "use appropriate getters or setters instead")
+const struct bpf_map_def *bpf_map__def(const struct bpf_map *map);
 /* get map name */
 LIBBPF_API const char *bpf_map__name(const struct bpf_map *map);
 /* get/set map type */
@@ -715,6 +747,7 @@ LIBBPF_API int bpf_map__set_type(struct bpf_map *map, enum bpf_map_type type);
 /* get/set map size (max_entries) */
 LIBBPF_API __u32 bpf_map__max_entries(const struct bpf_map *map);
 LIBBPF_API int bpf_map__set_max_entries(struct bpf_map *map, __u32 max_entries);
+LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_map__set_max_entries() instead")
 LIBBPF_API int bpf_map__resize(struct bpf_map *map, __u32 max_entries);
 /* get/set map flags */
 LIBBPF_API __u32 bpf_map__map_flags(const struct bpf_map *map);
@@ -739,8 +772,10 @@ LIBBPF_API __u64 bpf_map__map_extra(const struct bpf_map *map);
 LIBBPF_API int bpf_map__set_map_extra(struct bpf_map *map, __u64 map_extra);
 
 typedef void (*bpf_map_clear_priv_t)(struct bpf_map *, void *);
+LIBBPF_DEPRECATED_SINCE(0, 7, "storage via set_priv/priv is deprecated")
 LIBBPF_API int bpf_map__set_priv(struct bpf_map *map, void *priv,
                                 bpf_map_clear_priv_t clear_priv);
+LIBBPF_DEPRECATED_SINCE(0, 7, "storage via set_priv/priv is deprecated")
 LIBBPF_API void *bpf_map__priv(const struct bpf_map *map);
 LIBBPF_API int bpf_map__set_initial_value(struct bpf_map *map,
                                          const void *data, size_t size);
@@ -757,7 +792,6 @@ LIBBPF_API bool bpf_map__is_offload_neutral(const struct bpf_map *map);
  */
 LIBBPF_API bool bpf_map__is_internal(const struct bpf_map *map);
 LIBBPF_API int bpf_map__set_pin_path(struct bpf_map *map, const char *path);
-LIBBPF_API const char *bpf_map__get_pin_path(const struct bpf_map *map);
 LIBBPF_API const char *bpf_map__pin_path(const struct bpf_map *map);
 LIBBPF_API bool bpf_map__is_pinned(const struct bpf_map *map);
 LIBBPF_API int bpf_map__pin(struct bpf_map *map, const char *path);
@@ -832,13 +866,42 @@ struct bpf_xdp_set_link_opts {
 };
 #define bpf_xdp_set_link_opts__last_field old_fd
 
+LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_xdp_attach() instead")
 LIBBPF_API int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags);
+LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_xdp_attach() instead")
 LIBBPF_API int bpf_set_link_xdp_fd_opts(int ifindex, int fd, __u32 flags,
                                        const struct bpf_xdp_set_link_opts *opts);
+LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_xdp_query_id() instead")
 LIBBPF_API int bpf_get_link_xdp_id(int ifindex, __u32 *prog_id, __u32 flags);
+LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_xdp_query() instead")
 LIBBPF_API int bpf_get_link_xdp_info(int ifindex, struct xdp_link_info *info,
                                     size_t info_size, __u32 flags);
 
+struct bpf_xdp_attach_opts {
+       size_t sz;
+       int old_prog_fd;
+       size_t :0;
+};
+#define bpf_xdp_attach_opts__last_field old_prog_fd
+
+struct bpf_xdp_query_opts {
+       size_t sz;
+       __u32 prog_id;          /* output */
+       __u32 drv_prog_id;      /* output */
+       __u32 hw_prog_id;       /* output */
+       __u32 skb_prog_id;      /* output */
+       __u8 attach_mode;       /* output */
+       size_t :0;
+};
+#define bpf_xdp_query_opts__last_field attach_mode
+
+LIBBPF_API int bpf_xdp_attach(int ifindex, int prog_fd, __u32 flags,
+                             const struct bpf_xdp_attach_opts *opts);
+LIBBPF_API int bpf_xdp_detach(int ifindex, __u32 flags,
+                             const struct bpf_xdp_attach_opts *opts);
+LIBBPF_API int bpf_xdp_query(int ifindex, int flags, struct bpf_xdp_query_opts *opts);
+LIBBPF_API int bpf_xdp_query_id(int ifindex, int flags, __u32 *prog_id);
+
 /* TC related API */
 enum bpf_tc_attach_point {
        BPF_TC_INGRESS = 1 << 0,
index 5297839..aef6253 100644 (file)
@@ -247,6 +247,7 @@ LIBBPF_0.0.8 {
                bpf_link_create;
                bpf_link_update;
                bpf_map__set_initial_value;
+               bpf_prog_attach_opts;
                bpf_program__attach_cgroup;
                bpf_program__attach_lsm;
                bpf_program__is_lsm;
@@ -423,10 +424,16 @@ LIBBPF_0.6.0 {
 LIBBPF_0.7.0 {
        global:
                bpf_btf_load;
+               bpf_program__expected_attach_type;
                bpf_program__log_buf;
                bpf_program__log_level;
                bpf_program__set_log_buf;
                bpf_program__set_log_level;
+               bpf_program__type;
+               bpf_xdp_attach;
+               bpf_xdp_detach;
+               bpf_xdp_query;
+               bpf_xdp_query_id;
                libbpf_probe_bpf_helper;
                libbpf_probe_bpf_map_type;
                libbpf_probe_bpf_prog_type;
index 1565679..bc86b82 100644 (file)
@@ -92,6 +92,9 @@
 # define offsetofend(TYPE, FIELD) \
        (offsetof(TYPE, FIELD) + sizeof(((TYPE *)0)->FIELD))
 #endif
+#ifndef __alias
+#define __alias(symbol) __attribute__((alias(#symbol)))
+#endif
 
 /* Check whether a string `str` has prefix `pfx`, regardless if `pfx` is
  * a string literal known at compilation time or char * pointer known only at
index 79131f7..a283cf0 100644 (file)
@@ -73,6 +73,11 @@ enum libbpf_strict_mode {
         * operation.
         */
        LIBBPF_STRICT_AUTO_RLIMIT_MEMLOCK = 0x10,
+       /*
+        * Error out on any SEC("maps") map definition, which are deprecated
+        * in favor of BTF-defined map definitions in SEC(".maps").
+        */
+       LIBBPF_STRICT_MAP_DEFINITIONS = 0x20,
 
        __LIBBPF_STRICT_LAST,
 };
@@ -81,6 +86,23 @@ LIBBPF_API int libbpf_set_strict_mode(enum libbpf_strict_mode mode);
 
 #define DECLARE_LIBBPF_OPTS LIBBPF_OPTS
 
+/* "Discouraged" APIs which don't follow consistent libbpf naming patterns.
+ * They are normally a trivial aliases or wrappers for proper APIs and are
+ * left to minimize unnecessary disruption for users of libbpf. But they
+ * shouldn't be used going forward.
+ */
+
+struct bpf_program;
+struct bpf_map;
+struct btf;
+struct btf_ext;
+
+LIBBPF_API enum bpf_prog_type bpf_program__get_type(const struct bpf_program *prog);
+LIBBPF_API enum bpf_attach_type bpf_program__get_expected_attach_type(const struct bpf_program *prog);
+LIBBPF_API const char *bpf_map__get_pin_path(const struct bpf_map *map);
+LIBBPF_API const void *btf__get_raw_data(const struct btf *btf, __u32 *size);
+LIBBPF_API const void *btf_ext__get_raw_data(const struct btf_ext *btf_ext, __u32 *size);
+
 #ifdef __cplusplus
 } /* extern "C" */
 #endif
index 39f25e0..c39c37f 100644 (file)
@@ -217,6 +217,28 @@ static int __bpf_set_link_xdp_fd_replace(int ifindex, int fd, int old_fd,
        return libbpf_netlink_send_recv(&req, NULL, NULL, NULL);
 }
 
+int bpf_xdp_attach(int ifindex, int prog_fd, __u32 flags, const struct bpf_xdp_attach_opts *opts)
+{
+       int old_prog_fd, err;
+
+       if (!OPTS_VALID(opts, bpf_xdp_attach_opts))
+               return libbpf_err(-EINVAL);
+
+       old_prog_fd = OPTS_GET(opts, old_prog_fd, 0);
+       if (old_prog_fd)
+               flags |= XDP_FLAGS_REPLACE;
+       else
+               old_prog_fd = -1;
+
+       err = __bpf_set_link_xdp_fd_replace(ifindex, prog_fd, old_prog_fd, flags);
+       return libbpf_err(err);
+}
+
+int bpf_xdp_detach(int ifindex, __u32 flags, const struct bpf_xdp_attach_opts *opts)
+{
+       return bpf_xdp_attach(ifindex, -1, flags, opts);
+}
+
 int bpf_set_link_xdp_fd_opts(int ifindex, int fd, __u32 flags,
                             const struct bpf_xdp_set_link_opts *opts)
 {
@@ -303,69 +325,98 @@ static int get_xdp_info(void *cookie, void *msg, struct nlattr **tb)
        return 0;
 }
 
-int bpf_get_link_xdp_info(int ifindex, struct xdp_link_info *info,
-                         size_t info_size, __u32 flags)
+int bpf_xdp_query(int ifindex, int xdp_flags, struct bpf_xdp_query_opts *opts)
 {
-       struct xdp_id_md xdp_id = {};
-       __u32 mask;
-       int ret;
        struct libbpf_nla_req req = {
                .nh.nlmsg_len      = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
                .nh.nlmsg_type     = RTM_GETLINK,
                .nh.nlmsg_flags    = NLM_F_DUMP | NLM_F_REQUEST,
                .ifinfo.ifi_family = AF_PACKET,
        };
+       struct xdp_id_md xdp_id = {};
+       int err;
 
-       if (flags & ~XDP_FLAGS_MASK || !info_size)
+       if (!OPTS_VALID(opts, bpf_xdp_query_opts))
+               return libbpf_err(-EINVAL);
+
+       if (xdp_flags & ~XDP_FLAGS_MASK)
                return libbpf_err(-EINVAL);
 
        /* Check whether the single {HW,DRV,SKB} mode is set */
-       flags &= (XDP_FLAGS_SKB_MODE | XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE);
-       mask = flags - 1;
-       if (flags && flags & mask)
+       xdp_flags &= XDP_FLAGS_SKB_MODE | XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE;
+       if (xdp_flags & (xdp_flags - 1))
                return libbpf_err(-EINVAL);
 
        xdp_id.ifindex = ifindex;
-       xdp_id.flags = flags;
+       xdp_id.flags = xdp_flags;
 
-       ret = libbpf_netlink_send_recv(&req, __dump_link_nlmsg,
+       err = libbpf_netlink_send_recv(&req, __dump_link_nlmsg,
                                       get_xdp_info, &xdp_id);
-       if (!ret) {
-               size_t sz = min(info_size, sizeof(xdp_id.info));
+       if (err)
+               return libbpf_err(err);
 
-               memcpy(info, &xdp_id.info, sz);
-               memset((void *) info + sz, 0, info_size - sz);
-       }
+       OPTS_SET(opts, prog_id, xdp_id.info.prog_id);
+       OPTS_SET(opts, drv_prog_id, xdp_id.info.drv_prog_id);
+       OPTS_SET(opts, hw_prog_id, xdp_id.info.hw_prog_id);
+       OPTS_SET(opts, skb_prog_id, xdp_id.info.skb_prog_id);
+       OPTS_SET(opts, attach_mode, xdp_id.info.attach_mode);
 
-       return libbpf_err(ret);
+       return 0;
 }
 
-static __u32 get_xdp_id(struct xdp_link_info *info, __u32 flags)
+int bpf_get_link_xdp_info(int ifindex, struct xdp_link_info *info,
+                         size_t info_size, __u32 flags)
 {
-       flags &= XDP_FLAGS_MODES;
+       LIBBPF_OPTS(bpf_xdp_query_opts, opts);
+       size_t sz;
+       int err;
+
+       if (!info_size)
+               return libbpf_err(-EINVAL);
 
-       if (info->attach_mode != XDP_ATTACHED_MULTI && !flags)
-               return info->prog_id;
-       if (flags & XDP_FLAGS_DRV_MODE)
-               return info->drv_prog_id;
-       if (flags & XDP_FLAGS_HW_MODE)
-               return info->hw_prog_id;
-       if (flags & XDP_FLAGS_SKB_MODE)
-               return info->skb_prog_id;
+       err = bpf_xdp_query(ifindex, flags, &opts);
+       if (err)
+               return libbpf_err(err);
+
+       /* struct xdp_link_info field layout matches struct bpf_xdp_query_opts
+        * layout after sz field
+        */
+       sz = min(info_size, offsetofend(struct xdp_link_info, attach_mode));
+       memcpy(info, &opts.prog_id, sz);
+       memset((void *)info + sz, 0, info_size - sz);
 
        return 0;
 }
 
-int bpf_get_link_xdp_id(int ifindex, __u32 *prog_id, __u32 flags)
+int bpf_xdp_query_id(int ifindex, int flags, __u32 *prog_id)
 {
-       struct xdp_link_info info;
+       LIBBPF_OPTS(bpf_xdp_query_opts, opts);
        int ret;
 
-       ret = bpf_get_link_xdp_info(ifindex, &info, sizeof(info), flags);
-       if (!ret)
-               *prog_id = get_xdp_id(&info, flags);
+       ret = bpf_xdp_query(ifindex, flags, &opts);
+       if (ret)
+               return libbpf_err(ret);
+
+       flags &= XDP_FLAGS_MODES;
 
-       return libbpf_err(ret);
+       if (opts.attach_mode != XDP_ATTACHED_MULTI && !flags)
+               *prog_id = opts.prog_id;
+       else if (flags & XDP_FLAGS_DRV_MODE)
+               *prog_id = opts.drv_prog_id;
+       else if (flags & XDP_FLAGS_HW_MODE)
+               *prog_id = opts.hw_prog_id;
+       else if (flags & XDP_FLAGS_SKB_MODE)
+               *prog_id = opts.skb_prog_id;
+       else
+               *prog_id = 0;
+
+       return 0;
+}
+
+
+int bpf_get_link_xdp_id(int ifindex, __u32 *prog_id, __u32 flags)
+{
+       return bpf_xdp_query_id(ifindex, flags, prog_id);
 }
 
 typedef int (*qdisc_config_t)(struct libbpf_nla_req *req);
index 0b84d8e..dcd3336 100644 (file)
@@ -70,19 +70,85 @@ static inline int skel_closenz(int fd)
        return -EINVAL;
 }
 
+#ifndef offsetofend
+#define offsetofend(TYPE, MEMBER) \
+       (offsetof(TYPE, MEMBER) + sizeof((((TYPE *)0)->MEMBER)))
+#endif
+
+static inline int skel_map_create(enum bpf_map_type map_type,
+                                 const char *map_name,
+                                 __u32 key_size,
+                                 __u32 value_size,
+                                 __u32 max_entries)
+{
+       const size_t attr_sz = offsetofend(union bpf_attr, map_extra);
+       union bpf_attr attr;
+
+       memset(&attr, 0, attr_sz);
+
+       attr.map_type = map_type;
+       strncpy(attr.map_name, map_name, sizeof(attr.map_name));
+       attr.key_size = key_size;
+       attr.value_size = value_size;
+       attr.max_entries = max_entries;
+
+       return skel_sys_bpf(BPF_MAP_CREATE, &attr, attr_sz);
+}
+
+static inline int skel_map_update_elem(int fd, const void *key,
+                                      const void *value, __u64 flags)
+{
+       const size_t attr_sz = offsetofend(union bpf_attr, flags);
+       union bpf_attr attr;
+
+       memset(&attr, 0, attr_sz);
+       attr.map_fd = fd;
+       attr.key = (long) key;
+       attr.value = (long) value;
+       attr.flags = flags;
+
+       return skel_sys_bpf(BPF_MAP_UPDATE_ELEM, &attr, attr_sz);
+}
+
+static inline int skel_raw_tracepoint_open(const char *name, int prog_fd)
+{
+       const size_t attr_sz = offsetofend(union bpf_attr, raw_tracepoint.prog_fd);
+       union bpf_attr attr;
+
+       memset(&attr, 0, attr_sz);
+       attr.raw_tracepoint.name = (long) name;
+       attr.raw_tracepoint.prog_fd = prog_fd;
+
+       return skel_sys_bpf(BPF_RAW_TRACEPOINT_OPEN, &attr, attr_sz);
+}
+
+static inline int skel_link_create(int prog_fd, int target_fd,
+                                  enum bpf_attach_type attach_type)
+{
+       const size_t attr_sz = offsetofend(union bpf_attr, link_create.iter_info_len);
+       union bpf_attr attr;
+
+       memset(&attr, 0, attr_sz);
+       attr.link_create.prog_fd = prog_fd;
+       attr.link_create.target_fd = target_fd;
+       attr.link_create.attach_type = attach_type;
+
+       return skel_sys_bpf(BPF_LINK_CREATE, &attr, attr_sz);
+}
+
 static inline int bpf_load_and_run(struct bpf_load_and_run_opts *opts)
 {
        int map_fd = -1, prog_fd = -1, key = 0, err;
        union bpf_attr attr;
 
-       map_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "__loader.map", 4, opts->data_sz, 1, NULL);
+       map_fd = skel_map_create(BPF_MAP_TYPE_ARRAY, "__loader.map", 4, opts->data_sz, 1);
        if (map_fd < 0) {
                opts->errstr = "failed to create loader map";
                err = -errno;
                goto out;
        }
 
-       err = bpf_map_update_elem(map_fd, &key, opts->data, 0);
+       err = skel_map_update_elem(map_fd, &key, opts->data, 0);
        if (err < 0) {
                opts->errstr = "failed to update loader map";
                err = -errno;
index 8ac0a3a..0bc25a5 100644 (file)
@@ -13,7 +13,7 @@ static int test__bpf_parsing(void *obj_buf, size_t obj_buf_sz)
 {
        struct bpf_object *obj;
 
-       obj = bpf_object__open_buffer(obj_buf, obj_buf_sz, NULL);
+       obj = bpf_object__open_mem(obj_buf, obj_buf_sz, NULL);
        if (libbpf_get_error(obj))
                return TEST_FAIL;
        bpf_object__close(obj);
index 7ecfaac..3cd5ae2 100644 (file)
@@ -54,6 +54,7 @@ static bool libbpf_initialized;
 struct bpf_object *
 bpf__prepare_load_buffer(void *obj_buf, size_t obj_buf_sz, const char *name)
 {
+       LIBBPF_OPTS(bpf_object_open_opts, opts, .object_name = name);
        struct bpf_object *obj;
 
        if (!libbpf_initialized) {
@@ -61,7 +62,7 @@ bpf__prepare_load_buffer(void *obj_buf, size_t obj_buf_sz, const char *name)
                libbpf_initialized = true;
        }
 
-       obj = bpf_object__open_buffer(obj_buf, obj_buf_sz, name);
+       obj = bpf_object__open_mem(obj_buf, obj_buf_sz, &opts);
        if (IS_ERR_OR_NULL(obj)) {
                pr_debug("bpf: failed to load buffer\n");
                return ERR_PTR(-EINVAL);
@@ -72,6 +73,7 @@ bpf__prepare_load_buffer(void *obj_buf, size_t obj_buf_sz, const char *name)
 
 struct bpf_object *bpf__prepare_load(const char *filename, bool source)
 {
+       LIBBPF_OPTS(bpf_object_open_opts, opts, .object_name = filename);
        struct bpf_object *obj;
 
        if (!libbpf_initialized) {
@@ -94,7 +96,7 @@ struct bpf_object *bpf__prepare_load(const char *filename, bool source)
                                return ERR_PTR(-BPF_LOADER_ERRNO__COMPILE);
                } else
                        pr_debug("bpf: successful builtin compilation\n");
-               obj = bpf_object__open_buffer(obj_buf, obj_buf_sz, filename);
+               obj = bpf_object__open_mem(obj_buf, obj_buf_sz, &opts);
 
                if (!IS_ERR_OR_NULL(obj) && llvm_param.dump_obj)
                        llvm__dump_obj(filename, obj_buf, obj_buf_sz);
@@ -654,11 +656,11 @@ int bpf__probe(struct bpf_object *obj)
                }
 
                if (priv->is_tp) {
-                       bpf_program__set_tracepoint(prog);
+                       bpf_program__set_type(prog, BPF_PROG_TYPE_TRACEPOINT);
                        continue;
                }
 
-               bpf_program__set_kprobe(prog);
+               bpf_program__set_type(prog, BPF_PROG_TYPE_KPROBE);
                pev = &priv->pev;
 
                err = convert_perf_probe_events(pev, 1);
@@ -1005,24 +1007,22 @@ __bpf_map__config_value(struct bpf_map *map,
 {
        struct bpf_map_op *op;
        const char *map_name = bpf_map__name(map);
-       const struct bpf_map_def *def = bpf_map__def(map);
 
-       if (IS_ERR(def)) {
-               pr_debug("Unable to get map definition from '%s'\n",
-                        map_name);
+       if (!map) {
+               pr_debug("Map '%s' is invalid\n", map_name);
                return -BPF_LOADER_ERRNO__INTERNAL;
        }
 
-       if (def->type != BPF_MAP_TYPE_ARRAY) {
+       if (bpf_map__type(map) != BPF_MAP_TYPE_ARRAY) {
                pr_debug("Map %s type is not BPF_MAP_TYPE_ARRAY\n",
                         map_name);
                return -BPF_LOADER_ERRNO__OBJCONF_MAP_TYPE;
        }
-       if (def->key_size < sizeof(unsigned int)) {
+       if (bpf_map__key_size(map) < sizeof(unsigned int)) {
                pr_debug("Map %s has incorrect key size\n", map_name);
                return -BPF_LOADER_ERRNO__OBJCONF_MAP_KEYSIZE;
        }
-       switch (def->value_size) {
+       switch (bpf_map__value_size(map)) {
        case 1:
        case 2:
        case 4:
@@ -1064,7 +1064,6 @@ __bpf_map__config_event(struct bpf_map *map,
                        struct parse_events_term *term,
                        struct evlist *evlist)
 {
-       const struct bpf_map_def *def;
        struct bpf_map_op *op;
        const char *map_name = bpf_map__name(map);
        struct evsel *evsel = evlist__find_evsel_by_str(evlist, term->val.str);
@@ -1075,18 +1074,16 @@ __bpf_map__config_event(struct bpf_map *map,
                return -BPF_LOADER_ERRNO__OBJCONF_MAP_NOEVT;
        }
 
-       def = bpf_map__def(map);
-       if (IS_ERR(def)) {
-               pr_debug("Unable to get map definition from '%s'\n",
-                        map_name);
-               return PTR_ERR(def);
+       if (!map) {
+               pr_debug("Map '%s' is invalid\n", map_name);
+               return PTR_ERR(map);
        }
 
        /*
         * No need to check key_size and value_size:
         * kernel has already checked them.
         */
-       if (def->type != BPF_MAP_TYPE_PERF_EVENT_ARRAY) {
+       if (bpf_map__type(map) != BPF_MAP_TYPE_PERF_EVENT_ARRAY) {
                pr_debug("Map %s type is not BPF_MAP_TYPE_PERF_EVENT_ARRAY\n",
                         map_name);
                return -BPF_LOADER_ERRNO__OBJCONF_MAP_TYPE;
@@ -1135,7 +1132,6 @@ config_map_indices_range_check(struct parse_events_term *term,
                               const char *map_name)
 {
        struct parse_events_array *array = &term->array;
-       const struct bpf_map_def *def;
        unsigned int i;
 
        if (!array->nr_ranges)
@@ -1146,10 +1142,8 @@ config_map_indices_range_check(struct parse_events_term *term,
                return -BPF_LOADER_ERRNO__INTERNAL;
        }
 
-       def = bpf_map__def(map);
-       if (IS_ERR(def)) {
-               pr_debug("ERROR: Unable to get map definition from '%s'\n",
-                        map_name);
+       if (!map) {
+               pr_debug("Map '%s' is invalid\n", map_name);
                return -BPF_LOADER_ERRNO__INTERNAL;
        }
 
@@ -1158,7 +1152,7 @@ config_map_indices_range_check(struct parse_events_term *term,
                size_t length = array->ranges[i].length;
                unsigned int idx = start + length - 1;
 
-               if (idx >= def->max_entries) {
+               if (idx >= bpf_map__max_entries(map)) {
                        pr_debug("ERROR: index %d too large\n", idx);
                        return -BPF_LOADER_ERRNO__OBJCONF_MAP_IDX2BIG;
                }
@@ -1251,21 +1245,21 @@ out:
 }
 
 typedef int (*map_config_func_t)(const char *name, int map_fd,
-                                const struct bpf_map_def *pdef,
+                                const struct bpf_map *map,
                                 struct bpf_map_op *op,
                                 void *pkey, void *arg);
 
 static int
 foreach_key_array_all(map_config_func_t func,
                      void *arg, const char *name,
-                     int map_fd, const struct bpf_map_def *pdef,
+                     int map_fd, const struct bpf_map *map,
                      struct bpf_map_op *op)
 {
        unsigned int i;
        int err;
 
-       for (i = 0; i < pdef->max_entries; i++) {
-               err = func(name, map_fd, pdef, op, &i, arg);
+       for (i = 0; i < bpf_map__max_entries(map); i++) {
+               err = func(name, map_fd, map, op, &i, arg);
                if (err) {
                        pr_debug("ERROR: failed to insert value to %s[%u]\n",
                                 name, i);
@@ -1278,7 +1272,7 @@ foreach_key_array_all(map_config_func_t func,
 static int
 foreach_key_array_ranges(map_config_func_t func, void *arg,
                         const char *name, int map_fd,
-                        const struct bpf_map_def *pdef,
+                        const struct bpf_map *map,
                         struct bpf_map_op *op)
 {
        unsigned int i, j;
@@ -1291,7 +1285,7 @@ foreach_key_array_ranges(map_config_func_t func, void *arg,
                for (j = 0; j < length; j++) {
                        unsigned int idx = start + j;
 
-                       err = func(name, map_fd, pdef, op, &idx, arg);
+                       err = func(name, map_fd, map, op, &idx, arg);
                        if (err) {
                                pr_debug("ERROR: failed to insert value to %s[%u]\n",
                                         name, idx);
@@ -1307,9 +1301,8 @@ bpf_map_config_foreach_key(struct bpf_map *map,
                           map_config_func_t func,
                           void *arg)
 {
-       int err, map_fd;
+       int err, map_fd, type;
        struct bpf_map_op *op;
-       const struct bpf_map_def *def;
        const char *name = bpf_map__name(map);
        struct bpf_map_priv *priv = bpf_map__priv(map);
 
@@ -1322,9 +1315,8 @@ bpf_map_config_foreach_key(struct bpf_map *map,
                return 0;
        }
 
-       def = bpf_map__def(map);
-       if (IS_ERR(def)) {
-               pr_debug("ERROR: failed to get definition from map %s\n", name);
+       if (!map) {
+               pr_debug("Map '%s' is invalid\n", name);
                return -BPF_LOADER_ERRNO__INTERNAL;
        }
        map_fd = bpf_map__fd(map);
@@ -1333,19 +1325,19 @@ bpf_map_config_foreach_key(struct bpf_map *map,
                return map_fd;
        }
 
+       type = bpf_map__type(map);
        list_for_each_entry(op, &priv->ops_list, list) {
-               switch (def->type) {
+               switch (type) {
                case BPF_MAP_TYPE_ARRAY:
                case BPF_MAP_TYPE_PERF_EVENT_ARRAY:
                        switch (op->key_type) {
                        case BPF_MAP_KEY_ALL:
                                err = foreach_key_array_all(func, arg, name,
-                                                           map_fd, def, op);
+                                                           map_fd, map, op);
                                break;
                        case BPF_MAP_KEY_RANGES:
                                err = foreach_key_array_ranges(func, arg, name,
-                                                              map_fd, def,
-                                                              op);
+                                                              map_fd, map, op);
                                break;
                        default:
                                pr_debug("ERROR: keytype for map '%s' invalid\n",
@@ -1454,7 +1446,7 @@ apply_config_evsel_for_key(const char *name, int map_fd, void *pkey,
 
 static int
 apply_obj_config_map_for_key(const char *name, int map_fd,
-                            const struct bpf_map_def *pdef,
+                            const struct bpf_map *map,
                             struct bpf_map_op *op,
                             void *pkey, void *arg __maybe_unused)
 {
@@ -1463,7 +1455,7 @@ apply_obj_config_map_for_key(const char *name, int map_fd,
        switch (op->op_type) {
        case BPF_MAP_OP_SET_VALUE:
                err = apply_config_value_for_key(map_fd, pkey,
-                                                pdef->value_size,
+                                                bpf_map__value_size(map),
                                                 op->v.value);
                break;
        case BPF_MAP_OP_SET_EVSEL:
index eb853ca..c863ae0 100644 (file)
@@ -9,25 +9,25 @@
 #include <stdlib.h>
 #include <unistd.h>
 
-static bool bpf_map_def__is_per_cpu(const struct bpf_map_def *def)
+static bool bpf_map__is_per_cpu(enum bpf_map_type type)
 {
-       return def->type == BPF_MAP_TYPE_PERCPU_HASH ||
-              def->type == BPF_MAP_TYPE_PERCPU_ARRAY ||
-              def->type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
-              def->type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE;
+       return type == BPF_MAP_TYPE_PERCPU_HASH ||
+              type == BPF_MAP_TYPE_PERCPU_ARRAY ||
+              type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
+              type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE;
 }
 
-static void *bpf_map_def__alloc_value(const struct bpf_map_def *def)
+static void *bpf_map__alloc_value(const struct bpf_map *map)
 {
-       if (bpf_map_def__is_per_cpu(def))
-               return malloc(round_up(def->value_size, 8) * sysconf(_SC_NPROCESSORS_CONF));
+       if (bpf_map__is_per_cpu(bpf_map__type(map)))
+               return malloc(round_up(bpf_map__value_size(map), 8) *
+                             sysconf(_SC_NPROCESSORS_CONF));
 
-       return malloc(def->value_size);
+       return malloc(bpf_map__value_size(map));
 }
 
 int bpf_map__fprintf(struct bpf_map *map, FILE *fp)
 {
-       const struct bpf_map_def *def = bpf_map__def(map);
        void *prev_key = NULL, *key, *value;
        int fd = bpf_map__fd(map), err;
        int printed = 0;
@@ -35,15 +35,15 @@ int bpf_map__fprintf(struct bpf_map *map, FILE *fp)
        if (fd < 0)
                return fd;
 
-       if (IS_ERR(def))
-               return PTR_ERR(def);
+       if (!map)
+               return PTR_ERR(map);
 
        err = -ENOMEM;
-       key = malloc(def->key_size);
+       key = malloc(bpf_map__key_size(map));
        if (key == NULL)
                goto out;
 
-       value = bpf_map_def__alloc_value(def);
+       value = bpf_map__alloc_value(map);
        if (value == NULL)
                goto out_free_key;
 
index 42ffc24..91ea729 100644 (file)
@@ -21,7 +21,7 @@ endif
 
 BPF_GCC                ?= $(shell command -v bpf-gcc;)
 SAN_CFLAGS     ?=
-CFLAGS += -g -O0 -rdynamic -Wall $(GENFLAGS) $(SAN_CFLAGS)             \
+CFLAGS += -g -O0 -rdynamic -Wall -Werror $(GENFLAGS) $(SAN_CFLAGS)     \
          -I$(CURDIR) -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR)          \
          -I$(TOOLSINCDIR) -I$(APIDIR) -I$(OUTPUT)
 LDFLAGS += $(SAN_CFLAGS)
@@ -292,7 +292,7 @@ IS_LITTLE_ENDIAN = $(shell $(CC) -dM -E - </dev/null | \
 MENDIAN=$(if $(IS_LITTLE_ENDIAN),-mlittle-endian,-mbig-endian)
 
 CLANG_SYS_INCLUDES = $(call get_sys_includes,$(CLANG))
-BPF_CFLAGS = -g -D__TARGET_ARCH_$(SRCARCH) $(MENDIAN)                  \
+BPF_CFLAGS = -g -Werror -D__TARGET_ARCH_$(SRCARCH) $(MENDIAN)          \
             -I$(INCLUDE_DIR) -I$(CURDIR) -I$(APIDIR)                   \
             -I$(abspath $(OUTPUT)/../usr/include)
 
@@ -330,7 +330,7 @@ LINKED_SKELS := test_static_linked.skel.h linked_funcs.skel.h               \
 
 LSKELS := kfunc_call_test.c fentry_test.c fexit_test.c fexit_sleep.c \
        test_ringbuf.c atomics.c trace_printk.c trace_vprintk.c \
-       map_ptr_kern.c core_kern.c
+       map_ptr_kern.c core_kern.c core_kern_overflow.c
 # Generate both light skeleton and libbpf skeleton for these
 LSKELS_EXTRA := test_ksyms_module.c test_ksyms_weak.c kfunc_call_test_subprog.c
 SKEL_BLACKLIST += $$(LSKELS)
index 42ef250..d099d91 100644 (file)
@@ -206,6 +206,8 @@ btf_tag test and Clang version
 
 The btf_tag selftest requires LLVM support to recognize the btf_decl_tag and
 btf_type_tag attributes. They are introduced in `Clang 14` [0_, 1_].
+The subtests ``btf_type_tag_user_{mod1, mod2, vmlinux}`` also requires
+pahole version ``1.23``.
 
 Without them, the btf_tag selftest will be skipped and you will observe:
 
index da8593b..c2554f9 100644 (file)
@@ -151,7 +151,7 @@ static struct ringbuf_bench *ringbuf_setup_skeleton(void)
                /* record data + header take 16 bytes */
                skel->rodata->wakeup_data_size = args.sample_rate * 16;
 
-       bpf_map__resize(skel->maps.ringbuf, args.ringbuf_sz);
+       bpf_map__set_max_entries(skel->maps.ringbuf, args.ringbuf_sz);
 
        if (ringbuf_bench__load(skel)) {
                fprintf(stderr, "failed to load skeleton\n");
index 7f957c5..0c481de 100644 (file)
@@ -154,7 +154,6 @@ static void *uprobe_producer_without_nop(void *input)
 static void usetup(bool use_retprobe, bool use_nop)
 {
        size_t uprobe_offset;
-       ssize_t base_addr;
        struct bpf_link *link;
 
        setup_libbpf();
@@ -165,11 +164,10 @@ static void usetup(bool use_retprobe, bool use_nop)
                exit(1);
        }
 
-       base_addr = get_base_addr();
        if (use_nop)
-               uprobe_offset = get_uprobe_offset(&uprobe_target_with_nop, base_addr);
+               uprobe_offset = get_uprobe_offset(&uprobe_target_with_nop);
        else
-               uprobe_offset = get_uprobe_offset(&uprobe_target_without_nop, base_addr);
+               uprobe_offset = get_uprobe_offset(&uprobe_target_without_nop);
 
        link = bpf_program__attach_uprobe(ctx.skel->progs.bench_trigger_uprobe,
                                          use_retprobe,
index df3b292..27d63be 100644 (file)
 #define CREATE_TRACE_POINTS
 #include "bpf_testmod-events.h"
 
+typedef int (*func_proto_typedef)(long);
+typedef int (*func_proto_typedef_nested1)(func_proto_typedef);
+typedef int (*func_proto_typedef_nested2)(func_proto_typedef_nested1);
+
 DEFINE_PER_CPU(int, bpf_testmod_ksym_percpu) = 123;
 
 noinline void
@@ -21,6 +25,27 @@ bpf_testmod_test_mod_kfunc(int i)
        *(int *)this_cpu_ptr(&bpf_testmod_ksym_percpu) = i;
 }
 
+struct bpf_testmod_btf_type_tag_1 {
+       int a;
+};
+
+struct bpf_testmod_btf_type_tag_2 {
+       struct bpf_testmod_btf_type_tag_1 __user *p;
+};
+
+noinline int
+bpf_testmod_test_btf_type_tag_user_1(struct bpf_testmod_btf_type_tag_1 __user *arg) {
+       BTF_TYPE_EMIT(func_proto_typedef);
+       BTF_TYPE_EMIT(func_proto_typedef_nested1);
+       BTF_TYPE_EMIT(func_proto_typedef_nested2);
+       return arg->a;
+}
+
+noinline int
+bpf_testmod_test_btf_type_tag_user_2(struct bpf_testmod_btf_type_tag_2 *arg) {
+       return arg->p->a;
+}
+
 noinline int bpf_testmod_loop_test(int n)
 {
        int i, sum = 0;
@@ -109,26 +134,31 @@ static struct bin_attribute bin_attr_bpf_testmod_file __ro_after_init = {
        .write = bpf_testmod_test_write,
 };
 
-BTF_SET_START(bpf_testmod_kfunc_ids)
+BTF_SET_START(bpf_testmod_check_kfunc_ids)
 BTF_ID(func, bpf_testmod_test_mod_kfunc)
-BTF_SET_END(bpf_testmod_kfunc_ids)
+BTF_SET_END(bpf_testmod_check_kfunc_ids)
+
+static const struct btf_kfunc_id_set bpf_testmod_kfunc_set = {
+       .owner     = THIS_MODULE,
+       .check_set = &bpf_testmod_check_kfunc_ids,
+};
 
-static DEFINE_KFUNC_BTF_ID_SET(&bpf_testmod_kfunc_ids, bpf_testmod_kfunc_btf_set);
+extern int bpf_fentry_test1(int a);
 
 static int bpf_testmod_init(void)
 {
        int ret;
 
-       ret = sysfs_create_bin_file(kernel_kobj, &bin_attr_bpf_testmod_file);
-       if (ret)
+       ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_testmod_kfunc_set);
+       if (ret < 0)
                return ret;
-       register_kfunc_btf_id_set(&prog_test_kfunc_list, &bpf_testmod_kfunc_btf_set);
-       return 0;
+       if (bpf_fentry_test1(0) < 0)
+               return -EINVAL;
+       return sysfs_create_bin_file(kernel_kobj, &bin_attr_bpf_testmod_file);
 }
 
 static void bpf_testmod_exit(void)
 {
-       unregister_kfunc_btf_id_set(&prog_test_kfunc_list, &bpf_testmod_kfunc_btf_set);
        return sysfs_remove_bin_file(kernel_kobj, &bin_attr_bpf_testmod_file);
 }
 
index f628713..763db63 100644 (file)
@@ -48,3 +48,8 @@ CONFIG_IMA_READ_POLICY=y
 CONFIG_BLK_DEV_LOOP=y
 CONFIG_FUNCTION_TRACER=y
 CONFIG_DYNAMIC_FTRACE=y
+CONFIG_NETFILTER=y
+CONFIG_NF_DEFRAG_IPV4=y
+CONFIG_NF_DEFRAG_IPV6=y
+CONFIG_NF_CONNTRACK=y
+CONFIG_USERFAULTFD=y
index 86b7d5d..ab62aba 100644 (file)
@@ -7,18 +7,18 @@
 static void test_add(struct atomics_lskel *skel)
 {
        int err, prog_fd;
-       __u32 duration = 0, retval;
        int link_fd;
+       LIBBPF_OPTS(bpf_test_run_opts, topts);
 
        link_fd = atomics_lskel__add__attach(skel);
        if (!ASSERT_GT(link_fd, 0, "attach(add)"))
                return;
 
        prog_fd = skel->progs.add.prog_fd;
-       err = bpf_prog_test_run(prog_fd, 1, NULL, 0,
-                               NULL, NULL, &retval, &duration);
-       if (CHECK(err || retval, "test_run add",
-                 "err %d errno %d retval %d duration %d\n", err, errno, retval, duration))
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
+       if (!ASSERT_OK(err, "test_run_opts err"))
+               goto cleanup;
+       if (!ASSERT_OK(topts.retval, "test_run_opts retval"))
                goto cleanup;
 
        ASSERT_EQ(skel->data->add64_value, 3, "add64_value");
@@ -39,19 +39,18 @@ cleanup:
 static void test_sub(struct atomics_lskel *skel)
 {
        int err, prog_fd;
-       __u32 duration = 0, retval;
        int link_fd;
+       LIBBPF_OPTS(bpf_test_run_opts, topts);
 
        link_fd = atomics_lskel__sub__attach(skel);
        if (!ASSERT_GT(link_fd, 0, "attach(sub)"))
                return;
 
        prog_fd = skel->progs.sub.prog_fd;
-       err = bpf_prog_test_run(prog_fd, 1, NULL, 0,
-                               NULL, NULL, &retval, &duration);
-       if (CHECK(err || retval, "test_run sub",
-                 "err %d errno %d retval %d duration %d\n",
-                 err, errno, retval, duration))
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
+       if (!ASSERT_OK(err, "test_run_opts err"))
+               goto cleanup;
+       if (!ASSERT_OK(topts.retval, "test_run_opts retval"))
                goto cleanup;
 
        ASSERT_EQ(skel->data->sub64_value, -1, "sub64_value");
@@ -72,18 +71,18 @@ cleanup:
 static void test_and(struct atomics_lskel *skel)
 {
        int err, prog_fd;
-       __u32 duration = 0, retval;
        int link_fd;
+       LIBBPF_OPTS(bpf_test_run_opts, topts);
 
        link_fd = atomics_lskel__and__attach(skel);
        if (!ASSERT_GT(link_fd, 0, "attach(and)"))
                return;
 
        prog_fd = skel->progs.and.prog_fd;
-       err = bpf_prog_test_run(prog_fd, 1, NULL, 0,
-                               NULL, NULL, &retval, &duration);
-       if (CHECK(err || retval, "test_run and",
-                 "err %d errno %d retval %d duration %d\n", err, errno, retval, duration))
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
+       if (!ASSERT_OK(err, "test_run_opts err"))
+               goto cleanup;
+       if (!ASSERT_OK(topts.retval, "test_run_opts retval"))
                goto cleanup;
 
        ASSERT_EQ(skel->data->and64_value, 0x010ull << 32, "and64_value");
@@ -100,19 +99,18 @@ cleanup:
 static void test_or(struct atomics_lskel *skel)
 {
        int err, prog_fd;
-       __u32 duration = 0, retval;
        int link_fd;
+       LIBBPF_OPTS(bpf_test_run_opts, topts);
 
        link_fd = atomics_lskel__or__attach(skel);
        if (!ASSERT_GT(link_fd, 0, "attach(or)"))
                return;
 
        prog_fd = skel->progs.or.prog_fd;
-       err = bpf_prog_test_run(prog_fd, 1, NULL, 0,
-                               NULL, NULL, &retval, &duration);
-       if (CHECK(err || retval, "test_run or",
-                 "err %d errno %d retval %d duration %d\n",
-                 err, errno, retval, duration))
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
+       if (!ASSERT_OK(err, "test_run_opts err"))
+               goto cleanup;
+       if (!ASSERT_OK(topts.retval, "test_run_opts retval"))
                goto cleanup;
 
        ASSERT_EQ(skel->data->or64_value, 0x111ull << 32, "or64_value");
@@ -129,18 +127,18 @@ cleanup:
 static void test_xor(struct atomics_lskel *skel)
 {
        int err, prog_fd;
-       __u32 duration = 0, retval;
        int link_fd;
+       LIBBPF_OPTS(bpf_test_run_opts, topts);
 
        link_fd = atomics_lskel__xor__attach(skel);
        if (!ASSERT_GT(link_fd, 0, "attach(xor)"))
                return;
 
        prog_fd = skel->progs.xor.prog_fd;
-       err = bpf_prog_test_run(prog_fd, 1, NULL, 0,
-                               NULL, NULL, &retval, &duration);
-       if (CHECK(err || retval, "test_run xor",
-                 "err %d errno %d retval %d duration %d\n", err, errno, retval, duration))
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
+       if (!ASSERT_OK(err, "test_run_opts err"))
+               goto cleanup;
+       if (!ASSERT_OK(topts.retval, "test_run_opts retval"))
                goto cleanup;
 
        ASSERT_EQ(skel->data->xor64_value, 0x101ull << 32, "xor64_value");
@@ -157,18 +155,18 @@ cleanup:
 static void test_cmpxchg(struct atomics_lskel *skel)
 {
        int err, prog_fd;
-       __u32 duration = 0, retval;
        int link_fd;
+       LIBBPF_OPTS(bpf_test_run_opts, topts);
 
        link_fd = atomics_lskel__cmpxchg__attach(skel);
        if (!ASSERT_GT(link_fd, 0, "attach(cmpxchg)"))
                return;
 
        prog_fd = skel->progs.cmpxchg.prog_fd;
-       err = bpf_prog_test_run(prog_fd, 1, NULL, 0,
-                               NULL, NULL, &retval, &duration);
-       if (CHECK(err || retval, "test_run cmpxchg",
-                 "err %d errno %d retval %d duration %d\n", err, errno, retval, duration))
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
+       if (!ASSERT_OK(err, "test_run_opts err"))
+               goto cleanup;
+       if (!ASSERT_OK(topts.retval, "test_run_opts retval"))
                goto cleanup;
 
        ASSERT_EQ(skel->data->cmpxchg64_value, 2, "cmpxchg64_value");
@@ -186,18 +184,18 @@ cleanup:
 static void test_xchg(struct atomics_lskel *skel)
 {
        int err, prog_fd;
-       __u32 duration = 0, retval;
        int link_fd;
+       LIBBPF_OPTS(bpf_test_run_opts, topts);
 
        link_fd = atomics_lskel__xchg__attach(skel);
        if (!ASSERT_GT(link_fd, 0, "attach(xchg)"))
                return;
 
        prog_fd = skel->progs.xchg.prog_fd;
-       err = bpf_prog_test_run(prog_fd, 1, NULL, 0,
-                               NULL, NULL, &retval, &duration);
-       if (CHECK(err || retval, "test_run xchg",
-                 "err %d errno %d retval %d duration %d\n", err, errno, retval, duration))
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
+       if (!ASSERT_OK(err, "test_run_opts err"))
+               goto cleanup;
+       if (!ASSERT_OK(topts.retval, "test_run_opts retval"))
                goto cleanup;
 
        ASSERT_EQ(skel->data->xchg64_value, 2, "xchg64_value");
index d0bd51e..d48f6e5 100644 (file)
@@ -5,9 +5,10 @@
 /* this is how USDT semaphore is actually defined, except volatile modifier */
 volatile unsigned short uprobe_ref_ctr __attribute__((unused)) __attribute((section(".probes")));
 
-/* attach point */
-static void method(void) {
-       return ;
+/* uprobe attach point */
+static void trigger_func(void)
+{
+       asm volatile ("");
 }
 
 void test_attach_probe(void)
@@ -17,8 +18,7 @@ void test_attach_probe(void)
        struct bpf_link *kprobe_link, *kretprobe_link;
        struct bpf_link *uprobe_link, *uretprobe_link;
        struct test_attach_probe* skel;
-       size_t uprobe_offset;
-       ssize_t base_addr, ref_ctr_offset;
+       ssize_t uprobe_offset, ref_ctr_offset;
        bool legacy;
 
        /* Check if new-style kprobe/uprobe API is supported.
@@ -34,11 +34,9 @@ void test_attach_probe(void)
         */
        legacy = access("/sys/bus/event_source/devices/kprobe/type", F_OK) != 0;
 
-       base_addr = get_base_addr();
-       if (CHECK(base_addr < 0, "get_base_addr",
-                 "failed to find base addr: %zd", base_addr))
+       uprobe_offset = get_uprobe_offset(&trigger_func);
+       if (!ASSERT_GE(uprobe_offset, 0, "uprobe_offset"))
                return;
-       uprobe_offset = get_uprobe_offset(&method, base_addr);
 
        ref_ctr_offset = get_rel_offset((uintptr_t)&uprobe_ref_ctr);
        if (!ASSERT_GE(ref_ctr_offset, 0, "ref_ctr_offset"))
@@ -103,7 +101,7 @@ void test_attach_probe(void)
                goto cleanup;
 
        /* trigger & validate uprobe & uretprobe */
-       method();
+       trigger_func();
 
        if (CHECK(skel->bss->uprobe_res != 3, "check_uprobe_res",
                  "wrong uprobe res: %d\n", skel->bss->uprobe_res))
index d0f06e4..eac71fb 100644 (file)
@@ -1,13 +1,24 @@
 // SPDX-License-Identifier: GPL-2.0
-#include <test_progs.h>
-#include "bind_perm.skel.h"
-
+#define _GNU_SOURCE
+#include <sched.h>
+#include <stdlib.h>
 #include <sys/types.h>
 #include <sys/socket.h>
 #include <sys/capability.h>
 
+#include "test_progs.h"
+#include "bind_perm.skel.h"
+
 static int duration;
 
+static int create_netns(void)
+{
+       if (!ASSERT_OK(unshare(CLONE_NEWNET), "create netns"))
+               return -1;
+
+       return 0;
+}
+
 void try_bind(int family, int port, int expected_errno)
 {
        struct sockaddr_storage addr = {};
@@ -75,6 +86,9 @@ void test_bind_perm(void)
        struct bind_perm *skel;
        int cgroup_fd;
 
+       if (create_netns())
+               return;
+
        cgroup_fd = test__join_cgroup("/bind_perm");
        if (CHECK(cgroup_fd < 0, "cg-join", "errno %d", errno))
                return;
index 5eea3c3..cd10df6 100644 (file)
@@ -8,6 +8,12 @@
 #include <test_progs.h>
 #include "test_bpf_cookie.skel.h"
 
+/* uprobe attach point */
+static void trigger_func(void)
+{
+       asm volatile ("");
+}
+
 static void kprobe_subtest(struct test_bpf_cookie *skel)
 {
        DECLARE_LIBBPF_OPTS(bpf_kprobe_opts, opts);
@@ -62,11 +68,11 @@ static void uprobe_subtest(struct test_bpf_cookie *skel)
        DECLARE_LIBBPF_OPTS(bpf_uprobe_opts, opts);
        struct bpf_link *link1 = NULL, *link2 = NULL;
        struct bpf_link *retlink1 = NULL, *retlink2 = NULL;
-       size_t uprobe_offset;
-       ssize_t base_addr;
+       ssize_t uprobe_offset;
 
-       base_addr = get_base_addr();
-       uprobe_offset = get_uprobe_offset(&get_base_addr, base_addr);
+       uprobe_offset = get_uprobe_offset(&trigger_func);
+       if (!ASSERT_GE(uprobe_offset, 0, "uprobe_offset"))
+               goto cleanup;
 
        /* attach two uprobes */
        opts.bpf_cookie = 0x100;
@@ -99,7 +105,7 @@ static void uprobe_subtest(struct test_bpf_cookie *skel)
                goto cleanup;
 
        /* trigger uprobe && uretprobe */
-       get_base_addr();
+       trigger_func();
 
        ASSERT_EQ(skel->bss->uprobe_res, 0x100 | 0x200, "uprobe_res");
        ASSERT_EQ(skel->bss->uretprobe_res, 0x1000 | 0x2000, "uretprobe_res");
index b84f859..5142a7d 100644 (file)
@@ -138,6 +138,24 @@ static void test_task(void)
        bpf_iter_task__destroy(skel);
 }
 
+static void test_task_sleepable(void)
+{
+       struct bpf_iter_task *skel;
+
+       skel = bpf_iter_task__open_and_load();
+       if (!ASSERT_OK_PTR(skel, "bpf_iter_task__open_and_load"))
+               return;
+
+       do_dummy_read(skel->progs.dump_task_sleepable);
+
+       ASSERT_GT(skel->bss->num_expected_failure_copy_from_user_task, 0,
+                 "num_expected_failure_copy_from_user_task");
+       ASSERT_GT(skel->bss->num_success_copy_from_user_task, 0,
+                 "num_success_copy_from_user_task");
+
+       bpf_iter_task__destroy(skel);
+}
+
 static void test_task_stack(void)
 {
        struct bpf_iter_task_stack *skel;
@@ -1252,6 +1270,8 @@ void test_bpf_iter(void)
                test_bpf_map();
        if (test__start_subtest("task"))
                test_task();
+       if (test__start_subtest("task_sleepable"))
+               test_task_sleepable();
        if (test__start_subtest("task_stack"))
                test_task_stack();
        if (test__start_subtest("task_file"))
diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter_setsockopt_unix.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter_setsockopt_unix.c
new file mode 100644 (file)
index 0000000..ee725d4
--- /dev/null
@@ -0,0 +1,100 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright Amazon.com Inc. or its affiliates. */
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <test_progs.h>
+#include "bpf_iter_setsockopt_unix.skel.h"
+
+#define NR_CASES 5
+
+static int create_unix_socket(struct bpf_iter_setsockopt_unix *skel)
+{
+       struct sockaddr_un addr = {
+               .sun_family = AF_UNIX,
+               .sun_path = "",
+       };
+       socklen_t len;
+       int fd, err;
+
+       fd = socket(AF_UNIX, SOCK_STREAM, 0);
+       if (!ASSERT_NEQ(fd, -1, "socket"))
+               return -1;
+
+       len = offsetof(struct sockaddr_un, sun_path);
+       err = bind(fd, (struct sockaddr *)&addr, len);
+       if (!ASSERT_OK(err, "bind"))
+               return -1;
+
+       len = sizeof(addr);
+       err = getsockname(fd, (struct sockaddr *)&addr, &len);
+       if (!ASSERT_OK(err, "getsockname"))
+               return -1;
+
+       memcpy(&skel->bss->sun_path, &addr.sun_path,
+              len - offsetof(struct sockaddr_un, sun_path));
+
+       return fd;
+}
+
+static void test_sndbuf(struct bpf_iter_setsockopt_unix *skel, int fd)
+{
+       socklen_t optlen;
+       int i, err;
+
+       for (i = 0; i < NR_CASES; i++) {
+               if (!ASSERT_NEQ(skel->data->sndbuf_getsockopt[i], -1,
+                               "bpf_(get|set)sockopt"))
+                       return;
+
+               err = setsockopt(fd, SOL_SOCKET, SO_SNDBUF,
+                                &(skel->data->sndbuf_setsockopt[i]),
+                                sizeof(skel->data->sndbuf_setsockopt[i]));
+               if (!ASSERT_OK(err, "setsockopt"))
+                       return;
+
+               optlen = sizeof(skel->bss->sndbuf_getsockopt_expected[i]);
+               err = getsockopt(fd, SOL_SOCKET, SO_SNDBUF,
+                                &(skel->bss->sndbuf_getsockopt_expected[i]),
+                                &optlen);
+               if (!ASSERT_OK(err, "getsockopt"))
+                       return;
+
+               if (!ASSERT_EQ(skel->data->sndbuf_getsockopt[i],
+                              skel->bss->sndbuf_getsockopt_expected[i],
+                              "bpf_(get|set)sockopt"))
+                       return;
+       }
+}
+
+void test_bpf_iter_setsockopt_unix(void)
+{
+       struct bpf_iter_setsockopt_unix *skel;
+       int err, unix_fd, iter_fd;
+       char buf;
+
+       skel = bpf_iter_setsockopt_unix__open_and_load();
+       if (!ASSERT_OK_PTR(skel, "open_and_load"))
+               return;
+
+       unix_fd = create_unix_socket(skel);
+       if (!ASSERT_NEQ(unix_fd, -1, "create_unix_server"))
+               goto destroy;
+
+       skel->links.change_sndbuf = bpf_program__attach_iter(skel->progs.change_sndbuf, NULL);
+       if (!ASSERT_OK_PTR(skel->links.change_sndbuf, "bpf_program__attach_iter"))
+               goto destroy;
+
+       iter_fd = bpf_iter_create(bpf_link__fd(skel->links.change_sndbuf));
+       if (!ASSERT_GE(iter_fd, 0, "bpf_iter_create"))
+               goto destroy;
+
+       while ((err = read(iter_fd, &buf, sizeof(buf))) == -1 &&
+              errno == EAGAIN)
+               ;
+       if (!ASSERT_OK(err, "read iter error"))
+               goto destroy;
+
+       test_sndbuf(skel, unix_fd);
+destroy:
+       bpf_iter_setsockopt_unix__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_mod_race.c b/tools/testing/selftests/bpf/prog_tests/bpf_mod_race.c
new file mode 100644 (file)
index 0000000..d43f548
--- /dev/null
@@ -0,0 +1,230 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <unistd.h>
+#include <pthread.h>
+#include <sys/mman.h>
+#include <stdatomic.h>
+#include <test_progs.h>
+#include <sys/syscall.h>
+#include <linux/module.h>
+#include <linux/userfaultfd.h>
+
+#include "ksym_race.skel.h"
+#include "bpf_mod_race.skel.h"
+#include "kfunc_call_race.skel.h"
+
+/* This test crafts a race between btf_try_get_module and do_init_module, and
+ * checks whether btf_try_get_module handles the invocation for a well-formed
+ * but uninitialized module correctly. Unless the module has completed its
+ * initcalls, the verifier should fail the program load and return ENXIO.
+ *
+ * userfaultfd is used to trigger a fault in an fmod_ret program, and make it
+ * sleep, then the BPF program is loaded and the return value from verifier is
+ * inspected. After this, the userfaultfd is closed so that the module loading
+ * thread makes forward progress, and fmod_ret injects an error so that the
+ * module load fails and it is freed.
+ *
+ * If the verifier succeeded in loading the supplied program, it will end up
+ * taking reference to freed module, and trigger a crash when the program fd
+ * is closed later. This is true for both kfuncs and ksyms. In both cases,
+ * the crash is triggered inside bpf_prog_free_deferred, when module reference
+ * is finally released.
+ */
+
+struct test_config {
+       const char *str_open;
+       void *(*bpf_open_and_load)();
+       void (*bpf_destroy)(void *);
+};
+
+enum test_state {
+       _TS_INVALID,
+       TS_MODULE_LOAD,
+       TS_MODULE_LOAD_FAIL,
+};
+
+static _Atomic enum test_state state = _TS_INVALID;
+
+static int sys_finit_module(int fd, const char *param_values, int flags)
+{
+       return syscall(__NR_finit_module, fd, param_values, flags);
+}
+
+static int sys_delete_module(const char *name, unsigned int flags)
+{
+       return syscall(__NR_delete_module, name, flags);
+}
+
+static int load_module(const char *mod)
+{
+       int ret, fd;
+
+       fd = open("bpf_testmod.ko", O_RDONLY);
+       if (fd < 0)
+               return fd;
+
+       ret = sys_finit_module(fd, "", 0);
+       close(fd);
+       if (ret < 0)
+               return ret;
+       return 0;
+}
+
+static void *load_module_thread(void *p)
+{
+
+       if (!ASSERT_NEQ(load_module("bpf_testmod.ko"), 0, "load_module_thread must fail"))
+               atomic_store(&state, TS_MODULE_LOAD);
+       else
+               atomic_store(&state, TS_MODULE_LOAD_FAIL);
+       return p;
+}
+
+static int sys_userfaultfd(int flags)
+{
+       return syscall(__NR_userfaultfd, flags);
+}
+
+static int test_setup_uffd(void *fault_addr)
+{
+       struct uffdio_register uffd_register = {};
+       struct uffdio_api uffd_api = {};
+       int uffd;
+
+       uffd = sys_userfaultfd(O_CLOEXEC);
+       if (uffd < 0)
+               return -errno;
+
+       uffd_api.api = UFFD_API;
+       uffd_api.features = 0;
+       if (ioctl(uffd, UFFDIO_API, &uffd_api)) {
+               close(uffd);
+               return -1;
+       }
+
+       uffd_register.range.start = (unsigned long)fault_addr;
+       uffd_register.range.len = 4096;
+       uffd_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+       if (ioctl(uffd, UFFDIO_REGISTER, &uffd_register)) {
+               close(uffd);
+               return -1;
+       }
+       return uffd;
+}
+
+static void test_bpf_mod_race_config(const struct test_config *config)
+{
+       void *fault_addr, *skel_fail;
+       struct bpf_mod_race *skel;
+       struct uffd_msg uffd_msg;
+       pthread_t load_mod_thrd;
+       _Atomic int *blockingp;
+       int uffd, ret;
+
+       fault_addr = mmap(0, 4096, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+       if (!ASSERT_NEQ(fault_addr, MAP_FAILED, "mmap for uffd registration"))
+               return;
+
+       if (!ASSERT_OK(sys_delete_module("bpf_testmod", 0), "unload bpf_testmod"))
+               goto end_mmap;
+
+       skel = bpf_mod_race__open();
+       if (!ASSERT_OK_PTR(skel, "bpf_mod_kfunc_race__open"))
+               goto end_module;
+
+       skel->rodata->bpf_mod_race_config.tgid = getpid();
+       skel->rodata->bpf_mod_race_config.inject_error = -4242;
+       skel->rodata->bpf_mod_race_config.fault_addr = fault_addr;
+       if (!ASSERT_OK(bpf_mod_race__load(skel), "bpf_mod___load"))
+               goto end_destroy;
+       blockingp = (_Atomic int *)&skel->bss->bpf_blocking;
+
+       if (!ASSERT_OK(bpf_mod_race__attach(skel), "bpf_mod_kfunc_race__attach"))
+               goto end_destroy;
+
+       uffd = test_setup_uffd(fault_addr);
+       if (!ASSERT_GE(uffd, 0, "userfaultfd open + register address"))
+               goto end_destroy;
+
+       if (!ASSERT_OK(pthread_create(&load_mod_thrd, NULL, load_module_thread, NULL),
+                      "load module thread"))
+               goto end_uffd;
+
+       /* Now, we either fail loading module, or block in bpf prog, spin to find out */
+       while (!atomic_load(&state) && !atomic_load(blockingp))
+               ;
+       if (!ASSERT_EQ(state, _TS_INVALID, "module load should block"))
+               goto end_join;
+       if (!ASSERT_EQ(*blockingp, 1, "module load blocked")) {
+               pthread_kill(load_mod_thrd, SIGKILL);
+               goto end_uffd;
+       }
+
+       /* We might have set bpf_blocking to 1, but may have not blocked in
+        * bpf_copy_from_user. Read userfaultfd descriptor to verify that.
+        */
+       if (!ASSERT_EQ(read(uffd, &uffd_msg, sizeof(uffd_msg)), sizeof(uffd_msg),
+                      "read uffd block event"))
+               goto end_join;
+       if (!ASSERT_EQ(uffd_msg.event, UFFD_EVENT_PAGEFAULT, "read uffd event is pagefault"))
+               goto end_join;
+
+       /* We know that load_mod_thrd is blocked in the fmod_ret program, the
+        * module state is still MODULE_STATE_COMING because mod->init hasn't
+        * returned. This is the time we try to load a program calling kfunc and
+        * check if we get ENXIO from verifier.
+        */
+       skel_fail = config->bpf_open_and_load();
+       ret = errno;
+       if (!ASSERT_EQ(skel_fail, NULL, config->str_open)) {
+               /* Close uffd to unblock load_mod_thrd */
+               close(uffd);
+               uffd = -1;
+               while (atomic_load(blockingp) != 2)
+                       ;
+               ASSERT_OK(kern_sync_rcu(), "kern_sync_rcu");
+               config->bpf_destroy(skel_fail);
+               goto end_join;
+
+       }
+       ASSERT_EQ(ret, ENXIO, "verifier returns ENXIO");
+       ASSERT_EQ(skel->data->res_try_get_module, false, "btf_try_get_module == false");
+
+       close(uffd);
+       uffd = -1;
+end_join:
+       pthread_join(load_mod_thrd, NULL);
+       if (uffd < 0)
+               ASSERT_EQ(atomic_load(&state), TS_MODULE_LOAD_FAIL, "load_mod_thrd success");
+end_uffd:
+       if (uffd >= 0)
+               close(uffd);
+end_destroy:
+       bpf_mod_race__destroy(skel);
+       ASSERT_OK(kern_sync_rcu(), "kern_sync_rcu");
+end_module:
+       sys_delete_module("bpf_testmod", 0);
+       ASSERT_OK(load_module("bpf_testmod.ko"), "restore bpf_testmod");
+end_mmap:
+       munmap(fault_addr, 4096);
+       atomic_store(&state, _TS_INVALID);
+}
+
+static const struct test_config ksym_config = {
+       .str_open = "ksym_race__open_and_load",
+       .bpf_open_and_load = (void *)ksym_race__open_and_load,
+       .bpf_destroy = (void *)ksym_race__destroy,
+};
+
+static const struct test_config kfunc_config = {
+       .str_open = "kfunc_call_race__open_and_load",
+       .bpf_open_and_load = (void *)kfunc_call_race__open_and_load,
+       .bpf_destroy = (void *)kfunc_call_race__destroy,
+};
+
+void serial_test_bpf_mod_race(void)
+{
+       if (test__start_subtest("ksym (used_btfs UAF)"))
+               test_bpf_mod_race_config(&ksym_config);
+       if (test__start_subtest("kfunc (kfunc_btf_tab UAF)"))
+               test_bpf_mod_race_config(&kfunc_config);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c
new file mode 100644 (file)
index 0000000..dd30b1e
--- /dev/null
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+#include <network_helpers.h>
+#include "test_bpf_nf.skel.h"
+
+enum {
+       TEST_XDP,
+       TEST_TC_BPF,
+};
+
+void test_bpf_nf_ct(int mode)
+{
+       struct test_bpf_nf *skel;
+       int prog_fd, err;
+       LIBBPF_OPTS(bpf_test_run_opts, topts,
+               .data_in = &pkt_v4,
+               .data_size_in = sizeof(pkt_v4),
+               .repeat = 1,
+       );
+
+       skel = test_bpf_nf__open_and_load();
+       if (!ASSERT_OK_PTR(skel, "test_bpf_nf__open_and_load"))
+               return;
+
+       if (mode == TEST_XDP)
+               prog_fd = bpf_program__fd(skel->progs.nf_xdp_ct_test);
+       else
+               prog_fd = bpf_program__fd(skel->progs.nf_skb_ct_test);
+
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
+       if (!ASSERT_OK(err, "bpf_prog_test_run"))
+               goto end;
+
+       ASSERT_EQ(skel->bss->test_einval_bpf_tuple, -EINVAL, "Test EINVAL for NULL bpf_tuple");
+       ASSERT_EQ(skel->bss->test_einval_reserved, -EINVAL, "Test EINVAL for reserved not set to 0");
+       ASSERT_EQ(skel->bss->test_einval_netns_id, -EINVAL, "Test EINVAL for netns_id < -1");
+       ASSERT_EQ(skel->bss->test_einval_len_opts, -EINVAL, "Test EINVAL for len__opts != NF_BPF_CT_OPTS_SZ");
+       ASSERT_EQ(skel->bss->test_eproto_l4proto, -EPROTO, "Test EPROTO for l4proto != TCP or UDP");
+       ASSERT_EQ(skel->bss->test_enonet_netns_id, -ENONET, "Test ENONET for bad but valid netns_id");
+       ASSERT_EQ(skel->bss->test_enoent_lookup, -ENOENT, "Test ENOENT for failed lookup");
+       ASSERT_EQ(skel->bss->test_eafnosupport, -EAFNOSUPPORT, "Test EAFNOSUPPORT for invalid len__tuple");
+end:
+       test_bpf_nf__destroy(skel);
+}
+
+void test_bpf_nf(void)
+{
+       if (test__start_subtest("xdp-ct"))
+               test_bpf_nf_ct(TEST_XDP);
+       if (test__start_subtest("tc-bpf-ct"))
+               test_bpf_nf_ct(TEST_TC_BPF);
+}
index 8ba53ac..8b652f5 100644 (file)
@@ -3939,6 +3939,25 @@ static struct btf_raw_test raw_tests[] = {
        .err_str = "Invalid component_idx",
 },
 {
+       .descr = "decl_tag test #15, func, invalid func proto",
+       .raw_types = {
+               BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),  /* [1] */
+               BTF_DECL_TAG_ENC(NAME_TBD, 3, 0),               /* [2] */
+               BTF_FUNC_ENC(NAME_TBD, 8),                      /* [3] */
+               BTF_END_RAW,
+       },
+       BTF_STR_SEC("\0tag\0func"),
+       .map_type = BPF_MAP_TYPE_ARRAY,
+       .map_name = "tag_type_check_btf",
+       .key_size = sizeof(int),
+       .value_size = 4,
+       .key_type_id = 1,
+       .value_type_id = 1,
+       .max_entries = 1,
+       .btf_load_err = true,
+       .err_str = "Invalid type_id",
+},
+{
        .descr = "type_tag test #1",
        .raw_types = {
                BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),  /* [1] */
@@ -4560,6 +4579,8 @@ static void do_test_file(unsigned int test_num)
        has_btf_ext = btf_ext != NULL;
        btf_ext__free(btf_ext);
 
+       /* temporary disable LIBBPF_STRICT_MAP_DEFINITIONS to test legacy maps */
+       libbpf_set_strict_mode(LIBBPF_STRICT_ALL & ~LIBBPF_STRICT_MAP_DEFINITIONS);
        obj = bpf_object__open(test->file);
        err = libbpf_get_error(obj);
        if (CHECK(err, "obj: %d", err))
@@ -4684,6 +4705,8 @@ skip:
        fprintf(stderr, "OK");
 
 done:
+       libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
+
        btf__free(btf);
        free(func_info);
        bpf_object__close(obj);
index 88d63e2..f7560b5 100644 (file)
@@ -1,19 +1,21 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2021 Facebook */
 #include <test_progs.h>
-#include "btf_decl_tag.skel.h"
+#include <bpf/btf.h>
+#include "test_btf_decl_tag.skel.h"
 
 /* struct btf_type_tag_test is referenced in btf_type_tag.skel.h */
 struct btf_type_tag_test {
         int **p;
 };
 #include "btf_type_tag.skel.h"
+#include "btf_type_tag_user.skel.h"
 
 static void test_btf_decl_tag(void)
 {
-       struct btf_decl_tag *skel;
+       struct test_btf_decl_tag *skel;
 
-       skel = btf_decl_tag__open_and_load();
+       skel = test_btf_decl_tag__open_and_load();
        if (!ASSERT_OK_PTR(skel, "btf_decl_tag"))
                return;
 
@@ -22,7 +24,7 @@ static void test_btf_decl_tag(void)
                test__skip();
        }
 
-       btf_decl_tag__destroy(skel);
+       test_btf_decl_tag__destroy(skel);
 }
 
 static void test_btf_type_tag(void)
@@ -41,10 +43,101 @@ static void test_btf_type_tag(void)
        btf_type_tag__destroy(skel);
 }
 
+static void test_btf_type_tag_mod_user(bool load_test_user1)
+{
+       const char *module_name = "bpf_testmod";
+       struct btf *vmlinux_btf, *module_btf;
+       struct btf_type_tag_user *skel;
+       __s32 type_id;
+       int err;
+
+       if (!env.has_testmod) {
+               test__skip();
+               return;
+       }
+
+       /* skip the test if the module does not have __user tags */
+       vmlinux_btf = btf__load_vmlinux_btf();
+       if (!ASSERT_OK_PTR(vmlinux_btf, "could not load vmlinux BTF"))
+               return;
+
+       module_btf = btf__load_module_btf(module_name, vmlinux_btf);
+       if (!ASSERT_OK_PTR(module_btf, "could not load module BTF"))
+               goto free_vmlinux_btf;
+
+       type_id = btf__find_by_name_kind(module_btf, "user", BTF_KIND_TYPE_TAG);
+       if (type_id <= 0) {
+               printf("%s:SKIP: btf_type_tag attribute not in %s", __func__, module_name);
+               test__skip();
+               goto free_module_btf;
+       }
+
+       skel = btf_type_tag_user__open();
+       if (!ASSERT_OK_PTR(skel, "btf_type_tag_user"))
+               goto free_module_btf;
+
+       bpf_program__set_autoload(skel->progs.test_sys_getsockname, false);
+       if (load_test_user1)
+               bpf_program__set_autoload(skel->progs.test_user2, false);
+       else
+               bpf_program__set_autoload(skel->progs.test_user1, false);
+
+       err = btf_type_tag_user__load(skel);
+       ASSERT_ERR(err, "btf_type_tag_user");
+
+       btf_type_tag_user__destroy(skel);
+
+free_module_btf:
+       btf__free(module_btf);
+free_vmlinux_btf:
+       btf__free(vmlinux_btf);
+}
+
+static void test_btf_type_tag_vmlinux_user(void)
+{
+       struct btf_type_tag_user *skel;
+       struct btf *vmlinux_btf;
+       __s32 type_id;
+       int err;
+
+       /* skip the test if the vmlinux does not have __user tags */
+       vmlinux_btf = btf__load_vmlinux_btf();
+       if (!ASSERT_OK_PTR(vmlinux_btf, "could not load vmlinux BTF"))
+               return;
+
+       type_id = btf__find_by_name_kind(vmlinux_btf, "user", BTF_KIND_TYPE_TAG);
+       if (type_id <= 0) {
+               printf("%s:SKIP: btf_type_tag attribute not in vmlinux btf", __func__);
+               test__skip();
+               goto free_vmlinux_btf;
+       }
+
+       skel = btf_type_tag_user__open();
+       if (!ASSERT_OK_PTR(skel, "btf_type_tag_user"))
+               goto free_vmlinux_btf;
+
+       bpf_program__set_autoload(skel->progs.test_user2, false);
+       bpf_program__set_autoload(skel->progs.test_user1, false);
+
+       err = btf_type_tag_user__load(skel);
+       ASSERT_ERR(err, "btf_type_tag_user");
+
+       btf_type_tag_user__destroy(skel);
+
+free_vmlinux_btf:
+       btf__free(vmlinux_btf);
+}
+
 void test_btf_tag(void)
 {
        if (test__start_subtest("btf_decl_tag"))
                test_btf_decl_tag();
        if (test__start_subtest("btf_type_tag"))
                test_btf_type_tag();
+       if (test__start_subtest("btf_type_tag_user_mod1"))
+               test_btf_type_tag_mod_user(true);
+       if (test__start_subtest("btf_type_tag_user_mod2"))
+               test_btf_type_tag_mod_user(false);
+       if (test__start_subtest("btf_type_tag_sys_user_vmlinux"))
+               test_btf_type_tag_vmlinux_user();
 }
index d3e8f72..38b3c47 100644 (file)
@@ -194,14 +194,14 @@ void serial_test_cgroup_attach_multi(void)
 
        attach_opts.flags = BPF_F_ALLOW_OVERRIDE | BPF_F_REPLACE;
        attach_opts.replace_prog_fd = allow_prog[0];
-       if (CHECK(!bpf_prog_attach_xattr(allow_prog[6], cg1,
+       if (CHECK(!bpf_prog_attach_opts(allow_prog[6], cg1,
                                         BPF_CGROUP_INET_EGRESS, &attach_opts),
                  "fail_prog_replace_override", "unexpected success\n"))
                goto err;
        CHECK_FAIL(errno != EINVAL);
 
        attach_opts.flags = BPF_F_REPLACE;
-       if (CHECK(!bpf_prog_attach_xattr(allow_prog[6], cg1,
+       if (CHECK(!bpf_prog_attach_opts(allow_prog[6], cg1,
                                         BPF_CGROUP_INET_EGRESS, &attach_opts),
                  "fail_prog_replace_no_multi", "unexpected success\n"))
                goto err;
@@ -209,7 +209,7 @@ void serial_test_cgroup_attach_multi(void)
 
        attach_opts.flags = BPF_F_ALLOW_MULTI | BPF_F_REPLACE;
        attach_opts.replace_prog_fd = -1;
-       if (CHECK(!bpf_prog_attach_xattr(allow_prog[6], cg1,
+       if (CHECK(!bpf_prog_attach_opts(allow_prog[6], cg1,
                                         BPF_CGROUP_INET_EGRESS, &attach_opts),
                  "fail_prog_replace_bad_fd", "unexpected success\n"))
                goto err;
@@ -217,7 +217,7 @@ void serial_test_cgroup_attach_multi(void)
 
        /* replacing a program that is not attached to cgroup should fail  */
        attach_opts.replace_prog_fd = allow_prog[3];
-       if (CHECK(!bpf_prog_attach_xattr(allow_prog[6], cg1,
+       if (CHECK(!bpf_prog_attach_opts(allow_prog[6], cg1,
                                         BPF_CGROUP_INET_EGRESS, &attach_opts),
                  "fail_prog_replace_no_ent", "unexpected success\n"))
                goto err;
@@ -225,14 +225,14 @@ void serial_test_cgroup_attach_multi(void)
 
        /* replace 1st from the top program */
        attach_opts.replace_prog_fd = allow_prog[0];
-       if (CHECK(bpf_prog_attach_xattr(allow_prog[6], cg1,
+       if (CHECK(bpf_prog_attach_opts(allow_prog[6], cg1,
                                        BPF_CGROUP_INET_EGRESS, &attach_opts),
                  "prog_replace", "errno=%d\n", errno))
                goto err;
 
        /* replace program with itself */
        attach_opts.replace_prog_fd = allow_prog[6];
-       if (CHECK(bpf_prog_attach_xattr(allow_prog[6], cg1,
+       if (CHECK(bpf_prog_attach_opts(allow_prog[6], cg1,
                                        BPF_CGROUP_INET_EGRESS, &attach_opts),
                  "prog_replace", "errno=%d\n", errno))
                goto err;
diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_getset_retval.c b/tools/testing/selftests/bpf/prog_tests/cgroup_getset_retval.c
new file mode 100644 (file)
index 0000000..0b47c3c
--- /dev/null
@@ -0,0 +1,481 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ * Copyright 2021 Google LLC.
+ */
+
+#include <test_progs.h>
+#include <cgroup_helpers.h>
+#include <network_helpers.h>
+
+#include "cgroup_getset_retval_setsockopt.skel.h"
+#include "cgroup_getset_retval_getsockopt.skel.h"
+
+#define SOL_CUSTOM     0xdeadbeef
+
+static int zero;
+
+static void test_setsockopt_set(int cgroup_fd, int sock_fd)
+{
+       struct cgroup_getset_retval_setsockopt *obj;
+       struct bpf_link *link_set_eunatch = NULL;
+
+       obj = cgroup_getset_retval_setsockopt__open_and_load();
+       if (!ASSERT_OK_PTR(obj, "skel-load"))
+               return;
+
+       /* Attach setsockopt that sets EUNATCH, assert that
+        * we actually get that error when we run setsockopt()
+        */
+       link_set_eunatch = bpf_program__attach_cgroup(obj->progs.set_eunatch,
+                                                     cgroup_fd);
+       if (!ASSERT_OK_PTR(link_set_eunatch, "cg-attach-set_eunatch"))
+               goto close_bpf_object;
+
+       if (!ASSERT_ERR(setsockopt(sock_fd, SOL_SOCKET, SO_REUSEADDR,
+                                  &zero, sizeof(int)), "setsockopt"))
+               goto close_bpf_object;
+       if (!ASSERT_EQ(errno, EUNATCH, "setsockopt-errno"))
+               goto close_bpf_object;
+
+       if (!ASSERT_EQ(obj->bss->invocations, 1, "invocations"))
+               goto close_bpf_object;
+       if (!ASSERT_FALSE(obj->bss->assertion_error, "assertion_error"))
+               goto close_bpf_object;
+
+close_bpf_object:
+       bpf_link__destroy(link_set_eunatch);
+
+       cgroup_getset_retval_setsockopt__destroy(obj);
+}
+
+static void test_setsockopt_set_and_get(int cgroup_fd, int sock_fd)
+{
+       struct cgroup_getset_retval_setsockopt *obj;
+       struct bpf_link *link_set_eunatch = NULL, *link_get_retval = NULL;
+
+       obj = cgroup_getset_retval_setsockopt__open_and_load();
+       if (!ASSERT_OK_PTR(obj, "skel-load"))
+               return;
+
+       /* Attach setsockopt that sets EUNATCH, and one that gets the
+        * previously set errno. Assert that we get the same errno back.
+        */
+       link_set_eunatch = bpf_program__attach_cgroup(obj->progs.set_eunatch,
+                                                     cgroup_fd);
+       if (!ASSERT_OK_PTR(link_set_eunatch, "cg-attach-set_eunatch"))
+               goto close_bpf_object;
+       link_get_retval = bpf_program__attach_cgroup(obj->progs.get_retval,
+                                                    cgroup_fd);
+       if (!ASSERT_OK_PTR(link_get_retval, "cg-attach-get_retval"))
+               goto close_bpf_object;
+
+       if (!ASSERT_ERR(setsockopt(sock_fd, SOL_SOCKET, SO_REUSEADDR,
+                                  &zero, sizeof(int)), "setsockopt"))
+               goto close_bpf_object;
+       if (!ASSERT_EQ(errno, EUNATCH, "setsockopt-errno"))
+               goto close_bpf_object;
+
+       if (!ASSERT_EQ(obj->bss->invocations, 2, "invocations"))
+               goto close_bpf_object;
+       if (!ASSERT_FALSE(obj->bss->assertion_error, "assertion_error"))
+               goto close_bpf_object;
+       if (!ASSERT_EQ(obj->bss->retval_value, -EUNATCH, "retval_value"))
+               goto close_bpf_object;
+
+close_bpf_object:
+       bpf_link__destroy(link_set_eunatch);
+       bpf_link__destroy(link_get_retval);
+
+       cgroup_getset_retval_setsockopt__destroy(obj);
+}
+
+static void test_setsockopt_default_zero(int cgroup_fd, int sock_fd)
+{
+       struct cgroup_getset_retval_setsockopt *obj;
+       struct bpf_link *link_get_retval = NULL;
+
+       obj = cgroup_getset_retval_setsockopt__open_and_load();
+       if (!ASSERT_OK_PTR(obj, "skel-load"))
+               return;
+
+       /* Attach setsockopt that gets the previously set errno.
+        * Assert that, without anything setting one, we get 0.
+        */
+       link_get_retval = bpf_program__attach_cgroup(obj->progs.get_retval,
+                                                    cgroup_fd);
+       if (!ASSERT_OK_PTR(link_get_retval, "cg-attach-get_retval"))
+               goto close_bpf_object;
+
+       if (!ASSERT_OK(setsockopt(sock_fd, SOL_SOCKET, SO_REUSEADDR,
+                                 &zero, sizeof(int)), "setsockopt"))
+               goto close_bpf_object;
+
+       if (!ASSERT_EQ(obj->bss->invocations, 1, "invocations"))
+               goto close_bpf_object;
+       if (!ASSERT_FALSE(obj->bss->assertion_error, "assertion_error"))
+               goto close_bpf_object;
+       if (!ASSERT_EQ(obj->bss->retval_value, 0, "retval_value"))
+               goto close_bpf_object;
+
+close_bpf_object:
+       bpf_link__destroy(link_get_retval);
+
+       cgroup_getset_retval_setsockopt__destroy(obj);
+}
+
+static void test_setsockopt_default_zero_and_set(int cgroup_fd, int sock_fd)
+{
+       struct cgroup_getset_retval_setsockopt *obj;
+       struct bpf_link *link_get_retval = NULL, *link_set_eunatch = NULL;
+
+       obj = cgroup_getset_retval_setsockopt__open_and_load();
+       if (!ASSERT_OK_PTR(obj, "skel-load"))
+               return;
+
+       /* Attach setsockopt that gets the previously set errno, and then
+        * one that sets the errno to EUNATCH. Assert that the get does not
+        * see EUNATCH set later, and does not prevent EUNATCH from being set.
+        */
+       link_get_retval = bpf_program__attach_cgroup(obj->progs.get_retval,
+                                                    cgroup_fd);
+       if (!ASSERT_OK_PTR(link_get_retval, "cg-attach-get_retval"))
+               goto close_bpf_object;
+       link_set_eunatch = bpf_program__attach_cgroup(obj->progs.set_eunatch,
+                                                     cgroup_fd);
+       if (!ASSERT_OK_PTR(link_set_eunatch, "cg-attach-set_eunatch"))
+               goto close_bpf_object;
+
+       if (!ASSERT_ERR(setsockopt(sock_fd, SOL_SOCKET, SO_REUSEADDR,
+                                  &zero, sizeof(int)), "setsockopt"))
+               goto close_bpf_object;
+       if (!ASSERT_EQ(errno, EUNATCH, "setsockopt-errno"))
+               goto close_bpf_object;
+
+       if (!ASSERT_EQ(obj->bss->invocations, 2, "invocations"))
+               goto close_bpf_object;
+       if (!ASSERT_FALSE(obj->bss->assertion_error, "assertion_error"))
+               goto close_bpf_object;
+       if (!ASSERT_EQ(obj->bss->retval_value, 0, "retval_value"))
+               goto close_bpf_object;
+
+close_bpf_object:
+       bpf_link__destroy(link_get_retval);
+       bpf_link__destroy(link_set_eunatch);
+
+       cgroup_getset_retval_setsockopt__destroy(obj);
+}
+
+static void test_setsockopt_override(int cgroup_fd, int sock_fd)
+{
+       struct cgroup_getset_retval_setsockopt *obj;
+       struct bpf_link *link_set_eunatch = NULL, *link_set_eisconn = NULL;
+       struct bpf_link *link_get_retval = NULL;
+
+       obj = cgroup_getset_retval_setsockopt__open_and_load();
+       if (!ASSERT_OK_PTR(obj, "skel-load"))
+               return;
+
+       /* Attach setsockopt that sets EUNATCH, then one that sets EISCONN,
+        * and then one that gets the exported errno. Assert both the syscall
+        * and the helper sees the last set errno.
+        */
+       link_set_eunatch = bpf_program__attach_cgroup(obj->progs.set_eunatch,
+                                                     cgroup_fd);
+       if (!ASSERT_OK_PTR(link_set_eunatch, "cg-attach-set_eunatch"))
+               goto close_bpf_object;
+       link_set_eisconn = bpf_program__attach_cgroup(obj->progs.set_eisconn,
+                                                     cgroup_fd);
+       if (!ASSERT_OK_PTR(link_set_eisconn, "cg-attach-set_eisconn"))
+               goto close_bpf_object;
+       link_get_retval = bpf_program__attach_cgroup(obj->progs.get_retval,
+                                                    cgroup_fd);
+       if (!ASSERT_OK_PTR(link_get_retval, "cg-attach-get_retval"))
+               goto close_bpf_object;
+
+       if (!ASSERT_ERR(setsockopt(sock_fd, SOL_SOCKET, SO_REUSEADDR,
+                                  &zero, sizeof(int)), "setsockopt"))
+               goto close_bpf_object;
+       if (!ASSERT_EQ(errno, EISCONN, "setsockopt-errno"))
+               goto close_bpf_object;
+
+       if (!ASSERT_EQ(obj->bss->invocations, 3, "invocations"))
+               goto close_bpf_object;
+       if (!ASSERT_FALSE(obj->bss->assertion_error, "assertion_error"))
+               goto close_bpf_object;
+       if (!ASSERT_EQ(obj->bss->retval_value, -EISCONN, "retval_value"))
+               goto close_bpf_object;
+
+close_bpf_object:
+       bpf_link__destroy(link_set_eunatch);
+       bpf_link__destroy(link_set_eisconn);
+       bpf_link__destroy(link_get_retval);
+
+       cgroup_getset_retval_setsockopt__destroy(obj);
+}
+
+static void test_setsockopt_legacy_eperm(int cgroup_fd, int sock_fd)
+{
+       struct cgroup_getset_retval_setsockopt *obj;
+       struct bpf_link *link_legacy_eperm = NULL, *link_get_retval = NULL;
+
+       obj = cgroup_getset_retval_setsockopt__open_and_load();
+       if (!ASSERT_OK_PTR(obj, "skel-load"))
+               return;
+
+       /* Attach setsockopt that return a reject without setting errno
+        * (legacy reject), and one that gets the errno. Assert that for
+        * backward compatibility the syscall result in EPERM, and this
+        * is also visible to the helper.
+        */
+       link_legacy_eperm = bpf_program__attach_cgroup(obj->progs.legacy_eperm,
+                                                      cgroup_fd);
+       if (!ASSERT_OK_PTR(link_legacy_eperm, "cg-attach-legacy_eperm"))
+               goto close_bpf_object;
+       link_get_retval = bpf_program__attach_cgroup(obj->progs.get_retval,
+                                                    cgroup_fd);
+       if (!ASSERT_OK_PTR(link_get_retval, "cg-attach-get_retval"))
+               goto close_bpf_object;
+
+       if (!ASSERT_ERR(setsockopt(sock_fd, SOL_SOCKET, SO_REUSEADDR,
+                                  &zero, sizeof(int)), "setsockopt"))
+               goto close_bpf_object;
+       if (!ASSERT_EQ(errno, EPERM, "setsockopt-errno"))
+               goto close_bpf_object;
+
+       if (!ASSERT_EQ(obj->bss->invocations, 2, "invocations"))
+               goto close_bpf_object;
+       if (!ASSERT_FALSE(obj->bss->assertion_error, "assertion_error"))
+               goto close_bpf_object;
+       if (!ASSERT_EQ(obj->bss->retval_value, -EPERM, "retval_value"))
+               goto close_bpf_object;
+
+close_bpf_object:
+       bpf_link__destroy(link_legacy_eperm);
+       bpf_link__destroy(link_get_retval);
+
+       cgroup_getset_retval_setsockopt__destroy(obj);
+}
+
+static void test_setsockopt_legacy_no_override(int cgroup_fd, int sock_fd)
+{
+       struct cgroup_getset_retval_setsockopt *obj;
+       struct bpf_link *link_set_eunatch = NULL, *link_legacy_eperm = NULL;
+       struct bpf_link *link_get_retval = NULL;
+
+       obj = cgroup_getset_retval_setsockopt__open_and_load();
+       if (!ASSERT_OK_PTR(obj, "skel-load"))
+               return;
+
+       /* Attach setsockopt that sets EUNATCH, then one that return a reject
+        * without setting errno, and then one that gets the exported errno.
+        * Assert both the syscall and the helper's errno are unaffected by
+        * the second prog (i.e. legacy rejects does not override the errno
+        * to EPERM).
+        */
+       link_set_eunatch = bpf_program__attach_cgroup(obj->progs.set_eunatch,
+                                                     cgroup_fd);
+       if (!ASSERT_OK_PTR(link_set_eunatch, "cg-attach-set_eunatch"))
+               goto close_bpf_object;
+       link_legacy_eperm = bpf_program__attach_cgroup(obj->progs.legacy_eperm,
+                                                      cgroup_fd);
+       if (!ASSERT_OK_PTR(link_legacy_eperm, "cg-attach-legacy_eperm"))
+               goto close_bpf_object;
+       link_get_retval = bpf_program__attach_cgroup(obj->progs.get_retval,
+                                                    cgroup_fd);
+       if (!ASSERT_OK_PTR(link_get_retval, "cg-attach-get_retval"))
+               goto close_bpf_object;
+
+       if (!ASSERT_ERR(setsockopt(sock_fd, SOL_SOCKET, SO_REUSEADDR,
+                                  &zero, sizeof(int)), "setsockopt"))
+               goto close_bpf_object;
+       if (!ASSERT_EQ(errno, EUNATCH, "setsockopt-errno"))
+               goto close_bpf_object;
+
+       if (!ASSERT_EQ(obj->bss->invocations, 3, "invocations"))
+               goto close_bpf_object;
+       if (!ASSERT_FALSE(obj->bss->assertion_error, "assertion_error"))
+               goto close_bpf_object;
+       if (!ASSERT_EQ(obj->bss->retval_value, -EUNATCH, "retval_value"))
+               goto close_bpf_object;
+
+close_bpf_object:
+       bpf_link__destroy(link_set_eunatch);
+       bpf_link__destroy(link_legacy_eperm);
+       bpf_link__destroy(link_get_retval);
+
+       cgroup_getset_retval_setsockopt__destroy(obj);
+}
+
+static void test_getsockopt_get(int cgroup_fd, int sock_fd)
+{
+       struct cgroup_getset_retval_getsockopt *obj;
+       struct bpf_link *link_get_retval = NULL;
+       int buf;
+       socklen_t optlen = sizeof(buf);
+
+       obj = cgroup_getset_retval_getsockopt__open_and_load();
+       if (!ASSERT_OK_PTR(obj, "skel-load"))
+               return;
+
+       /* Attach getsockopt that gets previously set errno. Assert that the
+        * error from kernel is in both ctx_retval_value and retval_value.
+        */
+       link_get_retval = bpf_program__attach_cgroup(obj->progs.get_retval,
+                                                    cgroup_fd);
+       if (!ASSERT_OK_PTR(link_get_retval, "cg-attach-get_retval"))
+               goto close_bpf_object;
+
+       if (!ASSERT_ERR(getsockopt(sock_fd, SOL_CUSTOM, 0,
+                                  &buf, &optlen), "getsockopt"))
+               goto close_bpf_object;
+       if (!ASSERT_EQ(errno, EOPNOTSUPP, "getsockopt-errno"))
+               goto close_bpf_object;
+
+       if (!ASSERT_EQ(obj->bss->invocations, 1, "invocations"))
+               goto close_bpf_object;
+       if (!ASSERT_FALSE(obj->bss->assertion_error, "assertion_error"))
+               goto close_bpf_object;
+       if (!ASSERT_EQ(obj->bss->retval_value, -EOPNOTSUPP, "retval_value"))
+               goto close_bpf_object;
+       if (!ASSERT_EQ(obj->bss->ctx_retval_value, -EOPNOTSUPP, "ctx_retval_value"))
+               goto close_bpf_object;
+
+close_bpf_object:
+       bpf_link__destroy(link_get_retval);
+
+       cgroup_getset_retval_getsockopt__destroy(obj);
+}
+
+static void test_getsockopt_override(int cgroup_fd, int sock_fd)
+{
+       struct cgroup_getset_retval_getsockopt *obj;
+       struct bpf_link *link_set_eisconn = NULL;
+       int buf;
+       socklen_t optlen = sizeof(buf);
+
+       obj = cgroup_getset_retval_getsockopt__open_and_load();
+       if (!ASSERT_OK_PTR(obj, "skel-load"))
+               return;
+
+       /* Attach getsockopt that sets retval to -EISCONN. Assert that this
+        * overrides the value from kernel.
+        */
+       link_set_eisconn = bpf_program__attach_cgroup(obj->progs.set_eisconn,
+                                                     cgroup_fd);
+       if (!ASSERT_OK_PTR(link_set_eisconn, "cg-attach-set_eisconn"))
+               goto close_bpf_object;
+
+       if (!ASSERT_ERR(getsockopt(sock_fd, SOL_CUSTOM, 0,
+                                  &buf, &optlen), "getsockopt"))
+               goto close_bpf_object;
+       if (!ASSERT_EQ(errno, EISCONN, "getsockopt-errno"))
+               goto close_bpf_object;
+
+       if (!ASSERT_EQ(obj->bss->invocations, 1, "invocations"))
+               goto close_bpf_object;
+       if (!ASSERT_FALSE(obj->bss->assertion_error, "assertion_error"))
+               goto close_bpf_object;
+
+close_bpf_object:
+       bpf_link__destroy(link_set_eisconn);
+
+       cgroup_getset_retval_getsockopt__destroy(obj);
+}
+
+static void test_getsockopt_retval_sync(int cgroup_fd, int sock_fd)
+{
+       struct cgroup_getset_retval_getsockopt *obj;
+       struct bpf_link *link_set_eisconn = NULL, *link_clear_retval = NULL;
+       struct bpf_link *link_get_retval = NULL;
+       int buf;
+       socklen_t optlen = sizeof(buf);
+
+       obj = cgroup_getset_retval_getsockopt__open_and_load();
+       if (!ASSERT_OK_PTR(obj, "skel-load"))
+               return;
+
+       /* Attach getsockopt that sets retval to -EISCONN, and one that clears
+        * ctx retval. Assert that the clearing ctx retval is synced to helper
+        * and clears any errors both from kernel and BPF..
+        */
+       link_set_eisconn = bpf_program__attach_cgroup(obj->progs.set_eisconn,
+                                                     cgroup_fd);
+       if (!ASSERT_OK_PTR(link_set_eisconn, "cg-attach-set_eisconn"))
+               goto close_bpf_object;
+       link_clear_retval = bpf_program__attach_cgroup(obj->progs.clear_retval,
+                                                      cgroup_fd);
+       if (!ASSERT_OK_PTR(link_clear_retval, "cg-attach-clear_retval"))
+               goto close_bpf_object;
+       link_get_retval = bpf_program__attach_cgroup(obj->progs.get_retval,
+                                                    cgroup_fd);
+       if (!ASSERT_OK_PTR(link_get_retval, "cg-attach-get_retval"))
+               goto close_bpf_object;
+
+       if (!ASSERT_OK(getsockopt(sock_fd, SOL_CUSTOM, 0,
+                                 &buf, &optlen), "getsockopt"))
+               goto close_bpf_object;
+
+       if (!ASSERT_EQ(obj->bss->invocations, 3, "invocations"))
+               goto close_bpf_object;
+       if (!ASSERT_FALSE(obj->bss->assertion_error, "assertion_error"))
+               goto close_bpf_object;
+       if (!ASSERT_EQ(obj->bss->retval_value, 0, "retval_value"))
+               goto close_bpf_object;
+       if (!ASSERT_EQ(obj->bss->ctx_retval_value, 0, "ctx_retval_value"))
+               goto close_bpf_object;
+
+close_bpf_object:
+       bpf_link__destroy(link_set_eisconn);
+       bpf_link__destroy(link_clear_retval);
+       bpf_link__destroy(link_get_retval);
+
+       cgroup_getset_retval_getsockopt__destroy(obj);
+}
+
+void test_cgroup_getset_retval(void)
+{
+       int cgroup_fd = -1;
+       int sock_fd = -1;
+
+       cgroup_fd = test__join_cgroup("/cgroup_getset_retval");
+       if (!ASSERT_GE(cgroup_fd, 0, "cg-create"))
+               goto close_fd;
+
+       sock_fd = start_server(AF_INET, SOCK_DGRAM, NULL, 0, 0);
+       if (!ASSERT_GE(sock_fd, 0, "start-server"))
+               goto close_fd;
+
+       if (test__start_subtest("setsockopt-set"))
+               test_setsockopt_set(cgroup_fd, sock_fd);
+
+       if (test__start_subtest("setsockopt-set_and_get"))
+               test_setsockopt_set_and_get(cgroup_fd, sock_fd);
+
+       if (test__start_subtest("setsockopt-default_zero"))
+               test_setsockopt_default_zero(cgroup_fd, sock_fd);
+
+       if (test__start_subtest("setsockopt-default_zero_and_set"))
+               test_setsockopt_default_zero_and_set(cgroup_fd, sock_fd);
+
+       if (test__start_subtest("setsockopt-override"))
+               test_setsockopt_override(cgroup_fd, sock_fd);
+
+       if (test__start_subtest("setsockopt-legacy_eperm"))
+               test_setsockopt_legacy_eperm(cgroup_fd, sock_fd);
+
+       if (test__start_subtest("setsockopt-legacy_no_override"))
+               test_setsockopt_legacy_no_override(cgroup_fd, sock_fd);
+
+       if (test__start_subtest("getsockopt-get"))
+               test_getsockopt_get(cgroup_fd, sock_fd);
+
+       if (test__start_subtest("getsockopt-override"))
+               test_getsockopt_override(cgroup_fd, sock_fd);
+
+       if (test__start_subtest("getsockopt-retval_sync"))
+               test_getsockopt_retval_sync(cgroup_fd, sock_fd);
+
+close_fd:
+       close(cgroup_fd);
+}
index f73e6e3..12f4395 100644 (file)
@@ -79,28 +79,21 @@ static void test_check_mtu_run_xdp(struct test_check_mtu *skel,
                                   struct bpf_program *prog,
                                   __u32 mtu_expect)
 {
-       const char *prog_name = bpf_program__name(prog);
        int retval_expect = XDP_PASS;
        __u32 mtu_result = 0;
        char buf[256] = {};
-       int err;
-       struct bpf_prog_test_run_attr tattr = {
+       int err, prog_fd = bpf_program__fd(prog);
+       LIBBPF_OPTS(bpf_test_run_opts, topts,
                .repeat = 1,
                .data_in = &pkt_v4,
                .data_size_in = sizeof(pkt_v4),
                .data_out = buf,
                .data_size_out = sizeof(buf),
-               .prog_fd = bpf_program__fd(prog),
-       };
-
-       err = bpf_prog_test_run_xattr(&tattr);
-       CHECK_ATTR(err != 0, "bpf_prog_test_run",
-                  "prog_name:%s (err %d errno %d retval %d)\n",
-                  prog_name, err, errno, tattr.retval);
+       );
 
-       CHECK(tattr.retval != retval_expect, "retval",
-             "progname:%s unexpected retval=%d expected=%d\n",
-             prog_name, tattr.retval, retval_expect);
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
+       ASSERT_OK(err, "test_run");
+       ASSERT_EQ(topts.retval, retval_expect, "retval");
 
        /* Extract MTU that BPF-prog got */
        mtu_result = skel->bss->global_bpf_mtu_xdp;
@@ -139,28 +132,21 @@ static void test_check_mtu_run_tc(struct test_check_mtu *skel,
                                  struct bpf_program *prog,
                                  __u32 mtu_expect)
 {
-       const char *prog_name = bpf_program__name(prog);
        int retval_expect = BPF_OK;
        __u32 mtu_result = 0;
        char buf[256] = {};
-       int err;
-       struct bpf_prog_test_run_attr tattr = {
-               .repeat = 1,
+       int err, prog_fd = bpf_program__fd(prog);
+       LIBBPF_OPTS(bpf_test_run_opts, topts,
                .data_in = &pkt_v4,
                .data_size_in = sizeof(pkt_v4),
                .data_out = buf,
                .data_size_out = sizeof(buf),
-               .prog_fd = bpf_program__fd(prog),
-       };
-
-       err = bpf_prog_test_run_xattr(&tattr);
-       CHECK_ATTR(err != 0, "bpf_prog_test_run",
-                  "prog_name:%s (err %d errno %d retval %d)\n",
-                  prog_name, err, errno, tattr.retval);
+               .repeat = 1,
+       );
 
-       CHECK(tattr.retval != retval_expect, "retval",
-             "progname:%s unexpected retval=%d expected=%d\n",
-             prog_name, tattr.retval, retval_expect);
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
+       ASSERT_OK(err, "test_run");
+       ASSERT_EQ(topts.retval, retval_expect, "retval");
 
        /* Extract MTU that BPF-prog got */
        mtu_result = skel->bss->global_bpf_mtu_tc;
index e075d03..224f016 100644 (file)
@@ -161,7 +161,7 @@ static socklen_t prepare_addr(struct sockaddr_storage *addr, int family)
        }
 }
 
-static bool was_decapsulated(struct bpf_prog_test_run_attr *tattr)
+static bool was_decapsulated(struct bpf_test_run_opts *tattr)
 {
        return tattr->data_size_out < tattr->data_size_in;
 }
@@ -367,12 +367,12 @@ static void close_fds(int *fds, int n)
 
 static void test_cls_redirect_common(struct bpf_program *prog)
 {
-       struct bpf_prog_test_run_attr tattr = {};
+       LIBBPF_OPTS(bpf_test_run_opts, tattr);
        int families[] = { AF_INET, AF_INET6 };
        struct sockaddr_storage ss;
        struct sockaddr *addr;
        socklen_t slen;
-       int i, j, err;
+       int i, j, err, prog_fd;
        int servers[__NR_KIND][ARRAY_SIZE(families)] = {};
        int conns[__NR_KIND][ARRAY_SIZE(families)] = {};
        struct tuple tuples[__NR_KIND][ARRAY_SIZE(families)];
@@ -394,7 +394,7 @@ static void test_cls_redirect_common(struct bpf_program *prog)
                        goto cleanup;
        }
 
-       tattr.prog_fd = bpf_program__fd(prog);
+       prog_fd = bpf_program__fd(prog);
        for (i = 0; i < ARRAY_SIZE(tests); i++) {
                struct test_cfg *test = &tests[i];
 
@@ -415,7 +415,7 @@ static void test_cls_redirect_common(struct bpf_program *prog)
                        if (CHECK_FAIL(!tattr.data_size_in))
                                continue;
 
-                       err = bpf_prog_test_run_xattr(&tattr);
+                       err = bpf_prog_test_run_opts(prog_fd, &tattr);
                        if (CHECK_FAIL(err))
                                continue;
 
index 561c518..6a5a1c0 100644 (file)
@@ -7,8 +7,22 @@
 void test_core_kern_lskel(void)
 {
        struct core_kern_lskel *skel;
+       int link_fd;
 
        skel = core_kern_lskel__open_and_load();
-       ASSERT_OK_PTR(skel, "open_and_load");
+       if (!ASSERT_OK_PTR(skel, "open_and_load"))
+               return;
+
+       link_fd = core_kern_lskel__core_relo_proto__attach(skel);
+       if (!ASSERT_GT(link_fd, 0, "attach(core_relo_proto)"))
+               goto cleanup;
+
+       /* trigger tracepoints */
+       usleep(1);
+       ASSERT_TRUE(skel->bss->proto_out[0], "bpf_core_type_exists");
+       ASSERT_FALSE(skel->bss->proto_out[1], "!bpf_core_type_exists");
+       ASSERT_TRUE(skel->bss->proto_out[2], "bpf_core_type_exists. nested");
+
+cleanup:
        core_kern_lskel__destroy(skel);
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/core_kern_overflow.c b/tools/testing/selftests/bpf/prog_tests/core_kern_overflow.c
new file mode 100644 (file)
index 0000000..04cc145
--- /dev/null
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "test_progs.h"
+#include "core_kern_overflow.lskel.h"
+
+void test_core_kern_overflow_lskel(void)
+{
+       struct core_kern_overflow_lskel *skel;
+
+       skel = core_kern_overflow_lskel__open_and_load();
+       if (!ASSERT_NULL(skel, "open_and_load"))
+               core_kern_overflow_lskel__destroy(skel);
+}
index cbaa44f..5aa52cc 100644 (file)
@@ -26,10 +26,10 @@ static void test_dummy_st_ops_attach(void)
 static void test_dummy_init_ret_value(void)
 {
        __u64 args[1] = {0};
-       struct bpf_prog_test_run_attr attr = {
-               .ctx_size_in = sizeof(args),
+       LIBBPF_OPTS(bpf_test_run_opts, attr,
                .ctx_in = args,
-       };
+               .ctx_size_in = sizeof(args),
+       );
        struct dummy_st_ops *skel;
        int fd, err;
 
@@ -38,8 +38,7 @@ static void test_dummy_init_ret_value(void)
                return;
 
        fd = bpf_program__fd(skel->progs.test_1);
-       attr.prog_fd = fd;
-       err = bpf_prog_test_run_xattr(&attr);
+       err = bpf_prog_test_run_opts(fd, &attr);
        ASSERT_OK(err, "test_run");
        ASSERT_EQ(attr.retval, 0xf2f3f4f5, "test_ret");
 
@@ -53,10 +52,10 @@ static void test_dummy_init_ptr_arg(void)
                .val = exp_retval,
        };
        __u64 args[1] = {(unsigned long)&in_state};
-       struct bpf_prog_test_run_attr attr = {
-               .ctx_size_in = sizeof(args),
+       LIBBPF_OPTS(bpf_test_run_opts, attr,
                .ctx_in = args,
-       };
+               .ctx_size_in = sizeof(args),
+       );
        struct dummy_st_ops *skel;
        int fd, err;
 
@@ -65,8 +64,7 @@ static void test_dummy_init_ptr_arg(void)
                return;
 
        fd = bpf_program__fd(skel->progs.test_1);
-       attr.prog_fd = fd;
-       err = bpf_prog_test_run_xattr(&attr);
+       err = bpf_prog_test_run_opts(fd, &attr);
        ASSERT_OK(err, "test_run");
        ASSERT_EQ(in_state.val, 0x5a, "test_ptr_ret");
        ASSERT_EQ(attr.retval, exp_retval, "test_ret");
@@ -77,10 +75,10 @@ static void test_dummy_init_ptr_arg(void)
 static void test_dummy_multiple_args(void)
 {
        __u64 args[5] = {0, -100, 0x8a5f, 'c', 0x1234567887654321ULL};
-       struct bpf_prog_test_run_attr attr = {
-               .ctx_size_in = sizeof(args),
+       LIBBPF_OPTS(bpf_test_run_opts, attr,
                .ctx_in = args,
-       };
+               .ctx_size_in = sizeof(args),
+       );
        struct dummy_st_ops *skel;
        int fd, err;
        size_t i;
@@ -91,8 +89,7 @@ static void test_dummy_multiple_args(void)
                return;
 
        fd = bpf_program__fd(skel->progs.test_2);
-       attr.prog_fd = fd;
-       err = bpf_prog_test_run_xattr(&attr);
+       err = bpf_prog_test_run_opts(fd, &attr);
        ASSERT_OK(err, "test_run");
        for (i = 0; i < ARRAY_SIZE(args); i++) {
                snprintf(name, sizeof(name), "arg %zu", i);
index 4374ac8..130f5b8 100644 (file)
@@ -9,38 +9,34 @@ void test_fentry_fexit(void)
        struct fentry_test_lskel *fentry_skel = NULL;
        struct fexit_test_lskel *fexit_skel = NULL;
        __u64 *fentry_res, *fexit_res;
-       __u32 duration = 0, retval;
        int err, prog_fd, i;
+       LIBBPF_OPTS(bpf_test_run_opts, topts);
 
        fentry_skel = fentry_test_lskel__open_and_load();
-       if (CHECK(!fentry_skel, "fentry_skel_load", "fentry skeleton failed\n"))
+       if (!ASSERT_OK_PTR(fentry_skel, "fentry_skel_load"))
                goto close_prog;
        fexit_skel = fexit_test_lskel__open_and_load();
-       if (CHECK(!fexit_skel, "fexit_skel_load", "fexit skeleton failed\n"))
+       if (!ASSERT_OK_PTR(fexit_skel, "fexit_skel_load"))
                goto close_prog;
 
        err = fentry_test_lskel__attach(fentry_skel);
-       if (CHECK(err, "fentry_attach", "fentry attach failed: %d\n", err))
+       if (!ASSERT_OK(err, "fentry_attach"))
                goto close_prog;
        err = fexit_test_lskel__attach(fexit_skel);
-       if (CHECK(err, "fexit_attach", "fexit attach failed: %d\n", err))
+       if (!ASSERT_OK(err, "fexit_attach"))
                goto close_prog;
 
        prog_fd = fexit_skel->progs.test1.prog_fd;
-       err = bpf_prog_test_run(prog_fd, 1, NULL, 0,
-                               NULL, NULL, &retval, &duration);
-       CHECK(err || retval, "ipv6",
-             "err %d errno %d retval %d duration %d\n",
-             err, errno, retval, duration);
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
+       ASSERT_OK(err, "ipv6 test_run");
+       ASSERT_OK(topts.retval, "ipv6 test retval");
 
        fentry_res = (__u64 *)fentry_skel->bss;
        fexit_res = (__u64 *)fexit_skel->bss;
        printf("%lld\n", fentry_skel->bss->test1_result);
        for (i = 0; i < 8; i++) {
-               CHECK(fentry_res[i] != 1, "result",
-                     "fentry_test%d failed err %lld\n", i + 1, fentry_res[i]);
-               CHECK(fexit_res[i] != 1, "result",
-                     "fexit_test%d failed err %lld\n", i + 1, fexit_res[i]);
+               ASSERT_EQ(fentry_res[i], 1, "fentry result");
+               ASSERT_EQ(fexit_res[i], 1, "fexit result");
        }
 
 close_prog:
index 12921b3..c0d1d61 100644 (file)
@@ -6,9 +6,9 @@
 static int fentry_test(struct fentry_test_lskel *fentry_skel)
 {
        int err, prog_fd, i;
-       __u32 duration = 0, retval;
        int link_fd;
        __u64 *result;
+       LIBBPF_OPTS(bpf_test_run_opts, topts);
 
        err = fentry_test_lskel__attach(fentry_skel);
        if (!ASSERT_OK(err, "fentry_attach"))
@@ -20,10 +20,9 @@ static int fentry_test(struct fentry_test_lskel *fentry_skel)
                return -1;
 
        prog_fd = fentry_skel->progs.test1.prog_fd;
-       err = bpf_prog_test_run(prog_fd, 1, NULL, 0,
-                               NULL, NULL, &retval, &duration);
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
        ASSERT_OK(err, "test_run");
-       ASSERT_EQ(retval, 0, "test_run");
+       ASSERT_EQ(topts.retval, 0, "test_run");
 
        result = (__u64 *)fentry_skel->bss;
        for (i = 0; i < sizeof(*fentry_skel->bss) / sizeof(__u64); i++) {
index c52f99f..d9aad15 100644 (file)
@@ -58,12 +58,17 @@ static void test_fexit_bpf2bpf_common(const char *obj_file,
                                      test_cb cb)
 {
        struct bpf_object *obj = NULL, *tgt_obj;
-       __u32 retval, tgt_prog_id, info_len;
+       __u32 tgt_prog_id, info_len;
        struct bpf_prog_info prog_info = {};
        struct bpf_program **prog = NULL, *p;
        struct bpf_link **link = NULL;
        int err, tgt_fd, i;
        struct btf *btf;
+       LIBBPF_OPTS(bpf_test_run_opts, topts,
+               .data_in = &pkt_v6,
+               .data_size_in = sizeof(pkt_v6),
+               .repeat = 1,
+       );
 
        err = bpf_prog_test_load(target_obj_file, BPF_PROG_TYPE_UNSPEC,
                            &tgt_obj, &tgt_fd);
@@ -132,7 +137,7 @@ static void test_fexit_bpf2bpf_common(const char *obj_file,
                                             &link_info, &info_len);
                ASSERT_OK(err, "link_fd_get_info");
                ASSERT_EQ(link_info.tracing.attach_type,
-                         bpf_program__get_expected_attach_type(prog[i]),
+                         bpf_program__expected_attach_type(prog[i]),
                          "link_attach_type");
                ASSERT_EQ(link_info.tracing.target_obj_id, tgt_prog_id, "link_tgt_obj_id");
                ASSERT_EQ(link_info.tracing.target_btf_id, btf_id, "link_tgt_btf_id");
@@ -147,10 +152,9 @@ static void test_fexit_bpf2bpf_common(const char *obj_file,
        if (!run_prog)
                goto close_prog;
 
-       err = bpf_prog_test_run(tgt_fd, 1, &pkt_v6, sizeof(pkt_v6),
-                               NULL, NULL, &retval, NULL);
+       err = bpf_prog_test_run_opts(tgt_fd, &topts);
        ASSERT_OK(err, "prog_run");
-       ASSERT_EQ(retval, 0, "prog_run_ret");
+       ASSERT_EQ(topts.retval, 0, "prog_run_ret");
 
        if (check_data_map(obj, prog_cnt, false))
                goto close_prog;
@@ -225,29 +229,31 @@ static int test_second_attach(struct bpf_object *obj)
        const char *tgt_obj_file = "./test_pkt_access.o";
        struct bpf_program *prog = NULL;
        struct bpf_object *tgt_obj;
-       __u32 duration = 0, retval;
        struct bpf_link *link;
        int err = 0, tgt_fd;
+       LIBBPF_OPTS(bpf_test_run_opts, topts,
+               .data_in = &pkt_v6,
+               .data_size_in = sizeof(pkt_v6),
+               .repeat = 1,
+       );
 
        prog = bpf_object__find_program_by_name(obj, prog_name);
-       if (CHECK(!prog, "find_prog", "prog %s not found\n", prog_name))
+       if (!ASSERT_OK_PTR(prog, "find_prog"))
                return -ENOENT;
 
        err = bpf_prog_test_load(tgt_obj_file, BPF_PROG_TYPE_UNSPEC,
                            &tgt_obj, &tgt_fd);
-       if (CHECK(err, "second_prog_load", "file %s err %d errno %d\n",
-                 tgt_obj_file, err, errno))
+       if (!ASSERT_OK(err, "second_prog_load"))
                return err;
 
        link = bpf_program__attach_freplace(prog, tgt_fd, tgt_name);
        if (!ASSERT_OK_PTR(link, "second_link"))
                goto out;
 
-       err = bpf_prog_test_run(tgt_fd, 1, &pkt_v6, sizeof(pkt_v6),
-                               NULL, NULL, &retval, &duration);
-       if (CHECK(err || retval, "ipv6",
-                 "err %d errno %d retval %d duration %d\n",
-                 err, errno, retval, duration))
+       err = bpf_prog_test_run_opts(tgt_fd, &topts);
+       if (!ASSERT_OK(err, "ipv6 test_run"))
+               goto out;
+       if (!ASSERT_OK(topts.retval, "ipv6 retval"))
                goto out;
 
        err = check_data_map(obj, 1, true);
index e4cede6..3ee2107 100644 (file)
@@ -10,9 +10,7 @@ void test_fexit_stress(void)
        char test_skb[128] = {};
        int fexit_fd[CNT] = {};
        int link_fd[CNT] = {};
-       __u32 duration = 0;
        char error[4096];
-       __u32 prog_ret;
        int err, i, filter_fd;
 
        const struct bpf_insn trace_program[] = {
@@ -36,9 +34,15 @@ void test_fexit_stress(void)
                .log_size = sizeof(error),
        );
 
+       LIBBPF_OPTS(bpf_test_run_opts, topts,
+               .data_in = test_skb,
+               .data_size_in = sizeof(test_skb),
+               .repeat = 1,
+       );
+
        err = libbpf_find_vmlinux_btf_id("bpf_fentry_test1",
                                         trace_opts.expected_attach_type);
-       if (CHECK(err <= 0, "find_vmlinux_btf_id", "failed: %d\n", err))
+       if (!ASSERT_GT(err, 0, "find_vmlinux_btf_id"))
                goto out;
        trace_opts.attach_btf_id = err;
 
@@ -47,24 +51,20 @@ void test_fexit_stress(void)
                                            trace_program,
                                            sizeof(trace_program) / sizeof(struct bpf_insn),
                                            &trace_opts);
-               if (CHECK(fexit_fd[i] < 0, "fexit loaded",
-                         "failed: %d errno %d\n", fexit_fd[i], errno))
+               if (!ASSERT_GE(fexit_fd[i], 0, "fexit load"))
                        goto out;
                link_fd[i] = bpf_raw_tracepoint_open(NULL, fexit_fd[i]);
-               if (CHECK(link_fd[i] < 0, "fexit attach failed",
-                         "prog %d failed: %d err %d\n", i, link_fd[i], errno))
+               if (!ASSERT_GE(link_fd[i], 0, "fexit attach"))
                        goto out;
        }
 
        filter_fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL",
                                  skb_program, sizeof(skb_program) / sizeof(struct bpf_insn),
                                  &skb_opts);
-       if (CHECK(filter_fd < 0, "test_program_loaded", "failed: %d errno %d\n",
-                 filter_fd, errno))
+       if (!ASSERT_GE(filter_fd, 0, "test_program_loaded"))
                goto out;
 
-       err = bpf_prog_test_run(filter_fd, 1, test_skb, sizeof(test_skb), 0,
-                               0, &prog_ret, 0);
+       err = bpf_prog_test_run_opts(filter_fd, &topts);
        close(filter_fd);
        CHECK_FAIL(err);
 out:
index d4887d8..101b734 100644 (file)
@@ -6,9 +6,9 @@
 static int fexit_test(struct fexit_test_lskel *fexit_skel)
 {
        int err, prog_fd, i;
-       __u32 duration = 0, retval;
        int link_fd;
        __u64 *result;
+       LIBBPF_OPTS(bpf_test_run_opts, topts);
 
        err = fexit_test_lskel__attach(fexit_skel);
        if (!ASSERT_OK(err, "fexit_attach"))
@@ -20,10 +20,9 @@ static int fexit_test(struct fexit_test_lskel *fexit_skel)
                return -1;
 
        prog_fd = fexit_skel->progs.test1.prog_fd;
-       err = bpf_prog_test_run(prog_fd, 1, NULL, 0,
-                               NULL, NULL, &retval, &duration);
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
        ASSERT_OK(err, "test_run");
-       ASSERT_EQ(retval, 0, "test_run");
+       ASSERT_EQ(topts.retval, 0, "test_run");
 
        result = (__u64 *)fexit_skel->bss;
        for (i = 0; i < sizeof(*fexit_skel->bss) / sizeof(__u64); i++) {
index ac54e3f..0c1661e 100644 (file)
@@ -13,8 +13,9 @@
 #endif
 
 #define CHECK_FLOW_KEYS(desc, got, expected)                           \
-       CHECK_ATTR(memcmp(&got, &expected, sizeof(got)) != 0,           \
+       _CHECK(memcmp(&got, &expected, sizeof(got)) != 0,               \
              desc,                                                     \
+             topts.duration,                                           \
              "nhoff=%u/%u "                                            \
              "thoff=%u/%u "                                            \
              "addr_proto=0x%x/0x%x "                                   \
@@ -457,7 +458,7 @@ static int init_prog_array(struct bpf_object *obj, struct bpf_map *prog_array)
        if (map_fd < 0)
                return -1;
 
-       for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) {
+       for (i = 0; i < bpf_map__max_entries(prog_array); i++) {
                snprintf(prog_name, sizeof(prog_name), "flow_dissector_%d", i);
 
                prog = bpf_object__find_program_by_name(obj, prog_name);
@@ -487,7 +488,7 @@ static void run_tests_skb_less(int tap_fd, struct bpf_map *keys)
                /* Keep in sync with 'flags' from eth_get_headlen. */
                __u32 eth_get_headlen_flags =
                        BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG;
-               struct bpf_prog_test_run_attr tattr = {};
+               LIBBPF_OPTS(bpf_test_run_opts, topts);
                struct bpf_flow_keys flow_keys = {};
                __u32 key = (__u32)(tests[i].keys.sport) << 16 |
                            tests[i].keys.dport;
@@ -503,13 +504,12 @@ static void run_tests_skb_less(int tap_fd, struct bpf_map *keys)
                CHECK(err < 0, "tx_tap", "err %d errno %d\n", err, errno);
 
                err = bpf_map_lookup_elem(keys_fd, &key, &flow_keys);
-               CHECK_ATTR(err, tests[i].name, "bpf_map_lookup_elem %d\n", err);
+               ASSERT_OK(err, "bpf_map_lookup_elem");
 
-               CHECK_ATTR(err, tests[i].name, "skb-less err %d\n", err);
                CHECK_FLOW_KEYS(tests[i].name, flow_keys, tests[i].keys);
 
                err = bpf_map_delete_elem(keys_fd, &key);
-               CHECK_ATTR(err, tests[i].name, "bpf_map_delete_elem %d\n", err);
+               ASSERT_OK(err, "bpf_map_delete_elem");
        }
 }
 
@@ -573,27 +573,24 @@ void test_flow_dissector(void)
 
        for (i = 0; i < ARRAY_SIZE(tests); i++) {
                struct bpf_flow_keys flow_keys;
-               struct bpf_prog_test_run_attr tattr = {
-                       .prog_fd = prog_fd,
+               LIBBPF_OPTS(bpf_test_run_opts, topts,
                        .data_in = &tests[i].pkt,
                        .data_size_in = sizeof(tests[i].pkt),
                        .data_out = &flow_keys,
-               };
+               );
                static struct bpf_flow_keys ctx = {};
 
                if (tests[i].flags) {
-                       tattr.ctx_in = &ctx;
-                       tattr.ctx_size_in = sizeof(ctx);
+                       topts.ctx_in = &ctx;
+                       topts.ctx_size_in = sizeof(ctx);
                        ctx.flags = tests[i].flags;
                }
 
-               err = bpf_prog_test_run_xattr(&tattr);
-               CHECK_ATTR(tattr.data_size_out != sizeof(flow_keys) ||
-                          err || tattr.retval != 1,
-                          tests[i].name,
-                          "err %d errno %d retval %d duration %d size %u/%zu\n",
-                          err, errno, tattr.retval, tattr.duration,
-                          tattr.data_size_out, sizeof(flow_keys));
+               err = bpf_prog_test_run_opts(prog_fd, &topts);
+               ASSERT_OK(err, "test_run");
+               ASSERT_EQ(topts.retval, 1, "test_run retval");
+               ASSERT_EQ(topts.data_size_out, sizeof(flow_keys),
+                         "test_run data_size_out");
                CHECK_FLOW_KEYS(tests[i].name, flow_keys, tests[i].keys);
        }
 
index 93ac3f2..36afb40 100644 (file)
@@ -5,7 +5,6 @@
 void serial_test_flow_dissector_load_bytes(void)
 {
        struct bpf_flow_keys flow_keys;
-       __u32 duration = 0, retval, size;
        struct bpf_insn prog[] = {
                // BPF_REG_1 - 1st argument: context
                // BPF_REG_2 - 2nd argument: offset, start at first byte
@@ -27,22 +26,25 @@ void serial_test_flow_dissector_load_bytes(void)
                BPF_EXIT_INSN(),
        };
        int fd, err;
+       LIBBPF_OPTS(bpf_test_run_opts, topts,
+               .data_in = &pkt_v4,
+               .data_size_in = sizeof(pkt_v4),
+               .data_out = &flow_keys,
+               .data_size_out = sizeof(flow_keys),
+               .repeat = 1,
+       );
 
        /* make sure bpf_skb_load_bytes is not allowed from skb-less context
         */
        fd = bpf_test_load_program(BPF_PROG_TYPE_FLOW_DISSECTOR, prog,
                              ARRAY_SIZE(prog), "GPL", 0, NULL, 0);
-       CHECK(fd < 0,
-             "flow_dissector-bpf_skb_load_bytes-load",
-             "fd %d errno %d\n",
-             fd, errno);
+       ASSERT_GE(fd, 0, "bpf_test_load_program good fd");
 
-       err = bpf_prog_test_run(fd, 1, &pkt_v4, sizeof(pkt_v4),
-                               &flow_keys, &size, &retval, &duration);
-       CHECK(size != sizeof(flow_keys) || err || retval != 1,
-             "flow_dissector-bpf_skb_load_bytes",
-             "err %d errno %d retval %d duration %d size %u/%zu\n",
-             err, errno, retval, duration, size, sizeof(flow_keys));
+       err = bpf_prog_test_run_opts(fd, &topts);
+       ASSERT_OK(err, "test_run");
+       ASSERT_EQ(topts.data_size_out, sizeof(flow_keys),
+                 "test_run data_size_out");
+       ASSERT_EQ(topts.retval, 1, "test_run retval");
 
        if (fd >= -1)
                close(fd);
index 68eb12a..044df13 100644 (file)
@@ -12,8 +12,13 @@ static void test_hash_map(void)
        int i, err, hashmap_fd, max_entries, percpu_map_fd;
        struct for_each_hash_map_elem *skel;
        __u64 *percpu_valbuf = NULL;
-       __u32 key, num_cpus, retval;
+       __u32 key, num_cpus;
        __u64 val;
+       LIBBPF_OPTS(bpf_test_run_opts, topts,
+               .data_in = &pkt_v4,
+               .data_size_in = sizeof(pkt_v4),
+               .repeat = 1,
+       );
 
        skel = for_each_hash_map_elem__open_and_load();
        if (!ASSERT_OK_PTR(skel, "for_each_hash_map_elem__open_and_load"))
@@ -42,11 +47,10 @@ static void test_hash_map(void)
        if (!ASSERT_OK(err, "percpu_map_update"))
                goto out;
 
-       err = bpf_prog_test_run(bpf_program__fd(skel->progs.test_pkt_access),
-                               1, &pkt_v4, sizeof(pkt_v4), NULL, NULL,
-                               &retval, &duration);
-       if (CHECK(err || retval, "ipv4", "err %d errno %d retval %d\n",
-                 err, errno, retval))
+       err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_pkt_access), &topts);
+       duration = topts.duration;
+       if (CHECK(err || topts.retval, "ipv4", "err %d errno %d retval %d\n",
+                 err, errno, topts.retval))
                goto out;
 
        ASSERT_EQ(skel->bss->hashmap_output, 4, "hashmap_output");
@@ -69,11 +73,16 @@ out:
 
 static void test_array_map(void)
 {
-       __u32 key, num_cpus, max_entries, retval;
+       __u32 key, num_cpus, max_entries;
        int i, arraymap_fd, percpu_map_fd, err;
        struct for_each_array_map_elem *skel;
        __u64 *percpu_valbuf = NULL;
        __u64 val, expected_total;
+       LIBBPF_OPTS(bpf_test_run_opts, topts,
+               .data_in = &pkt_v4,
+               .data_size_in = sizeof(pkt_v4),
+               .repeat = 1,
+       );
 
        skel = for_each_array_map_elem__open_and_load();
        if (!ASSERT_OK_PTR(skel, "for_each_array_map_elem__open_and_load"))
@@ -106,11 +115,10 @@ static void test_array_map(void)
        if (!ASSERT_OK(err, "percpu_map_update"))
                goto out;
 
-       err = bpf_prog_test_run(bpf_program__fd(skel->progs.test_pkt_access),
-                               1, &pkt_v4, sizeof(pkt_v4), NULL, NULL,
-                               &retval, &duration);
-       if (CHECK(err || retval, "ipv4", "err %d errno %d retval %d\n",
-                 err, errno, retval))
+       err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_pkt_access), &topts);
+       duration = topts.duration;
+       if (CHECK(err || topts.retval, "ipv4", "err %d errno %d retval %d\n",
+                 err, errno, topts.retval))
                goto out;
 
        ASSERT_EQ(skel->bss->arraymap_output, expected_total, "array_output");
index 85c4271..28cf639 100644 (file)
@@ -5,8 +5,8 @@
 void test_get_func_args_test(void)
 {
        struct get_func_args_test *skel = NULL;
-       __u32 duration = 0, retval;
        int err, prog_fd;
+       LIBBPF_OPTS(bpf_test_run_opts, topts);
 
        skel = get_func_args_test__open_and_load();
        if (!ASSERT_OK_PTR(skel, "get_func_args_test__open_and_load"))
@@ -20,19 +20,17 @@ void test_get_func_args_test(void)
         * fentry/fexit programs.
         */
        prog_fd = bpf_program__fd(skel->progs.test1);
-       err = bpf_prog_test_run(prog_fd, 1, NULL, 0,
-                               NULL, NULL, &retval, &duration);
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
        ASSERT_OK(err, "test_run");
-       ASSERT_EQ(retval, 0, "test_run");
+       ASSERT_EQ(topts.retval, 0, "test_run");
 
        /* This runs bpf_modify_return_test function and triggers
         * fmod_ret_test and fexit_test programs.
         */
        prog_fd = bpf_program__fd(skel->progs.fmod_ret_test);
-       err = bpf_prog_test_run(prog_fd, 1, NULL, 0,
-                               NULL, NULL, &retval, &duration);
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
        ASSERT_OK(err, "test_run");
-       ASSERT_EQ(retval, 1234, "test_run");
+       ASSERT_EQ(topts.retval, 1234, "test_run");
 
        ASSERT_EQ(skel->bss->test1_result, 1, "test1_result");
        ASSERT_EQ(skel->bss->test2_result, 1, "test2_result");
index 02a465f..938dbd4 100644 (file)
@@ -5,8 +5,8 @@
 void test_get_func_ip_test(void)
 {
        struct get_func_ip_test *skel = NULL;
-       __u32 duration = 0, retval;
        int err, prog_fd;
+       LIBBPF_OPTS(bpf_test_run_opts, topts);
 
        skel = get_func_ip_test__open();
        if (!ASSERT_OK_PTR(skel, "get_func_ip_test__open"))
@@ -29,14 +29,12 @@ void test_get_func_ip_test(void)
                goto cleanup;
 
        prog_fd = bpf_program__fd(skel->progs.test1);
-       err = bpf_prog_test_run(prog_fd, 1, NULL, 0,
-                               NULL, NULL, &retval, &duration);
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
        ASSERT_OK(err, "test_run");
-       ASSERT_EQ(retval, 0, "test_run");
+       ASSERT_EQ(topts.retval, 0, "test_run");
 
        prog_fd = bpf_program__fd(skel->progs.test5);
-       err = bpf_prog_test_run(prog_fd, 1, NULL, 0,
-                               NULL, NULL, &retval, &duration);
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
 
        ASSERT_OK(err, "test_run");
 
index 8d5a602..5308de1 100644 (file)
@@ -27,7 +27,7 @@ void test_get_stackid_cannot_attach(void)
                return;
 
        /* override program type */
-       bpf_program__set_perf_event(skel->progs.oncpu);
+       bpf_program__set_type(skel->progs.oncpu, BPF_PROG_TYPE_PERF_EVENT);
 
        err = test_stacktrace_build_id__load(skel);
        if (CHECK(err, "skel_load", "skeleton load failed: %d\n", err))
index 9da131b..6fb3d31 100644 (file)
@@ -121,7 +121,7 @@ static void test_global_data_rdonly(struct bpf_object *obj, __u32 duration)
        if (CHECK_FAIL(map_fd < 0))
                return;
 
-       buff = malloc(bpf_map__def(map)->value_size);
+       buff = malloc(bpf_map__value_size(map));
        if (buff)
                err = bpf_map_update_elem(map_fd, &zero, buff, 0);
        free(buff);
@@ -132,24 +132,26 @@ static void test_global_data_rdonly(struct bpf_object *obj, __u32 duration)
 void test_global_data(void)
 {
        const char *file = "./test_global_data.o";
-       __u32 duration = 0, retval;
        struct bpf_object *obj;
        int err, prog_fd;
+       LIBBPF_OPTS(bpf_test_run_opts, topts,
+               .data_in = &pkt_v4,
+               .data_size_in = sizeof(pkt_v4),
+               .repeat = 1,
+       );
 
        err = bpf_prog_test_load(file, BPF_PROG_TYPE_SCHED_CLS, &obj, &prog_fd);
-       if (CHECK(err, "load program", "error %d loading %s\n", err, file))
+       if (!ASSERT_OK(err, "load program"))
                return;
 
-       err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4),
-                               NULL, NULL, &retval, &duration);
-       CHECK(err || retval, "pass global data run",
-             "err %d errno %d retval %d duration %d\n",
-             err, errno, retval, duration);
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
+       ASSERT_OK(err, "pass global data run err");
+       ASSERT_OK(topts.retval, "pass global data run retval");
 
-       test_global_data_number(obj, duration);
-       test_global_data_string(obj, duration);
-       test_global_data_struct(obj, duration);
-       test_global_data_rdonly(obj, duration);
+       test_global_data_number(obj, topts.duration);
+       test_global_data_string(obj, topts.duration);
+       test_global_data_struct(obj, topts.duration);
+       test_global_data_rdonly(obj, topts.duration);
 
        bpf_object__close(obj);
 }
index 1db86ea..57331c6 100644 (file)
@@ -20,7 +20,7 @@ void test_global_data_init(void)
        if (CHECK_FAIL(!map || !bpf_map__is_internal(map)))
                goto out;
 
-       sz = bpf_map__def(map)->value_size;
+       sz = bpf_map__value_size(map);
        newval = malloc(sz);
        if (CHECK_FAIL(!newval))
                goto out;
index 93a2439..29039a3 100644 (file)
@@ -40,19 +40,21 @@ static void test_global_func_args0(struct bpf_object *obj)
 void test_global_func_args(void)
 {
        const char *file = "./test_global_func_args.o";
-       __u32 retval;
        struct bpf_object *obj;
        int err, prog_fd;
+       LIBBPF_OPTS(bpf_test_run_opts, topts,
+               .data_in = &pkt_v4,
+               .data_size_in = sizeof(pkt_v4),
+               .repeat = 1,
+       );
 
        err = bpf_prog_test_load(file, BPF_PROG_TYPE_CGROUP_SKB, &obj, &prog_fd);
        if (CHECK(err, "load program", "error %d loading %s\n", err, file))
                return;
 
-       err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4),
-                               NULL, NULL, &retval, &duration);
-       CHECK(err || retval, "pass global func args run",
-             "err %d errno %d retval %d duration %d\n",
-             err, errno, retval, duration);
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
+       ASSERT_OK(err, "test_run");
+       ASSERT_OK(topts.retval, "test_run retval");
 
        test_global_func_args0(obj);
 
index ce10d2f..1cee695 100644 (file)
@@ -53,24 +53,24 @@ static void on_sample(void *ctx, int cpu, void *data, __u32 size)
 void serial_test_kfree_skb(void)
 {
        struct __sk_buff skb = {};
-       struct bpf_prog_test_run_attr tattr = {
+       LIBBPF_OPTS(bpf_test_run_opts, topts,
                .data_in = &pkt_v6,
                .data_size_in = sizeof(pkt_v6),
                .ctx_in = &skb,
                .ctx_size_in = sizeof(skb),
-       };
+       );
        struct kfree_skb *skel = NULL;
        struct bpf_link *link;
        struct bpf_object *obj;
        struct perf_buffer *pb = NULL;
-       int err;
+       int err, prog_fd;
        bool passed = false;
        __u32 duration = 0;
        const int zero = 0;
        bool test_ok[2];
 
        err = bpf_prog_test_load("./test_pkt_access.o", BPF_PROG_TYPE_SCHED_CLS,
-                           &obj, &tattr.prog_fd);
+                                &obj, &prog_fd);
        if (CHECK(err, "prog_load sched cls", "err %d errno %d\n", err, errno))
                return;
 
@@ -100,11 +100,9 @@ void serial_test_kfree_skb(void)
                goto close_prog;
 
        memcpy(skb.cb, &cb, sizeof(cb));
-       err = bpf_prog_test_run_xattr(&tattr);
-       duration = tattr.duration;
-       CHECK(err || tattr.retval, "ipv6",
-             "err %d errno %d retval %d duration %d\n",
-             err, errno, tattr.retval, duration);
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
+       ASSERT_OK(err, "ipv6 test_run");
+       ASSERT_OK(topts.retval, "ipv6 test_run retval");
 
        /* read perf buffer */
        err = perf_buffer__poll(pb, 100);
index 7d7445c..c00eb97 100644 (file)
@@ -9,23 +9,31 @@
 static void test_main(void)
 {
        struct kfunc_call_test_lskel *skel;
-       int prog_fd, retval, err;
+       int prog_fd, err;
+       LIBBPF_OPTS(bpf_test_run_opts, topts,
+               .data_in = &pkt_v4,
+               .data_size_in = sizeof(pkt_v4),
+               .repeat = 1,
+       );
 
        skel = kfunc_call_test_lskel__open_and_load();
        if (!ASSERT_OK_PTR(skel, "skel"))
                return;
 
        prog_fd = skel->progs.kfunc_call_test1.prog_fd;
-       err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4),
-                               NULL, NULL, (__u32 *)&retval, NULL);
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
        ASSERT_OK(err, "bpf_prog_test_run(test1)");
-       ASSERT_EQ(retval, 12, "test1-retval");
+       ASSERT_EQ(topts.retval, 12, "test1-retval");
 
        prog_fd = skel->progs.kfunc_call_test2.prog_fd;
-       err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4),
-                               NULL, NULL, (__u32 *)&retval, NULL);
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
        ASSERT_OK(err, "bpf_prog_test_run(test2)");
-       ASSERT_EQ(retval, 3, "test2-retval");
+       ASSERT_EQ(topts.retval, 3, "test2-retval");
+
+       prog_fd = skel->progs.kfunc_call_test_ref_btf_id.prog_fd;
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
+       ASSERT_OK(err, "bpf_prog_test_run(test_ref_btf_id)");
+       ASSERT_EQ(topts.retval, 0, "test_ref_btf_id-retval");
 
        kfunc_call_test_lskel__destroy(skel);
 }
@@ -33,17 +41,21 @@ static void test_main(void)
 static void test_subprog(void)
 {
        struct kfunc_call_test_subprog *skel;
-       int prog_fd, retval, err;
+       int prog_fd, err;
+       LIBBPF_OPTS(bpf_test_run_opts, topts,
+               .data_in = &pkt_v4,
+               .data_size_in = sizeof(pkt_v4),
+               .repeat = 1,
+       );
 
        skel = kfunc_call_test_subprog__open_and_load();
        if (!ASSERT_OK_PTR(skel, "skel"))
                return;
 
        prog_fd = bpf_program__fd(skel->progs.kfunc_call_test1);
-       err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4),
-                               NULL, NULL, (__u32 *)&retval, NULL);
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
        ASSERT_OK(err, "bpf_prog_test_run(test1)");
-       ASSERT_EQ(retval, 10, "test1-retval");
+       ASSERT_EQ(topts.retval, 10, "test1-retval");
        ASSERT_NEQ(skel->data->active_res, -1, "active_res");
        ASSERT_EQ(skel->data->sk_state_res, BPF_TCP_CLOSE, "sk_state_res");
 
@@ -53,17 +65,21 @@ static void test_subprog(void)
 static void test_subprog_lskel(void)
 {
        struct kfunc_call_test_subprog_lskel *skel;
-       int prog_fd, retval, err;
+       int prog_fd, err;
+       LIBBPF_OPTS(bpf_test_run_opts, topts,
+               .data_in = &pkt_v4,
+               .data_size_in = sizeof(pkt_v4),
+               .repeat = 1,
+       );
 
        skel = kfunc_call_test_subprog_lskel__open_and_load();
        if (!ASSERT_OK_PTR(skel, "skel"))
                return;
 
        prog_fd = skel->progs.kfunc_call_test1.prog_fd;
-       err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4),
-                               NULL, NULL, (__u32 *)&retval, NULL);
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
        ASSERT_OK(err, "bpf_prog_test_run(test1)");
-       ASSERT_EQ(retval, 10, "test1-retval");
+       ASSERT_EQ(topts.retval, 10, "test1-retval");
        ASSERT_NEQ(skel->data->active_res, -1, "active_res");
        ASSERT_EQ(skel->data->sk_state_res, BPF_TCP_CLOSE, "sk_state_res");
 
index d490ad8..a1ebac7 100644 (file)
@@ -6,11 +6,15 @@
 #include "test_ksyms_module.lskel.h"
 #include "test_ksyms_module.skel.h"
 
-void test_ksyms_module_lskel(void)
+static void test_ksyms_module_lskel(void)
 {
        struct test_ksyms_module_lskel *skel;
-       int retval;
        int err;
+       LIBBPF_OPTS(bpf_test_run_opts, topts,
+               .data_in = &pkt_v4,
+               .data_size_in = sizeof(pkt_v4),
+               .repeat = 1,
+       );
 
        if (!env.has_testmod) {
                test__skip();
@@ -20,20 +24,24 @@ void test_ksyms_module_lskel(void)
        skel = test_ksyms_module_lskel__open_and_load();
        if (!ASSERT_OK_PTR(skel, "test_ksyms_module_lskel__open_and_load"))
                return;
-       err = bpf_prog_test_run(skel->progs.load.prog_fd, 1, &pkt_v4, sizeof(pkt_v4),
-                               NULL, NULL, (__u32 *)&retval, NULL);
+       err = bpf_prog_test_run_opts(skel->progs.load.prog_fd, &topts);
        if (!ASSERT_OK(err, "bpf_prog_test_run"))
                goto cleanup;
-       ASSERT_EQ(retval, 0, "retval");
+       ASSERT_EQ(topts.retval, 0, "retval");
        ASSERT_EQ(skel->bss->out_bpf_testmod_ksym, 42, "bpf_testmod_ksym");
 cleanup:
        test_ksyms_module_lskel__destroy(skel);
 }
 
-void test_ksyms_module_libbpf(void)
+static void test_ksyms_module_libbpf(void)
 {
        struct test_ksyms_module *skel;
-       int retval, err;
+       int err;
+       LIBBPF_OPTS(bpf_test_run_opts, topts,
+               .data_in = &pkt_v4,
+               .data_size_in = sizeof(pkt_v4),
+               .repeat = 1,
+       );
 
        if (!env.has_testmod) {
                test__skip();
@@ -43,11 +51,10 @@ void test_ksyms_module_libbpf(void)
        skel = test_ksyms_module__open_and_load();
        if (!ASSERT_OK_PTR(skel, "test_ksyms_module__open"))
                return;
-       err = bpf_prog_test_run(bpf_program__fd(skel->progs.load), 1, &pkt_v4,
-                               sizeof(pkt_v4), NULL, NULL, (__u32 *)&retval, NULL);
+       err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.load), &topts);
        if (!ASSERT_OK(err, "bpf_prog_test_run"))
                goto cleanup;
-       ASSERT_EQ(retval, 0, "retval");
+       ASSERT_EQ(topts.retval, 0, "retval");
        ASSERT_EQ(skel->bss->out_bpf_testmod_ksym, 42, "bpf_testmod_ksym");
 cleanup:
        test_ksyms_module__destroy(skel);
index 540ef28..55f733f 100644 (file)
@@ -23,12 +23,16 @@ static void test_l4lb(const char *file)
                __u8 flags;
        } real_def = {.dst = MAGIC_VAL};
        __u32 ch_key = 11, real_num = 3;
-       __u32 duration, retval, size;
        int err, i, prog_fd, map_fd;
        __u64 bytes = 0, pkts = 0;
        struct bpf_object *obj;
        char buf[128];
        u32 *magic = (u32 *)buf;
+       LIBBPF_OPTS(bpf_test_run_opts, topts,
+               .data_out = buf,
+               .data_size_out = sizeof(buf),
+               .repeat = NUM_ITER,
+       );
 
        err = bpf_prog_test_load(file, BPF_PROG_TYPE_SCHED_CLS, &obj, &prog_fd);
        if (CHECK_FAIL(err))
@@ -49,19 +53,24 @@ static void test_l4lb(const char *file)
                goto out;
        bpf_map_update_elem(map_fd, &real_num, &real_def, 0);
 
-       err = bpf_prog_test_run(prog_fd, NUM_ITER, &pkt_v4, sizeof(pkt_v4),
-                               buf, &size, &retval, &duration);
-       CHECK(err || retval != 7/*TC_ACT_REDIRECT*/ || size != 54 ||
-             *magic != MAGIC_VAL, "ipv4",
-             "err %d errno %d retval %d size %d magic %x\n",
-             err, errno, retval, size, *magic);
+       topts.data_in = &pkt_v4;
+       topts.data_size_in = sizeof(pkt_v4);
 
-       err = bpf_prog_test_run(prog_fd, NUM_ITER, &pkt_v6, sizeof(pkt_v6),
-                               buf, &size, &retval, &duration);
-       CHECK(err || retval != 7/*TC_ACT_REDIRECT*/ || size != 74 ||
-             *magic != MAGIC_VAL, "ipv6",
-             "err %d errno %d retval %d size %d magic %x\n",
-             err, errno, retval, size, *magic);
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
+       ASSERT_OK(err, "test_run");
+       ASSERT_EQ(topts.retval, 7 /*TC_ACT_REDIRECT*/, "ipv4 test_run retval");
+       ASSERT_EQ(topts.data_size_out, 54, "ipv4 test_run data_size_out");
+       ASSERT_EQ(*magic, MAGIC_VAL, "ipv4 magic");
+
+       topts.data_in = &pkt_v6;
+       topts.data_size_in = sizeof(pkt_v6);
+       topts.data_size_out = sizeof(buf); /* reset out size */
+
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
+       ASSERT_OK(err, "test_run");
+       ASSERT_EQ(topts.retval, 7 /*TC_ACT_REDIRECT*/, "ipv6 test_run retval");
+       ASSERT_EQ(topts.data_size_out, 74, "ipv6 test_run data_size_out");
+       ASSERT_EQ(*magic, MAGIC_VAL, "ipv6 magic");
 
        map_fd = bpf_find_map(__func__, obj, "stats");
        if (map_fd < 0)
index e469b02..1ef377a 100644 (file)
@@ -202,7 +202,7 @@ static void bpf_btf_load_log_buf(void)
        const void *raw_btf_data;
        __u32 raw_btf_size;
        struct btf *btf;
-       char *log_buf;
+       char *log_buf = NULL;
        int fd = -1;
 
        btf = btf__new_empty();
index 23d19e9..e4e99b3 100644 (file)
@@ -4,14 +4,17 @@
 
 static void *spin_lock_thread(void *arg)
 {
-       __u32 duration, retval;
        int err, prog_fd = *(u32 *) arg;
+       LIBBPF_OPTS(bpf_test_run_opts, topts,
+               .data_in = &pkt_v4,
+               .data_size_in = sizeof(pkt_v4),
+               .repeat = 10000,
+       );
+
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
+       ASSERT_OK(err, "test_run_opts err");
+       ASSERT_OK(topts.retval, "test_run_opts retval");
 
-       err = bpf_prog_test_run(prog_fd, 10000, &pkt_v4, sizeof(pkt_v4),
-                               NULL, NULL, &retval, &duration);
-       CHECK(err || retval, "",
-             "err %d errno %d retval %d duration %d\n",
-             err, errno, retval, duration);
        pthread_exit(arg);
 }
 
index 2737255..43e502a 100644 (file)
@@ -9,10 +9,16 @@
 void test_map_ptr(void)
 {
        struct map_ptr_kern_lskel *skel;
-       __u32 duration = 0, retval;
        char buf[128];
        int err;
        int page_size = getpagesize();
+       LIBBPF_OPTS(bpf_test_run_opts, topts,
+               .data_in = &pkt_v4,
+               .data_size_in = sizeof(pkt_v4),
+               .data_out = buf,
+               .data_size_out = sizeof(buf),
+               .repeat = 1,
+       );
 
        skel = map_ptr_kern_lskel__open();
        if (!ASSERT_OK_PTR(skel, "skel_open"))
@@ -26,14 +32,12 @@ void test_map_ptr(void)
 
        skel->bss->page_size = page_size;
 
-       err = bpf_prog_test_run(skel->progs.cg_skb.prog_fd, 1, &pkt_v4,
-                               sizeof(pkt_v4), buf, NULL, &retval, NULL);
+       err = bpf_prog_test_run_opts(skel->progs.cg_skb.prog_fd, &topts);
 
-       if (CHECK(err, "test_run", "err=%d errno=%d\n", err, errno))
+       if (!ASSERT_OK(err, "test_run"))
                goto cleanup;
 
-       if (CHECK(!retval, "retval", "retval=%d map_type=%u line=%u\n", retval,
-                 skel->bss->g_map_type, skel->bss->g_line))
+       if (!ASSERT_NEQ(topts.retval, 0, "test_run retval"))
                goto cleanup;
 
 cleanup:
index b772fe3..5d9955a 100644 (file)
@@ -15,39 +15,31 @@ static void run_test(__u32 input_retval, __u16 want_side_effect, __s16 want_ret)
 {
        struct modify_return *skel = NULL;
        int err, prog_fd;
-       __u32 duration = 0, retval;
        __u16 side_effect;
        __s16 ret;
+       LIBBPF_OPTS(bpf_test_run_opts, topts);
 
        skel = modify_return__open_and_load();
-       if (CHECK(!skel, "skel_load", "modify_return skeleton failed\n"))
+       if (!ASSERT_OK_PTR(skel, "skel_load"))
                goto cleanup;
 
        err = modify_return__attach(skel);
-       if (CHECK(err, "modify_return", "attach failed: %d\n", err))
+       if (!ASSERT_OK(err, "modify_return__attach failed"))
                goto cleanup;
 
        skel->bss->input_retval = input_retval;
        prog_fd = bpf_program__fd(skel->progs.fmod_ret_test);
-       err = bpf_prog_test_run(prog_fd, 1, NULL, 0, NULL, 0,
-                               &retval, &duration);
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
+       ASSERT_OK(err, "test_run");
 
-       CHECK(err, "test_run", "err %d errno %d\n", err, errno);
+       side_effect = UPPER(topts.retval);
+       ret = LOWER(topts.retval);
 
-       side_effect = UPPER(retval);
-       ret  = LOWER(retval);
-
-       CHECK(ret != want_ret, "test_run",
-             "unexpected ret: %d, expected: %d\n", ret, want_ret);
-       CHECK(side_effect != want_side_effect, "modify_return",
-             "unexpected side_effect: %d\n", side_effect);
-
-       CHECK(skel->bss->fentry_result != 1, "modify_return",
-             "fentry failed\n");
-       CHECK(skel->bss->fexit_result != 1, "modify_return",
-             "fexit failed\n");
-       CHECK(skel->bss->fmod_ret_result != 1, "modify_return",
-             "fmod_ret failed\n");
+       ASSERT_EQ(ret, want_ret, "test_run ret");
+       ASSERT_EQ(side_effect, want_side_effect, "modify_return side_effect");
+       ASSERT_EQ(skel->bss->fentry_result, 1, "modify_return fentry_result");
+       ASSERT_EQ(skel->bss->fexit_result, 1, "modify_return fexit_result");
+       ASSERT_EQ(skel->bss->fmod_ret_result, 1, "modify_return fmod_ret_result");
 
 cleanup:
        modify_return__destroy(skel);
@@ -63,4 +55,3 @@ void serial_test_modify_return(void)
                 0 /* want_side_effect */,
                 -EINVAL /* want_ret */);
 }
-
index 6628710..0bcccdc 100644 (file)
@@ -6,23 +6,27 @@ void test_pkt_access(void)
 {
        const char *file = "./test_pkt_access.o";
        struct bpf_object *obj;
-       __u32 duration, retval;
        int err, prog_fd;
+       LIBBPF_OPTS(bpf_test_run_opts, topts,
+               .data_in = &pkt_v4,
+               .data_size_in = sizeof(pkt_v4),
+               .repeat = 100000,
+       );
 
        err = bpf_prog_test_load(file, BPF_PROG_TYPE_SCHED_CLS, &obj, &prog_fd);
        if (CHECK_FAIL(err))
                return;
 
-       err = bpf_prog_test_run(prog_fd, 100000, &pkt_v4, sizeof(pkt_v4),
-                               NULL, NULL, &retval, &duration);
-       CHECK(err || retval, "ipv4",
-             "err %d errno %d retval %d duration %d\n",
-             err, errno, retval, duration);
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
+       ASSERT_OK(err, "ipv4 test_run_opts err");
+       ASSERT_OK(topts.retval, "ipv4 test_run_opts retval");
+
+       topts.data_in = &pkt_v6;
+       topts.data_size_in = sizeof(pkt_v6);
+       topts.data_size_out = 0; /* reset from last call */
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
+       ASSERT_OK(err, "ipv6 test_run_opts err");
+       ASSERT_OK(topts.retval, "ipv6 test_run_opts retval");
 
-       err = bpf_prog_test_run(prog_fd, 100000, &pkt_v6, sizeof(pkt_v6),
-                               NULL, NULL, &retval, &duration);
-       CHECK(err || retval, "ipv6",
-             "err %d errno %d retval %d duration %d\n",
-             err, errno, retval, duration);
        bpf_object__close(obj);
 }
index c9d2d6a..00ee1dd 100644 (file)
@@ -6,18 +6,20 @@ void test_pkt_md_access(void)
 {
        const char *file = "./test_pkt_md_access.o";
        struct bpf_object *obj;
-       __u32 duration, retval;
        int err, prog_fd;
+       LIBBPF_OPTS(bpf_test_run_opts, topts,
+               .data_in = &pkt_v4,
+               .data_size_in = sizeof(pkt_v4),
+               .repeat = 10,
+       );
 
        err = bpf_prog_test_load(file, BPF_PROG_TYPE_SCHED_CLS, &obj, &prog_fd);
        if (CHECK_FAIL(err))
                return;
 
-       err = bpf_prog_test_run(prog_fd, 10, &pkt_v4, sizeof(pkt_v4),
-                               NULL, NULL, &retval, &duration);
-       CHECK(err || retval, "",
-             "err %d errno %d retval %d duration %d\n",
-             err, errno, retval, duration);
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
+       ASSERT_OK(err, "test_run_opts err");
+       ASSERT_OK(topts.retval, "test_run_opts retval");
 
        bpf_object__close(obj);
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/prog_run_opts.c b/tools/testing/selftests/bpf/prog_tests/prog_run_opts.c
new file mode 100644 (file)
index 0000000..1ccd2bd
--- /dev/null
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+#include <network_helpers.h>
+
+#include "test_pkt_access.skel.h"
+
+static const __u32 duration;
+
+static void check_run_cnt(int prog_fd, __u64 run_cnt)
+{
+       struct bpf_prog_info info = {};
+       __u32 info_len = sizeof(info);
+       int err;
+
+       err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len);
+       if (CHECK(err, "get_prog_info", "failed to get bpf_prog_info for fd %d\n", prog_fd))
+               return;
+
+       CHECK(run_cnt != info.run_cnt, "run_cnt",
+             "incorrect number of repetitions, want %llu have %llu\n", run_cnt, info.run_cnt);
+}
+
+void test_prog_run_opts(void)
+{
+       struct test_pkt_access *skel;
+       int err, stats_fd = -1, prog_fd;
+       char buf[10] = {};
+       __u64 run_cnt = 0;
+
+       LIBBPF_OPTS(bpf_test_run_opts, topts,
+               .repeat = 1,
+               .data_in = &pkt_v4,
+               .data_size_in = sizeof(pkt_v4),
+               .data_out = buf,
+               .data_size_out = 5,
+       );
+
+       stats_fd = bpf_enable_stats(BPF_STATS_RUN_TIME);
+       if (!ASSERT_GE(stats_fd, 0, "enable_stats good fd"))
+               return;
+
+       skel = test_pkt_access__open_and_load();
+       if (!ASSERT_OK_PTR(skel, "open_and_load"))
+               goto cleanup;
+
+       prog_fd = bpf_program__fd(skel->progs.test_pkt_access);
+
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
+       ASSERT_EQ(errno, ENOSPC, "test_run errno");
+       ASSERT_ERR(err, "test_run");
+       ASSERT_OK(topts.retval, "test_run retval");
+
+       ASSERT_EQ(topts.data_size_out, sizeof(pkt_v4), "test_run data_size_out");
+       ASSERT_EQ(buf[5], 0, "overflow, BPF_PROG_TEST_RUN ignored size hint");
+
+       run_cnt += topts.repeat;
+       check_run_cnt(prog_fd, run_cnt);
+
+       topts.data_out = NULL;
+       topts.data_size_out = 0;
+       topts.repeat = 2;
+       errno = 0;
+
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
+       ASSERT_OK(errno, "run_no_output errno");
+       ASSERT_OK(err, "run_no_output err");
+       ASSERT_OK(topts.retval, "run_no_output retval");
+
+       run_cnt += topts.repeat;
+       check_run_cnt(prog_fd, run_cnt);
+
+cleanup:
+       if (skel)
+               test_pkt_access__destroy(skel);
+       if (stats_fd >= 0)
+               close(stats_fd);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/prog_run_xattr.c b/tools/testing/selftests/bpf/prog_tests/prog_run_xattr.c
deleted file mode 100644 (file)
index 89fc98f..0000000
+++ /dev/null
@@ -1,83 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <test_progs.h>
-#include <network_helpers.h>
-
-#include "test_pkt_access.skel.h"
-
-static const __u32 duration;
-
-static void check_run_cnt(int prog_fd, __u64 run_cnt)
-{
-       struct bpf_prog_info info = {};
-       __u32 info_len = sizeof(info);
-       int err;
-
-       err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len);
-       if (CHECK(err, "get_prog_info", "failed to get bpf_prog_info for fd %d\n", prog_fd))
-               return;
-
-       CHECK(run_cnt != info.run_cnt, "run_cnt",
-             "incorrect number of repetitions, want %llu have %llu\n", run_cnt, info.run_cnt);
-}
-
-void test_prog_run_xattr(void)
-{
-       struct test_pkt_access *skel;
-       int err, stats_fd = -1;
-       char buf[10] = {};
-       __u64 run_cnt = 0;
-
-       struct bpf_prog_test_run_attr tattr = {
-               .repeat = 1,
-               .data_in = &pkt_v4,
-               .data_size_in = sizeof(pkt_v4),
-               .data_out = buf,
-               .data_size_out = 5,
-       };
-
-       stats_fd = bpf_enable_stats(BPF_STATS_RUN_TIME);
-       if (CHECK_ATTR(stats_fd < 0, "enable_stats", "failed %d\n", errno))
-               return;
-
-       skel = test_pkt_access__open_and_load();
-       if (CHECK_ATTR(!skel, "open_and_load", "failed\n"))
-               goto cleanup;
-
-       tattr.prog_fd = bpf_program__fd(skel->progs.test_pkt_access);
-
-       err = bpf_prog_test_run_xattr(&tattr);
-       CHECK_ATTR(err >= 0 || errno != ENOSPC || tattr.retval, "run",
-             "err %d errno %d retval %d\n", err, errno, tattr.retval);
-
-       CHECK_ATTR(tattr.data_size_out != sizeof(pkt_v4), "data_size_out",
-             "incorrect output size, want %zu have %u\n",
-             sizeof(pkt_v4), tattr.data_size_out);
-
-       CHECK_ATTR(buf[5] != 0, "overflow",
-             "BPF_PROG_TEST_RUN ignored size hint\n");
-
-       run_cnt += tattr.repeat;
-       check_run_cnt(tattr.prog_fd, run_cnt);
-
-       tattr.data_out = NULL;
-       tattr.data_size_out = 0;
-       tattr.repeat = 2;
-       errno = 0;
-
-       err = bpf_prog_test_run_xattr(&tattr);
-       CHECK_ATTR(err || errno || tattr.retval, "run_no_output",
-             "err %d errno %d retval %d\n", err, errno, tattr.retval);
-
-       tattr.data_size_out = 1;
-       err = bpf_prog_test_run_xattr(&tattr);
-       CHECK_ATTR(err != -EINVAL, "run_wrong_size_out", "err %d\n", err);
-
-       run_cnt += tattr.repeat;
-       check_run_cnt(tattr.prog_fd, run_cnt);
-
-cleanup:
-       if (skel)
-               test_pkt_access__destroy(skel);
-       if (stats_fd >= 0)
-               close(stats_fd);
-}
index b9822f9..d2743fc 100644 (file)
@@ -10,11 +10,18 @@ enum {
 static void test_queue_stack_map_by_type(int type)
 {
        const int MAP_SIZE = 32;
-       __u32 vals[MAP_SIZE], duration, retval, size, val;
+       __u32 vals[MAP_SIZE], val;
        int i, err, prog_fd, map_in_fd, map_out_fd;
        char file[32], buf[128];
        struct bpf_object *obj;
        struct iphdr iph;
+       LIBBPF_OPTS(bpf_test_run_opts, topts,
+               .data_in = &pkt_v4,
+               .data_size_in = sizeof(pkt_v4),
+               .data_out = buf,
+               .data_size_out = sizeof(buf),
+               .repeat = 1,
+       );
 
        /* Fill test values to be used */
        for (i = 0; i < MAP_SIZE; i++)
@@ -58,38 +65,37 @@ static void test_queue_stack_map_by_type(int type)
                        pkt_v4.iph.saddr = vals[MAP_SIZE - 1 - i] * 5;
                }
 
-               err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4),
-                                       buf, &size, &retval, &duration);
-               if (err || retval || size != sizeof(pkt_v4))
+               topts.data_size_out = sizeof(buf);
+               err = bpf_prog_test_run_opts(prog_fd, &topts);
+               if (err || topts.retval ||
+                   topts.data_size_out != sizeof(pkt_v4))
                        break;
                memcpy(&iph, buf + sizeof(struct ethhdr), sizeof(iph));
                if (iph.daddr != val)
                        break;
        }
 
-       CHECK(err || retval || size != sizeof(pkt_v4) || iph.daddr != val,
-             "bpf_map_pop_elem",
-             "err %d errno %d retval %d size %d iph->daddr %u\n",
-             err, errno, retval, size, iph.daddr);
+       ASSERT_OK(err, "bpf_map_pop_elem");
+       ASSERT_OK(topts.retval, "bpf_map_pop_elem test retval");
+       ASSERT_EQ(topts.data_size_out, sizeof(pkt_v4),
+                 "bpf_map_pop_elem data_size_out");
+       ASSERT_EQ(iph.daddr, val, "bpf_map_pop_elem iph.daddr");
 
        /* Queue is empty, program should return TC_ACT_SHOT */
-       err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4),
-                               buf, &size, &retval, &duration);
-       CHECK(err || retval != 2 /* TC_ACT_SHOT */|| size != sizeof(pkt_v4),
-             "check-queue-stack-map-empty",
-             "err %d errno %d retval %d size %d\n",
-             err, errno, retval, size);
+       topts.data_size_out = sizeof(buf);
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
+       ASSERT_OK(err, "check-queue-stack-map-empty");
+       ASSERT_EQ(topts.retval, 2  /* TC_ACT_SHOT */,
+                 "check-queue-stack-map-empty test retval");
+       ASSERT_EQ(topts.data_size_out, sizeof(pkt_v4),
+                 "check-queue-stack-map-empty data_size_out");
 
        /* Check that the program pushed elements correctly */
        for (i = 0; i < MAP_SIZE; i++) {
                err = bpf_map_lookup_and_delete_elem(map_out_fd, NULL, &val);
-               if (err || val != vals[i] * 5)
-                       break;
+               ASSERT_OK(err, "bpf_map_lookup_and_delete_elem");
+               ASSERT_EQ(val, vals[i] * 5, "bpf_map_push_elem val");
        }
-
-       CHECK(i != MAP_SIZE && (err || val != vals[i] * 5),
-             "bpf_map_push_elem", "err %d value %u\n", err, val);
-
 out:
        pkt_v4.iph.saddr = 0;
        bpf_object__close(obj);
index 41720a6..fe5b8fa 100644 (file)
@@ -5,59 +5,54 @@
 #include "bpf/libbpf_internal.h"
 #include "test_raw_tp_test_run.skel.h"
 
-static int duration;
-
 void test_raw_tp_test_run(void)
 {
-       struct bpf_prog_test_run_attr test_attr = {};
        int comm_fd = -1, err, nr_online, i, prog_fd;
        __u64 args[2] = {0x1234ULL, 0x5678ULL};
        int expected_retval = 0x1234 + 0x5678;
        struct test_raw_tp_test_run *skel;
        char buf[] = "new_name";
        bool *online = NULL;
-       DECLARE_LIBBPF_OPTS(bpf_test_run_opts, opts,
-                           .ctx_in = args,
-                           .ctx_size_in = sizeof(args),
-                           .flags = BPF_F_TEST_RUN_ON_CPU,
-               );
+       LIBBPF_OPTS(bpf_test_run_opts, opts,
+               .ctx_in = args,
+               .ctx_size_in = sizeof(args),
+               .flags = BPF_F_TEST_RUN_ON_CPU,
+       );
 
        err = parse_cpu_mask_file("/sys/devices/system/cpu/online", &online,
                                  &nr_online);
-       if (CHECK(err, "parse_cpu_mask_file", "err %d\n", err))
+       if (!ASSERT_OK(err, "parse_cpu_mask_file"))
                return;
 
        skel = test_raw_tp_test_run__open_and_load();
-       if (CHECK(!skel, "skel_open", "failed to open skeleton\n"))
+       if (!ASSERT_OK_PTR(skel, "skel_open"))
                goto cleanup;
 
        err = test_raw_tp_test_run__attach(skel);
-       if (CHECK(err, "skel_attach", "skeleton attach failed: %d\n", err))
+       if (!ASSERT_OK(err, "skel_attach"))
                goto cleanup;
 
        comm_fd = open("/proc/self/comm", O_WRONLY|O_TRUNC);
-       if (CHECK(comm_fd < 0, "open /proc/self/comm", "err %d\n", errno))
+       if (!ASSERT_GE(comm_fd, 0, "open /proc/self/comm"))
                goto cleanup;
 
        err = write(comm_fd, buf, sizeof(buf));
-       CHECK(err < 0, "task rename", "err %d", errno);
+       ASSERT_GE(err, 0, "task rename");
 
-       CHECK(skel->bss->count == 0, "check_count", "didn't increase\n");
-       CHECK(skel->data->on_cpu != 0xffffffff, "check_on_cpu", "got wrong value\n");
+       ASSERT_NEQ(skel->bss->count, 0, "check_count");
+       ASSERT_EQ(skel->data->on_cpu, 0xffffffff, "check_on_cpu");
 
        prog_fd = bpf_program__fd(skel->progs.rename);
-       test_attr.prog_fd = prog_fd;
-       test_attr.ctx_in = args;
-       test_attr.ctx_size_in = sizeof(__u64);
+       opts.ctx_in = args;
+       opts.ctx_size_in = sizeof(__u64);
 
-       err = bpf_prog_test_run_xattr(&test_attr);
-       CHECK(err == 0, "test_run", "should fail for too small ctx\n");
+       err = bpf_prog_test_run_opts(prog_fd, &opts);
+       ASSERT_NEQ(err, 0, "test_run should fail for too small ctx");
 
-       test_attr.ctx_size_in = sizeof(args);
-       err = bpf_prog_test_run_xattr(&test_attr);
-       CHECK(err < 0, "test_run", "err %d\n", errno);
-       CHECK(test_attr.retval != expected_retval, "check_retval",
-             "expect 0x%x, got 0x%x\n", expected_retval, test_attr.retval);
+       opts.ctx_size_in = sizeof(args);
+       err = bpf_prog_test_run_opts(prog_fd, &opts);
+       ASSERT_OK(err, "test_run");
+       ASSERT_EQ(opts.retval, expected_retval, "check_retval");
 
        for (i = 0; i < nr_online; i++) {
                if (!online[i])
@@ -66,28 +61,23 @@ void test_raw_tp_test_run(void)
                opts.cpu = i;
                opts.retval = 0;
                err = bpf_prog_test_run_opts(prog_fd, &opts);
-               CHECK(err < 0, "test_run_opts", "err %d\n", errno);
-               CHECK(skel->data->on_cpu != i, "check_on_cpu",
-                     "expect %d got %d\n", i, skel->data->on_cpu);
-               CHECK(opts.retval != expected_retval,
-                     "check_retval", "expect 0x%x, got 0x%x\n",
-                     expected_retval, opts.retval);
+               ASSERT_OK(err, "test_run_opts");
+               ASSERT_EQ(skel->data->on_cpu, i, "check_on_cpu");
+               ASSERT_EQ(opts.retval, expected_retval, "check_retval");
        }
 
        /* invalid cpu ID should fail with ENXIO */
        opts.cpu = 0xffffffff;
        err = bpf_prog_test_run_opts(prog_fd, &opts);
-       CHECK(err >= 0 || errno != ENXIO,
-             "test_run_opts_fail",
-             "should failed with ENXIO\n");
+       ASSERT_EQ(errno, ENXIO, "test_run_opts should fail with ENXIO");
+       ASSERT_ERR(err, "test_run_opts_fail");
 
        /* non-zero cpu w/o BPF_F_TEST_RUN_ON_CPU should fail with EINVAL */
        opts.cpu = 1;
        opts.flags = 0;
        err = bpf_prog_test_run_opts(prog_fd, &opts);
-       CHECK(err >= 0 || errno != EINVAL,
-             "test_run_opts_fail",
-             "should failed with EINVAL\n");
+       ASSERT_EQ(errno, EINVAL, "test_run_opts should fail with EINVAL");
+       ASSERT_ERR(err, "test_run_opts_fail");
 
 cleanup:
        close(comm_fd);
index 239bacc..f4aa7da 100644 (file)
@@ -56,21 +56,23 @@ void serial_test_raw_tp_writable_test_run(void)
                0,
        };
 
-       __u32 prog_ret;
-       int err = bpf_prog_test_run(filter_fd, 1, test_skb, sizeof(test_skb), 0,
-                                   0, &prog_ret, 0);
+       LIBBPF_OPTS(bpf_test_run_opts, topts,
+               .data_in = test_skb,
+               .data_size_in = sizeof(test_skb),
+               .repeat = 1,
+       );
+       int err = bpf_prog_test_run_opts(filter_fd, &topts);
        CHECK(err != 42, "test_run",
              "tracepoint did not modify return value\n");
-       CHECK(prog_ret != 0, "test_run_ret",
+       CHECK(topts.retval != 0, "test_run_ret",
              "socket_filter did not return 0\n");
 
        close(tp_fd);
 
-       err = bpf_prog_test_run(filter_fd, 1, test_skb, sizeof(test_skb), 0, 0,
-                               &prog_ret, 0);
+       err = bpf_prog_test_run_opts(filter_fd, &topts);
        CHECK(err != 0, "test_run_notrace",
              "test_run failed with %d errno %d\n", err, errno);
-       CHECK(prog_ret != 0, "test_run_ret_notrace",
+       CHECK(topts.retval != 0, "test_run_ret_notrace",
              "socket_filter did not return 0\n");
 
 out_filterfd:
index aecfe66..70b49da 100644 (file)
@@ -13,10 +13,14 @@ static void test_signal_pending_by_type(enum bpf_prog_type prog_type)
        struct itimerval timeo = {
                .it_value.tv_usec = 100000, /* 100ms */
        };
-       __u32 duration = 0, retval;
        int prog_fd;
        int err;
        int i;
+       LIBBPF_OPTS(bpf_test_run_opts, topts,
+               .data_in = &pkt_v4,
+               .data_size_in = sizeof(pkt_v4),
+               .repeat = 0xffffffff,
+       );
 
        for (i = 0; i < ARRAY_SIZE(prog); i++)
                prog[i] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 0);
@@ -24,20 +28,17 @@ static void test_signal_pending_by_type(enum bpf_prog_type prog_type)
 
        prog_fd = bpf_test_load_program(prog_type, prog, ARRAY_SIZE(prog),
                                   "GPL", 0, NULL, 0);
-       CHECK(prog_fd < 0, "test-run", "errno %d\n", errno);
+       ASSERT_GE(prog_fd, 0, "test-run load");
 
        err = sigaction(SIGALRM, &sigalrm_action, NULL);
-       CHECK(err, "test-run-signal-sigaction", "errno %d\n", errno);
+       ASSERT_OK(err, "test-run-signal-sigaction");
 
        err = setitimer(ITIMER_REAL, &timeo, NULL);
-       CHECK(err, "test-run-signal-timer", "errno %d\n", errno);
-
-       err = bpf_prog_test_run(prog_fd, 0xffffffff, &pkt_v4, sizeof(pkt_v4),
-                               NULL, NULL, &retval, &duration);
-       CHECK(duration > 500000000, /* 500ms */
-             "test-run-signal-duration",
-             "duration %dns > 500ms\n",
-             duration);
+       ASSERT_OK(err, "test-run-signal-timer");
+
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
+       ASSERT_LE(topts.duration, 500000000 /* 500ms */,
+                 "test-run-signal-duration");
 
        signal(SIGALRM, SIG_DFL);
 }
index b5319ba..ce0e555 100644 (file)
@@ -20,97 +20,72 @@ void test_skb_ctx(void)
                .gso_size = 10,
                .hwtstamp = 11,
        };
-       struct bpf_prog_test_run_attr tattr = {
+       LIBBPF_OPTS(bpf_test_run_opts, tattr,
                .data_in = &pkt_v4,
                .data_size_in = sizeof(pkt_v4),
                .ctx_in = &skb,
                .ctx_size_in = sizeof(skb),
                .ctx_out = &skb,
                .ctx_size_out = sizeof(skb),
-       };
+       );
        struct bpf_object *obj;
-       int err;
-       int i;
+       int err, prog_fd, i;
 
-       err = bpf_prog_test_load("./test_skb_ctx.o", BPF_PROG_TYPE_SCHED_CLS, &obj,
-                           &tattr.prog_fd);
-       if (CHECK_ATTR(err, "load", "err %d errno %d\n", err, errno))
+       err = bpf_prog_test_load("./test_skb_ctx.o", BPF_PROG_TYPE_SCHED_CLS,
+                                &obj, &prog_fd);
+       if (!ASSERT_OK(err, "load"))
                return;
 
        /* ctx_in != NULL, ctx_size_in == 0 */
 
        tattr.ctx_size_in = 0;
-       err = bpf_prog_test_run_xattr(&tattr);
-       CHECK_ATTR(err == 0, "ctx_size_in", "err %d errno %d\n", err, errno);
+       err = bpf_prog_test_run_opts(prog_fd, &tattr);
+       ASSERT_NEQ(err, 0, "ctx_size_in");
        tattr.ctx_size_in = sizeof(skb);
 
        /* ctx_out != NULL, ctx_size_out == 0 */
 
        tattr.ctx_size_out = 0;
-       err = bpf_prog_test_run_xattr(&tattr);
-       CHECK_ATTR(err == 0, "ctx_size_out", "err %d errno %d\n", err, errno);
+       err = bpf_prog_test_run_opts(prog_fd, &tattr);
+       ASSERT_NEQ(err, 0, "ctx_size_out");
        tattr.ctx_size_out = sizeof(skb);
 
        /* non-zero [len, tc_index] fields should be rejected*/
 
        skb.len = 1;
-       err = bpf_prog_test_run_xattr(&tattr);
-       CHECK_ATTR(err == 0, "len", "err %d errno %d\n", err, errno);
+       err = bpf_prog_test_run_opts(prog_fd, &tattr);
+       ASSERT_NEQ(err, 0, "len");
        skb.len = 0;
 
        skb.tc_index = 1;
-       err = bpf_prog_test_run_xattr(&tattr);
-       CHECK_ATTR(err == 0, "tc_index", "err %d errno %d\n", err, errno);
+       err = bpf_prog_test_run_opts(prog_fd, &tattr);
+       ASSERT_NEQ(err, 0, "tc_index");
        skb.tc_index = 0;
 
        /* non-zero [hash, sk] fields should be rejected */
 
        skb.hash = 1;
-       err = bpf_prog_test_run_xattr(&tattr);
-       CHECK_ATTR(err == 0, "hash", "err %d errno %d\n", err, errno);
+       err = bpf_prog_test_run_opts(prog_fd, &tattr);
+       ASSERT_NEQ(err, 0, "hash");
        skb.hash = 0;
 
        skb.sk = (struct bpf_sock *)1;
-       err = bpf_prog_test_run_xattr(&tattr);
-       CHECK_ATTR(err == 0, "sk", "err %d errno %d\n", err, errno);
+       err = bpf_prog_test_run_opts(prog_fd, &tattr);
+       ASSERT_NEQ(err, 0, "sk");
        skb.sk = 0;
 
-       err = bpf_prog_test_run_xattr(&tattr);
-       CHECK_ATTR(err != 0 || tattr.retval,
-                  "run",
-                  "err %d errno %d retval %d\n",
-                  err, errno, tattr.retval);
-
-       CHECK_ATTR(tattr.ctx_size_out != sizeof(skb),
-                  "ctx_size_out",
-                  "incorrect output size, want %zu have %u\n",
-                  sizeof(skb), tattr.ctx_size_out);
+       err = bpf_prog_test_run_opts(prog_fd, &tattr);
+       ASSERT_OK(err, "test_run");
+       ASSERT_OK(tattr.retval, "test_run retval");
+       ASSERT_EQ(tattr.ctx_size_out, sizeof(skb), "ctx_size_out");
 
        for (i = 0; i < 5; i++)
-               CHECK_ATTR(skb.cb[i] != i + 2,
-                          "ctx_out_cb",
-                          "skb->cb[i] == %d, expected %d\n",
-                          skb.cb[i], i + 2);
-       CHECK_ATTR(skb.priority != 7,
-                  "ctx_out_priority",
-                  "skb->priority == %d, expected %d\n",
-                  skb.priority, 7);
-       CHECK_ATTR(skb.ifindex != 1,
-                  "ctx_out_ifindex",
-                  "skb->ifindex == %d, expected %d\n",
-                  skb.ifindex, 1);
-       CHECK_ATTR(skb.ingress_ifindex != 11,
-                  "ctx_out_ingress_ifindex",
-                  "skb->ingress_ifindex == %d, expected %d\n",
-                  skb.ingress_ifindex, 11);
-       CHECK_ATTR(skb.tstamp != 8,
-                  "ctx_out_tstamp",
-                  "skb->tstamp == %lld, expected %d\n",
-                  skb.tstamp, 8);
-       CHECK_ATTR(skb.mark != 10,
-                  "ctx_out_mark",
-                  "skb->mark == %u, expected %d\n",
-                  skb.mark, 10);
+               ASSERT_EQ(skb.cb[i], i + 2, "ctx_out_cb");
+       ASSERT_EQ(skb.priority, 7, "ctx_out_priority");
+       ASSERT_EQ(skb.ifindex, 1, "ctx_out_ifindex");
+       ASSERT_EQ(skb.ingress_ifindex, 11, "ctx_out_ingress_ifindex");
+       ASSERT_EQ(skb.tstamp, 8, "ctx_out_tstamp");
+       ASSERT_EQ(skb.mark, 10, "ctx_out_mark");
 
        bpf_object__close(obj);
 }
index 6f802a1..97dc8b1 100644 (file)
@@ -9,22 +9,22 @@ void test_skb_helpers(void)
                .gso_segs = 8,
                .gso_size = 10,
        };
-       struct bpf_prog_test_run_attr tattr = {
+       LIBBPF_OPTS(bpf_test_run_opts, topts,
                .data_in = &pkt_v4,
                .data_size_in = sizeof(pkt_v4),
                .ctx_in = &skb,
                .ctx_size_in = sizeof(skb),
                .ctx_out = &skb,
                .ctx_size_out = sizeof(skb),
-       };
+       );
        struct bpf_object *obj;
-       int err;
+       int err, prog_fd;
 
-       err = bpf_prog_test_load("./test_skb_helpers.o", BPF_PROG_TYPE_SCHED_CLS, &obj,
-                           &tattr.prog_fd);
-       if (CHECK_ATTR(err, "load", "err %d errno %d\n", err, errno))
+       err = bpf_prog_test_load("./test_skb_helpers.o",
+                                BPF_PROG_TYPE_SCHED_CLS, &obj, &prog_fd);
+       if (!ASSERT_OK(err, "load"))
                return;
-       err = bpf_prog_test_run_xattr(&tattr);
-       CHECK_ATTR(err, "len", "err %d errno %d\n", err, errno);
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
+       ASSERT_OK(err, "test_run");
        bpf_object__close(obj);
 }
index 9fc040e..9d211b5 100644 (file)
@@ -1,9 +1,11 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2019 Facebook */
 
+#define _GNU_SOURCE
 #include <netinet/in.h>
 #include <arpa/inet.h>
 #include <unistd.h>
+#include <sched.h>
 #include <stdlib.h>
 #include <string.h>
 #include <errno.h>
@@ -20,6 +22,7 @@
 enum bpf_linum_array_idx {
        EGRESS_LINUM_IDX,
        INGRESS_LINUM_IDX,
+       READ_SK_DST_PORT_LINUM_IDX,
        __NR_BPF_LINUM_ARRAY_IDX,
 };
 
@@ -42,8 +45,16 @@ static __u64 child_cg_id;
 static int linum_map_fd;
 static __u32 duration;
 
-static __u32 egress_linum_idx = EGRESS_LINUM_IDX;
-static __u32 ingress_linum_idx = INGRESS_LINUM_IDX;
+static bool create_netns(void)
+{
+       if (!ASSERT_OK(unshare(CLONE_NEWNET), "create netns"))
+               return false;
+
+       if (!ASSERT_OK(system("ip link set dev lo up"), "bring up lo"))
+               return false;
+
+       return true;
+}
 
 static void print_sk(const struct bpf_sock *sk, const char *prefix)
 {
@@ -91,19 +102,24 @@ static void check_result(void)
 {
        struct bpf_tcp_sock srv_tp, cli_tp, listen_tp;
        struct bpf_sock srv_sk, cli_sk, listen_sk;
-       __u32 ingress_linum, egress_linum;
+       __u32 idx, ingress_linum, egress_linum, linum;
        int err;
 
-       err = bpf_map_lookup_elem(linum_map_fd, &egress_linum_idx,
-                                 &egress_linum);
+       idx = EGRESS_LINUM_IDX;
+       err = bpf_map_lookup_elem(linum_map_fd, &idx, &egress_linum);
        CHECK(err < 0, "bpf_map_lookup_elem(linum_map_fd)",
              "err:%d errno:%d\n", err, errno);
 
-       err = bpf_map_lookup_elem(linum_map_fd, &ingress_linum_idx,
-                                 &ingress_linum);
+       idx = INGRESS_LINUM_IDX;
+       err = bpf_map_lookup_elem(linum_map_fd, &idx, &ingress_linum);
        CHECK(err < 0, "bpf_map_lookup_elem(linum_map_fd)",
              "err:%d errno:%d\n", err, errno);
 
+       idx = READ_SK_DST_PORT_LINUM_IDX;
+       err = bpf_map_lookup_elem(linum_map_fd, &idx, &linum);
+       ASSERT_OK(err, "bpf_map_lookup_elem(linum_map_fd, READ_SK_DST_PORT_IDX)");
+       ASSERT_EQ(linum, 0, "failure in read_sk_dst_port on line");
+
        memcpy(&srv_sk, &skel->bss->srv_sk, sizeof(srv_sk));
        memcpy(&srv_tp, &skel->bss->srv_tp, sizeof(srv_tp));
        memcpy(&cli_sk, &skel->bss->cli_sk, sizeof(cli_sk));
@@ -262,7 +278,7 @@ static void test(void)
        char buf[DATA_LEN];
 
        /* Prepare listen_fd */
-       listen_fd = start_server(AF_INET6, SOCK_STREAM, "::1", 0, 0);
+       listen_fd = start_server(AF_INET6, SOCK_STREAM, "::1", 0xcafe, 0);
        /* start_server() has logged the error details */
        if (CHECK_FAIL(listen_fd == -1))
                goto done;
@@ -330,8 +346,12 @@ done:
 
 void serial_test_sock_fields(void)
 {
-       struct bpf_link *egress_link = NULL, *ingress_link = NULL;
        int parent_cg_fd = -1, child_cg_fd = -1;
+       struct bpf_link *link;
+
+       /* Use a dedicated netns to have a fixed listen port */
+       if (!create_netns())
+               return;
 
        /* Create a cgroup, get fd, and join it */
        parent_cg_fd = test__join_cgroup(PARENT_CGROUP);
@@ -352,15 +372,20 @@ void serial_test_sock_fields(void)
        if (CHECK(!skel, "test_sock_fields__open_and_load", "failed\n"))
                goto done;
 
-       egress_link = bpf_program__attach_cgroup(skel->progs.egress_read_sock_fields,
-                                                child_cg_fd);
-       if (!ASSERT_OK_PTR(egress_link, "attach_cgroup(egress)"))
+       link = bpf_program__attach_cgroup(skel->progs.egress_read_sock_fields, child_cg_fd);
+       if (!ASSERT_OK_PTR(link, "attach_cgroup(egress_read_sock_fields)"))
+               goto done;
+       skel->links.egress_read_sock_fields = link;
+
+       link = bpf_program__attach_cgroup(skel->progs.ingress_read_sock_fields, child_cg_fd);
+       if (!ASSERT_OK_PTR(link, "attach_cgroup(ingress_read_sock_fields)"))
                goto done;
+       skel->links.ingress_read_sock_fields = link;
 
-       ingress_link = bpf_program__attach_cgroup(skel->progs.ingress_read_sock_fields,
-                                                 child_cg_fd);
-       if (!ASSERT_OK_PTR(ingress_link, "attach_cgroup(ingress)"))
+       link = bpf_program__attach_cgroup(skel->progs.read_sk_dst_port, child_cg_fd);
+       if (!ASSERT_OK_PTR(link, "attach_cgroup(read_sk_dst_port"))
                goto done;
+       skel->links.read_sk_dst_port = link;
 
        linum_map_fd = bpf_map__fd(skel->maps.linum_map);
        sk_pkt_out_cnt_fd = bpf_map__fd(skel->maps.sk_pkt_out_cnt);
@@ -369,8 +394,7 @@ void serial_test_sock_fields(void)
        test();
 
 done:
-       bpf_link__destroy(egress_link);
-       bpf_link__destroy(ingress_link);
+       test_sock_fields__detach(skel);
        test_sock_fields__destroy(skel);
        if (child_cg_fd >= 0)
                close(child_cg_fd);
index 85db0f4..cec5c08 100644 (file)
@@ -8,6 +8,7 @@
 #include "test_sockmap_update.skel.h"
 #include "test_sockmap_invalid_update.skel.h"
 #include "test_sockmap_skb_verdict_attach.skel.h"
+#include "test_sockmap_progs_query.skel.h"
 #include "bpf_iter_sockmap.skel.h"
 
 #define TCP_REPAIR             19      /* TCP sock is under repair right now */
@@ -139,12 +140,16 @@ out:
 
 static void test_sockmap_update(enum bpf_map_type map_type)
 {
-       struct bpf_prog_test_run_attr tattr;
        int err, prog, src, duration = 0;
        struct test_sockmap_update *skel;
        struct bpf_map *dst_map;
        const __u32 zero = 0;
        char dummy[14] = {0};
+       LIBBPF_OPTS(bpf_test_run_opts, topts,
+               .data_in = dummy,
+               .data_size_in = sizeof(dummy),
+               .repeat = 1,
+       );
        __s64 sk;
 
        sk = connected_socket_v4();
@@ -166,16 +171,10 @@ static void test_sockmap_update(enum bpf_map_type map_type)
        if (CHECK(err, "update_elem(src)", "errno=%u\n", errno))
                goto out;
 
-       tattr = (struct bpf_prog_test_run_attr){
-               .prog_fd = prog,
-               .repeat = 1,
-               .data_in = dummy,
-               .data_size_in = sizeof(dummy),
-       };
-
-       err = bpf_prog_test_run_xattr(&tattr);
-       if (CHECK_ATTR(err || !tattr.retval, "bpf_prog_test_run",
-                      "errno=%u retval=%u\n", errno, tattr.retval))
+       err = bpf_prog_test_run_opts(prog, &topts);
+       if (!ASSERT_OK(err, "test_run"))
+               goto out;
+       if (!ASSERT_NEQ(topts.retval, 0, "test_run retval"))
                goto out;
 
        compare_cookies(skel->maps.src, dst_map);
@@ -315,6 +314,63 @@ out:
        test_sockmap_skb_verdict_attach__destroy(skel);
 }
 
+static __u32 query_prog_id(int prog_fd)
+{
+       struct bpf_prog_info info = {};
+       __u32 info_len = sizeof(info);
+       int err;
+
+       err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len);
+       if (!ASSERT_OK(err, "bpf_obj_get_info_by_fd") ||
+           !ASSERT_EQ(info_len, sizeof(info), "bpf_obj_get_info_by_fd"))
+               return 0;
+
+       return info.id;
+}
+
+static void test_sockmap_progs_query(enum bpf_attach_type attach_type)
+{
+       struct test_sockmap_progs_query *skel;
+       int err, map_fd, verdict_fd;
+       __u32 attach_flags = 0;
+       __u32 prog_ids[3] = {};
+       __u32 prog_cnt = 3;
+
+       skel = test_sockmap_progs_query__open_and_load();
+       if (!ASSERT_OK_PTR(skel, "test_sockmap_progs_query__open_and_load"))
+               return;
+
+       map_fd = bpf_map__fd(skel->maps.sock_map);
+
+       if (attach_type == BPF_SK_MSG_VERDICT)
+               verdict_fd = bpf_program__fd(skel->progs.prog_skmsg_verdict);
+       else
+               verdict_fd = bpf_program__fd(skel->progs.prog_skb_verdict);
+
+       err = bpf_prog_query(map_fd, attach_type, 0 /* query flags */,
+                            &attach_flags, prog_ids, &prog_cnt);
+       ASSERT_OK(err, "bpf_prog_query failed");
+       ASSERT_EQ(attach_flags,  0, "wrong attach_flags on query");
+       ASSERT_EQ(prog_cnt, 0, "wrong program count on query");
+
+       err = bpf_prog_attach(verdict_fd, map_fd, attach_type, 0);
+       if (!ASSERT_OK(err, "bpf_prog_attach failed"))
+               goto out;
+
+       prog_cnt = 1;
+       err = bpf_prog_query(map_fd, attach_type, 0 /* query flags */,
+                            &attach_flags, prog_ids, &prog_cnt);
+       ASSERT_OK(err, "bpf_prog_query failed");
+       ASSERT_EQ(attach_flags, 0, "wrong attach_flags on query");
+       ASSERT_EQ(prog_cnt, 1, "wrong program count on query");
+       ASSERT_EQ(prog_ids[0], query_prog_id(verdict_fd),
+                 "wrong prog_ids on query");
+
+       bpf_prog_detach2(verdict_fd, map_fd, attach_type);
+out:
+       test_sockmap_progs_query__destroy(skel);
+}
+
 void test_sockmap_basic(void)
 {
        if (test__start_subtest("sockmap create_update_free"))
@@ -341,4 +397,12 @@ void test_sockmap_basic(void)
                test_sockmap_skb_verdict_attach(BPF_SK_SKB_STREAM_VERDICT,
                                                BPF_SK_SKB_VERDICT);
        }
+       if (test__start_subtest("sockmap msg_verdict progs query"))
+               test_sockmap_progs_query(BPF_SK_MSG_VERDICT);
+       if (test__start_subtest("sockmap stream_parser progs query"))
+               test_sockmap_progs_query(BPF_SK_SKB_STREAM_PARSER);
+       if (test__start_subtest("sockmap stream_verdict progs query"))
+               test_sockmap_progs_query(BPF_SK_SKB_STREAM_VERDICT);
+       if (test__start_subtest("sockmap skb_verdict progs query"))
+               test_sockmap_progs_query(BPF_SK_SKB_VERDICT);
 }
index 7e21bfa..2cf0c7a 100644 (file)
@@ -1413,14 +1413,12 @@ close_srv1:
 
 static void test_ops_cleanup(const struct bpf_map *map)
 {
-       const struct bpf_map_def *def;
        int err, mapfd;
        u32 key;
 
-       def = bpf_map__def(map);
        mapfd = bpf_map__fd(map);
 
-       for (key = 0; key < def->max_entries; key++) {
+       for (key = 0; key < bpf_map__max_entries(map); key++) {
                err = bpf_map_delete_elem(mapfd, &key);
                if (err && errno != EINVAL && errno != ENOENT)
                        FAIL_ERRNO("map_delete: expected EINVAL/ENOENT");
@@ -1443,13 +1441,13 @@ static const char *family_str(sa_family_t family)
 
 static const char *map_type_str(const struct bpf_map *map)
 {
-       const struct bpf_map_def *def;
+       int type;
 
-       def = bpf_map__def(map);
-       if (IS_ERR(def))
+       if (!map)
                return "invalid";
+       type = bpf_map__type(map);
 
-       switch (def->type) {
+       switch (type) {
        case BPF_MAP_TYPE_SOCKMAP:
                return "sockmap";
        case BPF_MAP_TYPE_SOCKHASH:
index 4b937e5..30a99d2 100644 (file)
@@ -173,11 +173,11 @@ static int getsetsockopt(void)
        }
 
        memset(&buf, 0, sizeof(buf));
-       buf.zc.address = 12345; /* rejected by BPF */
+       buf.zc.address = 12345; /* Not page aligned. Rejected by tcp_zerocopy_receive() */
        optlen = sizeof(buf.zc);
        errno = 0;
        err = getsockopt(fd, SOL_TCP, TCP_ZEROCOPY_RECEIVE, &buf, &optlen);
-       if (errno != EPERM) {
+       if (errno != EINVAL) {
                log_err("Unexpected getsockopt(TCP_ZEROCOPY_RECEIVE) err=%d errno=%d",
                        err, errno);
                goto err;
index 6307f5d..8e329ea 100644 (file)
@@ -4,14 +4,16 @@
 
 static void *spin_lock_thread(void *arg)
 {
-       __u32 duration, retval;
        int err, prog_fd = *(u32 *) arg;
+       LIBBPF_OPTS(bpf_test_run_opts, topts,
+               .data_in = &pkt_v4,
+               .data_size_in = sizeof(pkt_v4),
+               .repeat = 10000,
+       );
 
-       err = bpf_prog_test_run(prog_fd, 10000, &pkt_v4, sizeof(pkt_v4),
-                               NULL, NULL, &retval, &duration);
-       CHECK(err || retval, "",
-             "err %d errno %d retval %d duration %d\n",
-             err, errno, retval, duration);
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
+       ASSERT_OK(err, "test_run");
+       ASSERT_OK(topts.retval, "test_run retval");
        pthread_exit(arg);
 }
 
index 0a91d8d..f45a1d7 100644 (file)
@@ -42,7 +42,7 @@ retry:
                return;
 
        /* override program type */
-       bpf_program__set_perf_event(skel->progs.oncpu);
+       bpf_program__set_type(skel->progs.oncpu, BPF_PROG_TYPE_PERF_EVENT);
 
        err = test_stacktrace_build_id__load(skel);
        if (CHECK(err, "skel_load", "skeleton load failed: %d\n", err))
index 81e997a..f4d4000 100644 (file)
@@ -20,20 +20,20 @@ void test_syscall(void)
                .log_buf = (uintptr_t) verifier_log,
                .log_size = sizeof(verifier_log),
        };
-       struct bpf_prog_test_run_attr tattr = {
+       LIBBPF_OPTS(bpf_test_run_opts, tattr,
                .ctx_in = &ctx,
                .ctx_size_in = sizeof(ctx),
-       };
+       );
        struct syscall *skel = NULL;
        __u64 key = 12, value = 0;
-       int err;
+       int err, prog_fd;
 
        skel = syscall__open_and_load();
        if (!ASSERT_OK_PTR(skel, "skel_load"))
                goto cleanup;
 
-       tattr.prog_fd = bpf_program__fd(skel->progs.bpf_prog);
-       err = bpf_prog_test_run_xattr(&tattr);
+       prog_fd = bpf_program__fd(skel->progs.bpf_prog);
+       err = bpf_prog_test_run_opts(prog_fd, &tattr);
        ASSERT_EQ(err, 0, "err");
        ASSERT_EQ(tattr.retval, 1, "retval");
        ASSERT_GT(ctx.map_fd, 0, "ctx.map_fd");
index 5dc0f42..c4da87e 100644 (file)
@@ -12,9 +12,13 @@ static void test_tailcall_1(void)
        struct bpf_map *prog_array;
        struct bpf_program *prog;
        struct bpf_object *obj;
-       __u32 retval, duration;
        char prog_name[32];
        char buff[128] = {};
+       LIBBPF_OPTS(bpf_test_run_opts, topts,
+               .data_in = buff,
+               .data_size_in = sizeof(buff),
+               .repeat = 1,
+       );
 
        err = bpf_prog_test_load("tailcall1.o", BPF_PROG_TYPE_SCHED_CLS, &obj,
                            &prog_fd);
@@ -37,7 +41,7 @@ static void test_tailcall_1(void)
        if (CHECK_FAIL(map_fd < 0))
                goto out;
 
-       for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) {
+       for (i = 0; i < bpf_map__max_entries(prog_array); i++) {
                snprintf(prog_name, sizeof(prog_name), "classifier_%d", i);
 
                prog = bpf_object__find_program_by_name(obj, prog_name);
@@ -53,23 +57,21 @@ static void test_tailcall_1(void)
                        goto out;
        }
 
-       for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) {
-               err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0,
-                                       &duration, &retval, NULL);
-               CHECK(err || retval != i, "tailcall",
-                     "err %d errno %d retval %d\n", err, errno, retval);
+       for (i = 0; i < bpf_map__max_entries(prog_array); i++) {
+               err = bpf_prog_test_run_opts(main_fd, &topts);
+               ASSERT_OK(err, "tailcall");
+               ASSERT_EQ(topts.retval, i, "tailcall retval");
 
                err = bpf_map_delete_elem(map_fd, &i);
                if (CHECK_FAIL(err))
                        goto out;
        }
 
-       err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0,
-                               &duration, &retval, NULL);
-       CHECK(err || retval != 3, "tailcall", "err %d errno %d retval %d\n",
-             err, errno, retval);
+       err = bpf_prog_test_run_opts(main_fd, &topts);
+       ASSERT_OK(err, "tailcall");
+       ASSERT_EQ(topts.retval, 3, "tailcall retval");
 
-       for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) {
+       for (i = 0; i < bpf_map__max_entries(prog_array); i++) {
                snprintf(prog_name, sizeof(prog_name), "classifier_%d", i);
 
                prog = bpf_object__find_program_by_name(obj, prog_name);
@@ -85,13 +87,12 @@ static void test_tailcall_1(void)
                        goto out;
        }
 
-       err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0,
-                               &duration, &retval, NULL);
-       CHECK(err || retval != 0, "tailcall", "err %d errno %d retval %d\n",
-             err, errno, retval);
+       err = bpf_prog_test_run_opts(main_fd, &topts);
+       ASSERT_OK(err, "tailcall");
+       ASSERT_OK(topts.retval, "tailcall retval");
 
-       for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) {
-               j = bpf_map__def(prog_array)->max_entries - 1 - i;
+       for (i = 0; i < bpf_map__max_entries(prog_array); i++) {
+               j = bpf_map__max_entries(prog_array) - 1 - i;
                snprintf(prog_name, sizeof(prog_name), "classifier_%d", j);
 
                prog = bpf_object__find_program_by_name(obj, prog_name);
@@ -107,33 +108,30 @@ static void test_tailcall_1(void)
                        goto out;
        }
 
-       for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) {
-               j = bpf_map__def(prog_array)->max_entries - 1 - i;
+       for (i = 0; i < bpf_map__max_entries(prog_array); i++) {
+               j = bpf_map__max_entries(prog_array) - 1 - i;
 
-               err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0,
-                                       &duration, &retval, NULL);
-               CHECK(err || retval != j, "tailcall",
-                     "err %d errno %d retval %d\n", err, errno, retval);
+               err = bpf_prog_test_run_opts(main_fd, &topts);
+               ASSERT_OK(err, "tailcall");
+               ASSERT_EQ(topts.retval, j, "tailcall retval");
 
                err = bpf_map_delete_elem(map_fd, &i);
                if (CHECK_FAIL(err))
                        goto out;
        }
 
-       err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0,
-                               &duration, &retval, NULL);
-       CHECK(err || retval != 3, "tailcall", "err %d errno %d retval %d\n",
-             err, errno, retval);
+       err = bpf_prog_test_run_opts(main_fd, &topts);
+       ASSERT_OK(err, "tailcall");
+       ASSERT_EQ(topts.retval, 3, "tailcall retval");
 
-       for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) {
+       for (i = 0; i < bpf_map__max_entries(prog_array); i++) {
                err = bpf_map_delete_elem(map_fd, &i);
                if (CHECK_FAIL(err >= 0 || errno != ENOENT))
                        goto out;
 
-               err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0,
-                                       &duration, &retval, NULL);
-               CHECK(err || retval != 3, "tailcall",
-                     "err %d errno %d retval %d\n", err, errno, retval);
+               err = bpf_prog_test_run_opts(main_fd, &topts);
+               ASSERT_OK(err, "tailcall");
+               ASSERT_EQ(topts.retval, 3, "tailcall retval");
        }
 
 out:
@@ -150,9 +148,13 @@ static void test_tailcall_2(void)
        struct bpf_map *prog_array;
        struct bpf_program *prog;
        struct bpf_object *obj;
-       __u32 retval, duration;
        char prog_name[32];
        char buff[128] = {};
+       LIBBPF_OPTS(bpf_test_run_opts, topts,
+               .data_in = buff,
+               .data_size_in = sizeof(buff),
+               .repeat = 1,
+       );
 
        err = bpf_prog_test_load("tailcall2.o", BPF_PROG_TYPE_SCHED_CLS, &obj,
                            &prog_fd);
@@ -175,7 +177,7 @@ static void test_tailcall_2(void)
        if (CHECK_FAIL(map_fd < 0))
                goto out;
 
-       for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) {
+       for (i = 0; i < bpf_map__max_entries(prog_array); i++) {
                snprintf(prog_name, sizeof(prog_name), "classifier_%d", i);
 
                prog = bpf_object__find_program_by_name(obj, prog_name);
@@ -191,30 +193,27 @@ static void test_tailcall_2(void)
                        goto out;
        }
 
-       err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0,
-                               &duration, &retval, NULL);
-       CHECK(err || retval != 2, "tailcall", "err %d errno %d retval %d\n",
-             err, errno, retval);
+       err = bpf_prog_test_run_opts(main_fd, &topts);
+       ASSERT_OK(err, "tailcall");
+       ASSERT_EQ(topts.retval, 2, "tailcall retval");
 
        i = 2;
        err = bpf_map_delete_elem(map_fd, &i);
        if (CHECK_FAIL(err))
                goto out;
 
-       err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0,
-                               &duration, &retval, NULL);
-       CHECK(err || retval != 1, "tailcall", "err %d errno %d retval %d\n",
-             err, errno, retval);
+       err = bpf_prog_test_run_opts(main_fd, &topts);
+       ASSERT_OK(err, "tailcall");
+       ASSERT_EQ(topts.retval, 1, "tailcall retval");
 
        i = 0;
        err = bpf_map_delete_elem(map_fd, &i);
        if (CHECK_FAIL(err))
                goto out;
 
-       err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0,
-                               &duration, &retval, NULL);
-       CHECK(err || retval != 3, "tailcall", "err %d errno %d retval %d\n",
-             err, errno, retval);
+       err = bpf_prog_test_run_opts(main_fd, &topts);
+       ASSERT_OK(err, "tailcall");
+       ASSERT_EQ(topts.retval, 3, "tailcall retval");
 out:
        bpf_object__close(obj);
 }
@@ -225,8 +224,12 @@ static void test_tailcall_count(const char *which)
        struct bpf_map *prog_array, *data_map;
        struct bpf_program *prog;
        struct bpf_object *obj;
-       __u32 retval, duration;
        char buff[128] = {};
+       LIBBPF_OPTS(bpf_test_run_opts, topts,
+               .data_in = buff,
+               .data_size_in = sizeof(buff),
+               .repeat = 1,
+       );
 
        err = bpf_prog_test_load(which, BPF_PROG_TYPE_SCHED_CLS, &obj,
                            &prog_fd);
@@ -262,10 +265,9 @@ static void test_tailcall_count(const char *which)
        if (CHECK_FAIL(err))
                goto out;
 
-       err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0,
-                               &duration, &retval, NULL);
-       CHECK(err || retval != 1, "tailcall", "err %d errno %d retval %d\n",
-             err, errno, retval);
+       err = bpf_prog_test_run_opts(main_fd, &topts);
+       ASSERT_OK(err, "tailcall");
+       ASSERT_EQ(topts.retval, 1, "tailcall retval");
 
        data_map = bpf_object__find_map_by_name(obj, "tailcall.bss");
        if (CHECK_FAIL(!data_map || !bpf_map__is_internal(data_map)))
@@ -277,18 +279,17 @@ static void test_tailcall_count(const char *which)
 
        i = 0;
        err = bpf_map_lookup_elem(data_fd, &i, &val);
-       CHECK(err || val != 33, "tailcall count", "err %d errno %d count %d\n",
-             err, errno, val);
+       ASSERT_OK(err, "tailcall count");
+       ASSERT_EQ(val, 33, "tailcall count");
 
        i = 0;
        err = bpf_map_delete_elem(map_fd, &i);
        if (CHECK_FAIL(err))
                goto out;
 
-       err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0,
-                               &duration, &retval, NULL);
-       CHECK(err || retval != 0, "tailcall", "err %d errno %d retval %d\n",
-             err, errno, retval);
+       err = bpf_prog_test_run_opts(main_fd, &topts);
+       ASSERT_OK(err, "tailcall");
+       ASSERT_OK(topts.retval, "tailcall retval");
 out:
        bpf_object__close(obj);
 }
@@ -319,10 +320,14 @@ static void test_tailcall_4(void)
        struct bpf_map *prog_array, *data_map;
        struct bpf_program *prog;
        struct bpf_object *obj;
-       __u32 retval, duration;
        static const int zero = 0;
        char buff[128] = {};
        char prog_name[32];
+       LIBBPF_OPTS(bpf_test_run_opts, topts,
+               .data_in = buff,
+               .data_size_in = sizeof(buff),
+               .repeat = 1,
+       );
 
        err = bpf_prog_test_load("tailcall4.o", BPF_PROG_TYPE_SCHED_CLS, &obj,
                            &prog_fd);
@@ -353,7 +358,7 @@ static void test_tailcall_4(void)
        if (CHECK_FAIL(map_fd < 0))
                return;
 
-       for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) {
+       for (i = 0; i < bpf_map__max_entries(prog_array); i++) {
                snprintf(prog_name, sizeof(prog_name), "classifier_%d", i);
 
                prog = bpf_object__find_program_by_name(obj, prog_name);
@@ -369,18 +374,17 @@ static void test_tailcall_4(void)
                        goto out;
        }
 
-       for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) {
+       for (i = 0; i < bpf_map__max_entries(prog_array); i++) {
                err = bpf_map_update_elem(data_fd, &zero, &i, BPF_ANY);
                if (CHECK_FAIL(err))
                        goto out;
 
-               err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0,
-                                       &duration, &retval, NULL);
-               CHECK(err || retval != i, "tailcall",
-                     "err %d errno %d retval %d\n", err, errno, retval);
+               err = bpf_prog_test_run_opts(main_fd, &topts);
+               ASSERT_OK(err, "tailcall");
+               ASSERT_EQ(topts.retval, i, "tailcall retval");
        }
 
-       for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) {
+       for (i = 0; i < bpf_map__max_entries(prog_array); i++) {
                err = bpf_map_update_elem(data_fd, &zero, &i, BPF_ANY);
                if (CHECK_FAIL(err))
                        goto out;
@@ -389,10 +393,9 @@ static void test_tailcall_4(void)
                if (CHECK_FAIL(err))
                        goto out;
 
-               err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0,
-                                       &duration, &retval, NULL);
-               CHECK(err || retval != 3, "tailcall",
-                     "err %d errno %d retval %d\n", err, errno, retval);
+               err = bpf_prog_test_run_opts(main_fd, &topts);
+               ASSERT_OK(err, "tailcall");
+               ASSERT_EQ(topts.retval, 3, "tailcall retval");
        }
 out:
        bpf_object__close(obj);
@@ -407,10 +410,14 @@ static void test_tailcall_5(void)
        struct bpf_map *prog_array, *data_map;
        struct bpf_program *prog;
        struct bpf_object *obj;
-       __u32 retval, duration;
        static const int zero = 0;
        char buff[128] = {};
        char prog_name[32];
+       LIBBPF_OPTS(bpf_test_run_opts, topts,
+               .data_in = buff,
+               .data_size_in = sizeof(buff),
+               .repeat = 1,
+       );
 
        err = bpf_prog_test_load("tailcall5.o", BPF_PROG_TYPE_SCHED_CLS, &obj,
                            &prog_fd);
@@ -441,7 +448,7 @@ static void test_tailcall_5(void)
        if (CHECK_FAIL(map_fd < 0))
                return;
 
-       for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) {
+       for (i = 0; i < bpf_map__max_entries(prog_array); i++) {
                snprintf(prog_name, sizeof(prog_name), "classifier_%d", i);
 
                prog = bpf_object__find_program_by_name(obj, prog_name);
@@ -457,18 +464,17 @@ static void test_tailcall_5(void)
                        goto out;
        }
 
-       for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) {
+       for (i = 0; i < bpf_map__max_entries(prog_array); i++) {
                err = bpf_map_update_elem(data_fd, &zero, &key[i], BPF_ANY);
                if (CHECK_FAIL(err))
                        goto out;
 
-               err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0,
-                                       &duration, &retval, NULL);
-               CHECK(err || retval != i, "tailcall",
-                     "err %d errno %d retval %d\n", err, errno, retval);
+               err = bpf_prog_test_run_opts(main_fd, &topts);
+               ASSERT_OK(err, "tailcall");
+               ASSERT_EQ(topts.retval, i, "tailcall retval");
        }
 
-       for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) {
+       for (i = 0; i < bpf_map__max_entries(prog_array); i++) {
                err = bpf_map_update_elem(data_fd, &zero, &key[i], BPF_ANY);
                if (CHECK_FAIL(err))
                        goto out;
@@ -477,10 +483,9 @@ static void test_tailcall_5(void)
                if (CHECK_FAIL(err))
                        goto out;
 
-               err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0,
-                                       &duration, &retval, NULL);
-               CHECK(err || retval != 3, "tailcall",
-                     "err %d errno %d retval %d\n", err, errno, retval);
+               err = bpf_prog_test_run_opts(main_fd, &topts);
+               ASSERT_OK(err, "tailcall");
+               ASSERT_EQ(topts.retval, 3, "tailcall retval");
        }
 out:
        bpf_object__close(obj);
@@ -495,8 +500,12 @@ static void test_tailcall_bpf2bpf_1(void)
        struct bpf_map *prog_array;
        struct bpf_program *prog;
        struct bpf_object *obj;
-       __u32 retval, duration;
        char prog_name[32];
+       LIBBPF_OPTS(bpf_test_run_opts, topts,
+               .data_in = &pkt_v4,
+               .data_size_in = sizeof(pkt_v4),
+               .repeat = 1,
+       );
 
        err = bpf_prog_test_load("tailcall_bpf2bpf1.o", BPF_PROG_TYPE_SCHED_CLS,
                            &obj, &prog_fd);
@@ -520,7 +529,7 @@ static void test_tailcall_bpf2bpf_1(void)
                goto out;
 
        /* nop -> jmp */
-       for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) {
+       for (i = 0; i < bpf_map__max_entries(prog_array); i++) {
                snprintf(prog_name, sizeof(prog_name), "classifier_%d", i);
 
                prog = bpf_object__find_program_by_name(obj, prog_name);
@@ -536,10 +545,9 @@ static void test_tailcall_bpf2bpf_1(void)
                        goto out;
        }
 
-       err = bpf_prog_test_run(main_fd, 1, &pkt_v4, sizeof(pkt_v4), 0,
-                               0, &retval, &duration);
-       CHECK(err || retval != 1, "tailcall",
-             "err %d errno %d retval %d\n", err, errno, retval);
+       err = bpf_prog_test_run_opts(main_fd, &topts);
+       ASSERT_OK(err, "tailcall");
+       ASSERT_EQ(topts.retval, 1, "tailcall retval");
 
        /* jmp -> nop, call subprog that will do tailcall */
        i = 1;
@@ -547,10 +555,9 @@ static void test_tailcall_bpf2bpf_1(void)
        if (CHECK_FAIL(err))
                goto out;
 
-       err = bpf_prog_test_run(main_fd, 1, &pkt_v4, sizeof(pkt_v4), 0,
-                               0, &retval, &duration);
-       CHECK(err || retval != 0, "tailcall", "err %d errno %d retval %d\n",
-             err, errno, retval);
+       err = bpf_prog_test_run_opts(main_fd, &topts);
+       ASSERT_OK(err, "tailcall");
+       ASSERT_OK(topts.retval, "tailcall retval");
 
        /* make sure that subprog can access ctx and entry prog that
         * called this subprog can properly return
@@ -560,11 +567,9 @@ static void test_tailcall_bpf2bpf_1(void)
        if (CHECK_FAIL(err))
                goto out;
 
-       err = bpf_prog_test_run(main_fd, 1, &pkt_v4, sizeof(pkt_v4), 0,
-                               0, &retval, &duration);
-       CHECK(err || retval != sizeof(pkt_v4) * 2,
-             "tailcall", "err %d errno %d retval %d\n",
-             err, errno, retval);
+       err = bpf_prog_test_run_opts(main_fd, &topts);
+       ASSERT_OK(err, "tailcall");
+       ASSERT_EQ(topts.retval, sizeof(pkt_v4) * 2, "tailcall retval");
 out:
        bpf_object__close(obj);
 }
@@ -579,8 +584,12 @@ static void test_tailcall_bpf2bpf_2(void)
        struct bpf_map *prog_array, *data_map;
        struct bpf_program *prog;
        struct bpf_object *obj;
-       __u32 retval, duration;
        char buff[128] = {};
+       LIBBPF_OPTS(bpf_test_run_opts, topts,
+               .data_in = buff,
+               .data_size_in = sizeof(buff),
+               .repeat = 1,
+       );
 
        err = bpf_prog_test_load("tailcall_bpf2bpf2.o", BPF_PROG_TYPE_SCHED_CLS,
                            &obj, &prog_fd);
@@ -616,10 +625,9 @@ static void test_tailcall_bpf2bpf_2(void)
        if (CHECK_FAIL(err))
                goto out;
 
-       err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0,
-                               &duration, &retval, NULL);
-       CHECK(err || retval != 1, "tailcall", "err %d errno %d retval %d\n",
-             err, errno, retval);
+       err = bpf_prog_test_run_opts(main_fd, &topts);
+       ASSERT_OK(err, "tailcall");
+       ASSERT_EQ(topts.retval, 1, "tailcall retval");
 
        data_map = bpf_object__find_map_by_name(obj, "tailcall.bss");
        if (CHECK_FAIL(!data_map || !bpf_map__is_internal(data_map)))
@@ -631,18 +639,17 @@ static void test_tailcall_bpf2bpf_2(void)
 
        i = 0;
        err = bpf_map_lookup_elem(data_fd, &i, &val);
-       CHECK(err || val != 33, "tailcall count", "err %d errno %d count %d\n",
-             err, errno, val);
+       ASSERT_OK(err, "tailcall count");
+       ASSERT_EQ(val, 33, "tailcall count");
 
        i = 0;
        err = bpf_map_delete_elem(map_fd, &i);
        if (CHECK_FAIL(err))
                goto out;
 
-       err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0,
-                               &duration, &retval, NULL);
-       CHECK(err || retval != 0, "tailcall", "err %d errno %d retval %d\n",
-             err, errno, retval);
+       err = bpf_prog_test_run_opts(main_fd, &topts);
+       ASSERT_OK(err, "tailcall");
+       ASSERT_OK(topts.retval, "tailcall retval");
 out:
        bpf_object__close(obj);
 }
@@ -657,8 +664,12 @@ static void test_tailcall_bpf2bpf_3(void)
        struct bpf_map *prog_array;
        struct bpf_program *prog;
        struct bpf_object *obj;
-       __u32 retval, duration;
        char prog_name[32];
+       LIBBPF_OPTS(bpf_test_run_opts, topts,
+               .data_in = &pkt_v4,
+               .data_size_in = sizeof(pkt_v4),
+               .repeat = 1,
+       );
 
        err = bpf_prog_test_load("tailcall_bpf2bpf3.o", BPF_PROG_TYPE_SCHED_CLS,
                            &obj, &prog_fd);
@@ -681,7 +692,7 @@ static void test_tailcall_bpf2bpf_3(void)
        if (CHECK_FAIL(map_fd < 0))
                goto out;
 
-       for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) {
+       for (i = 0; i < bpf_map__max_entries(prog_array); i++) {
                snprintf(prog_name, sizeof(prog_name), "classifier_%d", i);
 
                prog = bpf_object__find_program_by_name(obj, prog_name);
@@ -697,33 +708,27 @@ static void test_tailcall_bpf2bpf_3(void)
                        goto out;
        }
 
-       err = bpf_prog_test_run(main_fd, 1, &pkt_v4, sizeof(pkt_v4), 0,
-                               &duration, &retval, NULL);
-       CHECK(err || retval != sizeof(pkt_v4) * 3,
-             "tailcall", "err %d errno %d retval %d\n",
-             err, errno, retval);
+       err = bpf_prog_test_run_opts(main_fd, &topts);
+       ASSERT_OK(err, "tailcall");
+       ASSERT_EQ(topts.retval, sizeof(pkt_v4) * 3, "tailcall retval");
 
        i = 1;
        err = bpf_map_delete_elem(map_fd, &i);
        if (CHECK_FAIL(err))
                goto out;
 
-       err = bpf_prog_test_run(main_fd, 1, &pkt_v4, sizeof(pkt_v4), 0,
-                               &duration, &retval, NULL);
-       CHECK(err || retval != sizeof(pkt_v4),
-             "tailcall", "err %d errno %d retval %d\n",
-             err, errno, retval);
+       err = bpf_prog_test_run_opts(main_fd, &topts);
+       ASSERT_OK(err, "tailcall");
+       ASSERT_EQ(topts.retval, sizeof(pkt_v4), "tailcall retval");
 
        i = 0;
        err = bpf_map_delete_elem(map_fd, &i);
        if (CHECK_FAIL(err))
                goto out;
 
-       err = bpf_prog_test_run(main_fd, 1, &pkt_v4, sizeof(pkt_v4), 0,
-                               &duration, &retval, NULL);
-       CHECK(err || retval != sizeof(pkt_v4) * 2,
-             "tailcall", "err %d errno %d retval %d\n",
-             err, errno, retval);
+       err = bpf_prog_test_run_opts(main_fd, &topts);
+       ASSERT_OK(err, "tailcall");
+       ASSERT_EQ(topts.retval, sizeof(pkt_v4) * 2, "tailcall retval");
 out:
        bpf_object__close(obj);
 }
@@ -754,8 +759,12 @@ static void test_tailcall_bpf2bpf_4(bool noise)
        struct bpf_map *prog_array, *data_map;
        struct bpf_program *prog;
        struct bpf_object *obj;
-       __u32 retval, duration;
        char prog_name[32];
+       LIBBPF_OPTS(bpf_test_run_opts, topts,
+               .data_in = &pkt_v4,
+               .data_size_in = sizeof(pkt_v4),
+               .repeat = 1,
+       );
 
        err = bpf_prog_test_load("tailcall_bpf2bpf4.o", BPF_PROG_TYPE_SCHED_CLS,
                            &obj, &prog_fd);
@@ -778,7 +787,7 @@ static void test_tailcall_bpf2bpf_4(bool noise)
        if (CHECK_FAIL(map_fd < 0))
                goto out;
 
-       for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) {
+       for (i = 0; i < bpf_map__max_entries(prog_array); i++) {
                snprintf(prog_name, sizeof(prog_name), "classifier_%d", i);
 
                prog = bpf_object__find_program_by_name(obj, prog_name);
@@ -809,15 +818,14 @@ static void test_tailcall_bpf2bpf_4(bool noise)
        if (CHECK_FAIL(err))
                goto out;
 
-       err = bpf_prog_test_run(main_fd, 1, &pkt_v4, sizeof(pkt_v4), 0,
-                               &duration, &retval, NULL);
-       CHECK(err || retval != sizeof(pkt_v4) * 3, "tailcall", "err %d errno %d retval %d\n",
-             err, errno, retval);
+       err = bpf_prog_test_run_opts(main_fd, &topts);
+       ASSERT_OK(err, "tailcall");
+       ASSERT_EQ(topts.retval, sizeof(pkt_v4) * 3, "tailcall retval");
 
        i = 0;
        err = bpf_map_lookup_elem(data_fd, &i, &val);
-       CHECK(err || val.count != 31, "tailcall count", "err %d errno %d count %d\n",
-             err, errno, val.count);
+       ASSERT_OK(err, "tailcall count");
+       ASSERT_EQ(val.count, 31, "tailcall count");
 
 out:
        bpf_object__close(obj);
index 37c20b5..61935e7 100644 (file)
@@ -3,18 +3,22 @@
 #include <test_progs.h>
 #include "test_task_pt_regs.skel.h"
 
+/* uprobe attach point */
+static void trigger_func(void)
+{
+       asm volatile ("");
+}
+
 void test_task_pt_regs(void)
 {
        struct test_task_pt_regs *skel;
        struct bpf_link *uprobe_link;
-       size_t uprobe_offset;
-       ssize_t base_addr;
+       ssize_t uprobe_offset;
        bool match;
 
-       base_addr = get_base_addr();
-       if (!ASSERT_GT(base_addr, 0, "get_base_addr"))
+       uprobe_offset = get_uprobe_offset(&trigger_func);
+       if (!ASSERT_GE(uprobe_offset, 0, "uprobe_offset"))
                return;
-       uprobe_offset = get_uprobe_offset(&get_base_addr, base_addr);
 
        skel = test_task_pt_regs__open_and_load();
        if (!ASSERT_OK_PTR(skel, "skel_open"))
@@ -32,7 +36,7 @@ void test_task_pt_regs(void)
        skel->links.handle_uprobe = uprobe_link;
 
        /* trigger & validate uprobe */
-       get_base_addr();
+       trigger_func();
 
        if (!ASSERT_EQ(skel->bss->uprobe_res, 1, "check_uprobe_res"))
                goto cleanup;
diff --git a/tools/testing/selftests/bpf/prog_tests/test_bpf_syscall_macro.c b/tools/testing/selftests/bpf/prog_tests/test_bpf_syscall_macro.c
new file mode 100644 (file)
index 0000000..c381faa
--- /dev/null
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright 2022 Sony Group Corporation */
+#include <sys/prctl.h>
+#include <test_progs.h>
+#include "bpf_syscall_macro.skel.h"
+
+void test_bpf_syscall_macro(void)
+{
+       struct bpf_syscall_macro *skel = NULL;
+       int err;
+       int exp_arg1 = 1001;
+       unsigned long exp_arg2 = 12;
+       unsigned long exp_arg3 = 13;
+       unsigned long exp_arg4 = 14;
+       unsigned long exp_arg5 = 15;
+
+       /* check whether it can open program */
+       skel = bpf_syscall_macro__open();
+       if (!ASSERT_OK_PTR(skel, "bpf_syscall_macro__open"))
+               return;
+
+       skel->rodata->filter_pid = getpid();
+
+       /* check whether it can load program */
+       err = bpf_syscall_macro__load(skel);
+       if (!ASSERT_OK(err, "bpf_syscall_macro__load"))
+               goto cleanup;
+
+       /* check whether it can attach kprobe */
+       err = bpf_syscall_macro__attach(skel);
+       if (!ASSERT_OK(err, "bpf_syscall_macro__attach"))
+               goto cleanup;
+
+       /* check whether args of syscall are copied correctly */
+       prctl(exp_arg1, exp_arg2, exp_arg3, exp_arg4, exp_arg5);
+#if defined(__aarch64__) || defined(__s390__)
+       ASSERT_NEQ(skel->bss->arg1, exp_arg1, "syscall_arg1");
+#else
+       ASSERT_EQ(skel->bss->arg1, exp_arg1, "syscall_arg1");
+#endif
+       ASSERT_EQ(skel->bss->arg2, exp_arg2, "syscall_arg2");
+       ASSERT_EQ(skel->bss->arg3, exp_arg3, "syscall_arg3");
+       /* it cannot copy arg4 when uses PT_REGS_PARM4 on x86_64 */
+#ifdef __x86_64__
+       ASSERT_NEQ(skel->bss->arg4_cx, exp_arg4, "syscall_arg4_from_cx");
+#else
+       ASSERT_EQ(skel->bss->arg4_cx, exp_arg4, "syscall_arg4_from_cx");
+#endif
+       ASSERT_EQ(skel->bss->arg4, exp_arg4, "syscall_arg4");
+       ASSERT_EQ(skel->bss->arg5, exp_arg5, "syscall_arg5");
+
+       /* check whether args of syscall are copied correctly for CORE variants */
+       ASSERT_EQ(skel->bss->arg1_core, exp_arg1, "syscall_arg1_core_variant");
+       ASSERT_EQ(skel->bss->arg2_core, exp_arg2, "syscall_arg2_core_variant");
+       ASSERT_EQ(skel->bss->arg3_core, exp_arg3, "syscall_arg3_core_variant");
+       /* it cannot copy arg4 when uses PT_REGS_PARM4_CORE on x86_64 */
+#ifdef __x86_64__
+       ASSERT_NEQ(skel->bss->arg4_core_cx, exp_arg4, "syscall_arg4_from_cx_core_variant");
+#else
+       ASSERT_EQ(skel->bss->arg4_core_cx, exp_arg4, "syscall_arg4_from_cx_core_variant");
+#endif
+       ASSERT_EQ(skel->bss->arg4_core, exp_arg4, "syscall_arg4_core_variant");
+       ASSERT_EQ(skel->bss->arg5_core, exp_arg5, "syscall_arg5_core_variant");
+
+       ASSERT_EQ(skel->bss->option_syscall, exp_arg1, "BPF_KPROBE_SYSCALL_option");
+       ASSERT_EQ(skel->bss->arg2_syscall, exp_arg2, "BPF_KPROBE_SYSCALL_arg2");
+       ASSERT_EQ(skel->bss->arg3_syscall, exp_arg3, "BPF_KPROBE_SYSCALL_arg3");
+       ASSERT_EQ(skel->bss->arg4_syscall, exp_arg4, "BPF_KPROBE_SYSCALL_arg4");
+       ASSERT_EQ(skel->bss->arg5_syscall, exp_arg5, "BPF_KPROBE_SYSCALL_arg5");
+
+cleanup:
+       bpf_syscall_macro__destroy(skel);
+}
index 4ca2751..de24e8f 100644 (file)
@@ -8,20 +8,20 @@
 
 static int sanity_run(struct bpf_program *prog)
 {
-       struct bpf_prog_test_run_attr test_attr = {};
+       LIBBPF_OPTS(bpf_test_run_opts, test_attr);
        __u64 args[] = {1, 2, 3};
-       __u32 duration = 0;
        int err, prog_fd;
 
        prog_fd = bpf_program__fd(prog);
-       test_attr.prog_fd = prog_fd;
        test_attr.ctx_in = args;
        test_attr.ctx_size_in = sizeof(args);
-       err = bpf_prog_test_run_xattr(&test_attr);
-       if (CHECK(err || test_attr.retval, "test_run",
-                 "err %d errno %d retval %d duration %d\n",
-                 err, errno, test_attr.retval, duration))
+       err = bpf_prog_test_run_opts(prog_fd, &test_attr);
+       if (!ASSERT_OK(err, "test_run"))
+               return -1;
+
+       if (!ASSERT_OK(test_attr.retval, "test_run retval"))
                return -1;
+
        return 0;
 }
 
index cf12155..ae93411 100644 (file)
@@ -6,15 +6,18 @@
 
 static int sanity_run(struct bpf_program *prog)
 {
-       __u32 duration, retval;
        int err, prog_fd;
+       LIBBPF_OPTS(bpf_test_run_opts, topts,
+               .data_in = &pkt_v4,
+               .data_size_in = sizeof(pkt_v4),
+               .repeat = 1,
+       );
 
        prog_fd = bpf_program__fd(prog);
-       err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4),
-                               NULL, NULL, &retval, &duration);
-       if (CHECK(err || retval != 123, "test_run",
-                 "err %d errno %d retval %d duration %d\n",
-                 err, errno, retval, duration))
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
+       if (!ASSERT_OK(err, "test_run"))
+               return -1;
+       if (!ASSERT_EQ(topts.retval, 123, "test_run retval"))
                return -1;
        return 0;
 }
index 0f4e49e..7eb0492 100644 (file)
@@ -6,7 +6,7 @@
 static int timer(struct timer *timer_skel)
 {
        int err, prog_fd;
-       __u32 duration = 0, retval;
+       LIBBPF_OPTS(bpf_test_run_opts, topts);
 
        err = timer__attach(timer_skel);
        if (!ASSERT_OK(err, "timer_attach"))
@@ -16,10 +16,9 @@ static int timer(struct timer *timer_skel)
        ASSERT_EQ(timer_skel->data->callback2_check, 52, "callback2_check1");
 
        prog_fd = bpf_program__fd(timer_skel->progs.test1);
-       err = bpf_prog_test_run(prog_fd, 1, NULL, 0,
-                               NULL, NULL, &retval, &duration);
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
        ASSERT_OK(err, "test_run");
-       ASSERT_EQ(retval, 0, "test_run");
+       ASSERT_EQ(topts.retval, 0, "test_run");
        timer__detach(timer_skel);
 
        usleep(50); /* 10 usecs should be enough, but give it extra */
index 949a061..2ee5f5a 100644 (file)
@@ -6,19 +6,18 @@
 
 static int timer_mim(struct timer_mim *timer_skel)
 {
-       __u32 duration = 0, retval;
        __u64 cnt1, cnt2;
        int err, prog_fd, key1 = 1;
+       LIBBPF_OPTS(bpf_test_run_opts, topts);
 
        err = timer_mim__attach(timer_skel);
        if (!ASSERT_OK(err, "timer_attach"))
                return err;
 
        prog_fd = bpf_program__fd(timer_skel->progs.test1);
-       err = bpf_prog_test_run(prog_fd, 1, NULL, 0,
-                               NULL, NULL, &retval, &duration);
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
        ASSERT_OK(err, "test_run");
-       ASSERT_EQ(retval, 0, "test_run");
+       ASSERT_EQ(topts.retval, 0, "test_run");
        timer_mim__detach(timer_skel);
 
        /* check that timer_cb[12] are incrementing 'cnt' */
index 924441d..aabdff7 100644 (file)
@@ -23,8 +23,12 @@ void test_trace_ext(void)
        int err, pkt_fd, ext_fd;
        struct bpf_program *prog;
        char buf[100];
-       __u32 retval;
        __u64 len;
+       LIBBPF_OPTS(bpf_test_run_opts, topts,
+               .data_in = &pkt_v4,
+               .data_size_in = sizeof(pkt_v4),
+               .repeat = 1,
+       );
 
        /* open/load/attach test_pkt_md_access */
        skel_pkt = test_pkt_md_access__open_and_load();
@@ -77,32 +81,32 @@ void test_trace_ext(void)
 
        /* load/attach tracing */
        err = test_trace_ext_tracing__load(skel_trace);
-       if (CHECK(err, "setup", "tracing/test_pkt_md_access_new load failed\n")) {
+       if (!ASSERT_OK(err, "tracing/test_pkt_md_access_new load")) {
                libbpf_strerror(err, buf, sizeof(buf));
                fprintf(stderr, "%s\n", buf);
                goto cleanup;
        }
 
        err = test_trace_ext_tracing__attach(skel_trace);
-       if (CHECK(err, "setup", "tracing/test_pkt_md_access_new attach failed: %d\n", err))
+       if (!ASSERT_OK(err, "tracing/test_pkt_md_access_new attach"))
                goto cleanup;
 
        /* trigger the test */
-       err = bpf_prog_test_run(pkt_fd, 1, &pkt_v4, sizeof(pkt_v4),
-                               NULL, NULL, &retval, &duration);
-       CHECK(err || retval, "run", "err %d errno %d retval %d\n", err, errno, retval);
+       err = bpf_prog_test_run_opts(pkt_fd, &topts);
+       ASSERT_OK(err, "test_run_opts err");
+       ASSERT_OK(topts.retval, "test_run_opts retval");
 
        bss_ext = skel_ext->bss;
        bss_trace = skel_trace->bss;
 
        len = bss_ext->ext_called;
 
-       CHECK(bss_ext->ext_called == 0,
-               "check", "failed to trigger freplace/test_pkt_md_access\n");
-       CHECK(bss_trace->fentry_called != len,
-               "check", "failed to trigger fentry/test_pkt_md_access_new\n");
-       CHECK(bss_trace->fexit_called != len,
-               "check", "failed to trigger fexit/test_pkt_md_access_new\n");
+       ASSERT_NEQ(bss_ext->ext_called, 0,
+                 "failed to trigger freplace/test_pkt_md_access");
+       ASSERT_EQ(bss_trace->fentry_called, len,
+                 "failed to trigger fentry/test_pkt_md_access_new");
+       ASSERT_EQ(bss_trace->fexit_called, len,
+                  "failed to trigger fexit/test_pkt_md_access_new");
 
 cleanup:
        test_trace_ext_tracing__destroy(skel_trace);
index ac65456..ec21c53 100644 (file)
@@ -13,8 +13,14 @@ void test_xdp(void)
        char buf[128];
        struct ipv6hdr iph6;
        struct iphdr iph;
-       __u32 duration, retval, size;
        int err, prog_fd, map_fd;
+       LIBBPF_OPTS(bpf_test_run_opts, topts,
+               .data_in = &pkt_v4,
+               .data_size_in = sizeof(pkt_v4),
+               .data_out = buf,
+               .data_size_out = sizeof(buf),
+               .repeat = 1,
+       );
 
        err = bpf_prog_test_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd);
        if (CHECK_FAIL(err))
@@ -26,21 +32,23 @@ void test_xdp(void)
        bpf_map_update_elem(map_fd, &key4, &value4, 0);
        bpf_map_update_elem(map_fd, &key6, &value6, 0);
 
-       err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4),
-                               buf, &size, &retval, &duration);
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
        memcpy(&iph, buf + sizeof(struct ethhdr), sizeof(iph));
-       CHECK(err || retval != XDP_TX || size != 74 ||
-             iph.protocol != IPPROTO_IPIP, "ipv4",
-             "err %d errno %d retval %d size %d\n",
-             err, errno, retval, size);
+       ASSERT_OK(err, "test_run");
+       ASSERT_EQ(topts.retval, XDP_TX, "ipv4 test_run retval");
+       ASSERT_EQ(topts.data_size_out, 74, "ipv4 test_run data_size_out");
+       ASSERT_EQ(iph.protocol, IPPROTO_IPIP, "ipv4 test_run iph.protocol");
 
-       err = bpf_prog_test_run(prog_fd, 1, &pkt_v6, sizeof(pkt_v6),
-                               buf, &size, &retval, &duration);
+       topts.data_in = &pkt_v6;
+       topts.data_size_in = sizeof(pkt_v6);
+       topts.data_size_out = sizeof(buf);
+
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
        memcpy(&iph6, buf + sizeof(struct ethhdr), sizeof(iph6));
-       CHECK(err || retval != XDP_TX || size != 114 ||
-             iph6.nexthdr != IPPROTO_IPV6, "ipv6",
-             "err %d errno %d retval %d size %d\n",
-             err, errno, retval, size);
+       ASSERT_OK(err, "test_run");
+       ASSERT_EQ(topts.retval, XDP_TX, "ipv6 test_run retval");
+       ASSERT_EQ(topts.data_size_out, 114, "ipv6 test_run data_size_out");
+       ASSERT_EQ(iph6.nexthdr, IPPROTO_IPV6, "ipv6 test_run iph6.nexthdr");
 out:
        bpf_object__close(obj);
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_adjust_frags.c b/tools/testing/selftests/bpf/prog_tests/xdp_adjust_frags.c
new file mode 100644 (file)
index 0000000..d18e6f3
--- /dev/null
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+#include <network_helpers.h>
+
+static void test_xdp_update_frags(void)
+{
+       const char *file = "./test_xdp_update_frags.o";
+       struct bpf_program *prog;
+       struct bpf_object *obj;
+       int err, prog_fd;
+       __u32 *offset;
+       __u8 *buf;
+       LIBBPF_OPTS(bpf_test_run_opts, topts);
+
+       obj = bpf_object__open(file);
+       if (libbpf_get_error(obj))
+               return;
+
+       prog = bpf_object__next_program(obj, NULL);
+       if (bpf_object__load(obj))
+               return;
+
+       prog_fd = bpf_program__fd(prog);
+
+       buf = malloc(128);
+       if (!ASSERT_OK_PTR(buf, "alloc buf 128b"))
+               goto out;
+
+       memset(buf, 0, 128);
+       offset = (__u32 *)buf;
+       *offset = 16;
+       buf[*offset] = 0xaa;            /* marker at offset 16 (head) */
+       buf[*offset + 15] = 0xaa;       /* marker at offset 31 (head) */
+
+       topts.data_in = buf;
+       topts.data_out = buf;
+       topts.data_size_in = 128;
+       topts.data_size_out = 128;
+
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
+
+       /* test_xdp_update_frags: buf[16,31]: 0xaa -> 0xbb */
+       ASSERT_OK(err, "xdp_update_frag");
+       ASSERT_EQ(topts.retval, XDP_PASS, "xdp_update_frag retval");
+       ASSERT_EQ(buf[16], 0xbb, "xdp_update_frag buf[16]");
+       ASSERT_EQ(buf[31], 0xbb, "xdp_update_frag buf[31]");
+
+       free(buf);
+
+       buf = malloc(9000);
+       if (!ASSERT_OK_PTR(buf, "alloc buf 9Kb"))
+               goto out;
+
+       memset(buf, 0, 9000);
+       offset = (__u32 *)buf;
+       *offset = 5000;
+       buf[*offset] = 0xaa;            /* marker at offset 5000 (frag0) */
+       buf[*offset + 15] = 0xaa;       /* marker at offset 5015 (frag0) */
+
+       topts.data_in = buf;
+       topts.data_out = buf;
+       topts.data_size_in = 9000;
+       topts.data_size_out = 9000;
+
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
+
+       /* test_xdp_update_frags: buf[5000,5015]: 0xaa -> 0xbb */
+       ASSERT_OK(err, "xdp_update_frag");
+       ASSERT_EQ(topts.retval, XDP_PASS, "xdp_update_frag retval");
+       ASSERT_EQ(buf[5000], 0xbb, "xdp_update_frag buf[5000]");
+       ASSERT_EQ(buf[5015], 0xbb, "xdp_update_frag buf[5015]");
+
+       memset(buf, 0, 9000);
+       offset = (__u32 *)buf;
+       *offset = 3510;
+       buf[*offset] = 0xaa;            /* marker at offset 3510 (head) */
+       buf[*offset + 15] = 0xaa;       /* marker at offset 3525 (frag0) */
+
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
+
+       /* test_xdp_update_frags: buf[3510,3525]: 0xaa -> 0xbb */
+       ASSERT_OK(err, "xdp_update_frag");
+       ASSERT_EQ(topts.retval, XDP_PASS, "xdp_update_frag retval");
+       ASSERT_EQ(buf[3510], 0xbb, "xdp_update_frag buf[3510]");
+       ASSERT_EQ(buf[3525], 0xbb, "xdp_update_frag buf[3525]");
+
+       memset(buf, 0, 9000);
+       offset = (__u32 *)buf;
+       *offset = 7606;
+       buf[*offset] = 0xaa;            /* marker at offset 7606 (frag0) */
+       buf[*offset + 15] = 0xaa;       /* marker at offset 7621 (frag1) */
+
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
+
+       /* test_xdp_update_frags: buf[7606,7621]: 0xaa -> 0xbb */
+       ASSERT_OK(err, "xdp_update_frag");
+       ASSERT_EQ(topts.retval, XDP_PASS, "xdp_update_frag retval");
+       ASSERT_EQ(buf[7606], 0xbb, "xdp_update_frag buf[7606]");
+       ASSERT_EQ(buf[7621], 0xbb, "xdp_update_frag buf[7621]");
+
+       free(buf);
+out:
+       bpf_object__close(obj);
+}
+
+void test_xdp_adjust_frags(void)
+{
+       if (test__start_subtest("xdp_adjust_frags"))
+               test_xdp_update_frags();
+}
index 3f5a17c..21ceac2 100644 (file)
@@ -5,28 +5,35 @@
 static void test_xdp_adjust_tail_shrink(void)
 {
        const char *file = "./test_xdp_adjust_tail_shrink.o";
-       __u32 duration, retval, size, expect_sz;
+       __u32 expect_sz;
        struct bpf_object *obj;
        int err, prog_fd;
        char buf[128];
+       LIBBPF_OPTS(bpf_test_run_opts, topts,
+               .data_in = &pkt_v4,
+               .data_size_in = sizeof(pkt_v4),
+               .data_out = buf,
+               .data_size_out = sizeof(buf),
+               .repeat = 1,
+       );
 
        err = bpf_prog_test_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd);
-       if (CHECK_FAIL(err))
+       if (ASSERT_OK(err, "test_xdp_adjust_tail_shrink"))
                return;
 
-       err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4),
-                               buf, &size, &retval, &duration);
-
-       CHECK(err || retval != XDP_DROP,
-             "ipv4", "err %d errno %d retval %d size %d\n",
-             err, errno, retval, size);
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
+       ASSERT_OK(err, "ipv4");
+       ASSERT_EQ(topts.retval, XDP_DROP, "ipv4 retval");
 
        expect_sz = sizeof(pkt_v6) - 20;  /* Test shrink with 20 bytes */
-       err = bpf_prog_test_run(prog_fd, 1, &pkt_v6, sizeof(pkt_v6),
-                               buf, &size, &retval, &duration);
-       CHECK(err || retval != XDP_TX || size != expect_sz,
-             "ipv6", "err %d errno %d retval %d size %d expect-size %d\n",
-             err, errno, retval, size, expect_sz);
+       topts.data_in = &pkt_v6;
+       topts.data_size_in = sizeof(pkt_v6);
+       topts.data_size_out = sizeof(buf);
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
+       ASSERT_OK(err, "ipv6");
+       ASSERT_EQ(topts.retval, XDP_TX, "ipv6 retval");
+       ASSERT_EQ(topts.data_size_out, expect_sz, "ipv6 size");
+
        bpf_object__close(obj);
 }
 
@@ -35,25 +42,31 @@ static void test_xdp_adjust_tail_grow(void)
        const char *file = "./test_xdp_adjust_tail_grow.o";
        struct bpf_object *obj;
        char buf[4096]; /* avoid segfault: large buf to hold grow results */
-       __u32 duration, retval, size, expect_sz;
+       __u32 expect_sz;
        int err, prog_fd;
+       LIBBPF_OPTS(bpf_test_run_opts, topts,
+               .data_in = &pkt_v4,
+               .data_size_in = sizeof(pkt_v4),
+               .data_out = buf,
+               .data_size_out = sizeof(buf),
+               .repeat = 1,
+       );
 
        err = bpf_prog_test_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd);
-       if (CHECK_FAIL(err))
+       if (ASSERT_OK(err, "test_xdp_adjust_tail_grow"))
                return;
 
-       err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4),
-                               buf, &size, &retval, &duration);
-       CHECK(err || retval != XDP_DROP,
-             "ipv4", "err %d errno %d retval %d size %d\n",
-             err, errno, retval, size);
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
+       ASSERT_OK(err, "ipv4");
+       ASSERT_EQ(topts.retval, XDP_DROP, "ipv4 retval");
 
        expect_sz = sizeof(pkt_v6) + 40; /* Test grow with 40 bytes */
-       err = bpf_prog_test_run(prog_fd, 1, &pkt_v6, sizeof(pkt_v6) /* 74 */,
-                               buf, &size, &retval, &duration);
-       CHECK(err || retval != XDP_TX || size != expect_sz,
-             "ipv6", "err %d errno %d retval %d size %d expect-size %d\n",
-             err, errno, retval, size, expect_sz);
+       topts.data_in = &pkt_v6;
+       topts.data_size_in = sizeof(pkt_v6);
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
+       ASSERT_OK(err, "ipv6");
+       ASSERT_EQ(topts.retval, XDP_TX, "ipv6 retval");
+       ASSERT_EQ(topts.data_size_out, expect_sz, "ipv6 size");
 
        bpf_object__close(obj);
 }
@@ -65,18 +78,18 @@ static void test_xdp_adjust_tail_grow2(void)
        int tailroom = 320; /* SKB_DATA_ALIGN(sizeof(struct skb_shared_info))*/;
        struct bpf_object *obj;
        int err, cnt, i;
-       int max_grow;
+       int max_grow, prog_fd;
 
-       struct bpf_prog_test_run_attr tattr = {
+       LIBBPF_OPTS(bpf_test_run_opts, tattr,
                .repeat         = 1,
                .data_in        = &buf,
                .data_out       = &buf,
                .data_size_in   = 0, /* Per test */
                .data_size_out  = 0, /* Per test */
-       };
+       );
 
-       err = bpf_prog_test_load(file, BPF_PROG_TYPE_XDP, &obj, &tattr.prog_fd);
-       if (CHECK_ATTR(err, "load", "err %d errno %d\n", err, errno))
+       err = bpf_prog_test_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd);
+       if (ASSERT_OK(err, "test_xdp_adjust_tail_grow"))
                return;
 
        /* Test case-64 */
@@ -84,49 +97,171 @@ static void test_xdp_adjust_tail_grow2(void)
        tattr.data_size_in  =  64; /* Determine test case via pkt size */
        tattr.data_size_out = 128; /* Limit copy_size */
        /* Kernel side alloc packet memory area that is zero init */
-       err = bpf_prog_test_run_xattr(&tattr);
+       err = bpf_prog_test_run_opts(prog_fd, &tattr);
 
-       CHECK_ATTR(errno != ENOSPC /* Due limit copy_size in bpf_test_finish */
-                  || tattr.retval != XDP_TX
-                  || tattr.data_size_out != 192, /* Expected grow size */
-                  "case-64",
-                  "err %d errno %d retval %d size %d\n",
-                  err, errno, tattr.retval, tattr.data_size_out);
+       ASSERT_EQ(errno, ENOSPC, "case-64 errno"); /* Due limit copy_size in bpf_test_finish */
+       ASSERT_EQ(tattr.retval, XDP_TX, "case-64 retval");
+       ASSERT_EQ(tattr.data_size_out, 192, "case-64 data_size_out"); /* Expected grow size */
 
        /* Extra checks for data contents */
-       CHECK_ATTR(tattr.data_size_out != 192
-                  || buf[0]   != 1 ||  buf[63]  != 1  /*  0-63  memset to 1 */
-                  || buf[64]  != 0 ||  buf[127] != 0  /* 64-127 memset to 0 */
-                  || buf[128] != 1 ||  buf[191] != 1, /*128-191 memset to 1 */
-                  "case-64-data",
-                  "err %d errno %d retval %d size %d\n",
-                  err, errno, tattr.retval, tattr.data_size_out);
+       ASSERT_EQ(buf[0], 1, "case-64-data buf[0]"); /*  0-63  memset to 1 */
+       ASSERT_EQ(buf[63], 1, "case-64-data buf[63]");
+       ASSERT_EQ(buf[64], 0, "case-64-data buf[64]"); /* 64-127 memset to 0 */
+       ASSERT_EQ(buf[127], 0, "case-64-data buf[127]");
+       ASSERT_EQ(buf[128], 1, "case-64-data buf[128]"); /* 128-191 memset to 1 */
+       ASSERT_EQ(buf[191], 1, "case-64-data buf[191]");
 
        /* Test case-128 */
        memset(buf, 2, sizeof(buf));
        tattr.data_size_in  = 128; /* Determine test case via pkt size */
        tattr.data_size_out = sizeof(buf);   /* Copy everything */
-       err = bpf_prog_test_run_xattr(&tattr);
+       err = bpf_prog_test_run_opts(prog_fd, &tattr);
 
        max_grow = 4096 - XDP_PACKET_HEADROOM - tailroom; /* 3520 */
-       CHECK_ATTR(err
-                  || tattr.retval != XDP_TX
-                  || tattr.data_size_out != max_grow,/* Expect max grow size */
-                  "case-128",
-                  "err %d errno %d retval %d size %d expect-size %d\n",
-                  err, errno, tattr.retval, tattr.data_size_out, max_grow);
+       ASSERT_OK(err, "case-128");
+       ASSERT_EQ(tattr.retval, XDP_TX, "case-128 retval");
+       ASSERT_EQ(tattr.data_size_out, max_grow, "case-128 data_size_out"); /* Expect max grow */
 
        /* Extra checks for data content: Count grow size, will contain zeros */
        for (i = 0, cnt = 0; i < sizeof(buf); i++) {
                if (buf[i] == 0)
                        cnt++;
        }
-       CHECK_ATTR((cnt != (max_grow - tattr.data_size_in)) /* Grow increase */
-                  || tattr.data_size_out != max_grow, /* Total grow size */
-                  "case-128-data",
-                  "err %d errno %d retval %d size %d grow-size %d\n",
-                  err, errno, tattr.retval, tattr.data_size_out, cnt);
+       ASSERT_EQ(cnt, max_grow - tattr.data_size_in, "case-128-data cnt"); /* Grow increase */
+       ASSERT_EQ(tattr.data_size_out, max_grow, "case-128-data data_size_out"); /* Total grow */
+
+       bpf_object__close(obj);
+}
+
+static void test_xdp_adjust_frags_tail_shrink(void)
+{
+       const char *file = "./test_xdp_adjust_tail_shrink.o";
+       __u32 exp_size;
+       struct bpf_program *prog;
+       struct bpf_object *obj;
+       int err, prog_fd;
+       __u8 *buf;
+       LIBBPF_OPTS(bpf_test_run_opts, topts);
+
+       /* For the individual test cases, the first byte in the packet
+        * indicates which test will be run.
+        */
+       obj = bpf_object__open(file);
+       if (libbpf_get_error(obj))
+               return;
+
+       prog = bpf_object__next_program(obj, NULL);
+       if (bpf_object__load(obj))
+               return;
+
+       prog_fd = bpf_program__fd(prog);
+
+       buf = malloc(9000);
+       if (!ASSERT_OK_PTR(buf, "alloc buf 9Kb"))
+               goto out;
+
+       memset(buf, 0, 9000);
+
+       /* Test case removing 10 bytes from last frag, NOT freeing it */
+       exp_size = 8990; /* 9000 - 10 */
+       topts.data_in = buf;
+       topts.data_out = buf;
+       topts.data_size_in = 9000;
+       topts.data_size_out = 9000;
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
+
+       ASSERT_OK(err, "9Kb-10b");
+       ASSERT_EQ(topts.retval, XDP_TX, "9Kb-10b retval");
+       ASSERT_EQ(topts.data_size_out, exp_size, "9Kb-10b size");
+
+       /* Test case removing one of two pages, assuming 4K pages */
+       buf[0] = 1;
+       exp_size = 4900; /* 9000 - 4100 */
+
+       topts.data_size_out = 9000; /* reset from previous invocation */
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
+
+       ASSERT_OK(err, "9Kb-4Kb");
+       ASSERT_EQ(topts.retval, XDP_TX, "9Kb-4Kb retval");
+       ASSERT_EQ(topts.data_size_out, exp_size, "9Kb-4Kb size");
+
+       /* Test case removing two pages resulting in a linear xdp_buff */
+       buf[0] = 2;
+       exp_size = 800; /* 9000 - 8200 */
+       topts.data_size_out = 9000; /* reset from previous invocation */
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
+
+       ASSERT_OK(err, "9Kb-9Kb");
+       ASSERT_EQ(topts.retval, XDP_TX, "9Kb-9Kb retval");
+       ASSERT_EQ(topts.data_size_out, exp_size, "9Kb-9Kb size");
+
+       free(buf);
+out:
+       bpf_object__close(obj);
+}
+
+static void test_xdp_adjust_frags_tail_grow(void)
+{
+       const char *file = "./test_xdp_adjust_tail_grow.o";
+       __u32 exp_size;
+       struct bpf_program *prog;
+       struct bpf_object *obj;
+       int err, i, prog_fd;
+       __u8 *buf;
+       LIBBPF_OPTS(bpf_test_run_opts, topts);
+
+       obj = bpf_object__open(file);
+       if (libbpf_get_error(obj))
+               return;
+
+       prog = bpf_object__next_program(obj, NULL);
+       if (bpf_object__load(obj))
+               return;
+
+       prog_fd = bpf_program__fd(prog);
+
+       buf = malloc(16384);
+       if (!ASSERT_OK_PTR(buf, "alloc buf 16Kb"))
+               goto out;
+
+       /* Test case add 10 bytes to last frag */
+       memset(buf, 1, 16384);
+       exp_size = 9000 + 10;
+
+       topts.data_in = buf;
+       topts.data_out = buf;
+       topts.data_size_in = 9000;
+       topts.data_size_out = 16384;
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
+
+       ASSERT_OK(err, "9Kb+10b");
+       ASSERT_EQ(topts.retval, XDP_TX, "9Kb+10b retval");
+       ASSERT_EQ(topts.data_size_out, exp_size, "9Kb+10b size");
+
+       for (i = 0; i < 9000; i++)
+               ASSERT_EQ(buf[i], 1, "9Kb+10b-old");
+
+       for (i = 9000; i < 9010; i++)
+               ASSERT_EQ(buf[i], 0, "9Kb+10b-new");
+
+       for (i = 9010; i < 16384; i++)
+               ASSERT_EQ(buf[i], 1, "9Kb+10b-untouched");
+
+       /* Test a too large grow */
+       memset(buf, 1, 16384);
+       exp_size = 9001;
+
+       topts.data_in = topts.data_out = buf;
+       topts.data_size_in = 9001;
+       topts.data_size_out = 16384;
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
+
+       ASSERT_OK(err, "9Kb+10b");
+       ASSERT_EQ(topts.retval, XDP_DROP, "9Kb+10b retval");
+       ASSERT_EQ(topts.data_size_out, exp_size, "9Kb+10b size");
 
+       free(buf);
+out:
        bpf_object__close(obj);
 }
 
@@ -138,4 +273,8 @@ void test_xdp_adjust_tail(void)
                test_xdp_adjust_tail_grow();
        if (test__start_subtest("xdp_adjust_tail_grow2"))
                test_xdp_adjust_tail_grow2();
+       if (test__start_subtest("xdp_adjust_frags_tail_shrink"))
+               test_xdp_adjust_frags_tail_shrink();
+       if (test__start_subtest("xdp_adjust_frags_tail_grow"))
+               test_xdp_adjust_frags_tail_grow();
 }
index c6fa390..62aa3ed 100644 (file)
@@ -11,8 +11,7 @@ void serial_test_xdp_attach(void)
        const char *file = "./test_xdp.o";
        struct bpf_prog_info info = {};
        int err, fd1, fd2, fd3;
-       DECLARE_LIBBPF_OPTS(bpf_xdp_set_link_opts, opts,
-                           .old_fd = -1);
+       LIBBPF_OPTS(bpf_xdp_attach_opts, opts);
 
        len = sizeof(info);
 
@@ -38,49 +37,47 @@ void serial_test_xdp_attach(void)
        if (CHECK_FAIL(err))
                goto out_2;
 
-       err = bpf_set_link_xdp_fd_opts(IFINDEX_LO, fd1, XDP_FLAGS_REPLACE,
-                                      &opts);
+       err = bpf_xdp_attach(IFINDEX_LO, fd1, XDP_FLAGS_REPLACE, &opts);
        if (CHECK(err, "load_ok", "initial load failed"))
                goto out_close;
 
-       err = bpf_get_link_xdp_id(IFINDEX_LO, &id0, 0);
+       err = bpf_xdp_query_id(IFINDEX_LO, 0, &id0);
        if (CHECK(err || id0 != id1, "id1_check",
                  "loaded prog id %u != id1 %u, err %d", id0, id1, err))
                goto out_close;
 
-       err = bpf_set_link_xdp_fd_opts(IFINDEX_LO, fd2, XDP_FLAGS_REPLACE,
-                                      &opts);
+       err = bpf_xdp_attach(IFINDEX_LO, fd2, XDP_FLAGS_REPLACE, &opts);
        if (CHECK(!err, "load_fail", "load with expected id didn't fail"))
                goto out;
 
-       opts.old_fd = fd1;
-       err = bpf_set_link_xdp_fd_opts(IFINDEX_LO, fd2, 0, &opts);
+       opts.old_prog_fd = fd1;
+       err = bpf_xdp_attach(IFINDEX_LO, fd2, 0, &opts);
        if (CHECK(err, "replace_ok", "replace valid old_fd failed"))
                goto out;
-       err = bpf_get_link_xdp_id(IFINDEX_LO, &id0, 0);
+       err = bpf_xdp_query_id(IFINDEX_LO, 0, &id0);
        if (CHECK(err || id0 != id2, "id2_check",
                  "loaded prog id %u != id2 %u, err %d", id0, id2, err))
                goto out_close;
 
-       err = bpf_set_link_xdp_fd_opts(IFINDEX_LO, fd3, 0, &opts);
+       err = bpf_xdp_attach(IFINDEX_LO, fd3, 0, &opts);
        if (CHECK(!err, "replace_fail", "replace invalid old_fd didn't fail"))
                goto out;
 
-       err = bpf_set_link_xdp_fd_opts(IFINDEX_LO, -1, 0, &opts);
+       err = bpf_xdp_detach(IFINDEX_LO, 0, &opts);
        if (CHECK(!err, "remove_fail", "remove invalid old_fd didn't fail"))
                goto out;
 
-       opts.old_fd = fd2;
-       err = bpf_set_link_xdp_fd_opts(IFINDEX_LO, -1, 0, &opts);
+       opts.old_prog_fd = fd2;
+       err = bpf_xdp_detach(IFINDEX_LO, 0, &opts);
        if (CHECK(err, "remove_ok", "remove valid old_fd failed"))
                goto out;
 
-       err = bpf_get_link_xdp_id(IFINDEX_LO, &id0, 0);
+       err = bpf_xdp_query_id(IFINDEX_LO, 0, &id0);
        if (CHECK(err || id0 != 0, "unload_check",
                  "loaded prog id %u != 0, err %d", id0, err))
                goto out_close;
 out:
-       bpf_set_link_xdp_fd(IFINDEX_LO, -1, 0);
+       bpf_xdp_detach(IFINDEX_LO, 0, NULL);
 out_close:
        bpf_object__close(obj3);
 out_2:
index c98a897..76967d8 100644 (file)
@@ -10,40 +10,101 @@ struct meta {
        int pkt_len;
 };
 
+struct test_ctx_s {
+       bool passed;
+       int pkt_size;
+};
+
+struct test_ctx_s test_ctx;
+
 static void on_sample(void *ctx, int cpu, void *data, __u32 size)
 {
-       int duration = 0;
        struct meta *meta = (struct meta *)data;
        struct ipv4_packet *trace_pkt_v4 = data + sizeof(*meta);
+       unsigned char *raw_pkt = data + sizeof(*meta);
+       struct test_ctx_s *tst_ctx = ctx;
+
+       ASSERT_GE(size, sizeof(pkt_v4) + sizeof(*meta), "check_size");
+       ASSERT_EQ(meta->ifindex, if_nametoindex("lo"), "check_meta_ifindex");
+       ASSERT_EQ(meta->pkt_len, tst_ctx->pkt_size, "check_meta_pkt_len");
+       ASSERT_EQ(memcmp(trace_pkt_v4, &pkt_v4, sizeof(pkt_v4)), 0,
+                 "check_packet_content");
+
+       if (meta->pkt_len > sizeof(pkt_v4)) {
+               for (int i = 0; i < meta->pkt_len - sizeof(pkt_v4); i++)
+                       ASSERT_EQ(raw_pkt[i + sizeof(pkt_v4)], (unsigned char)i,
+                                 "check_packet_content");
+       }
+
+       tst_ctx->passed = true;
+}
 
-       if (CHECK(size < sizeof(pkt_v4) + sizeof(*meta),
-                 "check_size", "size %u < %zu\n",
-                 size, sizeof(pkt_v4) + sizeof(*meta)))
-               return;
+#define BUF_SZ 9000
 
-       if (CHECK(meta->ifindex != if_nametoindex("lo"), "check_meta_ifindex",
-                 "meta->ifindex = %d\n", meta->ifindex))
+static void run_xdp_bpf2bpf_pkt_size(int pkt_fd, struct perf_buffer *pb,
+                                    struct test_xdp_bpf2bpf *ftrace_skel,
+                                    int pkt_size)
+{
+       __u8 *buf, *buf_in;
+       int err;
+       LIBBPF_OPTS(bpf_test_run_opts, topts);
+
+       if (!ASSERT_LE(pkt_size, BUF_SZ, "pkt_size") ||
+           !ASSERT_GE(pkt_size, sizeof(pkt_v4), "pkt_size"))
                return;
 
-       if (CHECK(meta->pkt_len != sizeof(pkt_v4), "check_meta_pkt_len",
-                 "meta->pkt_len = %zd\n", sizeof(pkt_v4)))
+       buf_in = malloc(BUF_SZ);
+       if (!ASSERT_OK_PTR(buf_in, "buf_in malloc()"))
                return;
 
-       if (CHECK(memcmp(trace_pkt_v4, &pkt_v4, sizeof(pkt_v4)),
-                 "check_packet_content", "content not the same\n"))
+       buf = malloc(BUF_SZ);
+       if (!ASSERT_OK_PTR(buf, "buf malloc()")) {
+               free(buf_in);
                return;
+       }
+
+       test_ctx.passed = false;
+       test_ctx.pkt_size = pkt_size;
+
+       memcpy(buf_in, &pkt_v4, sizeof(pkt_v4));
+       if (pkt_size > sizeof(pkt_v4)) {
+               for (int i = 0; i < (pkt_size - sizeof(pkt_v4)); i++)
+                       buf_in[i + sizeof(pkt_v4)] = i;
+       }
+
+       /* Run test program */
+       topts.data_in = buf_in;
+       topts.data_size_in = pkt_size;
+       topts.data_out = buf;
+       topts.data_size_out = BUF_SZ;
+
+       err = bpf_prog_test_run_opts(pkt_fd, &topts);
+
+       ASSERT_OK(err, "ipv4");
+       ASSERT_EQ(topts.retval, XDP_PASS, "ipv4 retval");
+       ASSERT_EQ(topts.data_size_out, pkt_size, "ipv4 size");
+
+       /* Make sure bpf_xdp_output() was triggered and it sent the expected
+        * data to the perf ring buffer.
+        */
+       err = perf_buffer__poll(pb, 100);
+
+       ASSERT_GE(err, 0, "perf_buffer__poll");
+       ASSERT_TRUE(test_ctx.passed, "test passed");
+       /* Verify test results */
+       ASSERT_EQ(ftrace_skel->bss->test_result_fentry, if_nametoindex("lo"),
+                 "fentry result");
+       ASSERT_EQ(ftrace_skel->bss->test_result_fexit, XDP_PASS, "fexit result");
 
-       *(bool *)ctx = true;
+       free(buf);
+       free(buf_in);
 }
 
 void test_xdp_bpf2bpf(void)
 {
-       __u32 duration = 0, retval, size;
-       char buf[128];
        int err, pkt_fd, map_fd;
-       bool passed = false;
-       struct iphdr iph;
-       struct iptnl_info value4 = {.family = AF_INET};
+       int pkt_sizes[] = {sizeof(pkt_v4), 1024, 4100, 8200};
+       struct iptnl_info value4 = {.family = AF_INET6};
        struct test_xdp *pkt_skel = NULL;
        struct test_xdp_bpf2bpf *ftrace_skel = NULL;
        struct vip key4 = {.protocol = 6, .family = AF_INET};
@@ -52,7 +113,7 @@ void test_xdp_bpf2bpf(void)
 
        /* Load XDP program to introspect */
        pkt_skel = test_xdp__open_and_load();
-       if (CHECK(!pkt_skel, "pkt_skel_load", "test_xdp skeleton failed\n"))
+       if (!ASSERT_OK_PTR(pkt_skel, "test_xdp__open_and_load"))
                return;
 
        pkt_fd = bpf_program__fd(pkt_skel->progs._xdp_tx_iptunnel);
@@ -62,7 +123,7 @@ void test_xdp_bpf2bpf(void)
 
        /* Load trace program */
        ftrace_skel = test_xdp_bpf2bpf__open();
-       if (CHECK(!ftrace_skel, "__open", "ftrace skeleton failed\n"))
+       if (!ASSERT_OK_PTR(ftrace_skel, "test_xdp_bpf2bpf__open"))
                goto out;
 
        /* Demonstrate the bpf_program__set_attach_target() API rather than
@@ -77,50 +138,24 @@ void test_xdp_bpf2bpf(void)
        bpf_program__set_attach_target(prog, pkt_fd, "_xdp_tx_iptunnel");
 
        err = test_xdp_bpf2bpf__load(ftrace_skel);
-       if (CHECK(err, "__load", "ftrace skeleton failed\n"))
+       if (!ASSERT_OK(err, "test_xdp_bpf2bpf__load"))
                goto out;
 
        err = test_xdp_bpf2bpf__attach(ftrace_skel);
-       if (CHECK(err, "ftrace_attach", "ftrace attach failed: %d\n", err))
+       if (!ASSERT_OK(err, "test_xdp_bpf2bpf__attach"))
                goto out;
 
        /* Set up perf buffer */
-       pb = perf_buffer__new(bpf_map__fd(ftrace_skel->maps.perf_buf_map), 1,
-                             on_sample, NULL, &passed, NULL);
+       pb = perf_buffer__new(bpf_map__fd(ftrace_skel->maps.perf_buf_map), 8,
+                             on_sample, NULL, &test_ctx, NULL);
        if (!ASSERT_OK_PTR(pb, "perf_buf__new"))
                goto out;
 
-       /* Run test program */
-       err = bpf_prog_test_run(pkt_fd, 1, &pkt_v4, sizeof(pkt_v4),
-                               buf, &size, &retval, &duration);
-       memcpy(&iph, buf + sizeof(struct ethhdr), sizeof(iph));
-       if (CHECK(err || retval != XDP_TX || size != 74 ||
-                 iph.protocol != IPPROTO_IPIP, "ipv4",
-                 "err %d errno %d retval %d size %d\n",
-                 err, errno, retval, size))
-               goto out;
-
-       /* Make sure bpf_xdp_output() was triggered and it sent the expected
-        * data to the perf ring buffer.
-        */
-       err = perf_buffer__poll(pb, 100);
-       if (CHECK(err < 0, "perf_buffer__poll", "err %d\n", err))
-               goto out;
-
-       CHECK_FAIL(!passed);
-
-       /* Verify test results */
-       if (CHECK(ftrace_skel->bss->test_result_fentry != if_nametoindex("lo"),
-                 "result", "fentry failed err %llu\n",
-                 ftrace_skel->bss->test_result_fentry))
-               goto out;
-
-       CHECK(ftrace_skel->bss->test_result_fexit != XDP_TX, "result",
-             "fexit failed err %llu\n", ftrace_skel->bss->test_result_fexit);
-
+       for (int i = 0; i < ARRAY_SIZE(pkt_sizes); i++)
+               run_xdp_bpf2bpf_pkt_size(pkt_fd, pb, ftrace_skel,
+                                        pkt_sizes[i]);
 out:
-       if (pb)
-               perf_buffer__free(pb);
+       perf_buffer__free(pb);
        test_xdp__destroy(pkt_skel);
        test_xdp_bpf2bpf__destroy(ftrace_skel);
 }
index fd812bd..f775a16 100644 (file)
@@ -3,11 +3,12 @@
 #include <linux/if_link.h>
 #include <test_progs.h>
 
+#include "test_xdp_with_cpumap_frags_helpers.skel.h"
 #include "test_xdp_with_cpumap_helpers.skel.h"
 
 #define IFINDEX_LO     1
 
-void serial_test_xdp_cpumap_attach(void)
+static void test_xdp_with_cpumap_helpers(void)
 {
        struct test_xdp_with_cpumap_helpers *skel;
        struct bpf_prog_info info = {};
@@ -23,11 +24,11 @@ void serial_test_xdp_cpumap_attach(void)
                return;
 
        prog_fd = bpf_program__fd(skel->progs.xdp_redir_prog);
-       err = bpf_set_link_xdp_fd(IFINDEX_LO, prog_fd, XDP_FLAGS_SKB_MODE);
+       err = bpf_xdp_attach(IFINDEX_LO, prog_fd, XDP_FLAGS_SKB_MODE, NULL);
        if (!ASSERT_OK(err, "Generic attach of program with 8-byte CPUMAP"))
                goto out_close;
 
-       err = bpf_set_link_xdp_fd(IFINDEX_LO, -1, XDP_FLAGS_SKB_MODE);
+       err = bpf_xdp_detach(IFINDEX_LO, XDP_FLAGS_SKB_MODE, NULL);
        ASSERT_OK(err, "XDP program detach");
 
        prog_fd = bpf_program__fd(skel->progs.xdp_dummy_cm);
@@ -45,15 +46,76 @@ void serial_test_xdp_cpumap_attach(void)
        ASSERT_EQ(info.id, val.bpf_prog.id, "Match program id to cpumap entry prog_id");
 
        /* can not attach BPF_XDP_CPUMAP program to a device */
-       err = bpf_set_link_xdp_fd(IFINDEX_LO, prog_fd, XDP_FLAGS_SKB_MODE);
+       err = bpf_xdp_attach(IFINDEX_LO, prog_fd, XDP_FLAGS_SKB_MODE, NULL);
        if (!ASSERT_NEQ(err, 0, "Attach of BPF_XDP_CPUMAP program"))
-               bpf_set_link_xdp_fd(IFINDEX_LO, -1, XDP_FLAGS_SKB_MODE);
+               bpf_xdp_detach(IFINDEX_LO, XDP_FLAGS_SKB_MODE, NULL);
 
        val.qsize = 192;
        val.bpf_prog.fd = bpf_program__fd(skel->progs.xdp_dummy_prog);
        err = bpf_map_update_elem(map_fd, &idx, &val, 0);
        ASSERT_NEQ(err, 0, "Add non-BPF_XDP_CPUMAP program to cpumap entry");
 
+       /* Try to attach BPF_XDP program with frags to cpumap when we have
+        * already loaded a BPF_XDP program on the map
+        */
+       idx = 1;
+       val.qsize = 192;
+       val.bpf_prog.fd = bpf_program__fd(skel->progs.xdp_dummy_cm_frags);
+       err = bpf_map_update_elem(map_fd, &idx, &val, 0);
+       ASSERT_NEQ(err, 0, "Add BPF_XDP program with frags to cpumap entry");
+
 out_close:
        test_xdp_with_cpumap_helpers__destroy(skel);
 }
+
+static void test_xdp_with_cpumap_frags_helpers(void)
+{
+       struct test_xdp_with_cpumap_frags_helpers *skel;
+       struct bpf_prog_info info = {};
+       __u32 len = sizeof(info);
+       struct bpf_cpumap_val val = {
+               .qsize = 192,
+       };
+       int err, frags_prog_fd, map_fd;
+       __u32 idx = 0;
+
+       skel = test_xdp_with_cpumap_frags_helpers__open_and_load();
+       if (!ASSERT_OK_PTR(skel, "test_xdp_with_cpumap_helpers__open_and_load"))
+               return;
+
+       frags_prog_fd = bpf_program__fd(skel->progs.xdp_dummy_cm_frags);
+       map_fd = bpf_map__fd(skel->maps.cpu_map);
+       err = bpf_obj_get_info_by_fd(frags_prog_fd, &info, &len);
+       if (!ASSERT_OK(err, "bpf_obj_get_info_by_fd"))
+               goto out_close;
+
+       val.bpf_prog.fd = frags_prog_fd;
+       err = bpf_map_update_elem(map_fd, &idx, &val, 0);
+       ASSERT_OK(err, "Add program to cpumap entry");
+
+       err = bpf_map_lookup_elem(map_fd, &idx, &val);
+       ASSERT_OK(err, "Read cpumap entry");
+       ASSERT_EQ(info.id, val.bpf_prog.id,
+                 "Match program id to cpumap entry prog_id");
+
+       /* Try to attach BPF_XDP program to cpumap when we have
+        * already loaded a BPF_XDP program with frags on the map
+        */
+       idx = 1;
+       val.qsize = 192;
+       val.bpf_prog.fd = bpf_program__fd(skel->progs.xdp_dummy_cm);
+       err = bpf_map_update_elem(map_fd, &idx, &val, 0);
+       ASSERT_NEQ(err, 0, "Add BPF_XDP program to cpumap entry");
+
+out_close:
+       test_xdp_with_cpumap_frags_helpers__destroy(skel);
+}
+
+void serial_test_xdp_cpumap_attach(void)
+{
+       if (test__start_subtest("CPUMAP with programs in entries"))
+               test_xdp_with_cpumap_helpers();
+
+       if (test__start_subtest("CPUMAP with frags programs in entries"))
+               test_xdp_with_cpumap_frags_helpers();
+}
index 3079d55..ead4001 100644 (file)
@@ -4,6 +4,7 @@
 #include <test_progs.h>
 
 #include "test_xdp_devmap_helpers.skel.h"
+#include "test_xdp_with_devmap_frags_helpers.skel.h"
 #include "test_xdp_with_devmap_helpers.skel.h"
 
 #define IFINDEX_LO 1
@@ -25,11 +26,11 @@ static void test_xdp_with_devmap_helpers(void)
                return;
 
        dm_fd = bpf_program__fd(skel->progs.xdp_redir_prog);
-       err = bpf_set_link_xdp_fd(IFINDEX_LO, dm_fd, XDP_FLAGS_SKB_MODE);
+       err = bpf_xdp_attach(IFINDEX_LO, dm_fd, XDP_FLAGS_SKB_MODE, NULL);
        if (!ASSERT_OK(err, "Generic attach of program with 8-byte devmap"))
                goto out_close;
 
-       err = bpf_set_link_xdp_fd(IFINDEX_LO, -1, XDP_FLAGS_SKB_MODE);
+       err = bpf_xdp_detach(IFINDEX_LO, XDP_FLAGS_SKB_MODE, NULL);
        ASSERT_OK(err, "XDP program detach");
 
        dm_fd = bpf_program__fd(skel->progs.xdp_dummy_dm);
@@ -47,15 +48,24 @@ static void test_xdp_with_devmap_helpers(void)
        ASSERT_EQ(info.id, val.bpf_prog.id, "Match program id to devmap entry prog_id");
 
        /* can not attach BPF_XDP_DEVMAP program to a device */
-       err = bpf_set_link_xdp_fd(IFINDEX_LO, dm_fd, XDP_FLAGS_SKB_MODE);
+       err = bpf_xdp_attach(IFINDEX_LO, dm_fd, XDP_FLAGS_SKB_MODE, NULL);
        if (!ASSERT_NEQ(err, 0, "Attach of BPF_XDP_DEVMAP program"))
-               bpf_set_link_xdp_fd(IFINDEX_LO, -1, XDP_FLAGS_SKB_MODE);
+               bpf_xdp_detach(IFINDEX_LO, XDP_FLAGS_SKB_MODE, NULL);
 
        val.ifindex = 1;
        val.bpf_prog.fd = bpf_program__fd(skel->progs.xdp_dummy_prog);
        err = bpf_map_update_elem(map_fd, &idx, &val, 0);
        ASSERT_NEQ(err, 0, "Add non-BPF_XDP_DEVMAP program to devmap entry");
 
+       /* Try to attach BPF_XDP program with frags to devmap when we have
+        * already loaded a BPF_XDP program on the map
+        */
+       idx = 1;
+       val.ifindex = 1;
+       val.bpf_prog.fd = bpf_program__fd(skel->progs.xdp_dummy_dm_frags);
+       err = bpf_map_update_elem(map_fd, &idx, &val, 0);
+       ASSERT_NEQ(err, 0, "Add BPF_XDP program with frags to devmap entry");
+
 out_close:
        test_xdp_with_devmap_helpers__destroy(skel);
 }
@@ -71,12 +81,57 @@ static void test_neg_xdp_devmap_helpers(void)
        }
 }
 
+static void test_xdp_with_devmap_frags_helpers(void)
+{
+       struct test_xdp_with_devmap_frags_helpers *skel;
+       struct bpf_prog_info info = {};
+       struct bpf_devmap_val val = {
+               .ifindex = IFINDEX_LO,
+       };
+       __u32 len = sizeof(info);
+       int err, dm_fd_frags, map_fd;
+       __u32 idx = 0;
+
+       skel = test_xdp_with_devmap_frags_helpers__open_and_load();
+       if (!ASSERT_OK_PTR(skel, "test_xdp_with_devmap_helpers__open_and_load"))
+               return;
+
+       dm_fd_frags = bpf_program__fd(skel->progs.xdp_dummy_dm_frags);
+       map_fd = bpf_map__fd(skel->maps.dm_ports);
+       err = bpf_obj_get_info_by_fd(dm_fd_frags, &info, &len);
+       if (!ASSERT_OK(err, "bpf_obj_get_info_by_fd"))
+               goto out_close;
+
+       val.bpf_prog.fd = dm_fd_frags;
+       err = bpf_map_update_elem(map_fd, &idx, &val, 0);
+       ASSERT_OK(err, "Add frags program to devmap entry");
+
+       err = bpf_map_lookup_elem(map_fd, &idx, &val);
+       ASSERT_OK(err, "Read devmap entry");
+       ASSERT_EQ(info.id, val.bpf_prog.id,
+                 "Match program id to devmap entry prog_id");
+
+       /* Try to attach BPF_XDP program to devmap when we have
+        * already loaded a BPF_XDP program with frags on the map
+        */
+       idx = 1;
+       val.ifindex = 1;
+       val.bpf_prog.fd = bpf_program__fd(skel->progs.xdp_dummy_dm);
+       err = bpf_map_update_elem(map_fd, &idx, &val, 0);
+       ASSERT_NEQ(err, 0, "Add BPF_XDP program to devmap entry");
+
+out_close:
+       test_xdp_with_devmap_frags_helpers__destroy(skel);
+}
 
 void serial_test_xdp_devmap_attach(void)
 {
        if (test__start_subtest("DEVMAP with programs in entries"))
                test_xdp_with_devmap_helpers();
 
+       if (test__start_subtest("DEVMAP with frags programs in entries"))
+               test_xdp_with_devmap_frags_helpers();
+
        if (test__start_subtest("Verifier check of DEVMAP programs"))
                test_neg_xdp_devmap_helpers();
 }
index abe48e8..0d01ff6 100644 (file)
@@ -14,13 +14,13 @@ void serial_test_xdp_info(void)
 
        /* Get prog_id for XDP_ATTACHED_NONE mode */
 
-       err = bpf_get_link_xdp_id(IFINDEX_LO, &prog_id, 0);
+       err = bpf_xdp_query_id(IFINDEX_LO, 0, &prog_id);
        if (CHECK(err, "get_xdp_none", "errno=%d\n", errno))
                return;
        if (CHECK(prog_id, "prog_id_none", "unexpected prog_id=%u\n", prog_id))
                return;
 
-       err = bpf_get_link_xdp_id(IFINDEX_LO, &prog_id, XDP_FLAGS_SKB_MODE);
+       err = bpf_xdp_query_id(IFINDEX_LO, XDP_FLAGS_SKB_MODE, &prog_id);
        if (CHECK(err, "get_xdp_none_skb", "errno=%d\n", errno))
                return;
        if (CHECK(prog_id, "prog_id_none_skb", "unexpected prog_id=%u\n",
@@ -37,32 +37,32 @@ void serial_test_xdp_info(void)
        if (CHECK(err, "get_prog_info", "errno=%d\n", errno))
                goto out_close;
 
-       err = bpf_set_link_xdp_fd(IFINDEX_LO, prog_fd, XDP_FLAGS_SKB_MODE);
+       err = bpf_xdp_attach(IFINDEX_LO, prog_fd, XDP_FLAGS_SKB_MODE, NULL);
        if (CHECK(err, "set_xdp_skb", "errno=%d\n", errno))
                goto out_close;
 
        /* Get prog_id for single prog mode */
 
-       err = bpf_get_link_xdp_id(IFINDEX_LO, &prog_id, 0);
+       err = bpf_xdp_query_id(IFINDEX_LO, 0, &prog_id);
        if (CHECK(err, "get_xdp", "errno=%d\n", errno))
                goto out;
        if (CHECK(prog_id != info.id, "prog_id", "prog_id not available\n"))
                goto out;
 
-       err = bpf_get_link_xdp_id(IFINDEX_LO, &prog_id, XDP_FLAGS_SKB_MODE);
+       err = bpf_xdp_query_id(IFINDEX_LO, XDP_FLAGS_SKB_MODE, &prog_id);
        if (CHECK(err, "get_xdp_skb", "errno=%d\n", errno))
                goto out;
        if (CHECK(prog_id != info.id, "prog_id_skb", "prog_id not available\n"))
                goto out;
 
-       err = bpf_get_link_xdp_id(IFINDEX_LO, &prog_id, XDP_FLAGS_DRV_MODE);
+       err = bpf_xdp_query_id(IFINDEX_LO, XDP_FLAGS_DRV_MODE, &prog_id);
        if (CHECK(err, "get_xdp_drv", "errno=%d\n", errno))
                goto out;
        if (CHECK(prog_id, "prog_id_drv", "unexpected prog_id=%u\n", prog_id))
                goto out;
 
 out:
-       bpf_set_link_xdp_fd(IFINDEX_LO, -1, 0);
+       bpf_xdp_detach(IFINDEX_LO, 0, NULL);
 out_close:
        bpf_object__close(obj);
 }
index b2b357f..3e9d5c5 100644 (file)
@@ -8,9 +8,9 @@
 
 void serial_test_xdp_link(void)
 {
-       DECLARE_LIBBPF_OPTS(bpf_xdp_set_link_opts, opts, .old_fd = -1);
        struct test_xdp_link *skel1 = NULL, *skel2 = NULL;
        __u32 id1, id2, id0 = 0, prog_fd1, prog_fd2;
+       LIBBPF_OPTS(bpf_xdp_attach_opts, opts);
        struct bpf_link_info link_info;
        struct bpf_prog_info prog_info;
        struct bpf_link *link;
@@ -41,12 +41,12 @@ void serial_test_xdp_link(void)
        id2 = prog_info.id;
 
        /* set initial prog attachment */
-       err = bpf_set_link_xdp_fd_opts(IFINDEX_LO, prog_fd1, XDP_FLAGS_REPLACE, &opts);
+       err = bpf_xdp_attach(IFINDEX_LO, prog_fd1, XDP_FLAGS_REPLACE, &opts);
        if (!ASSERT_OK(err, "fd_attach"))
                goto cleanup;
 
        /* validate prog ID */
-       err = bpf_get_link_xdp_id(IFINDEX_LO, &id0, 0);
+       err = bpf_xdp_query_id(IFINDEX_LO, 0, &id0);
        if (!ASSERT_OK(err, "id1_check_err") || !ASSERT_EQ(id0, id1, "id1_check_val"))
                goto cleanup;
 
@@ -55,14 +55,14 @@ void serial_test_xdp_link(void)
        if (!ASSERT_ERR_PTR(link, "link_attach_should_fail")) {
                bpf_link__destroy(link);
                /* best-effort detach prog */
-               opts.old_fd = prog_fd1;
-               bpf_set_link_xdp_fd_opts(IFINDEX_LO, -1, XDP_FLAGS_REPLACE, &opts);
+               opts.old_prog_fd = prog_fd1;
+               bpf_xdp_detach(IFINDEX_LO, XDP_FLAGS_REPLACE, &opts);
                goto cleanup;
        }
 
        /* detach BPF program */
-       opts.old_fd = prog_fd1;
-       err = bpf_set_link_xdp_fd_opts(IFINDEX_LO, -1, XDP_FLAGS_REPLACE, &opts);
+       opts.old_prog_fd = prog_fd1;
+       err = bpf_xdp_detach(IFINDEX_LO, XDP_FLAGS_REPLACE, &opts);
        if (!ASSERT_OK(err, "prog_detach"))
                goto cleanup;
 
@@ -73,23 +73,23 @@ void serial_test_xdp_link(void)
        skel1->links.xdp_handler = link;
 
        /* validate prog ID */
-       err = bpf_get_link_xdp_id(IFINDEX_LO, &id0, 0);
+       err = bpf_xdp_query_id(IFINDEX_LO, 0, &id0);
        if (!ASSERT_OK(err, "id1_check_err") || !ASSERT_EQ(id0, id1, "id1_check_val"))
                goto cleanup;
 
        /* BPF prog attach is not allowed to replace BPF link */
-       opts.old_fd = prog_fd1;
-       err = bpf_set_link_xdp_fd_opts(IFINDEX_LO, prog_fd2, XDP_FLAGS_REPLACE, &opts);
+       opts.old_prog_fd = prog_fd1;
+       err = bpf_xdp_attach(IFINDEX_LO, prog_fd2, XDP_FLAGS_REPLACE, &opts);
        if (!ASSERT_ERR(err, "prog_attach_fail"))
                goto cleanup;
 
        /* Can't force-update when BPF link is active */
-       err = bpf_set_link_xdp_fd(IFINDEX_LO, prog_fd2, 0);
+       err = bpf_xdp_attach(IFINDEX_LO, prog_fd2, 0, NULL);
        if (!ASSERT_ERR(err, "prog_update_fail"))
                goto cleanup;
 
        /* Can't force-detach when BPF link is active */
-       err = bpf_set_link_xdp_fd(IFINDEX_LO, -1, 0);
+       err = bpf_xdp_detach(IFINDEX_LO, 0, NULL);
        if (!ASSERT_ERR(err, "prog_detach_fail"))
                goto cleanup;
 
@@ -109,7 +109,7 @@ void serial_test_xdp_link(void)
                goto cleanup;
        skel2->links.xdp_handler = link;
 
-       err = bpf_get_link_xdp_id(IFINDEX_LO, &id0, 0);
+       err = bpf_xdp_query_id(IFINDEX_LO, 0, &id0);
        if (!ASSERT_OK(err, "id2_check_err") || !ASSERT_EQ(id0, id2, "id2_check_val"))
                goto cleanup;
 
index 0281095..92ef0aa 100644 (file)
@@ -25,43 +25,49 @@ void test_xdp_noinline(void)
                __u8 flags;
        } real_def = {.dst = MAGIC_VAL};
        __u32 ch_key = 11, real_num = 3;
-       __u32 duration = 0, retval, size;
        int err, i;
        __u64 bytes = 0, pkts = 0;
        char buf[128];
        u32 *magic = (u32 *)buf;
+       LIBBPF_OPTS(bpf_test_run_opts, topts,
+               .data_in = &pkt_v4,
+               .data_size_in = sizeof(pkt_v4),
+               .data_out = buf,
+               .data_size_out = sizeof(buf),
+               .repeat = NUM_ITER,
+       );
 
        skel = test_xdp_noinline__open_and_load();
-       if (CHECK(!skel, "skel_open_and_load", "failed\n"))
+       if (!ASSERT_OK_PTR(skel, "skel_open_and_load"))
                return;
 
        bpf_map_update_elem(bpf_map__fd(skel->maps.vip_map), &key, &value, 0);
        bpf_map_update_elem(bpf_map__fd(skel->maps.ch_rings), &ch_key, &real_num, 0);
        bpf_map_update_elem(bpf_map__fd(skel->maps.reals), &real_num, &real_def, 0);
 
-       err = bpf_prog_test_run(bpf_program__fd(skel->progs.balancer_ingress_v4),
-                               NUM_ITER, &pkt_v4, sizeof(pkt_v4),
-                               buf, &size, &retval, &duration);
-       CHECK(err || retval != 1 || size != 54 ||
-             *magic != MAGIC_VAL, "ipv4",
-             "err %d errno %d retval %d size %d magic %x\n",
-             err, errno, retval, size, *magic);
+       err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.balancer_ingress_v4), &topts);
+       ASSERT_OK(err, "ipv4 test_run");
+       ASSERT_EQ(topts.retval, 1, "ipv4 test_run retval");
+       ASSERT_EQ(topts.data_size_out, 54, "ipv4 test_run data_size_out");
+       ASSERT_EQ(*magic, MAGIC_VAL, "ipv4 test_run magic");
 
-       err = bpf_prog_test_run(bpf_program__fd(skel->progs.balancer_ingress_v6),
-                               NUM_ITER, &pkt_v6, sizeof(pkt_v6),
-                               buf, &size, &retval, &duration);
-       CHECK(err || retval != 1 || size != 74 ||
-             *magic != MAGIC_VAL, "ipv6",
-             "err %d errno %d retval %d size %d magic %x\n",
-             err, errno, retval, size, *magic);
+       topts.data_in = &pkt_v6;
+       topts.data_size_in = sizeof(pkt_v6);
+       topts.data_out = buf;
+       topts.data_size_out = sizeof(buf);
+
+       err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.balancer_ingress_v6), &topts);
+       ASSERT_OK(err, "ipv6 test_run");
+       ASSERT_EQ(topts.retval, 1, "ipv6 test_run retval");
+       ASSERT_EQ(topts.data_size_out, 74, "ipv6 test_run data_size_out");
+       ASSERT_EQ(*magic, MAGIC_VAL, "ipv6 test_run magic");
 
        bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats), &stats_key, stats);
        for (i = 0; i < nr_cpus; i++) {
                bytes += stats[i].bytes;
                pkts += stats[i].pkts;
        }
-       CHECK(bytes != MAGIC_BYTES * NUM_ITER * 2 || pkts != NUM_ITER * 2,
-             "stats", "bytes %lld pkts %lld\n",
-             (unsigned long long)bytes, (unsigned long long)pkts);
+       ASSERT_EQ(bytes, MAGIC_BYTES * NUM_ITER * 2, "stats bytes");
+       ASSERT_EQ(pkts, NUM_ITER * 2, "stats pkts");
        test_xdp_noinline__destroy(skel);
 }
index 15a3900..f543d1b 100644 (file)
@@ -4,22 +4,25 @@
 void test_xdp_perf(void)
 {
        const char *file = "./xdp_dummy.o";
-       __u32 duration, retval, size;
        struct bpf_object *obj;
        char in[128], out[128];
        int err, prog_fd;
+       LIBBPF_OPTS(bpf_test_run_opts, topts,
+               .data_in = in,
+               .data_size_in = sizeof(in),
+               .data_out = out,
+               .data_size_out = sizeof(out),
+               .repeat = 1000000,
+       );
 
        err = bpf_prog_test_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd);
        if (CHECK_FAIL(err))
                return;
 
-       err = bpf_prog_test_run(prog_fd, 1000000, &in[0], 128,
-                               out, &size, &retval, &duration);
-
-       CHECK(err || retval != XDP_PASS || size != 128,
-             "xdp-perf",
-             "err %d errno %d retval %d size %d\n",
-             err, errno, retval, size);
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
+       ASSERT_OK(err, "test_run");
+       ASSERT_EQ(topts.retval, XDP_PASS, "test_run retval");
+       ASSERT_EQ(topts.data_size_out, 128, "test_run data_size_out");
 
        bpf_object__close(obj);
 }
index d9a88dd..7efcbdb 100644 (file)
@@ -5,6 +5,7 @@
 #include <linux/bpf.h>
 #include <stdbool.h>
 #include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
 
 char _license[] SEC("license") = "GPL";
 
@@ -87,7 +88,7 @@ bloom_callback(struct bpf_map *map, __u32 *key, void *val,
        return 0;
 }
 
-SEC("fentry/__x64_sys_getpgid")
+SEC("fentry/" SYS_PREFIX "sys_getpgid")
 int bloom_lookup(void *ctx)
 {
        struct callback_ctx data;
@@ -100,7 +101,7 @@ int bloom_lookup(void *ctx)
        return 0;
 }
 
-SEC("fentry/__x64_sys_getpgid")
+SEC("fentry/" SYS_PREFIX "sys_getpgid")
 int bloom_update(void *ctx)
 {
        struct callback_ctx data;
@@ -113,7 +114,7 @@ int bloom_update(void *ctx)
        return 0;
 }
 
-SEC("fentry/__x64_sys_getpgid")
+SEC("fentry/" SYS_PREFIX "sys_getpgid")
 int bloom_hashmap_lookup(void *ctx)
 {
        __u64 *result;
index 1316f3d..f245fcf 100644 (file)
@@ -3,6 +3,7 @@
 
 #include <linux/bpf.h>
 #include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
 
 char _license[] SEC("license") = "GPL";
 
@@ -51,7 +52,7 @@ check_elem(struct bpf_map *map, __u32 *key, __u32 *val,
        return 0;
 }
 
-SEC("fentry/__x64_sys_getpgid")
+SEC("fentry/" SYS_PREFIX "sys_getpgid")
 int inner_map(void *ctx)
 {
        struct bpf_map *inner_map;
@@ -70,7 +71,7 @@ int inner_map(void *ctx)
        return 0;
 }
 
-SEC("fentry/__x64_sys_getpgid")
+SEC("fentry/" SYS_PREFIX "sys_getpgid")
 int check_bloom(void *ctx)
 {
        struct callback_ctx data;
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_setsockopt_unix.c b/tools/testing/selftests/bpf/progs/bpf_iter_setsockopt_unix.c
new file mode 100644 (file)
index 0000000..eafc877
--- /dev/null
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright Amazon.com Inc. or its affiliates. */
+#include "bpf_iter.h"
+#include "bpf_tracing_net.h"
+#include <bpf/bpf_helpers.h>
+#include <limits.h>
+
+#define AUTOBIND_LEN 6
+char sun_path[AUTOBIND_LEN];
+
+#define NR_CASES 5
+int sndbuf_setsockopt[NR_CASES] = {-1, 0, 8192, INT_MAX / 2, INT_MAX};
+int sndbuf_getsockopt[NR_CASES] = {-1, -1, -1, -1, -1};
+int sndbuf_getsockopt_expected[NR_CASES];
+
+static inline int cmpname(struct unix_sock *unix_sk)
+{
+       int i;
+
+       for (i = 0; i < AUTOBIND_LEN; i++) {
+               if (unix_sk->addr->name->sun_path[i] != sun_path[i])
+                       return -1;
+       }
+
+       return 0;
+}
+
+SEC("iter/unix")
+int change_sndbuf(struct bpf_iter__unix *ctx)
+{
+       struct unix_sock *unix_sk = ctx->unix_sk;
+       int i, err;
+
+       if (!unix_sk || !unix_sk->addr)
+               return 0;
+
+       if (unix_sk->addr->name->sun_path[0])
+               return 0;
+
+       if (cmpname(unix_sk))
+               return 0;
+
+       for (i = 0; i < NR_CASES; i++) {
+               err = bpf_setsockopt(unix_sk, SOL_SOCKET, SO_SNDBUF,
+                                    &sndbuf_setsockopt[i],
+                                    sizeof(sndbuf_setsockopt[i]));
+               if (err)
+                       break;
+
+               err = bpf_getsockopt(unix_sk, SOL_SOCKET, SO_SNDBUF,
+                                    &sndbuf_getsockopt[i],
+                                    sizeof(sndbuf_getsockopt[i]));
+               if (err)
+                       break;
+       }
+
+       return 0;
+}
+
+char _license[] SEC("license") = "GPL";
index c86b93f..d227412 100644 (file)
@@ -2,6 +2,7 @@
 /* Copyright (c) 2020 Facebook */
 #include "bpf_iter.h"
 #include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
 
 char _license[] SEC("license") = "GPL";
 
@@ -23,3 +24,56 @@ int dump_task(struct bpf_iter__task *ctx)
        BPF_SEQ_PRINTF(seq, "%8d %8d\n", task->tgid, task->pid);
        return 0;
 }
+
+int num_expected_failure_copy_from_user_task = 0;
+int num_success_copy_from_user_task = 0;
+
+SEC("iter.s/task")
+int dump_task_sleepable(struct bpf_iter__task *ctx)
+{
+       struct seq_file *seq = ctx->meta->seq;
+       struct task_struct *task = ctx->task;
+       static const char info[] = "    === END ===";
+       struct pt_regs *regs;
+       void *ptr;
+       uint32_t user_data = 0;
+       int ret;
+
+       if (task == (void *)0) {
+               BPF_SEQ_PRINTF(seq, "%s\n", info);
+               return 0;
+       }
+
+       /* Read an invalid pointer and ensure we get an error */
+       ptr = NULL;
+       ret = bpf_copy_from_user_task(&user_data, sizeof(uint32_t), ptr, task, 0);
+       if (ret) {
+               ++num_expected_failure_copy_from_user_task;
+       } else {
+               BPF_SEQ_PRINTF(seq, "%s\n", info);
+               return 0;
+       }
+
+       /* Try to read the contents of the task's instruction pointer from the
+        * remote task's address space.
+        */
+       regs = (struct pt_regs *)bpf_task_pt_regs(task);
+       if (regs == (void *)0) {
+               BPF_SEQ_PRINTF(seq, "%s\n", info);
+               return 0;
+       }
+       ptr = (void *)PT_REGS_IP(regs);
+
+       ret = bpf_copy_from_user_task(&user_data, sizeof(uint32_t), ptr, task, 0);
+       if (ret) {
+               BPF_SEQ_PRINTF(seq, "%s\n", info);
+               return 0;
+       }
+       ++num_success_copy_from_user_task;
+
+       if (ctx->meta->seq_num == 0)
+               BPF_SEQ_PRINTF(seq, "    tgid      gid     data\n");
+
+       BPF_SEQ_PRINTF(seq, "%8d %8d %8d\n", task->tgid, task->pid, user_data);
+       return 0;
+}
index c21e3f5..e6aefae 100644 (file)
@@ -63,7 +63,7 @@ int dump_unix(struct bpf_iter__unix *ctx)
                        BPF_SEQ_PRINTF(seq, " @");
 
                        for (i = 1; i < len; i++) {
-                               /* unix_mkname() tests this upper bound. */
+                               /* unix_validate_addr() tests this upper bound. */
                                if (i >= sizeof(struct sockaddr_un))
                                        break;
 
index 12349e4..e085652 100644 (file)
@@ -3,6 +3,7 @@
 
 #include "vmlinux.h"
 #include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
 
 char _license[] SEC("license") = "GPL";
 
@@ -53,7 +54,7 @@ static int nested_callback1(__u32 index, void *data)
        return 0;
 }
 
-SEC("fentry/__x64_sys_nanosleep")
+SEC("fentry/" SYS_PREFIX "sys_nanosleep")
 int test_prog(void *ctx)
 {
        struct callback_ctx data = {};
@@ -71,7 +72,7 @@ int test_prog(void *ctx)
        return 0;
 }
 
-SEC("fentry/__x64_sys_nanosleep")
+SEC("fentry/" SYS_PREFIX "sys_nanosleep")
 int prog_null_ctx(void *ctx)
 {
        if (bpf_get_current_pid_tgid() >> 32 != pid)
@@ -82,7 +83,7 @@ int prog_null_ctx(void *ctx)
        return 0;
 }
 
-SEC("fentry/__x64_sys_nanosleep")
+SEC("fentry/" SYS_PREFIX "sys_nanosleep")
 int prog_invalid_flags(void *ctx)
 {
        struct callback_ctx data = {};
@@ -95,7 +96,7 @@ int prog_invalid_flags(void *ctx)
        return 0;
 }
 
-SEC("fentry/__x64_sys_nanosleep")
+SEC("fentry/" SYS_PREFIX "sys_nanosleep")
 int prog_nested_calls(void *ctx)
 {
        struct callback_ctx data = {};
index 9dafdc2..4ce76eb 100644 (file)
@@ -3,6 +3,7 @@
 
 #include "vmlinux.h"
 #include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
 
 char _license[] SEC("license") = "GPL";
 
@@ -14,7 +15,7 @@ static int empty_callback(__u32 index, void *data)
        return 0;
 }
 
-SEC("fentry/__x64_sys_getpgid")
+SEC("fentry/" SYS_PREFIX "sys_getpgid")
 int benchmark(void *ctx)
 {
        for (int i = 0; i < 1000; i++) {
diff --git a/tools/testing/selftests/bpf/progs/bpf_misc.h b/tools/testing/selftests/bpf/progs/bpf_misc.h
new file mode 100644 (file)
index 0000000..5bb11fe
--- /dev/null
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __BPF_MISC_H__
+#define __BPF_MISC_H__
+
+#if defined(__TARGET_ARCH_x86)
+#define SYSCALL_WRAPPER 1
+#define SYS_PREFIX "__x64_"
+#elif defined(__TARGET_ARCH_s390)
+#define SYSCALL_WRAPPER 1
+#define SYS_PREFIX "__s390x_"
+#elif defined(__TARGET_ARCH_arm64)
+#define SYSCALL_WRAPPER 1
+#define SYS_PREFIX "__arm64_"
+#else
+#define SYSCALL_WRAPPER 0
+#define SYS_PREFIX "__se_"
+#endif
+
+#endif
diff --git a/tools/testing/selftests/bpf/progs/bpf_mod_race.c b/tools/testing/selftests/bpf/progs/bpf_mod_race.c
new file mode 100644 (file)
index 0000000..82a5c6c
--- /dev/null
@@ -0,0 +1,100 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+const volatile struct {
+       /* thread to activate trace programs for */
+       pid_t tgid;
+       /* return error from __init function */
+       int inject_error;
+       /* uffd monitored range start address */
+       void *fault_addr;
+} bpf_mod_race_config = { -1 };
+
+int bpf_blocking = 0;
+int res_try_get_module = -1;
+
+static __always_inline bool check_thread_id(void)
+{
+       struct task_struct *task = bpf_get_current_task_btf();
+
+       return task->tgid == bpf_mod_race_config.tgid;
+}
+
+/* The trace of execution is something like this:
+ *
+ * finit_module()
+ *   load_module()
+ *     prepare_coming_module()
+ *       notifier_call(MODULE_STATE_COMING)
+ *         btf_parse_module()
+ *         btf_alloc_id()              // Visible to userspace at this point
+ *         list_add(btf_mod->list, &btf_modules)
+ *     do_init_module()
+ *       freeinit = kmalloc()
+ *       ret = mod->init()
+ *         bpf_prog_widen_race()
+ *           bpf_copy_from_user()
+ *             ...<sleep>...
+ *       if (ret < 0)
+ *         ...
+ *         free_module()
+ * return ret
+ *
+ * At this point, module loading thread is blocked, we now load the program:
+ *
+ * bpf_check
+ *   add_kfunc_call/check_pseudo_btf_id
+ *     btf_try_get_module
+ *       try_get_module_live == false
+ *     return -ENXIO
+ *
+ * Without the fix (try_get_module_live in btf_try_get_module):
+ *
+ * bpf_check
+ *   add_kfunc_call/check_pseudo_btf_id
+ *     btf_try_get_module
+ *       try_get_module == true
+ *     <store module reference in btf_kfunc_tab or used_btf array>
+ *   ...
+ * return fd
+ *
+ * Now, if we inject an error in the blocked program, our module will be freed
+ * (going straight from MODULE_STATE_COMING to MODULE_STATE_GOING).
+ * Later, when bpf program is freed, it will try to module_put already freed
+ * module. This is why try_get_module_live returns false if mod->state is not
+ * MODULE_STATE_LIVE.
+ */
+
+SEC("fmod_ret.s/bpf_fentry_test1")
+int BPF_PROG(widen_race, int a, int ret)
+{
+       char dst;
+
+       if (!check_thread_id())
+               return 0;
+       /* Indicate that we will attempt to block */
+       bpf_blocking = 1;
+       bpf_copy_from_user(&dst, 1, bpf_mod_race_config.fault_addr);
+       return bpf_mod_race_config.inject_error;
+}
+
+SEC("fexit/do_init_module")
+int BPF_PROG(fexit_init_module, struct module *mod, int ret)
+{
+       if (!check_thread_id())
+               return 0;
+       /* Indicate that we finished blocking */
+       bpf_blocking = 2;
+       return 0;
+}
+
+SEC("fexit/btf_try_get_module")
+int BPF_PROG(fexit_module_get, const struct btf *btf, struct module *mod)
+{
+       res_try_get_module = !!mod;
+       return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/bpf_syscall_macro.c b/tools/testing/selftests/bpf/progs/bpf_syscall_macro.c
new file mode 100644 (file)
index 0000000..05838ed
--- /dev/null
@@ -0,0 +1,84 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright 2022 Sony Group Corporation */
+#include <vmlinux.h>
+
+#include <bpf/bpf_core_read.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include "bpf_misc.h"
+
+int arg1 = 0;
+unsigned long arg2 = 0;
+unsigned long arg3 = 0;
+unsigned long arg4_cx = 0;
+unsigned long arg4 = 0;
+unsigned long arg5 = 0;
+
+int arg1_core = 0;
+unsigned long arg2_core = 0;
+unsigned long arg3_core = 0;
+unsigned long arg4_core_cx = 0;
+unsigned long arg4_core = 0;
+unsigned long arg5_core = 0;
+
+int option_syscall = 0;
+unsigned long arg2_syscall = 0;
+unsigned long arg3_syscall = 0;
+unsigned long arg4_syscall = 0;
+unsigned long arg5_syscall = 0;
+
+const volatile pid_t filter_pid = 0;
+
+SEC("kprobe/" SYS_PREFIX "sys_prctl")
+int BPF_KPROBE(handle_sys_prctl)
+{
+       struct pt_regs *real_regs;
+       pid_t pid = bpf_get_current_pid_tgid() >> 32;
+       unsigned long tmp = 0;
+
+       if (pid != filter_pid)
+               return 0;
+
+       real_regs = PT_REGS_SYSCALL_REGS(ctx);
+
+       /* test for PT_REGS_PARM */
+
+#if !defined(bpf_target_arm64) && !defined(bpf_target_s390)
+       bpf_probe_read_kernel(&tmp, sizeof(tmp), &PT_REGS_PARM1_SYSCALL(real_regs));
+#endif
+       arg1 = tmp;
+       bpf_probe_read_kernel(&arg2, sizeof(arg2), &PT_REGS_PARM2_SYSCALL(real_regs));
+       bpf_probe_read_kernel(&arg3, sizeof(arg3), &PT_REGS_PARM3_SYSCALL(real_regs));
+       bpf_probe_read_kernel(&arg4_cx, sizeof(arg4_cx), &PT_REGS_PARM4(real_regs));
+       bpf_probe_read_kernel(&arg4, sizeof(arg4), &PT_REGS_PARM4_SYSCALL(real_regs));
+       bpf_probe_read_kernel(&arg5, sizeof(arg5), &PT_REGS_PARM5_SYSCALL(real_regs));
+
+       /* test for the CORE variant of PT_REGS_PARM */
+       arg1_core = PT_REGS_PARM1_CORE_SYSCALL(real_regs);
+       arg2_core = PT_REGS_PARM2_CORE_SYSCALL(real_regs);
+       arg3_core = PT_REGS_PARM3_CORE_SYSCALL(real_regs);
+       arg4_core_cx = PT_REGS_PARM4_CORE(real_regs);
+       arg4_core = PT_REGS_PARM4_CORE_SYSCALL(real_regs);
+       arg5_core = PT_REGS_PARM5_CORE_SYSCALL(real_regs);
+
+       return 0;
+}
+
+SEC("kprobe/" SYS_PREFIX "sys_prctl")
+int BPF_KPROBE_SYSCALL(prctl_enter, int option, unsigned long arg2,
+                      unsigned long arg3, unsigned long arg4, unsigned long arg5)
+{
+       pid_t pid = bpf_get_current_pid_tgid() >> 32;
+
+       if (pid != filter_pid)
+               return 0;
+
+       option_syscall = option;
+       arg2_syscall = arg2;
+       arg3_syscall = arg3;
+       arg4_syscall = arg4;
+       arg5_syscall = arg5;
+       return 0;
+}
+
+char _license[] SEC("license") = "GPL";
index e0f4260..1c1289b 100644 (file)
@@ -5,6 +5,8 @@
 #define AF_INET                        2
 #define AF_INET6               10
 
+#define SOL_SOCKET             1
+#define SO_SNDBUF              7
 #define __SO_ACCEPTCON         (1 << 16)
 
 #define SOL_TCP                        6
diff --git a/tools/testing/selftests/bpf/progs/btf_type_tag_user.c b/tools/testing/selftests/bpf/progs/btf_type_tag_user.c
new file mode 100644 (file)
index 0000000..5523f77
--- /dev/null
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2022 Facebook */
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+struct bpf_testmod_btf_type_tag_1 {
+       int a;
+};
+
+struct bpf_testmod_btf_type_tag_2 {
+       struct bpf_testmod_btf_type_tag_1 *p;
+};
+
+int g;
+
+SEC("fentry/bpf_testmod_test_btf_type_tag_user_1")
+int BPF_PROG(test_user1, struct bpf_testmod_btf_type_tag_1 *arg)
+{
+       g = arg->a;
+       return 0;
+}
+
+SEC("fentry/bpf_testmod_test_btf_type_tag_user_2")
+int BPF_PROG(test_user2, struct bpf_testmod_btf_type_tag_2 *arg)
+{
+       g = arg->p->a;
+       return 0;
+}
+
+/* int __sys_getsockname(int fd, struct sockaddr __user *usockaddr,
+ *                       int __user *usockaddr_len);
+ */
+SEC("fentry/__sys_getsockname")
+int BPF_PROG(test_sys_getsockname, int fd, struct sockaddr *usockaddr,
+            int *usockaddr_len)
+{
+       g = usockaddr->sa_family;
+       return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/cgroup_getset_retval_getsockopt.c b/tools/testing/selftests/bpf/progs/cgroup_getset_retval_getsockopt.c
new file mode 100644 (file)
index 0000000..b2a409e
--- /dev/null
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ * Copyright 2021 Google LLC.
+ */
+
+#include <errno.h>
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+__u32 invocations = 0;
+__u32 assertion_error = 0;
+__u32 retval_value = 0;
+__u32 ctx_retval_value = 0;
+
+SEC("cgroup/getsockopt")
+int get_retval(struct bpf_sockopt *ctx)
+{
+       retval_value = bpf_get_retval();
+       ctx_retval_value = ctx->retval;
+       __sync_fetch_and_add(&invocations, 1);
+
+       return 1;
+}
+
+SEC("cgroup/getsockopt")
+int set_eisconn(struct bpf_sockopt *ctx)
+{
+       __sync_fetch_and_add(&invocations, 1);
+
+       if (bpf_set_retval(-EISCONN))
+               assertion_error = 1;
+
+       return 1;
+}
+
+SEC("cgroup/getsockopt")
+int clear_retval(struct bpf_sockopt *ctx)
+{
+       __sync_fetch_and_add(&invocations, 1);
+
+       ctx->retval = 0;
+
+       return 1;
+}
diff --git a/tools/testing/selftests/bpf/progs/cgroup_getset_retval_setsockopt.c b/tools/testing/selftests/bpf/progs/cgroup_getset_retval_setsockopt.c
new file mode 100644 (file)
index 0000000..d6e5903
--- /dev/null
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ * Copyright 2021 Google LLC.
+ */
+
+#include <errno.h>
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+__u32 invocations = 0;
+__u32 assertion_error = 0;
+__u32 retval_value = 0;
+
+SEC("cgroup/setsockopt")
+int get_retval(struct bpf_sockopt *ctx)
+{
+       retval_value = bpf_get_retval();
+       __sync_fetch_and_add(&invocations, 1);
+
+       return 1;
+}
+
+SEC("cgroup/setsockopt")
+int set_eunatch(struct bpf_sockopt *ctx)
+{
+       __sync_fetch_and_add(&invocations, 1);
+
+       if (bpf_set_retval(-EUNATCH))
+               assertion_error = 1;
+
+       return 0;
+}
+
+SEC("cgroup/setsockopt")
+int set_eisconn(struct bpf_sockopt *ctx)
+{
+       __sync_fetch_and_add(&invocations, 1);
+
+       if (bpf_set_retval(-EISCONN))
+               assertion_error = 1;
+
+       return 0;
+}
+
+SEC("cgroup/setsockopt")
+int legacy_eperm(struct bpf_sockopt *ctx)
+{
+       __sync_fetch_and_add(&invocations, 1);
+
+       return 0;
+}
index 13499cc..2715fe2 100644 (file)
@@ -101,4 +101,20 @@ int balancer_ingress(struct __sk_buff *ctx)
        return 0;
 }
 
+typedef int (*func_proto_typedef___match)(long);
+typedef int (*func_proto_typedef___doesnt_match)(char *);
+typedef int (*func_proto_typedef_nested1)(func_proto_typedef___match);
+
+int proto_out[3];
+
+SEC("raw_tracepoint/sys_enter")
+int core_relo_proto(void *ctx)
+{
+       proto_out[0] = bpf_core_type_exists(func_proto_typedef___match);
+       proto_out[1] = bpf_core_type_exists(func_proto_typedef___doesnt_match);
+       proto_out[2] = bpf_core_type_exists(func_proto_typedef_nested1);
+
+       return 0;
+}
+
 char LICENSE[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/core_kern_overflow.c b/tools/testing/selftests/bpf/progs/core_kern_overflow.c
new file mode 100644 (file)
index 0000000..f0d5652
--- /dev/null
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "vmlinux.h"
+
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
+
+typedef int (*func_proto_typedef)(long);
+typedef int (*func_proto_typedef_nested1)(func_proto_typedef);
+typedef int (*func_proto_typedef_nested2)(func_proto_typedef_nested1);
+
+int proto_out;
+
+SEC("raw_tracepoint/sys_enter")
+int core_relo_proto(void *ctx)
+{
+       proto_out = bpf_core_type_exists(func_proto_typedef_nested2);
+
+       return 0;
+}
+
+char LICENSE[] SEC("license") = "GPL";
index bca92c9..106dc75 100644 (file)
@@ -3,6 +3,7 @@
 #include "vmlinux.h"
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
+#include "bpf_misc.h"
 
 char LICENSE[] SEC("license") = "GPL";
 
@@ -10,8 +11,8 @@ int pid = 0;
 int fentry_cnt = 0;
 int fexit_cnt = 0;
 
-SEC("fentry/__x64_sys_nanosleep")
-int BPF_PROG(nanosleep_fentry, const struct pt_regs *regs)
+SEC("fentry/" SYS_PREFIX "sys_nanosleep")
+int nanosleep_fentry(void *ctx)
 {
        if (bpf_get_current_pid_tgid() >> 32 != pid)
                return 0;
@@ -20,8 +21,8 @@ int BPF_PROG(nanosleep_fentry, const struct pt_regs *regs)
        return 0;
 }
 
-SEC("fexit/__x64_sys_nanosleep")
-int BPF_PROG(nanosleep_fexit, const struct pt_regs *regs, int ret)
+SEC("fexit/" SYS_PREFIX "sys_nanosleep")
+int nanosleep_fexit(void *ctx)
 {
        if (bpf_get_current_pid_tgid() >> 32 != pid)
                return 0;
index 68a5a9d..7e94412 100644 (file)
@@ -7,12 +7,12 @@
 #include <bpf/bpf_endian.h>
 #include <bpf/bpf_helpers.h>
 
-struct bpf_map_def SEC("maps") sock_map = {
-       .type = BPF_MAP_TYPE_SOCKMAP,
-       .key_size = sizeof(int),
-       .value_size = sizeof(int),
-       .max_entries = 2,
-};
+struct {
+       __uint(type, BPF_MAP_TYPE_SOCKMAP);
+       __type(key, int);
+       __type(value, int);
+       __uint(max_entries, 2);
+} sock_map SEC(".maps");
 
 SEC("freplace/cls_redirect")
 int freplace_cls_redirect_test(struct __sk_buff *skb)
diff --git a/tools/testing/selftests/bpf/progs/kfunc_call_race.c b/tools/testing/selftests/bpf/progs/kfunc_call_race.c
new file mode 100644 (file)
index 0000000..4e8fed7
--- /dev/null
@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+
+extern void bpf_testmod_test_mod_kfunc(int i) __ksym;
+
+SEC("tc")
+int kfunc_call_fail(struct __sk_buff *ctx)
+{
+       bpf_testmod_test_mod_kfunc(0);
+       return 0;
+}
+
+char _license[] SEC("license") = "GPL";
index 8a8cf59..5aecbb9 100644 (file)
@@ -1,13 +1,20 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2021 Facebook */
-#include <linux/bpf.h>
+#include <vmlinux.h>
 #include <bpf/bpf_helpers.h>
-#include "bpf_tcp_helpers.h"
 
 extern int bpf_kfunc_call_test2(struct sock *sk, __u32 a, __u32 b) __ksym;
 extern __u64 bpf_kfunc_call_test1(struct sock *sk, __u32 a, __u64 b,
                                  __u32 c, __u64 d) __ksym;
 
+extern struct prog_test_ref_kfunc *bpf_kfunc_call_test_acquire(unsigned long *sp) __ksym;
+extern void bpf_kfunc_call_test_release(struct prog_test_ref_kfunc *p) __ksym;
+extern void bpf_kfunc_call_test_pass_ctx(struct __sk_buff *skb) __ksym;
+extern void bpf_kfunc_call_test_pass1(struct prog_test_pass1 *p) __ksym;
+extern void bpf_kfunc_call_test_pass2(struct prog_test_pass2 *p) __ksym;
+extern void bpf_kfunc_call_test_mem_len_pass1(void *mem, int len) __ksym;
+extern void bpf_kfunc_call_test_mem_len_fail2(__u64 *mem, int len) __ksym;
+
 SEC("tc")
 int kfunc_call_test2(struct __sk_buff *skb)
 {
@@ -44,4 +51,45 @@ int kfunc_call_test1(struct __sk_buff *skb)
        return ret;
 }
 
+SEC("tc")
+int kfunc_call_test_ref_btf_id(struct __sk_buff *skb)
+{
+       struct prog_test_ref_kfunc *pt;
+       unsigned long s = 0;
+       int ret = 0;
+
+       pt = bpf_kfunc_call_test_acquire(&s);
+       if (pt) {
+               if (pt->a != 42 || pt->b != 108)
+                       ret = -1;
+               bpf_kfunc_call_test_release(pt);
+       }
+       return ret;
+}
+
+SEC("tc")
+int kfunc_call_test_pass(struct __sk_buff *skb)
+{
+       struct prog_test_pass1 p1 = {};
+       struct prog_test_pass2 p2 = {};
+       short a = 0;
+       __u64 b = 0;
+       long c = 0;
+       char d = 0;
+       int e = 0;
+
+       bpf_kfunc_call_test_pass_ctx(skb);
+       bpf_kfunc_call_test_pass1(&p1);
+       bpf_kfunc_call_test_pass2(&p2);
+
+       bpf_kfunc_call_test_mem_len_pass1(&a, sizeof(a));
+       bpf_kfunc_call_test_mem_len_pass1(&b, sizeof(b));
+       bpf_kfunc_call_test_mem_len_pass1(&c, sizeof(c));
+       bpf_kfunc_call_test_mem_len_pass1(&d, sizeof(d));
+       bpf_kfunc_call_test_mem_len_pass1(&e, sizeof(e));
+       bpf_kfunc_call_test_mem_len_fail2(&b, -1);
+
+       return 0;
+}
+
 char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/ksym_race.c b/tools/testing/selftests/bpf/progs/ksym_race.c
new file mode 100644 (file)
index 0000000..def97f2
--- /dev/null
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+
+extern int bpf_testmod_ksym_percpu __ksym;
+
+SEC("tc")
+int ksym_fail(struct __sk_buff *ctx)
+{
+       return *(int *)bpf_this_cpu_ptr(&bpf_testmod_ksym_percpu);
+}
+
+char _license[] SEC("license") = "GPL";
index e5ab483..45204fe 100644 (file)
@@ -4,6 +4,7 @@
 #include <linux/bpf.h>
 #include <stdint.h>
 #include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
 
 char _license[] SEC("license") = "GPL";
 
@@ -18,7 +19,7 @@ const volatile int batch_cnt = 0;
 long sample_val = 42;
 long dropped __attribute__((aligned(128))) = 0;
 
-SEC("fentry/__x64_sys_getpgid")
+SEC("fentry/" SYS_PREFIX "sys_getpgid")
 int bench_perfbuf(void *ctx)
 {
        __u64 *sample;
index 123607d..6a46849 100644 (file)
@@ -4,6 +4,7 @@
 #include <linux/bpf.h>
 #include <stdint.h>
 #include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
 
 char _license[] SEC("license") = "GPL";
 
@@ -30,7 +31,7 @@ static __always_inline long get_flags()
        return sz >= wakeup_data_size ? BPF_RB_FORCE_WAKEUP : BPF_RB_NO_WAKEUP;
 }
 
-SEC("fentry/__x64_sys_getpgid")
+SEC("fentry/" SYS_PREFIX "sys_getpgid")
 int bench_ringbuf(void *ctx)
 {
        long *sample, flags;
index 1612a32..495990d 100644 (file)
@@ -2,19 +2,19 @@
 #include <linux/bpf.h>
 #include <bpf/bpf_helpers.h>
 
-struct bpf_map_def SEC("maps") htab = {
-       .type = BPF_MAP_TYPE_HASH,
-       .key_size = sizeof(__u32),
-       .value_size = sizeof(long),
-       .max_entries = 2,
-};
+struct {
+       __uint(type, BPF_MAP_TYPE_HASH);
+       __type(key, __u32);
+       __type(value, long);
+       __uint(max_entries, 2);
+} htab SEC(".maps");
 
-struct bpf_map_def SEC("maps") array = {
-       .type = BPF_MAP_TYPE_ARRAY,
-       .key_size = sizeof(__u32),
-       .value_size = sizeof(long),
-       .max_entries = 2,
-};
+struct {
+       __uint(type, BPF_MAP_TYPE_ARRAY);
+       __type(key, __u32);
+       __type(value, long);
+       __uint(max_entries, 2);
+} array SEC(".maps");
 
 /* Sample program which should always load for testing control paths. */
 SEC(".text") int func()
index 95d5b94..c9abfe3 100644 (file)
@@ -7,8 +7,6 @@ int bpf_prog1(struct __sk_buff *skb)
 {
        void *data_end = (void *)(long) skb->data_end;
        void *data = (void *)(long) skb->data;
-       __u32 lport = skb->local_port;
-       __u32 rport = skb->remote_port;
        __u8 *d = data;
        int err;
 
index 79c8139..c8d8100 100644 (file)
@@ -72,18 +72,19 @@ int _getsockopt(struct bpf_sockopt *ctx)
                 * reasons.
                 */
 
-               if (optval + sizeof(struct tcp_zerocopy_receive) > optval_end)
-                       return 0; /* EPERM, bounds check */
+               /* Check that optval contains address (__u64) */
+               if (optval + sizeof(__u64) > optval_end)
+                       return 0; /* bounds check */
 
                if (((struct tcp_zerocopy_receive *)optval)->address != 0)
-                       return 0; /* EPERM, unexpected data */
+                       return 0; /* unexpected data */
 
                return 1;
        }
 
        if (ctx->level == SOL_IP && ctx->optname == IP_FREEBIND) {
                if (optval + 1 > optval_end)
-                       return 0; /* EPERM, bounds check */
+                       return 0; /* bounds check */
 
                ctx->retval = 0; /* Reset system call return value to zero */
 
@@ -96,24 +97,24 @@ int _getsockopt(struct bpf_sockopt *ctx)
                 * bytes of data.
                 */
                if (optval_end - optval != page_size)
-                       return 0; /* EPERM, unexpected data size */
+                       return 0; /* unexpected data size */
 
                return 1;
        }
 
        if (ctx->level != SOL_CUSTOM)
-               return 0; /* EPERM, deny everything except custom level */
+               return 0; /* deny everything except custom level */
 
        if (optval + 1 > optval_end)
-               return 0; /* EPERM, bounds check */
+               return 0; /* bounds check */
 
        storage = bpf_sk_storage_get(&socket_storage_map, ctx->sk, 0,
                                     BPF_SK_STORAGE_GET_F_CREATE);
        if (!storage)
-               return 0; /* EPERM, couldn't get sk storage */
+               return 0; /* couldn't get sk storage */
 
        if (!ctx->retval)
-               return 0; /* EPERM, kernel should not have handled
+               return 0; /* kernel should not have handled
                           * SOL_CUSTOM, something is wrong!
                           */
        ctx->retval = 0; /* Reset system call return value to zero */
@@ -152,7 +153,7 @@ int _setsockopt(struct bpf_sockopt *ctx)
                /* Overwrite SO_SNDBUF value */
 
                if (optval + sizeof(__u32) > optval_end)
-                       return 0; /* EPERM, bounds check */
+                       return 0; /* bounds check */
 
                *(__u32 *)optval = 0x55AA;
                ctx->optlen = 4;
@@ -164,7 +165,7 @@ int _setsockopt(struct bpf_sockopt *ctx)
                /* Always use cubic */
 
                if (optval + 5 > optval_end)
-                       return 0; /* EPERM, bounds check */
+                       return 0; /* bounds check */
 
                memcpy(optval, "cubic", 5);
                ctx->optlen = 5;
@@ -175,10 +176,10 @@ int _setsockopt(struct bpf_sockopt *ctx)
        if (ctx->level == SOL_IP && ctx->optname == IP_FREEBIND) {
                /* Original optlen is larger than PAGE_SIZE. */
                if (ctx->optlen != page_size * 2)
-                       return 0; /* EPERM, unexpected data size */
+                       return 0; /* unexpected data size */
 
                if (optval + 1 > optval_end)
-                       return 0; /* EPERM, bounds check */
+                       return 0; /* bounds check */
 
                /* Make sure we can trim the buffer. */
                optval[0] = 0;
@@ -189,21 +190,21 @@ int _setsockopt(struct bpf_sockopt *ctx)
                 * bytes of data.
                 */
                if (optval_end - optval != page_size)
-                       return 0; /* EPERM, unexpected data size */
+                       return 0; /* unexpected data size */
 
                return 1;
        }
 
        if (ctx->level != SOL_CUSTOM)
-               return 0; /* EPERM, deny everything except custom level */
+               return 0; /* deny everything except custom level */
 
        if (optval + 1 > optval_end)
-               return 0; /* EPERM, bounds check */
+               return 0; /* bounds check */
 
        storage = bpf_sk_storage_get(&socket_storage_map, ctx->sk, 0,
                                     BPF_SK_STORAGE_GET_F_CREATE);
        if (!storage)
-               return 0; /* EPERM, couldn't get sk storage */
+               return 0; /* couldn't get sk storage */
 
        storage->val = optval[0];
        ctx->optlen = -1; /* BPF has consumed this option, don't call kernel
diff --git a/tools/testing/selftests/bpf/progs/test_bpf_nf.c b/tools/testing/selftests/bpf/progs/test_bpf_nf.c
new file mode 100644 (file)
index 0000000..f00a973
--- /dev/null
@@ -0,0 +1,118 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+
+#define EAFNOSUPPORT 97
+#define EPROTO 71
+#define ENONET 64
+#define EINVAL 22
+#define ENOENT 2
+
+int test_einval_bpf_tuple = 0;
+int test_einval_reserved = 0;
+int test_einval_netns_id = 0;
+int test_einval_len_opts = 0;
+int test_eproto_l4proto = 0;
+int test_enonet_netns_id = 0;
+int test_enoent_lookup = 0;
+int test_eafnosupport = 0;
+
+struct nf_conn;
+
+struct bpf_ct_opts___local {
+       s32 netns_id;
+       s32 error;
+       u8 l4proto;
+       u8 reserved[3];
+} __attribute__((preserve_access_index));
+
+struct nf_conn *bpf_xdp_ct_lookup(struct xdp_md *, struct bpf_sock_tuple *, u32,
+                                 struct bpf_ct_opts___local *, u32) __ksym;
+struct nf_conn *bpf_skb_ct_lookup(struct __sk_buff *, struct bpf_sock_tuple *, u32,
+                                 struct bpf_ct_opts___local *, u32) __ksym;
+void bpf_ct_release(struct nf_conn *) __ksym;
+
+static __always_inline void
+nf_ct_test(struct nf_conn *(*func)(void *, struct bpf_sock_tuple *, u32,
+                                  struct bpf_ct_opts___local *, u32),
+          void *ctx)
+{
+       struct bpf_ct_opts___local opts_def = { .l4proto = IPPROTO_TCP, .netns_id = -1 };
+       struct bpf_sock_tuple bpf_tuple;
+       struct nf_conn *ct;
+
+       __builtin_memset(&bpf_tuple, 0, sizeof(bpf_tuple.ipv4));
+
+       ct = func(ctx, NULL, 0, &opts_def, sizeof(opts_def));
+       if (ct)
+               bpf_ct_release(ct);
+       else
+               test_einval_bpf_tuple = opts_def.error;
+
+       opts_def.reserved[0] = 1;
+       ct = func(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def, sizeof(opts_def));
+       opts_def.reserved[0] = 0;
+       opts_def.l4proto = IPPROTO_TCP;
+       if (ct)
+               bpf_ct_release(ct);
+       else
+               test_einval_reserved = opts_def.error;
+
+       opts_def.netns_id = -2;
+       ct = func(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def, sizeof(opts_def));
+       opts_def.netns_id = -1;
+       if (ct)
+               bpf_ct_release(ct);
+       else
+               test_einval_netns_id = opts_def.error;
+
+       ct = func(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def, sizeof(opts_def) - 1);
+       if (ct)
+               bpf_ct_release(ct);
+       else
+               test_einval_len_opts = opts_def.error;
+
+       opts_def.l4proto = IPPROTO_ICMP;
+       ct = func(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def, sizeof(opts_def));
+       opts_def.l4proto = IPPROTO_TCP;
+       if (ct)
+               bpf_ct_release(ct);
+       else
+               test_eproto_l4proto = opts_def.error;
+
+       opts_def.netns_id = 0xf00f;
+       ct = func(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def, sizeof(opts_def));
+       opts_def.netns_id = -1;
+       if (ct)
+               bpf_ct_release(ct);
+       else
+               test_enonet_netns_id = opts_def.error;
+
+       ct = func(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def, sizeof(opts_def));
+       if (ct)
+               bpf_ct_release(ct);
+       else
+               test_enoent_lookup = opts_def.error;
+
+       ct = func(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4) - 1, &opts_def, sizeof(opts_def));
+       if (ct)
+               bpf_ct_release(ct);
+       else
+               test_eafnosupport = opts_def.error;
+}
+
+SEC("xdp")
+int nf_xdp_ct_test(struct xdp_md *ctx)
+{
+       nf_ct_test((void *)bpf_xdp_ct_lookup, ctx);
+       return 0;
+}
+
+SEC("tc")
+int nf_skb_ct_test(struct __sk_buff *ctx)
+{
+       nf_ct_test((void *)bpf_skb_ct_lookup, ctx);
+       return 0;
+}
+
+char _license[] SEC("license") = "GPL";
index 160ead6..07c94df 100644 (file)
@@ -9,12 +9,15 @@ struct ipv_counts {
        unsigned int v6;
 };
 
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
 struct bpf_map_def SEC("maps") btf_map = {
        .type = BPF_MAP_TYPE_ARRAY,
        .key_size = sizeof(int),
        .value_size = sizeof(struct ipv_counts),
        .max_entries = 4,
 };
+#pragma GCC diagnostic pop
 
 BPF_ANNOTATE_KV_PAIR(btf_map, int, struct ipv_counts);
 
index 1884a5b..762671a 100644 (file)
@@ -9,6 +9,8 @@ struct ipv_counts {
        unsigned int v6;
 };
 
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
 /* just to validate we can handle maps in multiple sections */
 struct bpf_map_def SEC("maps") btf_map_legacy = {
        .type = BPF_MAP_TYPE_ARRAY,
@@ -16,6 +18,7 @@ struct bpf_map_def SEC("maps") btf_map_legacy = {
        .value_size = sizeof(long long),
        .max_entries = 4,
 };
+#pragma GCC diagnostic pop
 
 BPF_ANNOTATE_KV_PAIR(btf_map_legacy, int, struct ipv_counts);
 
index 15e0f99..1dabb88 100644 (file)
@@ -8,12 +8,12 @@ struct ipv_counts {
        unsigned int v6;
 };
 
-struct bpf_map_def SEC("maps") btf_map = {
-       .type = BPF_MAP_TYPE_ARRAY,
-       .key_size = sizeof(int),
-       .value_size = sizeof(struct ipv_counts),
-       .max_entries = 4,
-};
+struct {
+       __uint(type, BPF_MAP_TYPE_ARRAY);
+       __uint(key_size, sizeof(int));
+       __uint(value_size, sizeof(struct ipv_counts));
+       __uint(max_entries, 4);
+} btf_map SEC(".maps");
 
 __attribute__((noinline))
 int test_long_fname_2(void)
index 8812a90..702578a 100644 (file)
@@ -7,20 +7,7 @@
 
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
-
-#if defined(__TARGET_ARCH_x86)
-#define SYSCALL_WRAPPER 1
-#define SYS_PREFIX "__x64_"
-#elif defined(__TARGET_ARCH_s390)
-#define SYSCALL_WRAPPER 1
-#define SYS_PREFIX "__s390x_"
-#elif defined(__TARGET_ARCH_arm64)
-#define SYSCALL_WRAPPER 1
-#define SYS_PREFIX "__arm64_"
-#else
-#define SYSCALL_WRAPPER 0
-#define SYS_PREFIX ""
-#endif
+#include "bpf_misc.h"
 
 static struct sockaddr_in old;
 
index eaa7d9d..5bdc0d3 100644 (file)
@@ -3,6 +3,7 @@
 
 #include <linux/bpf.h>
 #include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
 
 char _license[] SEC("license") = "GPL";
 
@@ -35,7 +36,7 @@ long prod_pos = 0;
 /* inner state */
 long seq = 0;
 
-SEC("fentry/__x64_sys_getpgid")
+SEC("fentry/" SYS_PREFIX "sys_getpgid")
 int test_ringbuf(void *ctx)
 {
        int cur_pid = bpf_get_current_pid_tgid() >> 32;
index 83b0aaa..bf5b7ca 100644 (file)
@@ -392,6 +392,7 @@ int ctx_narrow_access(struct bpf_sk_lookup *ctx)
 {
        struct bpf_sock *sk;
        int err, family;
+       __u32 val_u32;
        bool v4;
 
        v4 = (ctx->family == AF_INET);
@@ -418,6 +419,11 @@ int ctx_narrow_access(struct bpf_sk_lookup *ctx)
        if (LSW(ctx->remote_port, 0) != SRC_PORT)
                return SK_DROP;
 
+       /* Load from remote_port field with zero padding (backward compatibility) */
+       val_u32 = *(__u32 *)&ctx->remote_port;
+       if (val_u32 != bpf_htonl(bpf_ntohs(SRC_PORT) << 16))
+               return SK_DROP;
+
        /* Narrow loads from local_port field. Expect DST_PORT. */
        if (LSB(ctx->local_port, 0) != ((DST_PORT >> 0) & 0xff) ||
            LSB(ctx->local_port, 1) != ((DST_PORT >> 8) & 0xff) ||
index c304cd5..37aacc6 100644 (file)
 
 #define NUM_CGROUP_LEVELS      4
 
-struct bpf_map_def SEC("maps") cgroup_ids = {
-       .type = BPF_MAP_TYPE_ARRAY,
-       .key_size = sizeof(__u32),
-       .value_size = sizeof(__u64),
-       .max_entries = NUM_CGROUP_LEVELS,
-};
+struct {
+       __uint(type, BPF_MAP_TYPE_ARRAY);
+       __type(key, __u32);
+       __type(value, __u64);
+       __uint(max_entries, NUM_CGROUP_LEVELS);
+} cgroup_ids SEC(".maps");
 
 static __always_inline void log_nth_level(struct __sk_buff *skb, __u32 level)
 {
index 81b57b9..246f1f0 100644 (file)
@@ -12,6 +12,7 @@
 enum bpf_linum_array_idx {
        EGRESS_LINUM_IDX,
        INGRESS_LINUM_IDX,
+       READ_SK_DST_PORT_LINUM_IDX,
        __NR_BPF_LINUM_ARRAY_IDX,
 };
 
@@ -250,4 +251,44 @@ int ingress_read_sock_fields(struct __sk_buff *skb)
        return CG_OK;
 }
 
+static __noinline bool sk_dst_port__load_word(struct bpf_sock *sk)
+{
+       __u32 *word = (__u32 *)&sk->dst_port;
+       return word[0] == bpf_htonl(0xcafe0000);
+}
+
+static __noinline bool sk_dst_port__load_half(struct bpf_sock *sk)
+{
+       __u16 *half = (__u16 *)&sk->dst_port;
+       return half[0] == bpf_htons(0xcafe);
+}
+
+static __noinline bool sk_dst_port__load_byte(struct bpf_sock *sk)
+{
+       __u8 *byte = (__u8 *)&sk->dst_port;
+       return byte[0] == 0xca && byte[1] == 0xfe;
+}
+
+SEC("cgroup_skb/egress")
+int read_sk_dst_port(struct __sk_buff *skb)
+{
+       __u32 linum, linum_idx;
+       struct bpf_sock *sk;
+
+       linum_idx = READ_SK_DST_PORT_LINUM_IDX;
+
+       sk = skb->sk;
+       if (!sk)
+               RET_LOG();
+
+       if (!sk_dst_port__load_word(sk))
+               RET_LOG();
+       if (!sk_dst_port__load_half(sk))
+               RET_LOG();
+       if (!sk_dst_port__load_byte(sk))
+               RET_LOG();
+
+       return CG_OK;
+}
+
 char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_progs_query.c b/tools/testing/selftests/bpf/progs/test_sockmap_progs_query.c
new file mode 100644 (file)
index 0000000..9d58d61
--- /dev/null
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+
+struct {
+       __uint(type, BPF_MAP_TYPE_SOCKMAP);
+       __uint(max_entries, 1);
+       __type(key, __u32);
+       __type(value, __u64);
+} sock_map SEC(".maps");
+
+SEC("sk_skb")
+int prog_skb_verdict(struct __sk_buff *skb)
+{
+       return SK_PASS;
+}
+
+SEC("sk_msg")
+int prog_skmsg_verdict(struct sk_msg_md *msg)
+{
+       return SK_PASS;
+}
+
+char _license[] SEC("license") = "GPL";
index bf28814..950a70b 100644 (file)
 #define THROTTLE_RATE_BPS (5 * 1000 * 1000)
 
 /* flow_key => last_tstamp timestamp used */
-struct bpf_map_def SEC("maps") flow_map = {
-       .type = BPF_MAP_TYPE_HASH,
-       .key_size = sizeof(uint32_t),
-       .value_size = sizeof(uint64_t),
-       .max_entries = 1,
-};
+struct {
+       __uint(type, BPF_MAP_TYPE_HASH);
+       __type(key, uint32_t);
+       __type(value, uint64_t);
+       __uint(max_entries, 1);
+} flow_map SEC(".maps");
 
 static inline int throttle_flow(struct __sk_buff *skb)
 {
index cd747cd..6edebce 100644 (file)
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_endian.h>
 
-struct bpf_map_def SEC("maps") results = {
-       .type = BPF_MAP_TYPE_ARRAY,
-       .key_size = sizeof(__u32),
-       .value_size = sizeof(__u32),
-       .max_entries = 3,
-};
+struct {
+       __uint(type, BPF_MAP_TYPE_ARRAY);
+       __type(key, __u32);
+       __type(value, __u32);
+       __uint(max_entries, 3);
+} results SEC(".maps");
 
 static __always_inline __s64 gen_syncookie(void *data_end, struct bpf_sock *sk,
                                           void *iph, __u32 ip_size,
index 199c61b..53b64c9 100644 (file)
@@ -7,11 +7,10 @@ int _xdp_adjust_tail_grow(struct xdp_md *xdp)
 {
        void *data_end = (void *)(long)xdp->data_end;
        void *data = (void *)(long)xdp->data;
-       unsigned int data_len;
+       int data_len = bpf_xdp_get_buff_len(xdp);
        int offset = 0;
 
        /* Data length determine test case */
-       data_len = data_end - data;
 
        if (data_len == 54) { /* sizeof(pkt_v4) */
                offset = 4096; /* test too large offset */
@@ -20,7 +19,12 @@ int _xdp_adjust_tail_grow(struct xdp_md *xdp)
        } else if (data_len == 64) {
                offset = 128;
        } else if (data_len == 128) {
-               offset = 4096 - 256 - 320 - data_len; /* Max tail grow 3520 */
+               /* Max tail grow 3520 */
+               offset = 4096 - 256 - 320 - data_len;
+       } else if (data_len == 9000) {
+               offset = 10;
+       } else if (data_len == 9001) {
+               offset = 4096;
        } else {
                return XDP_ABORTED; /* No matching test */
        }
index b744825..ca68c03 100644 (file)
 SEC("xdp")
 int _xdp_adjust_tail_shrink(struct xdp_md *xdp)
 {
-       void *data_end = (void *)(long)xdp->data_end;
-       void *data = (void *)(long)xdp->data;
+       __u8 *data_end = (void *)(long)xdp->data_end;
+       __u8 *data = (void *)(long)xdp->data;
        int offset = 0;
 
-       if (data_end - data == 54) /* sizeof(pkt_v4) */
+       switch (bpf_xdp_get_buff_len(xdp)) {
+       case 54:
+               /* sizeof(pkt_v4) */
                offset = 256; /* shrink too much */
-       else
+               break;
+       case 9000:
+               /* non-linear buff test cases */
+               if (data + 1 > data_end)
+                       return XDP_DROP;
+
+               switch (data[0]) {
+               case 0:
+                       offset = 10;
+                       break;
+               case 1:
+                       offset = 4100;
+                       break;
+               case 2:
+                       offset = 8200;
+                       break;
+               default:
+                       return XDP_DROP;
+               }
+               break;
+       default:
                offset = 20;
+               break;
+       }
        if (bpf_xdp_adjust_tail(xdp, 0 - offset))
                return XDP_DROP;
        return XDP_TX;
index 58cf434..3379d30 100644 (file)
@@ -49,7 +49,7 @@ int BPF_PROG(trace_on_entry, struct xdp_buff *xdp)
        void *data = (void *)(long)xdp->data;
 
        meta.ifindex = xdp->rxq->dev->ifindex;
-       meta.pkt_len = data_end - data;
+       meta.pkt_len = bpf_xdp_get_buff_len((struct xdp_md *)xdp);
        bpf_xdp_output(xdp, &perf_buf_map,
                       ((__u64) meta.pkt_len << 32) |
                       BPF_F_CURRENT_CPU,
diff --git a/tools/testing/selftests/bpf/progs/test_xdp_update_frags.c b/tools/testing/selftests/bpf/progs/test_xdp_update_frags.c
new file mode 100644 (file)
index 0000000..2a3496d
--- /dev/null
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/bpf.h>
+#include <linux/if_ether.h>
+#include <bpf/bpf_helpers.h>
+
+int _version SEC("version") = 1;
+
+SEC("xdp.frags")
+int xdp_adjust_frags(struct xdp_md *xdp)
+{
+       __u8 *data_end = (void *)(long)xdp->data_end;
+       __u8 *data = (void *)(long)xdp->data;
+       __u8 val[16] = {};
+       __u32 offset;
+       int err;
+
+       if (data + sizeof(__u32) > data_end)
+               return XDP_DROP;
+
+       offset = *(__u32 *)data;
+       err = bpf_xdp_load_bytes(xdp, offset, val, sizeof(val));
+       if (err < 0)
+               return XDP_DROP;
+
+       if (val[0] != 0xaa || val[15] != 0xaa) /* marker */
+               return XDP_DROP;
+
+       val[0] = 0xbb; /* update the marker */
+       val[15] = 0xbb;
+       err = bpf_xdp_store_bytes(xdp, offset, val, sizeof(val));
+       if (err < 0)
+               return XDP_DROP;
+
+       return XDP_PASS;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_xdp_with_cpumap_frags_helpers.c b/tools/testing/selftests/bpf/progs/test_xdp_with_cpumap_frags_helpers.c
new file mode 100644 (file)
index 0000000..97ed625
--- /dev/null
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+#define IFINDEX_LO     1
+
+struct {
+       __uint(type, BPF_MAP_TYPE_CPUMAP);
+       __uint(key_size, sizeof(__u32));
+       __uint(value_size, sizeof(struct bpf_cpumap_val));
+       __uint(max_entries, 4);
+} cpu_map SEC(".maps");
+
+SEC("xdp/cpumap")
+int xdp_dummy_cm(struct xdp_md *ctx)
+{
+       return XDP_PASS;
+}
+
+SEC("xdp.frags/cpumap")
+int xdp_dummy_cm_frags(struct xdp_md *ctx)
+{
+       return XDP_PASS;
+}
+
+char _license[] SEC("license") = "GPL";
index 5320250..20ec672 100644 (file)
@@ -24,7 +24,7 @@ int xdp_dummy_prog(struct xdp_md *ctx)
        return XDP_PASS;
 }
 
-SEC("xdp_cpumap/dummy_cm")
+SEC("xdp/cpumap")
 int xdp_dummy_cm(struct xdp_md *ctx)
 {
        if (ctx->ingress_ifindex == IFINDEX_LO)
@@ -33,4 +33,10 @@ int xdp_dummy_cm(struct xdp_md *ctx)
        return XDP_PASS;
 }
 
+SEC("xdp.frags/cpumap")
+int xdp_dummy_cm_frags(struct xdp_md *ctx)
+{
+       return XDP_PASS;
+}
+
 char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_xdp_with_devmap_frags_helpers.c b/tools/testing/selftests/bpf/progs/test_xdp_with_devmap_frags_helpers.c
new file mode 100644 (file)
index 0000000..cdcf7de
--- /dev/null
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+struct {
+       __uint(type, BPF_MAP_TYPE_DEVMAP);
+       __uint(key_size, sizeof(__u32));
+       __uint(value_size, sizeof(struct bpf_devmap_val));
+       __uint(max_entries, 4);
+} dm_ports SEC(".maps");
+
+/* valid program on DEVMAP entry via SEC name;
+ * has access to egress and ingress ifindex
+ */
+SEC("xdp/devmap")
+int xdp_dummy_dm(struct xdp_md *ctx)
+{
+       return XDP_PASS;
+}
+
+SEC("xdp.frags/devmap")
+int xdp_dummy_dm_frags(struct xdp_md *ctx)
+{
+       return XDP_PASS;
+}
+
+char _license[] SEC("license") = "GPL";
index 1e6b9c3..4139a14 100644 (file)
@@ -27,7 +27,7 @@ int xdp_dummy_prog(struct xdp_md *ctx)
 /* valid program on DEVMAP entry via SEC name;
  * has access to egress and ingress ifindex
  */
-SEC("xdp_devmap/map_prog")
+SEC("xdp/devmap")
 int xdp_dummy_dm(struct xdp_md *ctx)
 {
        char fmt[] = "devmap redirect: dev %u -> dev %u len %u\n";
@@ -40,4 +40,11 @@ int xdp_dummy_dm(struct xdp_md *ctx)
 
        return XDP_PASS;
 }
+
+SEC("xdp.frags/devmap")
+int xdp_dummy_dm_frags(struct xdp_md *ctx)
+{
+       return XDP_PASS;
+}
+
 char _license[] SEC("license") = "GPL";
index 119582a..6695478 100644 (file)
@@ -4,6 +4,7 @@
 #include "vmlinux.h"
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
+#include "bpf_misc.h"
 
 char _license[] SEC("license") = "GPL";
 
@@ -12,7 +13,7 @@ int trace_printk_ran = 0;
 
 const char fmt[] = "Testing,testing %d\n";
 
-SEC("fentry/__x64_sys_nanosleep")
+SEC("fentry/" SYS_PREFIX "sys_nanosleep")
 int sys_enter(void *ctx)
 {
        trace_printk_ret = bpf_trace_printk(fmt, sizeof(fmt),
index d327241..969306c 100644 (file)
@@ -4,6 +4,7 @@
 #include "vmlinux.h"
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
+#include "bpf_misc.h"
 
 char _license[] SEC("license") = "GPL";
 
@@ -11,7 +12,7 @@ int null_data_vprintk_ret = 0;
 int trace_vprintk_ret = 0;
 int trace_vprintk_ran = 0;
 
-SEC("fentry/__x64_sys_nanosleep")
+SEC("fentry/" SYS_PREFIX "sys_nanosleep")
 int sys_enter(void *ctx)
 {
        static const char one[] = "1";
index 2098f3f..2ab049b 100644 (file)
@@ -5,6 +5,7 @@
 #include <asm/unistd.h>
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
+#include "bpf_misc.h"
 
 char _license[] SEC("license") = "GPL";
 
@@ -25,28 +26,28 @@ int BPF_PROG(bench_trigger_raw_tp, struct pt_regs *regs, long id)
        return 0;
 }
 
-SEC("kprobe/__x64_sys_getpgid")
+SEC("kprobe/" SYS_PREFIX "sys_getpgid")
 int bench_trigger_kprobe(void *ctx)
 {
        __sync_add_and_fetch(&hits, 1);
        return 0;
 }
 
-SEC("fentry/__x64_sys_getpgid")
+SEC("fentry/" SYS_PREFIX "sys_getpgid")
 int bench_trigger_fentry(void *ctx)
 {
        __sync_add_and_fetch(&hits, 1);
        return 0;
 }
 
-SEC("fentry.s/__x64_sys_getpgid")
+SEC("fentry.s/" SYS_PREFIX "sys_getpgid")
 int bench_trigger_fentry_sleep(void *ctx)
 {
        __sync_add_and_fetch(&hits, 1);
        return 0;
 }
 
-SEC("fmod_ret/__x64_sys_getpgid")
+SEC("fmod_ret/" SYS_PREFIX "sys_getpgid")
 int bench_trigger_fmodret(void *ctx)
 {
        __sync_add_and_fetch(&hits, 1);
index 8395782..97b26a3 100644 (file)
@@ -70,7 +70,7 @@ int xdp_redirect_map_all_prog(struct xdp_md *ctx)
                                BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS);
 }
 
-SEC("xdp_devmap/map_prog")
+SEC("xdp/devmap")
 int xdp_devmap_prog(struct xdp_md *ctx)
 {
        void *data_end = (void *)(long)ctx->data_end;
index b9f1bbb..6e62351 100644 (file)
@@ -61,7 +61,11 @@ static int bpf_map_lookup_elem_with_ref_bit(int fd, unsigned long long key,
        };
        __u8 data[64] = {};
        int mfd, pfd, ret, zero = 0;
-       __u32 retval = 0;
+       LIBBPF_OPTS(bpf_test_run_opts, topts,
+               .data_in = data,
+               .data_size_in = sizeof(data),
+               .repeat = 1,
+       );
 
        mfd = bpf_map_create(BPF_MAP_TYPE_ARRAY, NULL, sizeof(int), sizeof(__u64), 1, NULL);
        if (mfd < 0)
@@ -75,9 +79,8 @@ static int bpf_map_lookup_elem_with_ref_bit(int fd, unsigned long long key,
                return -1;
        }
 
-       ret = bpf_prog_test_run(pfd, 1, data, sizeof(data),
-                               NULL, NULL, &retval, NULL);
-       if (ret < 0 || retval != 42) {
+       ret = bpf_prog_test_run_opts(pfd, &topts);
+       if (ret < 0 || topts.retval != 42) {
                ret = -1;
        } else {
                assert(!bpf_map_lookup_elem(mfd, &zero, value));
index 5620919..826f442 100755 (executable)
 
 # Kselftest framework requirement - SKIP code is 4.
 ksft_skip=4
+readonly NS1="ns1-$(mktemp -u XXXXXX)"
+readonly NS2="ns2-$(mktemp -u XXXXXX)"
+readonly NS3="ns3-$(mktemp -u XXXXXX)"
+readonly NS4="ns4-$(mktemp -u XXXXXX)"
+readonly NS5="ns5-$(mktemp -u XXXXXX)"
+readonly NS6="ns6-$(mktemp -u XXXXXX)"
 
 msg="skip all tests:"
 if [ $UID != 0 ]; then
@@ -41,23 +47,23 @@ cleanup()
        fi
 
        set +e
-       ip netns del ns1 2> /dev/null
-       ip netns del ns2 2> /dev/null
-       ip netns del ns3 2> /dev/null
-       ip netns del ns4 2> /dev/null
-       ip netns del ns5 2> /dev/null
-       ip netns del ns6 2> /dev/null
+       ip netns del ${NS1} 2> /dev/null
+       ip netns del ${NS2} 2> /dev/null
+       ip netns del ${NS3} 2> /dev/null
+       ip netns del ${NS4} 2> /dev/null
+       ip netns del ${NS5} 2> /dev/null
+       ip netns del ${NS6} 2> /dev/null
        rm -f $TMP_FILE
 }
 
 set -e
 
-ip netns add ns1
-ip netns add ns2
-ip netns add ns3
-ip netns add ns4
-ip netns add ns5
-ip netns add ns6
+ip netns add ${NS1}
+ip netns add ${NS2}
+ip netns add ${NS3}
+ip netns add ${NS4}
+ip netns add ${NS5}
+ip netns add ${NS6}
 
 trap cleanup 0 2 3 6 9
 
@@ -67,78 +73,78 @@ ip link add veth5 type veth peer name veth6
 ip link add veth7 type veth peer name veth8
 ip link add veth9 type veth peer name veth10
 
-ip link set veth1 netns ns1
-ip link set veth2 netns ns2
-ip link set veth3 netns ns2
-ip link set veth4 netns ns3
-ip link set veth5 netns ns3
-ip link set veth6 netns ns4
-ip link set veth7 netns ns4
-ip link set veth8 netns ns5
-ip link set veth9 netns ns5
-ip link set veth10 netns ns6
-
-ip netns exec ns1 ip link set dev veth1 up
-ip netns exec ns2 ip link set dev veth2 up
-ip netns exec ns2 ip link set dev veth3 up
-ip netns exec ns3 ip link set dev veth4 up
-ip netns exec ns3 ip link set dev veth5 up
-ip netns exec ns4 ip link set dev veth6 up
-ip netns exec ns4 ip link set dev veth7 up
-ip netns exec ns5 ip link set dev veth8 up
-ip netns exec ns5 ip link set dev veth9 up
-ip netns exec ns6 ip link set dev veth10 up
-ip netns exec ns6 ip link set dev lo up
+ip link set veth1 netns ${NS1}
+ip link set veth2 netns ${NS2}
+ip link set veth3 netns ${NS2}
+ip link set veth4 netns ${NS3}
+ip link set veth5 netns ${NS3}
+ip link set veth6 netns ${NS4}
+ip link set veth7 netns ${NS4}
+ip link set veth8 netns ${NS5}
+ip link set veth9 netns ${NS5}
+ip link set veth10 netns ${NS6}
+
+ip netns exec ${NS1} ip link set dev veth1 up
+ip netns exec ${NS2} ip link set dev veth2 up
+ip netns exec ${NS2} ip link set dev veth3 up
+ip netns exec ${NS3} ip link set dev veth4 up
+ip netns exec ${NS3} ip link set dev veth5 up
+ip netns exec ${NS4} ip link set dev veth6 up
+ip netns exec ${NS4} ip link set dev veth7 up
+ip netns exec ${NS5} ip link set dev veth8 up
+ip netns exec ${NS5} ip link set dev veth9 up
+ip netns exec ${NS6} ip link set dev veth10 up
+ip netns exec ${NS6} ip link set dev lo up
 
 # All link scope addresses and routes required between veths
-ip netns exec ns1 ip -6 addr add fb00::12/16 dev veth1 scope link
-ip netns exec ns1 ip -6 route add fb00::21 dev veth1 scope link
-ip netns exec ns2 ip -6 addr add fb00::21/16 dev veth2 scope link
-ip netns exec ns2 ip -6 addr add fb00::34/16 dev veth3 scope link
-ip netns exec ns2 ip -6 route add fb00::43 dev veth3 scope link
-ip netns exec ns3 ip -6 route add fb00::65 dev veth5 scope link
-ip netns exec ns3 ip -6 addr add fb00::43/16 dev veth4 scope link
-ip netns exec ns3 ip -6 addr add fb00::56/16 dev veth5 scope link
-ip netns exec ns4 ip -6 addr add fb00::65/16 dev veth6 scope link
-ip netns exec ns4 ip -6 addr add fb00::78/16 dev veth7 scope link
-ip netns exec ns4 ip -6 route add fb00::87 dev veth7 scope link
-ip netns exec ns5 ip -6 addr add fb00::87/16 dev veth8 scope link
-ip netns exec ns5 ip -6 addr add fb00::910/16 dev veth9 scope link
-ip netns exec ns5 ip -6 route add fb00::109 dev veth9 scope link
-ip netns exec ns5 ip -6 route add fb00::109 table 117 dev veth9 scope link
-ip netns exec ns6 ip -6 addr add fb00::109/16 dev veth10 scope link
-
-ip netns exec ns1 ip -6 addr add fb00::1/16 dev lo
-ip netns exec ns1 ip -6 route add fb00::6 dev veth1 via fb00::21
-
-ip netns exec ns2 ip -6 route add fb00::6 encap bpf in obj test_lwt_seg6local.o sec encap_srh dev veth2
-ip netns exec ns2 ip -6 route add fd00::1 dev veth3 via fb00::43 scope link
-
-ip netns exec ns3 ip -6 route add fc42::1 dev veth5 via fb00::65
-ip netns exec ns3 ip -6 route add fd00::1 encap seg6local action End.BPF endpoint obj test_lwt_seg6local.o sec add_egr_x dev veth4
-
-ip netns exec ns4 ip -6 route add fd00::2 encap seg6local action End.BPF endpoint obj test_lwt_seg6local.o sec pop_egr dev veth6
-ip netns exec ns4 ip -6 addr add fc42::1 dev lo
-ip netns exec ns4 ip -6 route add fd00::3 dev veth7 via fb00::87
-
-ip netns exec ns5 ip -6 route add fd00::4 table 117 dev veth9 via fb00::109
-ip netns exec ns5 ip -6 route add fd00::3 encap seg6local action End.BPF endpoint obj test_lwt_seg6local.o sec inspect_t dev veth8
-
-ip netns exec ns6 ip -6 addr add fb00::6/16 dev lo
-ip netns exec ns6 ip -6 addr add fd00::4/16 dev lo
-
-ip netns exec ns1 sysctl net.ipv6.conf.all.forwarding=1 > /dev/null
-ip netns exec ns2 sysctl net.ipv6.conf.all.forwarding=1 > /dev/null
-ip netns exec ns3 sysctl net.ipv6.conf.all.forwarding=1 > /dev/null
-ip netns exec ns4 sysctl net.ipv6.conf.all.forwarding=1 > /dev/null
-ip netns exec ns5 sysctl net.ipv6.conf.all.forwarding=1 > /dev/null
-
-ip netns exec ns6 sysctl net.ipv6.conf.all.seg6_enabled=1 > /dev/null
-ip netns exec ns6 sysctl net.ipv6.conf.lo.seg6_enabled=1 > /dev/null
-ip netns exec ns6 sysctl net.ipv6.conf.veth10.seg6_enabled=1 > /dev/null
-
-ip netns exec ns6 nc -l -6 -u -d 7330 > $TMP_FILE &
-ip netns exec ns1 bash -c "echo 'foobar' | nc -w0 -6 -u -p 2121 -s fb00::1 fb00::6 7330"
+ip netns exec ${NS1} ip -6 addr add fb00::12/16 dev veth1 scope link
+ip netns exec ${NS1} ip -6 route add fb00::21 dev veth1 scope link
+ip netns exec ${NS2} ip -6 addr add fb00::21/16 dev veth2 scope link
+ip netns exec ${NS2} ip -6 addr add fb00::34/16 dev veth3 scope link
+ip netns exec ${NS2} ip -6 route add fb00::43 dev veth3 scope link
+ip netns exec ${NS3} ip -6 route add fb00::65 dev veth5 scope link
+ip netns exec ${NS3} ip -6 addr add fb00::43/16 dev veth4 scope link
+ip netns exec ${NS3} ip -6 addr add fb00::56/16 dev veth5 scope link
+ip netns exec ${NS4} ip -6 addr add fb00::65/16 dev veth6 scope link
+ip netns exec ${NS4} ip -6 addr add fb00::78/16 dev veth7 scope link
+ip netns exec ${NS4} ip -6 route add fb00::87 dev veth7 scope link
+ip netns exec ${NS5} ip -6 addr add fb00::87/16 dev veth8 scope link
+ip netns exec ${NS5} ip -6 addr add fb00::910/16 dev veth9 scope link
+ip netns exec ${NS5} ip -6 route add fb00::109 dev veth9 scope link
+ip netns exec ${NS5} ip -6 route add fb00::109 table 117 dev veth9 scope link
+ip netns exec ${NS6} ip -6 addr add fb00::109/16 dev veth10 scope link
+
+ip netns exec ${NS1} ip -6 addr add fb00::1/16 dev lo
+ip netns exec ${NS1} ip -6 route add fb00::6 dev veth1 via fb00::21
+
+ip netns exec ${NS2} ip -6 route add fb00::6 encap bpf in obj test_lwt_seg6local.o sec encap_srh dev veth2
+ip netns exec ${NS2} ip -6 route add fd00::1 dev veth3 via fb00::43 scope link
+
+ip netns exec ${NS3} ip -6 route add fc42::1 dev veth5 via fb00::65
+ip netns exec ${NS3} ip -6 route add fd00::1 encap seg6local action End.BPF endpoint obj test_lwt_seg6local.o sec add_egr_x dev veth4
+
+ip netns exec ${NS4} ip -6 route add fd00::2 encap seg6local action End.BPF endpoint obj test_lwt_seg6local.o sec pop_egr dev veth6
+ip netns exec ${NS4} ip -6 addr add fc42::1 dev lo
+ip netns exec ${NS4} ip -6 route add fd00::3 dev veth7 via fb00::87
+
+ip netns exec ${NS5} ip -6 route add fd00::4 table 117 dev veth9 via fb00::109
+ip netns exec ${NS5} ip -6 route add fd00::3 encap seg6local action End.BPF endpoint obj test_lwt_seg6local.o sec inspect_t dev veth8
+
+ip netns exec ${NS6} ip -6 addr add fb00::6/16 dev lo
+ip netns exec ${NS6} ip -6 addr add fd00::4/16 dev lo
+
+ip netns exec ${NS1} sysctl net.ipv6.conf.all.forwarding=1 > /dev/null
+ip netns exec ${NS2} sysctl net.ipv6.conf.all.forwarding=1 > /dev/null
+ip netns exec ${NS3} sysctl net.ipv6.conf.all.forwarding=1 > /dev/null
+ip netns exec ${NS4} sysctl net.ipv6.conf.all.forwarding=1 > /dev/null
+ip netns exec ${NS5} sysctl net.ipv6.conf.all.forwarding=1 > /dev/null
+
+ip netns exec ${NS6} sysctl net.ipv6.conf.all.seg6_enabled=1 > /dev/null
+ip netns exec ${NS6} sysctl net.ipv6.conf.lo.seg6_enabled=1 > /dev/null
+ip netns exec ${NS6} sysctl net.ipv6.conf.veth10.seg6_enabled=1 > /dev/null
+
+ip netns exec ${NS6} nc -l -6 -u -d 7330 > $TMP_FILE &
+ip netns exec ${NS1} bash -c "echo 'foobar' | nc -w0 -6 -u -p 2121 -s fb00::1 fb00::6 7330"
 sleep 5 # wait enough time to ensure the UDP datagram arrived to the last segment
 kill -TERM $!
 
index 50f7e74..cbebfaa 100644 (file)
@@ -738,7 +738,7 @@ static void test_sockmap(unsigned int tasks, void *data)
                            sizeof(key), sizeof(value),
                            6, NULL);
        if (fd < 0) {
-               if (!bpf_probe_map_type(BPF_MAP_TYPE_SOCKMAP, 0)) {
+               if (!libbpf_probe_bpf_map_type(BPF_MAP_TYPE_SOCKMAP, NULL)) {
                        printf("%s SKIP (unsupported map type BPF_MAP_TYPE_SOCKMAP)\n",
                               __func__);
                        skips++;
index 6413c14..102e658 100755 (executable)
@@ -4,6 +4,7 @@
 # Copyright (c) 2019 Cloudflare
 
 set -eu
+readonly NS1="ns1-$(mktemp -u XXXXXX)"
 
 wait_for_ip()
 {
@@ -28,12 +29,12 @@ get_prog_id()
 
 ns1_exec()
 {
-       ip netns exec ns1 "$@"
+       ip netns exec ${NS1} "$@"
 }
 
 setup()
 {
-       ip netns add ns1
+       ip netns add ${NS1}
        ns1_exec ip link set lo up
 
        ns1_exec sysctl -w net.ipv4.tcp_syncookies=2
index 76cd903..92e3465 100644 (file)
@@ -31,6 +31,7 @@
 #include <linux/if_ether.h>
 #include <linux/btf.h>
 
+#include <bpf/btf.h>
 #include <bpf/bpf.h>
 #include <bpf/libbpf.h>
 
@@ -66,6 +67,11 @@ static bool unpriv_disabled = false;
 static int skips;
 static bool verbose = false;
 
+struct kfunc_btf_id_pair {
+       const char *kfunc;
+       int insn_idx;
+};
+
 struct bpf_test {
        const char *descr;
        struct bpf_insn insns[MAX_INSNS];
@@ -92,6 +98,7 @@ struct bpf_test {
        int fixup_map_reuseport_array[MAX_FIXUPS];
        int fixup_map_ringbuf[MAX_FIXUPS];
        int fixup_map_timer[MAX_FIXUPS];
+       struct kfunc_btf_id_pair fixup_kfunc_btf_id[MAX_FIXUPS];
        /* Expected verifier log output for result REJECT or VERBOSE_ACCEPT.
         * Can be a tab-separated sequence of expected strings. An empty string
         * means no log verification.
@@ -449,7 +456,7 @@ static int probe_filter_length(const struct bpf_insn *fp)
 
 static bool skip_unsupported_map(enum bpf_map_type map_type)
 {
-       if (!bpf_probe_map_type(map_type, 0)) {
+       if (!libbpf_probe_bpf_map_type(map_type, NULL)) {
                printf("SKIP (unsupported map type %d)\n", map_type);
                skips++;
                return true;
@@ -744,6 +751,7 @@ static void do_test_fixup(struct bpf_test *test, enum bpf_prog_type prog_type,
        int *fixup_map_reuseport_array = test->fixup_map_reuseport_array;
        int *fixup_map_ringbuf = test->fixup_map_ringbuf;
        int *fixup_map_timer = test->fixup_map_timer;
+       struct kfunc_btf_id_pair *fixup_kfunc_btf_id = test->fixup_kfunc_btf_id;
 
        if (test->fill_helper) {
                test->fill_insns = calloc(MAX_TEST_INSNS, sizeof(struct bpf_insn));
@@ -936,6 +944,26 @@ static void do_test_fixup(struct bpf_test *test, enum bpf_prog_type prog_type,
                        fixup_map_timer++;
                } while (*fixup_map_timer);
        }
+
+       /* Patch in kfunc BTF IDs */
+       if (fixup_kfunc_btf_id->kfunc) {
+               struct btf *btf;
+               int btf_id;
+
+               do {
+                       btf_id = 0;
+                       btf = btf__load_vmlinux_btf();
+                       if (btf) {
+                               btf_id = btf__find_by_name_kind(btf,
+                                                               fixup_kfunc_btf_id->kfunc,
+                                                               BTF_KIND_FUNC);
+                               btf_id = btf_id < 0 ? 0 : btf_id;
+                       }
+                       btf__free(btf);
+                       prog[fixup_kfunc_btf_id->insn_idx].imm = btf_id;
+                       fixup_kfunc_btf_id++;
+               } while (fixup_kfunc_btf_id->kfunc);
+       }
 }
 
 struct libcap {
@@ -993,13 +1021,18 @@ static int do_prog_test_run(int fd_prog, bool unpriv, uint32_t expected_val,
 {
        __u8 tmp[TEST_DATA_LEN << 2];
        __u32 size_tmp = sizeof(tmp);
-       uint32_t retval;
        int err, saved_errno;
+       LIBBPF_OPTS(bpf_test_run_opts, topts,
+               .data_in = data,
+               .data_size_in = size_data,
+               .data_out = tmp,
+               .data_size_out = size_tmp,
+               .repeat = 1,
+       );
 
        if (unpriv)
                set_admin(true);
-       err = bpf_prog_test_run(fd_prog, 1, data, size_data,
-                               tmp, &size_tmp, &retval, NULL);
+       err = bpf_prog_test_run_opts(fd_prog, &topts);
        saved_errno = errno;
 
        if (unpriv)
@@ -1023,9 +1056,8 @@ static int do_prog_test_run(int fd_prog, bool unpriv, uint32_t expected_val,
                }
        }
 
-       if (retval != expected_val &&
-           expected_val != POINTER_VALUE) {
-               printf("FAIL retval %d != %d ", retval, expected_val);
+       if (topts.retval != expected_val && expected_val != POINTER_VALUE) {
+               printf("FAIL retval %d != %d ", topts.retval, expected_val);
                return 1;
        }
 
@@ -1148,7 +1180,7 @@ static void do_test_single(struct bpf_test *test, bool unpriv,
         * bpf_probe_prog_type won't give correct answer
         */
        if (fd_prog < 0 && prog_type != BPF_PROG_TYPE_TRACING &&
-           !bpf_probe_prog_type(prog_type, 0)) {
+           !libbpf_probe_bpf_prog_type(prog_type, NULL)) {
                printf("SKIP (unsupported program type %d)\n", prog_type);
                skips++;
                goto close_fds;
index d10cefd..ea69370 100755 (executable)
@@ -2,6 +2,8 @@
 
 # Kselftest framework requirement - SKIP code is 4.
 readonly KSFT_SKIP=4
+readonly NS1="ns1-$(mktemp -u XXXXXX)"
+readonly NS2="ns2-$(mktemp -u XXXXXX)"
 
 cleanup()
 {
@@ -13,8 +15,8 @@ cleanup()
 
        set +e
        ip link del veth1 2> /dev/null
-       ip netns del ns1 2> /dev/null
-       ip netns del ns2 2> /dev/null
+       ip netns del ${NS1} 2> /dev/null
+       ip netns del ${NS2} 2> /dev/null
 }
 
 ip link set dev lo xdp off 2>/dev/null > /dev/null
@@ -24,32 +26,32 @@ if [ $? -ne 0 ];then
 fi
 set -e
 
-ip netns add ns1
-ip netns add ns2
+ip netns add ${NS1}
+ip netns add ${NS2}
 
 trap cleanup 0 2 3 6 9
 
 ip link add veth1 type veth peer name veth2
 
-ip link set veth1 netns ns1
-ip link set veth2 netns ns2
+ip link set veth1 netns ${NS1}
+ip link set veth2 netns ${NS2}
 
-ip netns exec ns1 ip addr add 10.1.1.11/24 dev veth1
-ip netns exec ns2 ip addr add 10.1.1.22/24 dev veth2
+ip netns exec ${NS1} ip addr add 10.1.1.11/24 dev veth1
+ip netns exec ${NS2} ip addr add 10.1.1.22/24 dev veth2
 
-ip netns exec ns1 tc qdisc add dev veth1 clsact
-ip netns exec ns2 tc qdisc add dev veth2 clsact
+ip netns exec ${NS1} tc qdisc add dev veth1 clsact
+ip netns exec ${NS2} tc qdisc add dev veth2 clsact
 
-ip netns exec ns1 tc filter add dev veth1 ingress bpf da obj test_xdp_meta.o sec t
-ip netns exec ns2 tc filter add dev veth2 ingress bpf da obj test_xdp_meta.o sec t
+ip netns exec ${NS1} tc filter add dev veth1 ingress bpf da obj test_xdp_meta.o sec t
+ip netns exec ${NS2} tc filter add dev veth2 ingress bpf da obj test_xdp_meta.o sec t
 
-ip netns exec ns1 ip link set dev veth1 xdp obj test_xdp_meta.o sec x
-ip netns exec ns2 ip link set dev veth2 xdp obj test_xdp_meta.o sec x
+ip netns exec ${NS1} ip link set dev veth1 xdp obj test_xdp_meta.o sec x
+ip netns exec ${NS2} ip link set dev veth2 xdp obj test_xdp_meta.o sec x
 
-ip netns exec ns1 ip link set dev veth1 up
-ip netns exec ns2 ip link set dev veth2 up
+ip netns exec ${NS1} ip link set dev veth1 up
+ip netns exec ${NS2} ip link set dev veth2 up
 
-ip netns exec ns1 ping -c 1 10.1.1.22
-ip netns exec ns2 ping -c 1 10.1.1.11
+ip netns exec ${NS1} ping -c 1 10.1.1.22
+ip netns exec ${NS2} ping -c 1 10.1.1.11
 
 exit 0
index 57c8db9..1d79f31 100755 (executable)
@@ -10,6 +10,8 @@
 #     | xdp forwarding |
 #     ------------------
 
+readonly NS1="ns1-$(mktemp -u XXXXXX)"
+readonly NS2="ns2-$(mktemp -u XXXXXX)"
 ret=0
 
 setup()
@@ -17,27 +19,27 @@ setup()
 
        local xdpmode=$1
 
-       ip netns add ns1
-       ip netns add ns2
+       ip netns add ${NS1}
+       ip netns add ${NS2}
 
-       ip link add veth1 index 111 type veth peer name veth11 netns ns1
-       ip link add veth2 index 222 type veth peer name veth22 netns ns2
+       ip link add veth1 index 111 type veth peer name veth11 netns ${NS1}
+       ip link add veth2 index 222 type veth peer name veth22 netns ${NS2}
 
        ip link set veth1 up
        ip link set veth2 up
-       ip -n ns1 link set dev veth11 up
-       ip -n ns2 link set dev veth22 up
+       ip -n ${NS1} link set dev veth11 up
+       ip -n ${NS2} link set dev veth22 up
 
-       ip -n ns1 addr add 10.1.1.11/24 dev veth11
-       ip -n ns2 addr add 10.1.1.22/24 dev veth22
+       ip -n ${NS1} addr add 10.1.1.11/24 dev veth11
+       ip -n ${NS2} addr add 10.1.1.22/24 dev veth22
 }
 
 cleanup()
 {
        ip link del veth1 2> /dev/null
        ip link del veth2 2> /dev/null
-       ip netns del ns1 2> /dev/null
-       ip netns del ns2 2> /dev/null
+       ip netns del ${NS1} 2> /dev/null
+       ip netns del ${NS2} 2> /dev/null
 }
 
 test_xdp_redirect()
@@ -52,13 +54,13 @@ test_xdp_redirect()
                return 0
        fi
 
-       ip -n ns1 link set veth11 $xdpmode obj xdp_dummy.o sec xdp &> /dev/null
-       ip -n ns2 link set veth22 $xdpmode obj xdp_dummy.o sec xdp &> /dev/null
+       ip -n ${NS1} link set veth11 $xdpmode obj xdp_dummy.o sec xdp &> /dev/null
+       ip -n ${NS2} link set veth22 $xdpmode obj xdp_dummy.o sec xdp &> /dev/null
        ip link set dev veth1 $xdpmode obj test_xdp_redirect.o sec redirect_to_222 &> /dev/null
        ip link set dev veth2 $xdpmode obj test_xdp_redirect.o sec redirect_to_111 &> /dev/null
 
-       if ip netns exec ns1 ping -c 1 10.1.1.22 &> /dev/null &&
-          ip netns exec ns2 ping -c 1 10.1.1.11 &> /dev/null; then
+       if ip netns exec ${NS1} ping -c 1 10.1.1.22 &> /dev/null &&
+          ip netns exec ${NS2} ping -c 1 10.1.1.11 &> /dev/null; then
                echo "selftests: test_xdp_redirect $xdpmode [PASS]";
        else
                ret=1
index 05f8727..cc57cb8 100755 (executable)
@@ -32,6 +32,11 @@ DRV_MODE="xdpgeneric xdpdrv xdpegress"
 PASS=0
 FAIL=0
 LOG_DIR=$(mktemp -d)
+declare -a NS
+NS[0]="ns0-$(mktemp -u XXXXXX)"
+NS[1]="ns1-$(mktemp -u XXXXXX)"
+NS[2]="ns2-$(mktemp -u XXXXXX)"
+NS[3]="ns3-$(mktemp -u XXXXXX)"
 
 test_pass()
 {
@@ -47,11 +52,9 @@ test_fail()
 
 clean_up()
 {
-       for i in $(seq $NUM); do
-               ip link del veth$i 2> /dev/null
-               ip netns del ns$i 2> /dev/null
+       for i in $(seq 0 $NUM); do
+               ip netns del ${NS[$i]} 2> /dev/null
        done
-       ip netns del ns0 2> /dev/null
 }
 
 # Kselftest framework requirement - SKIP code is 4.
@@ -79,23 +82,22 @@ setup_ns()
                mode="xdpdrv"
        fi
 
-       ip netns add ns0
+       ip netns add ${NS[0]}
        for i in $(seq $NUM); do
-               ip netns add ns$i
-               ip -n ns$i link add veth0 index 2 type veth \
-                       peer name veth$i netns ns0 index $((1 + $i))
-               ip -n ns0 link set veth$i up
-               ip -n ns$i link set veth0 up
-
-               ip -n ns$i addr add 192.0.2.$i/24 dev veth0
-               ip -n ns$i addr add 2001:db8::$i/64 dev veth0
+               ip netns add ${NS[$i]}
+               ip -n ${NS[$i]} link add veth0 type veth peer name veth$i netns ${NS[0]}
+               ip -n ${NS[$i]} link set veth0 up
+               ip -n ${NS[0]} link set veth$i up
+
+               ip -n ${NS[$i]} addr add 192.0.2.$i/24 dev veth0
+               ip -n ${NS[$i]} addr add 2001:db8::$i/64 dev veth0
                # Add a neigh entry for IPv4 ping test
-               ip -n ns$i neigh add 192.0.2.253 lladdr 00:00:00:00:00:01 dev veth0
-               ip -n ns$i link set veth0 $mode obj \
+               ip -n ${NS[$i]} neigh add 192.0.2.253 lladdr 00:00:00:00:00:01 dev veth0
+               ip -n ${NS[$i]} link set veth0 $mode obj \
                        xdp_dummy.o sec xdp &> /dev/null || \
                        { test_fail "Unable to load dummy xdp" && exit 1; }
                IFACES="$IFACES veth$i"
-               veth_mac[$i]=$(ip -n ns0 link show veth$i | awk '/link\/ether/ {print $2}')
+               veth_mac[$i]=$(ip -n ${NS[0]} link show veth$i | awk '/link\/ether/ {print $2}')
        done
 }
 
@@ -104,10 +106,10 @@ do_egress_tests()
        local mode=$1
 
        # mac test
-       ip netns exec ns2 tcpdump -e -i veth0 -nn -l -e &> ${LOG_DIR}/mac_ns1-2_${mode}.log &
-       ip netns exec ns3 tcpdump -e -i veth0 -nn -l -e &> ${LOG_DIR}/mac_ns1-3_${mode}.log &
+       ip netns exec ${NS[2]} tcpdump -e -i veth0 -nn -l -e &> ${LOG_DIR}/mac_ns1-2_${mode}.log &
+       ip netns exec ${NS[3]} tcpdump -e -i veth0 -nn -l -e &> ${LOG_DIR}/mac_ns1-3_${mode}.log &
        sleep 0.5
-       ip netns exec ns1 ping 192.0.2.254 -i 0.1 -c 4 &> /dev/null
+       ip netns exec ${NS[1]} ping 192.0.2.254 -i 0.1 -c 4 &> /dev/null
        sleep 0.5
        pkill tcpdump
 
@@ -123,18 +125,18 @@ do_ping_tests()
        local mode=$1
 
        # ping6 test: echo request should be redirect back to itself, not others
-       ip netns exec ns1 ip neigh add 2001:db8::2 dev veth0 lladdr 00:00:00:00:00:02
+       ip netns exec ${NS[1]} ip neigh add 2001:db8::2 dev veth0 lladdr 00:00:00:00:00:02
 
-       ip netns exec ns1 tcpdump -i veth0 -nn -l -e &> ${LOG_DIR}/ns1-1_${mode}.log &
-       ip netns exec ns2 tcpdump -i veth0 -nn -l -e &> ${LOG_DIR}/ns1-2_${mode}.log &
-       ip netns exec ns3 tcpdump -i veth0 -nn -l -e &> ${LOG_DIR}/ns1-3_${mode}.log &
+       ip netns exec ${NS[1]} tcpdump -i veth0 -nn -l -e &> ${LOG_DIR}/ns1-1_${mode}.log &
+       ip netns exec ${NS[2]} tcpdump -i veth0 -nn -l -e &> ${LOG_DIR}/ns1-2_${mode}.log &
+       ip netns exec ${NS[3]} tcpdump -i veth0 -nn -l -e &> ${LOG_DIR}/ns1-3_${mode}.log &
        sleep 0.5
        # ARP test
-       ip netns exec ns1 arping -q -c 2 -I veth0 192.0.2.254
+       ip netns exec ${NS[1]} arping -q -c 2 -I veth0 192.0.2.254
        # IPv4 test
-       ip netns exec ns1 ping 192.0.2.253 -i 0.1 -c 4 &> /dev/null
+       ip netns exec ${NS[1]} ping 192.0.2.253 -i 0.1 -c 4 &> /dev/null
        # IPv6 test
-       ip netns exec ns1 ping6 2001:db8::2 -i 0.1 -c 2 &> /dev/null
+       ip netns exec ${NS[1]} ping6 2001:db8::2 -i 0.1 -c 2 &> /dev/null
        sleep 0.5
        pkill tcpdump
 
@@ -180,7 +182,7 @@ do_tests()
                xdpgeneric) drv_p="-S";;
        esac
 
-       ip netns exec ns0 ./xdp_redirect_multi $drv_p $IFACES &> ${LOG_DIR}/xdp_redirect_${mode}.log &
+       ip netns exec ${NS[0]} ./xdp_redirect_multi $drv_p $IFACES &> ${LOG_DIR}/xdp_redirect_${mode}.log &
        xdp_pid=$!
        sleep 1
        if ! ps -p $xdp_pid > /dev/null; then
@@ -197,10 +199,10 @@ do_tests()
        kill $xdp_pid
 }
 
-trap clean_up EXIT
-
 check_env
 
+trap clean_up EXIT
+
 for mode in ${DRV_MODE}; do
        setup_ns $mode
        do_tests $mode
index a3a1eae..392d28c 100755 (executable)
@@ -22,6 +22,9 @@ ksft_skip=4
 TESTNAME=xdp_veth
 BPF_FS=$(awk '$3 == "bpf" {print $2; exit}' /proc/mounts)
 BPF_DIR=$BPF_FS/test_$TESTNAME
+readonly NS1="ns1-$(mktemp -u XXXXXX)"
+readonly NS2="ns2-$(mktemp -u XXXXXX)"
+readonly NS3="ns3-$(mktemp -u XXXXXX)"
 
 _cleanup()
 {
@@ -29,9 +32,9 @@ _cleanup()
        ip link del veth1 2> /dev/null
        ip link del veth2 2> /dev/null
        ip link del veth3 2> /dev/null
-       ip netns del ns1 2> /dev/null
-       ip netns del ns2 2> /dev/null
-       ip netns del ns3 2> /dev/null
+       ip netns del ${NS1} 2> /dev/null
+       ip netns del ${NS2} 2> /dev/null
+       ip netns del ${NS3} 2> /dev/null
        rm -rf $BPF_DIR 2> /dev/null
 }
 
@@ -77,24 +80,24 @@ set -e
 
 trap cleanup_skip EXIT
 
-ip netns add ns1
-ip netns add ns2
-ip netns add ns3
+ip netns add ${NS1}
+ip netns add ${NS2}
+ip netns add ${NS3}
 
-ip link add veth1 index 111 type veth peer name veth11 netns ns1
-ip link add veth2 index 122 type veth peer name veth22 netns ns2
-ip link add veth3 index 133 type veth peer name veth33 netns ns3
+ip link add veth1 index 111 type veth peer name veth11 netns ${NS1}
+ip link add veth2 index 122 type veth peer name veth22 netns ${NS2}
+ip link add veth3 index 133 type veth peer name veth33 netns ${NS3}
 
 ip link set veth1 up
 ip link set veth2 up
 ip link set veth3 up
 
-ip -n ns1 addr add 10.1.1.11/24 dev veth11
-ip -n ns3 addr add 10.1.1.33/24 dev veth33
+ip -n ${NS1} addr add 10.1.1.11/24 dev veth11
+ip -n ${NS3} addr add 10.1.1.33/24 dev veth33
 
-ip -n ns1 link set dev veth11 up
-ip -n ns2 link set dev veth22 up
-ip -n ns3 link set dev veth33 up
+ip -n ${NS1} link set dev veth11 up
+ip -n ${NS2} link set dev veth22 up
+ip -n ${NS3} link set dev veth33 up
 
 mkdir $BPF_DIR
 bpftool prog loadall \
@@ -107,12 +110,12 @@ ip link set dev veth1 xdp pinned $BPF_DIR/progs/redirect_map_0
 ip link set dev veth2 xdp pinned $BPF_DIR/progs/redirect_map_1
 ip link set dev veth3 xdp pinned $BPF_DIR/progs/redirect_map_2
 
-ip -n ns1 link set dev veth11 xdp obj xdp_dummy.o sec xdp
-ip -n ns2 link set dev veth22 xdp obj xdp_tx.o sec xdp
-ip -n ns3 link set dev veth33 xdp obj xdp_dummy.o sec xdp
+ip -n ${NS1} link set dev veth11 xdp obj xdp_dummy.o sec xdp
+ip -n ${NS2} link set dev veth22 xdp obj xdp_tx.o sec xdp
+ip -n ${NS3} link set dev veth33 xdp obj xdp_dummy.o sec xdp
 
 trap cleanup EXIT
 
-ip netns exec ns1 ping -c 1 -W 1 10.1.1.33
+ip netns exec ${NS1} ping -c 1 -W 1 10.1.1.33
 
 exit 0
index 0cbc760..810c407 100755 (executable)
@@ -4,6 +4,8 @@
 
 # Kselftest framework requirement - SKIP code is 4.
 readonly KSFT_SKIP=4
+readonly NS1="ns1-$(mktemp -u XXXXXX)"
+readonly NS2="ns2-$(mktemp -u XXXXXX)"
 
 # Allow wrapper scripts to name test
 if [ -z "$TESTNAME" ]; then
@@ -49,15 +51,15 @@ cleanup()
 
        if [ -n "$INTERACTIVE" ]; then
                echo "Namespace setup still active explore with:"
-               echo " ip netns exec ns1 bash"
-               echo " ip netns exec ns2 bash"
+               echo " ip netns exec ${NS1} bash"
+               echo " ip netns exec ${NS2} bash"
                exit $status
        fi
 
        set +e
        ip link del veth1 2> /dev/null
-       ip netns del ns1 2> /dev/null
-       ip netns del ns2 2> /dev/null
+       ip netns del ${NS1} 2> /dev/null
+       ip netns del ${NS2} 2> /dev/null
 }
 
 # Using external program "getopt" to get --long-options
@@ -126,8 +128,8 @@ fi
 # Interactive mode likely require us to cleanup netns
 if [ -n "$INTERACTIVE" ]; then
        ip link del veth1 2> /dev/null
-       ip netns del ns1 2> /dev/null
-       ip netns del ns2 2> /dev/null
+       ip netns del ${NS1} 2> /dev/null
+       ip netns del ${NS2} 2> /dev/null
 fi
 
 # Exit on failure
@@ -144,8 +146,8 @@ if [ -n "$VERBOSE" ]; then
 fi
 
 # Create two namespaces
-ip netns add ns1
-ip netns add ns2
+ip netns add ${NS1}
+ip netns add ${NS2}
 
 # Run cleanup if failing or on kill
 trap cleanup 0 2 3 6 9
@@ -154,44 +156,44 @@ trap cleanup 0 2 3 6 9
 ip link add veth1 type veth peer name veth2
 
 # Move veth1 and veth2 into the respective namespaces
-ip link set veth1 netns ns1
-ip link set veth2 netns ns2
+ip link set veth1 netns ${NS1}
+ip link set veth2 netns ${NS2}
 
 # NOTICE: XDP require VLAN header inside packet payload
 #  - Thus, disable VLAN offloading driver features
 #  - For veth REMEMBER TX side VLAN-offload
 #
 # Disable rx-vlan-offload (mostly needed on ns1)
-ip netns exec ns1 ethtool -K veth1 rxvlan off
-ip netns exec ns2 ethtool -K veth2 rxvlan off
+ip netns exec ${NS1} ethtool -K veth1 rxvlan off
+ip netns exec ${NS2} ethtool -K veth2 rxvlan off
 #
 # Disable tx-vlan-offload (mostly needed on ns2)
-ip netns exec ns2 ethtool -K veth2 txvlan off
-ip netns exec ns1 ethtool -K veth1 txvlan off
+ip netns exec ${NS2} ethtool -K veth2 txvlan off
+ip netns exec ${NS1} ethtool -K veth1 txvlan off
 
 export IPADDR1=100.64.41.1
 export IPADDR2=100.64.41.2
 
 # In ns1/veth1 add IP-addr on plain net_device
-ip netns exec ns1 ip addr add ${IPADDR1}/24 dev veth1
-ip netns exec ns1 ip link set veth1 up
+ip netns exec ${NS1} ip addr add ${IPADDR1}/24 dev veth1
+ip netns exec ${NS1} ip link set veth1 up
 
 # In ns2/veth2 create VLAN device
 export VLAN=4011
 export DEVNS2=veth2
-ip netns exec ns2 ip link add link $DEVNS2 name $DEVNS2.$VLAN type vlan id $VLAN
-ip netns exec ns2 ip addr add ${IPADDR2}/24 dev $DEVNS2.$VLAN
-ip netns exec ns2 ip link set $DEVNS2 up
-ip netns exec ns2 ip link set $DEVNS2.$VLAN up
+ip netns exec ${NS2} ip link add link $DEVNS2 name $DEVNS2.$VLAN type vlan id $VLAN
+ip netns exec ${NS2} ip addr add ${IPADDR2}/24 dev $DEVNS2.$VLAN
+ip netns exec ${NS2} ip link set $DEVNS2 up
+ip netns exec ${NS2} ip link set $DEVNS2.$VLAN up
 
 # Bringup lo in netns (to avoids confusing people using --interactive)
-ip netns exec ns1 ip link set lo up
-ip netns exec ns2 ip link set lo up
+ip netns exec ${NS1} ip link set lo up
+ip netns exec ${NS2} ip link set lo up
 
 # At this point, the hosts cannot reach each-other,
 # because ns2 are using VLAN tags on the packets.
 
-ip netns exec ns2 sh -c 'ping -W 1 -c 1 100.64.41.1 || echo "Success: First ping must fail"'
+ip netns exec ${NS2} sh -c 'ping -W 1 -c 1 100.64.41.1 || echo "Success: First ping must fail"'
 
 
 # Now we can use the test_xdp_vlan.c program to pop/push these VLAN tags
@@ -202,19 +204,19 @@ export FILE=test_xdp_vlan.o
 
 # First test: Remove VLAN by setting VLAN ID 0, using "xdp_vlan_change"
 export XDP_PROG=xdp_vlan_change
-ip netns exec ns1 ip link set $DEVNS1 $XDP_MODE object $FILE section $XDP_PROG
+ip netns exec ${NS1} ip link set $DEVNS1 $XDP_MODE object $FILE section $XDP_PROG
 
 # In ns1: egress use TC to add back VLAN tag 4011
 #  (del cmd)
 #  tc qdisc del dev $DEVNS1 clsact 2> /dev/null
 #
-ip netns exec ns1 tc qdisc add dev $DEVNS1 clsact
-ip netns exec ns1 tc filter add dev $DEVNS1 egress \
+ip netns exec ${NS1} tc qdisc add dev $DEVNS1 clsact
+ip netns exec ${NS1} tc filter add dev $DEVNS1 egress \
   prio 1 handle 1 bpf da obj $FILE sec tc_vlan_push
 
 # Now the namespaces can reach each-other, test with ping:
-ip netns exec ns2 ping -i 0.2 -W 2 -c 2 $IPADDR1
-ip netns exec ns1 ping -i 0.2 -W 2 -c 2 $IPADDR2
+ip netns exec ${NS2} ping -i 0.2 -W 2 -c 2 $IPADDR1
+ip netns exec ${NS1} ping -i 0.2 -W 2 -c 2 $IPADDR2
 
 # Second test: Replace xdp prog, that fully remove vlan header
 #
@@ -223,9 +225,9 @@ ip netns exec ns1 ping -i 0.2 -W 2 -c 2 $IPADDR2
 # ETH_P_8021Q indication, and this cause overwriting of our changes.
 #
 export XDP_PROG=xdp_vlan_remove_outer2
-ip netns exec ns1 ip link set $DEVNS1 $XDP_MODE off
-ip netns exec ns1 ip link set $DEVNS1 $XDP_MODE object $FILE section $XDP_PROG
+ip netns exec ${NS1} ip link set $DEVNS1 $XDP_MODE off
+ip netns exec ${NS1} ip link set $DEVNS1 $XDP_MODE object $FILE section $XDP_PROG
 
 # Now the namespaces should still be able reach each-other, test with ping:
-ip netns exec ns2 ping -i 0.2 -W 2 -c 2 $IPADDR1
-ip netns exec ns1 ping -i 0.2 -W 2 -c 2 $IPADDR2
+ip netns exec ${NS2} ping -i 0.2 -W 2 -c 2 $IPADDR1
+ip netns exec ${NS1} ping -i 0.2 -W 2 -c 2 $IPADDR2
index 7b7f918..ca6abae 100644 (file)
@@ -138,6 +138,29 @@ void read_trace_pipe(void)
        }
 }
 
+ssize_t get_uprobe_offset(const void *addr)
+{
+       size_t start, end, base;
+       char buf[256];
+       bool found = false;
+       FILE *f;
+
+       f = fopen("/proc/self/maps", "r");
+       if (!f)
+               return -errno;
+
+       while (fscanf(f, "%zx-%zx %s %zx %*[^\n]\n", &start, &end, buf, &base) == 4) {
+               if (buf[2] == 'x' && (uintptr_t)addr >= start && (uintptr_t)addr < end) {
+                       found = true;
+                       break;
+               }
+       }
+
+       fclose(f);
+
+       if (!found)
+               return -ESRCH;
+
 #if defined(__powerpc64__) && defined(_CALL_ELF) && _CALL_ELF == 2
 
 #define OP_RT_RA_MASK   0xffff0000UL
@@ -145,10 +168,6 @@ void read_trace_pipe(void)
 #define ADDIS_R2_R12    0x3c4c0000UL
 #define ADDI_R2_R2      0x38420000UL
 
-ssize_t get_uprobe_offset(const void *addr, ssize_t base)
-{
-       u32 *insn = (u32 *)(uintptr_t)addr;
-
        /*
         * A PPC64 ABIv2 function may have a local and a global entry
         * point. We need to use the local entry point when patching
@@ -165,43 +184,16 @@ ssize_t get_uprobe_offset(const void *addr, ssize_t base)
         * lis   r2,XXXX
         * addi  r2,r2,XXXX
         */
-       if ((((*insn & OP_RT_RA_MASK) == ADDIS_R2_R12) ||
-            ((*insn & OP_RT_RA_MASK) == LIS_R2)) &&
-           ((*(insn + 1) & OP_RT_RA_MASK) == ADDI_R2_R2))
-               return (ssize_t)(insn + 2) - base;
-       else
-               return (uintptr_t)addr - base;
-}
-
-#else
+       {
+               const u32 *insn = (const u32 *)(uintptr_t)addr;
 
-ssize_t get_uprobe_offset(const void *addr, ssize_t base)
-{
-       return (uintptr_t)addr - base;
-}
-
-#endif
-
-ssize_t get_base_addr(void)
-{
-       size_t start, offset;
-       char buf[256];
-       FILE *f;
-
-       f = fopen("/proc/self/maps", "r");
-       if (!f)
-               return -errno;
-
-       while (fscanf(f, "%zx-%*x %s %zx %*[^\n]\n",
-                     &start, buf, &offset) == 3) {
-               if (strcmp(buf, "r-xp") == 0) {
-                       fclose(f);
-                       return start - offset;
-               }
+               if ((((*insn & OP_RT_RA_MASK) == ADDIS_R2_R12) ||
+                    ((*insn & OP_RT_RA_MASK) == LIS_R2)) &&
+                   ((*(insn + 1) & OP_RT_RA_MASK) == ADDI_R2_R2))
+                       return (uintptr_t)(insn + 2) - start + base;
        }
-
-       fclose(f);
-       return -EINVAL;
+#endif
+       return (uintptr_t)addr - start + base;
 }
 
 ssize_t get_rel_offset(uintptr_t addr)
index d907b44..238a9c9 100644 (file)
@@ -18,8 +18,7 @@ int kallsyms_find(const char *sym, unsigned long long *addr);
 
 void read_trace_pipe(void);
 
-ssize_t get_uprobe_offset(const void *addr, ssize_t base);
-ssize_t get_base_addr(void);
+ssize_t get_uprobe_offset(const void *addr);
 ssize_t get_rel_offset(uintptr_t addr);
 
 #endif
index d7b74eb..829be2b 100644 (file)
        .result  = ACCEPT,
 },
 {
+       "calls: invalid kfunc call: ptr_to_mem to struct with non-scalar",
+       .insns = {
+       BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+       BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8),
+       BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, BPF_PSEUDO_KFUNC_CALL, 0, 0),
+       BPF_EXIT_INSN(),
+       },
+       .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+       .result = REJECT,
+       .errstr = "arg#0 pointer type STRUCT prog_test_fail1 must point to scalar",
+       .fixup_kfunc_btf_id = {
+               { "bpf_kfunc_call_test_fail1", 2 },
+       },
+},
+{
+       "calls: invalid kfunc call: ptr_to_mem to struct with nesting depth > 4",
+       .insns = {
+       BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+       BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8),
+       BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, BPF_PSEUDO_KFUNC_CALL, 0, 0),
+       BPF_EXIT_INSN(),
+       },
+       .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+       .result = REJECT,
+       .errstr = "max struct nesting depth exceeded\narg#0 pointer type STRUCT prog_test_fail2",
+       .fixup_kfunc_btf_id = {
+               { "bpf_kfunc_call_test_fail2", 2 },
+       },
+},
+{
+       "calls: invalid kfunc call: ptr_to_mem to struct with FAM",
+       .insns = {
+       BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+       BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8),
+       BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, BPF_PSEUDO_KFUNC_CALL, 0, 0),
+       BPF_EXIT_INSN(),
+       },
+       .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+       .result = REJECT,
+       .errstr = "arg#0 pointer type STRUCT prog_test_fail3 must point to scalar",
+       .fixup_kfunc_btf_id = {
+               { "bpf_kfunc_call_test_fail3", 2 },
+       },
+},
+{
+       "calls: invalid kfunc call: reg->type != PTR_TO_CTX",
+       .insns = {
+       BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+       BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8),
+       BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, BPF_PSEUDO_KFUNC_CALL, 0, 0),
+       BPF_EXIT_INSN(),
+       },
+       .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+       .result = REJECT,
+       .errstr = "arg#0 expected pointer to ctx, but got PTR",
+       .fixup_kfunc_btf_id = {
+               { "bpf_kfunc_call_test_pass_ctx", 2 },
+       },
+},
+{
+       "calls: invalid kfunc call: void * not allowed in func proto without mem size arg",
+       .insns = {
+       BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+       BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8),
+       BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, BPF_PSEUDO_KFUNC_CALL, 0, 0),
+       BPF_EXIT_INSN(),
+       },
+       .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+       .result = REJECT,
+       .errstr = "arg#0 pointer type UNKNOWN  must point to scalar",
+       .fixup_kfunc_btf_id = {
+               { "bpf_kfunc_call_test_mem_len_fail1", 2 },
+       },
+},
+{
        "calls: basic sanity",
        .insns = {
        BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2),
index ce13ece..8c224ea 100644 (file)
        .result = ACCEPT,
 },
 {
-       "sk_fullsock(skb->sk): sk->dst_port [narrow load]",
+       "sk_fullsock(skb->sk): sk->dst_port [word load] (backward compatibility)",
+       .insns = {
+       BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)),
+       BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2),
+       BPF_MOV64_IMM(BPF_REG_0, 0),
+       BPF_EXIT_INSN(),
+       BPF_EMIT_CALL(BPF_FUNC_sk_fullsock),
+       BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
+       BPF_MOV64_IMM(BPF_REG_0, 0),
+       BPF_EXIT_INSN(),
+       BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, offsetof(struct bpf_sock, dst_port)),
+       BPF_MOV64_IMM(BPF_REG_0, 0),
+       BPF_EXIT_INSN(),
+       },
+       .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+       .result = ACCEPT,
+},
+{
+       "sk_fullsock(skb->sk): sk->dst_port [half load]",
        .insns = {
        BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)),
        BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2),
        .result = ACCEPT,
 },
 {
-       "sk_fullsock(skb->sk): sk->dst_port [load 2nd byte]",
+       "sk_fullsock(skb->sk): sk->dst_port [half load] (invalid)",
+       .insns = {
+       BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)),
+       BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2),
+       BPF_MOV64_IMM(BPF_REG_0, 0),
+       BPF_EXIT_INSN(),
+       BPF_EMIT_CALL(BPF_FUNC_sk_fullsock),
+       BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
+       BPF_MOV64_IMM(BPF_REG_0, 0),
+       BPF_EXIT_INSN(),
+       BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_0, offsetof(struct bpf_sock, dst_port) + 2),
+       BPF_MOV64_IMM(BPF_REG_0, 0),
+       BPF_EXIT_INSN(),
+       },
+       .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+       .result = REJECT,
+       .errstr = "invalid sock access",
+},
+{
+       "sk_fullsock(skb->sk): sk->dst_port [byte load]",
+       .insns = {
+       BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)),
+       BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2),
+       BPF_MOV64_IMM(BPF_REG_0, 0),
+       BPF_EXIT_INSN(),
+       BPF_EMIT_CALL(BPF_FUNC_sk_fullsock),
+       BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
+       BPF_MOV64_IMM(BPF_REG_0, 0),
+       BPF_EXIT_INSN(),
+       BPF_LDX_MEM(BPF_B, BPF_REG_2, BPF_REG_0, offsetof(struct bpf_sock, dst_port)),
+       BPF_LDX_MEM(BPF_B, BPF_REG_2, BPF_REG_0, offsetof(struct bpf_sock, dst_port) + 1),
+       BPF_MOV64_IMM(BPF_REG_0, 0),
+       BPF_EXIT_INSN(),
+       },
+       .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+       .result = ACCEPT,
+},
+{
+       "sk_fullsock(skb->sk): sk->dst_port [byte load] (invalid)",
+       .insns = {
+       BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)),
+       BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2),
+       BPF_MOV64_IMM(BPF_REG_0, 0),
+       BPF_EXIT_INSN(),
+       BPF_EMIT_CALL(BPF_FUNC_sk_fullsock),
+       BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
+       BPF_MOV64_IMM(BPF_REG_0, 0),
+       BPF_EXIT_INSN(),
+       BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, offsetof(struct bpf_sock, dst_port) + 2),
+       BPF_MOV64_IMM(BPF_REG_0, 0),
+       BPF_EXIT_INSN(),
+       },
+       .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+       .result = REJECT,
+       .errstr = "invalid sock access",
+},
+{
+       "sk_fullsock(skb->sk): past sk->dst_port [half load] (invalid)",
        .insns = {
        BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)),
        BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2),
        BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
        BPF_MOV64_IMM(BPF_REG_0, 0),
        BPF_EXIT_INSN(),
-       BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, offsetof(struct bpf_sock, dst_port) + 1),
+       BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_0, offsetofend(struct bpf_sock, dst_port)),
        BPF_MOV64_IMM(BPF_REG_0, 0),
        BPF_EXIT_INSN(),
        },
index 51c8224..aaedbf4 100644 (file)
@@ -32,12 +32,12 @@ static void int_exit(int sig)
        int i;
 
        for (i = 0; ifaces[i] > 0; i++) {
-               if (bpf_get_link_xdp_id(ifaces[i], &prog_id, xdp_flags)) {
-                       printf("bpf_get_link_xdp_id failed\n");
+               if (bpf_xdp_query_id(ifaces[i], xdp_flags, &prog_id)) {
+                       printf("bpf_xdp_query_id failed\n");
                        exit(1);
                }
                if (prog_id)
-                       bpf_set_link_xdp_fd(ifaces[i], -1, xdp_flags);
+                       bpf_xdp_detach(ifaces[i], xdp_flags, NULL);
        }
 
        exit(0);
@@ -210,7 +210,7 @@ int main(int argc, char **argv)
                }
 
                /* bind prog_fd to each interface */
-               ret = bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags);
+               ret = bpf_xdp_attach(ifindex, prog_fd, xdp_flags, NULL);
                if (ret) {
                        printf("Set xdp fd failed on %d\n", ifindex);
                        goto err_out;
index baa870a..c567856 100644 (file)
@@ -29,7 +29,7 @@ static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
 
 static void cleanup(int sig)
 {
-       bpf_set_link_xdp_fd(ifindex, -1, xdp_flags);
+       bpf_xdp_detach(ifindex, xdp_flags, NULL);
        if (sig)
                exit(1);
 }
@@ -203,7 +203,7 @@ int main(int argc, char **argv)
 
        printf("XDP setup disrupts network connectivity, hit Ctrl+C to quit\n");
 
-       if (bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags) < 0) {
+       if (bpf_xdp_attach(ifindex, prog_fd, xdp_flags, NULL) < 0) {
                fprintf(stderr, "Link set xdp fd failed for %s\n", ifname);
                goto done;
        }
index 0a5d23d..5f8296d 100644 (file)
@@ -266,22 +266,24 @@ static int xsk_configure_umem(struct xsk_umem_info *umem, void *buffer, u64 size
 }
 
 static int xsk_configure_socket(struct xsk_socket_info *xsk, struct xsk_umem_info *umem,
-                               struct ifobject *ifobject, u32 qid)
+                               struct ifobject *ifobject, bool shared)
 {
-       struct xsk_socket_config cfg;
+       struct xsk_socket_config cfg = {};
        struct xsk_ring_cons *rxr;
        struct xsk_ring_prod *txr;
 
        xsk->umem = umem;
        cfg.rx_size = xsk->rxqsize;
        cfg.tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
-       cfg.libbpf_flags = 0;
+       cfg.libbpf_flags = XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD;
        cfg.xdp_flags = ifobject->xdp_flags;
        cfg.bind_flags = ifobject->bind_flags;
+       if (shared)
+               cfg.bind_flags |= XDP_SHARED_UMEM;
 
        txr = ifobject->tx_on ? &xsk->tx : NULL;
        rxr = ifobject->rx_on ? &xsk->rx : NULL;
-       return xsk_socket__create(&xsk->xsk, ifobject->ifname, qid, umem->umem, rxr, txr, &cfg);
+       return xsk_socket__create(&xsk->xsk, ifobject->ifname, 0, umem->umem, rxr, txr, &cfg);
 }
 
 static struct option long_options[] = {
@@ -387,7 +389,6 @@ static void __test_spec_init(struct test_spec *test, struct ifobject *ifobj_tx,
        for (i = 0; i < MAX_INTERFACES; i++) {
                struct ifobject *ifobj = i ? ifobj_rx : ifobj_tx;
 
-               ifobj->umem = &ifobj->umem_arr[0];
                ifobj->xsk = &ifobj->xsk_arr[0];
                ifobj->use_poll = false;
                ifobj->pacing_on = true;
@@ -401,11 +402,12 @@ static void __test_spec_init(struct test_spec *test, struct ifobject *ifobj_tx,
                        ifobj->tx_on = false;
                }
 
+               memset(ifobj->umem, 0, sizeof(*ifobj->umem));
+               ifobj->umem->num_frames = DEFAULT_UMEM_BUFFERS;
+               ifobj->umem->frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE;
+
                for (j = 0; j < MAX_SOCKETS; j++) {
-                       memset(&ifobj->umem_arr[j], 0, sizeof(ifobj->umem_arr[j]));
                        memset(&ifobj->xsk_arr[j], 0, sizeof(ifobj->xsk_arr[j]));
-                       ifobj->umem_arr[j].num_frames = DEFAULT_UMEM_BUFFERS;
-                       ifobj->umem_arr[j].frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE;
                        ifobj->xsk_arr[j].rxqsize = XSK_RING_CONS__DEFAULT_NUM_DESCS;
                }
        }
@@ -906,7 +908,10 @@ static bool rx_stats_are_valid(struct ifobject *ifobject)
                        return true;
                case STAT_TEST_RX_FULL:
                        xsk_stat = stats.rx_ring_full;
-                       expected_stat -= RX_FULL_RXQSIZE;
+                       if (ifobject->umem->num_frames < XSK_RING_PROD__DEFAULT_NUM_DESCS)
+                               expected_stat = ifobject->umem->num_frames - RX_FULL_RXQSIZE;
+                       else
+                               expected_stat = XSK_RING_PROD__DEFAULT_NUM_DESCS - RX_FULL_RXQSIZE;
                        break;
                case STAT_TEST_RX_FILL_EMPTY:
                        xsk_stat = stats.rx_fill_ring_empty_descs;
@@ -947,7 +952,10 @@ static void tx_stats_validate(struct ifobject *ifobject)
 
 static void thread_common_ops(struct test_spec *test, struct ifobject *ifobject)
 {
+       u64 umem_sz = ifobject->umem->num_frames * ifobject->umem->frame_size;
        int mmap_flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE;
+       int ret, ifindex;
+       void *bufs;
        u32 i;
 
        ifobject->ns_fd = switch_namespace(ifobject->nsname);
@@ -955,23 +963,20 @@ static void thread_common_ops(struct test_spec *test, struct ifobject *ifobject)
        if (ifobject->umem->unaligned_mode)
                mmap_flags |= MAP_HUGETLB;
 
-       for (i = 0; i < test->nb_sockets; i++) {
-               u64 umem_sz = ifobject->umem->num_frames * ifobject->umem->frame_size;
-               u32 ctr = 0;
-               void *bufs;
-               int ret;
+       bufs = mmap(NULL, umem_sz, PROT_READ | PROT_WRITE, mmap_flags, -1, 0);
+       if (bufs == MAP_FAILED)
+               exit_with_error(errno);
 
-               bufs = mmap(NULL, umem_sz, PROT_READ | PROT_WRITE, mmap_flags, -1, 0);
-               if (bufs == MAP_FAILED)
-                       exit_with_error(errno);
+       ret = xsk_configure_umem(ifobject->umem, bufs, umem_sz);
+       if (ret)
+               exit_with_error(-ret);
 
-               ret = xsk_configure_umem(&ifobject->umem_arr[i], bufs, umem_sz);
-               if (ret)
-                       exit_with_error(-ret);
+       for (i = 0; i < test->nb_sockets; i++) {
+               u32 ctr = 0;
 
                while (ctr++ < SOCK_RECONF_CTR) {
-                       ret = xsk_configure_socket(&ifobject->xsk_arr[i], &ifobject->umem_arr[i],
-                                                  ifobject, i);
+                       ret = xsk_configure_socket(&ifobject->xsk_arr[i], ifobject->umem,
+                                                  ifobject, !!i);
                        if (!ret)
                                break;
 
@@ -982,8 +987,22 @@ static void thread_common_ops(struct test_spec *test, struct ifobject *ifobject)
                }
        }
 
-       ifobject->umem = &ifobject->umem_arr[0];
        ifobject->xsk = &ifobject->xsk_arr[0];
+
+       if (!ifobject->rx_on)
+               return;
+
+       ifindex = if_nametoindex(ifobject->ifname);
+       if (!ifindex)
+               exit_with_error(errno);
+
+       ret = xsk_setup_xdp_prog(ifindex, &ifobject->xsk_map_fd);
+       if (ret)
+               exit_with_error(-ret);
+
+       ret = xsk_socket__update_xskmap(ifobject->xsk->xsk, ifobject->xsk_map_fd);
+       if (ret)
+               exit_with_error(-ret);
 }
 
 static void testapp_cleanup_xsk_res(struct ifobject *ifobj)
@@ -1139,14 +1158,16 @@ static void testapp_bidi(struct test_spec *test)
 
 static void swap_xsk_resources(struct ifobject *ifobj_tx, struct ifobject *ifobj_rx)
 {
+       int ret;
+
        xsk_socket__delete(ifobj_tx->xsk->xsk);
-       xsk_umem__delete(ifobj_tx->umem->umem);
        xsk_socket__delete(ifobj_rx->xsk->xsk);
-       xsk_umem__delete(ifobj_rx->umem->umem);
-       ifobj_tx->umem = &ifobj_tx->umem_arr[1];
        ifobj_tx->xsk = &ifobj_tx->xsk_arr[1];
-       ifobj_rx->umem = &ifobj_rx->umem_arr[1];
        ifobj_rx->xsk = &ifobj_rx->xsk_arr[1];
+
+       ret = xsk_socket__update_xskmap(ifobj_rx->xsk->xsk, ifobj_rx->xsk_map_fd);
+       if (ret)
+               exit_with_error(-ret);
 }
 
 static void testapp_bpf_res(struct test_spec *test)
@@ -1405,13 +1426,13 @@ static struct ifobject *ifobject_create(void)
        if (!ifobj->xsk_arr)
                goto out_xsk_arr;
 
-       ifobj->umem_arr = calloc(MAX_SOCKETS, sizeof(*ifobj->umem_arr));
-       if (!ifobj->umem_arr)
-               goto out_umem_arr;
+       ifobj->umem = calloc(1, sizeof(*ifobj->umem));
+       if (!ifobj->umem)
+               goto out_umem;
 
        return ifobj;
 
-out_umem_arr:
+out_umem:
        free(ifobj->xsk_arr);
 out_xsk_arr:
        free(ifobj);
@@ -1420,7 +1441,7 @@ out_xsk_arr:
 
 static void ifobject_delete(struct ifobject *ifobj)
 {
-       free(ifobj->umem_arr);
+       free(ifobj->umem);
        free(ifobj->xsk_arr);
        free(ifobj);
 }
index 2f705f4..62a3e63 100644 (file)
@@ -125,10 +125,10 @@ struct ifobject {
        struct xsk_socket_info *xsk;
        struct xsk_socket_info *xsk_arr;
        struct xsk_umem_info *umem;
-       struct xsk_umem_info *umem_arr;
        thread_func_t func_ptr;
        struct pkt_stream *pkt_stream;
        int ns_fd;
+       int xsk_map_fd;
        u32 dst_ip;
        u32 src_ip;
        u32 xdp_flags;
index 7581a73..21a411b 100644 (file)
@@ -35,4 +35,4 @@ test_unix_oob
 gro
 ioam6_parser
 toeplitz
-cmsg_so_mark
+cmsg_sender
index 9897fa9..3bfeaf0 100644 (file)
@@ -30,6 +30,7 @@ TEST_PROGS += ioam6.sh
 TEST_PROGS += gro.sh
 TEST_PROGS += gre_gso.sh
 TEST_PROGS += cmsg_so_mark.sh
+TEST_PROGS += cmsg_time.sh
 TEST_PROGS += srv6_end_dt46_l3vpn_test.sh
 TEST_PROGS += srv6_end_dt4_l3vpn_test.sh
 TEST_PROGS += srv6_end_dt6_l3vpn_test.sh
@@ -52,7 +53,7 @@ TEST_GEN_FILES += gro
 TEST_GEN_PROGS = reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa
 TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict tls
 TEST_GEN_FILES += toeplitz
-TEST_GEN_FILES += cmsg_so_mark
+TEST_GEN_FILES += cmsg_sender
 
 TEST_FILES := settings
 
diff --git a/tools/testing/selftests/net/cmsg_sender.c b/tools/testing/selftests/net/cmsg_sender.c
new file mode 100644 (file)
index 0000000..24444dc
--- /dev/null
@@ -0,0 +1,380 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <errno.h>
+#include <error.h>
+#include <netdb.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <unistd.h>
+#include <linux/errqueue.h>
+#include <linux/icmp.h>
+#include <linux/icmpv6.h>
+#include <linux/net_tstamp.h>
+#include <linux/types.h>
+#include <linux/udp.h>
+#include <sys/socket.h>
+
+enum {
+       ERN_SUCCESS = 0,
+       /* Well defined errors, callers may depend on these */
+       ERN_SEND = 1,
+       /* Informational, can reorder */
+       ERN_HELP,
+       ERN_SEND_SHORT,
+       ERN_SOCK_CREATE,
+       ERN_RESOLVE,
+       ERN_CMSG_WR,
+       ERN_SOCKOPT,
+       ERN_GETTIME,
+       ERN_RECVERR,
+       ERN_CMSG_RD,
+       ERN_CMSG_RCV,
+};
+
+struct options {
+       bool silent_send;
+       const char *host;
+       const char *service;
+       struct {
+               unsigned int mark;
+       } sockopt;
+       struct {
+               unsigned int family;
+               unsigned int type;
+               unsigned int proto;
+       } sock;
+       struct {
+               bool ena;
+               unsigned int val;
+       } mark;
+       struct {
+               bool ena;
+               unsigned int delay;
+       } txtime;
+       struct {
+               bool ena;
+       } ts;
+} opt = {
+       .sock = {
+               .family = AF_UNSPEC,
+               .type   = SOCK_DGRAM,
+               .proto  = IPPROTO_UDP,
+       },
+};
+
+static struct timespec time_start_real;
+static struct timespec time_start_mono;
+
+static void __attribute__((noreturn)) cs_usage(const char *bin)
+{
+       printf("Usage: %s [opts] <dst host> <dst port / service>\n", bin);
+       printf("Options:\n"
+              "\t\t-s      Silent send() failures\n"
+              "\t\t-4/-6   Force IPv4 / IPv6 only\n"
+              "\t\t-p prot Socket protocol\n"
+              "\t\t        (u = UDP (default); i = ICMP; r = RAW)\n"
+              "\n"
+              "\t\t-m val  Set SO_MARK with given value\n"
+              "\t\t-M val  Set SO_MARK via setsockopt\n"
+              "\t\t-d val  Set SO_TXTIME with given delay (usec)\n"
+              "\t\t-t      Enable time stamp reporting\n"
+              "");
+       exit(ERN_HELP);
+}
+
+static void cs_parse_args(int argc, char *argv[])
+{
+       char o;
+
+       while ((o = getopt(argc, argv, "46sp:m:M:d:t")) != -1) {
+               switch (o) {
+               case 's':
+                       opt.silent_send = true;
+                       break;
+               case '4':
+                       opt.sock.family = AF_INET;
+                       break;
+               case '6':
+                       opt.sock.family = AF_INET6;
+                       break;
+               case 'p':
+                       if (*optarg == 'u' || *optarg == 'U') {
+                               opt.sock.proto = IPPROTO_UDP;
+                       } else if (*optarg == 'i' || *optarg == 'I') {
+                               opt.sock.proto = IPPROTO_ICMP;
+                       } else if (*optarg == 'r') {
+                               opt.sock.type = SOCK_RAW;
+                       } else {
+                               printf("Error: unknown protocol: %s\n", optarg);
+                               cs_usage(argv[0]);
+                       }
+                       break;
+
+               case 'm':
+                       opt.mark.ena = true;
+                       opt.mark.val = atoi(optarg);
+                       break;
+               case 'M':
+                       opt.sockopt.mark = atoi(optarg);
+                       break;
+               case 'd':
+                       opt.txtime.ena = true;
+                       opt.txtime.delay = atoi(optarg);
+                       break;
+               case 't':
+                       opt.ts.ena = true;
+                       break;
+               }
+       }
+
+       if (optind != argc - 2)
+               cs_usage(argv[0]);
+
+       opt.host = argv[optind];
+       opt.service = argv[optind + 1];
+}
+
+static void
+cs_write_cmsg(int fd, struct msghdr *msg, char *cbuf, size_t cbuf_sz)
+{
+       struct cmsghdr *cmsg;
+       size_t cmsg_len;
+
+       msg->msg_control = cbuf;
+       cmsg_len = 0;
+
+       if (opt.mark.ena) {
+               cmsg = (struct cmsghdr *)(cbuf + cmsg_len);
+               cmsg_len += CMSG_SPACE(sizeof(__u32));
+               if (cbuf_sz < cmsg_len)
+                       error(ERN_CMSG_WR, EFAULT, "cmsg buffer too small");
+
+               cmsg->cmsg_level = SOL_SOCKET;
+               cmsg->cmsg_type = SO_MARK;
+               cmsg->cmsg_len = CMSG_LEN(sizeof(__u32));
+               *(__u32 *)CMSG_DATA(cmsg) = opt.mark.val;
+       }
+       if (opt.txtime.ena) {
+               struct sock_txtime so_txtime = {
+                       .clockid = CLOCK_MONOTONIC,
+               };
+               __u64 txtime;
+
+               if (setsockopt(fd, SOL_SOCKET, SO_TXTIME,
+                              &so_txtime, sizeof(so_txtime)))
+                       error(ERN_SOCKOPT, errno, "setsockopt TXTIME");
+
+               txtime = time_start_mono.tv_sec * (1000ULL * 1000 * 1000) +
+                        time_start_mono.tv_nsec +
+                        opt.txtime.delay * 1000;
+
+               cmsg = (struct cmsghdr *)(cbuf + cmsg_len);
+               cmsg_len += CMSG_SPACE(sizeof(txtime));
+               if (cbuf_sz < cmsg_len)
+                       error(ERN_CMSG_WR, EFAULT, "cmsg buffer too small");
+
+               cmsg->cmsg_level = SOL_SOCKET;
+               cmsg->cmsg_type = SCM_TXTIME;
+               cmsg->cmsg_len = CMSG_LEN(sizeof(txtime));
+               memcpy(CMSG_DATA(cmsg), &txtime, sizeof(txtime));
+       }
+       if (opt.ts.ena) {
+               __u32 val = SOF_TIMESTAMPING_SOFTWARE |
+                           SOF_TIMESTAMPING_OPT_TSONLY;
+
+               if (setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING,
+                              &val, sizeof(val)))
+                       error(ERN_SOCKOPT, errno, "setsockopt TIMESTAMPING");
+
+               cmsg = (struct cmsghdr *)(cbuf + cmsg_len);
+               cmsg_len += CMSG_SPACE(sizeof(__u32));
+               if (cbuf_sz < cmsg_len)
+                       error(ERN_CMSG_WR, EFAULT, "cmsg buffer too small");
+
+               cmsg->cmsg_level = SOL_SOCKET;
+               cmsg->cmsg_type = SO_TIMESTAMPING;
+               cmsg->cmsg_len = CMSG_LEN(sizeof(__u32));
+               *(__u32 *)CMSG_DATA(cmsg) = SOF_TIMESTAMPING_TX_SCHED |
+                                           SOF_TIMESTAMPING_TX_SOFTWARE;
+       }
+
+       if (cmsg_len)
+               msg->msg_controllen = cmsg_len;
+       else
+               msg->msg_control = NULL;
+}
+
+static const char *cs_ts_info2str(unsigned int info)
+{
+       static const char *names[] = {
+               [SCM_TSTAMP_SND]        = "SND",
+               [SCM_TSTAMP_SCHED]      = "SCHED",
+               [SCM_TSTAMP_ACK]        = "ACK",
+       };
+
+       if (info < sizeof(names) / sizeof(names[0]))
+               return names[info];
+       return "unknown";
+}
+
+static void
+cs_read_cmsg(int fd, struct msghdr *msg, char *cbuf, size_t cbuf_sz)
+{
+       struct sock_extended_err *see;
+       struct scm_timestamping *ts;
+       struct cmsghdr *cmsg;
+       int i, err;
+
+       if (!opt.ts.ena)
+               return;
+       msg->msg_control = cbuf;
+       msg->msg_controllen = cbuf_sz;
+
+       while (true) {
+               ts = NULL;
+               see = NULL;
+               memset(cbuf, 0, cbuf_sz);
+
+               err = recvmsg(fd, msg, MSG_ERRQUEUE);
+               if (err < 0) {
+                       if (errno == EAGAIN)
+                               break;
+                       error(ERN_RECVERR, errno, "recvmsg ERRQ");
+               }
+
+               for (cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL;
+                    cmsg = CMSG_NXTHDR(msg, cmsg)) {
+                       if (cmsg->cmsg_level == SOL_SOCKET &&
+                           cmsg->cmsg_type == SO_TIMESTAMPING_OLD) {
+                               if (cmsg->cmsg_len < sizeof(*ts))
+                                       error(ERN_CMSG_RD, EINVAL, "TS cmsg");
+
+                               ts = (void *)CMSG_DATA(cmsg);
+                       }
+                       if ((cmsg->cmsg_level == SOL_IP &&
+                            cmsg->cmsg_type == IP_RECVERR) ||
+                           (cmsg->cmsg_level == SOL_IPV6 &&
+                            cmsg->cmsg_type == IPV6_RECVERR)) {
+                               if (cmsg->cmsg_len < sizeof(*see))
+                                       error(ERN_CMSG_RD, EINVAL, "sock_err cmsg");
+
+                               see = (void *)CMSG_DATA(cmsg);
+                       }
+               }
+
+               if (!ts)
+                       error(ERN_CMSG_RCV, ENOENT, "TS cmsg not found");
+               if (!see)
+                       error(ERN_CMSG_RCV, ENOENT, "sock_err cmsg not found");
+
+               for (i = 0; i < 3; i++) {
+                       unsigned long long rel_time;
+
+                       if (!ts->ts[i].tv_sec && !ts->ts[i].tv_nsec)
+                               continue;
+
+                       rel_time = (ts->ts[i].tv_sec - time_start_real.tv_sec) *
+                               (1000ULL * 1000) +
+                               (ts->ts[i].tv_nsec - time_start_real.tv_nsec) /
+                               1000;
+                       printf(" %5s ts%d %lluus\n",
+                              cs_ts_info2str(see->ee_info),
+                              i, rel_time);
+               }
+       }
+}
+
+int main(int argc, char *argv[])
+{
+       char buf[] = "blablablabla";
+       struct addrinfo hints, *ai;
+       struct iovec iov[1];
+       struct msghdr msg;
+       char cbuf[1024];
+       int err;
+       int fd;
+
+       cs_parse_args(argc, argv);
+
+       memset(&hints, 0, sizeof(hints));
+       hints.ai_family = opt.sock.family;
+
+       ai = NULL;
+       err = getaddrinfo(opt.host, opt.service, &hints, &ai);
+       if (err) {
+               fprintf(stderr, "Can't resolve address [%s]:%s\n",
+                       opt.host, opt.service);
+               return ERN_SOCK_CREATE;
+       }
+
+       if (ai->ai_family == AF_INET6 && opt.sock.proto == IPPROTO_ICMP)
+               opt.sock.proto = IPPROTO_ICMPV6;
+
+       fd = socket(ai->ai_family, opt.sock.type, opt.sock.proto);
+       if (fd < 0) {
+               fprintf(stderr, "Can't open socket: %s\n", strerror(errno));
+               freeaddrinfo(ai);
+               return ERN_RESOLVE;
+       }
+
+       if (opt.sock.proto == IPPROTO_ICMP) {
+               buf[0] = ICMP_ECHO;
+               buf[1] = 0;
+       } else if (opt.sock.proto == IPPROTO_ICMPV6) {
+               buf[0] = ICMPV6_ECHO_REQUEST;
+               buf[1] = 0;
+       } else if (opt.sock.type == SOCK_RAW) {
+               struct udphdr hdr = { 1, 2, htons(sizeof(buf)), 0 };
+               struct sockaddr_in6 *sin6 = (void *)ai->ai_addr;;
+
+               memcpy(buf, &hdr, sizeof(hdr));
+               sin6->sin6_port = htons(opt.sock.proto);
+       }
+
+       if (opt.sockopt.mark &&
+           setsockopt(fd, SOL_SOCKET, SO_MARK,
+                      &opt.sockopt.mark, sizeof(opt.sockopt.mark)))
+               error(ERN_SOCKOPT, errno, "setsockopt SO_MARK");
+
+       if (clock_gettime(CLOCK_REALTIME, &time_start_real))
+               error(ERN_GETTIME, errno, "gettime REALTIME");
+       if (clock_gettime(CLOCK_MONOTONIC, &time_start_mono))
+               error(ERN_GETTIME, errno, "gettime MONOTINIC");
+
+       iov[0].iov_base = buf;
+       iov[0].iov_len = sizeof(buf);
+
+       memset(&msg, 0, sizeof(msg));
+       msg.msg_name = ai->ai_addr;
+       msg.msg_namelen = ai->ai_addrlen;
+       msg.msg_iov = iov;
+       msg.msg_iovlen = 1;
+
+       cs_write_cmsg(fd, &msg, cbuf, sizeof(cbuf));
+
+       err = sendmsg(fd, &msg, 0);
+       if (err < 0) {
+               if (!opt.silent_send)
+                       fprintf(stderr, "send failed: %s\n", strerror(errno));
+               err = ERN_SEND;
+               goto err_out;
+       } else if (err != sizeof(buf)) {
+               fprintf(stderr, "short send\n");
+               err = ERN_SEND_SHORT;
+               goto err_out;
+       } else {
+               err = ERN_SUCCESS;
+       }
+
+       /* Make sure all timestamps have time to loop back */
+       usleep(opt.txtime.delay);
+
+       cs_read_cmsg(fd, &msg, cbuf, sizeof(cbuf));
+
+err_out:
+       close(fd);
+       freeaddrinfo(ai);
+       return err;
+}
diff --git a/tools/testing/selftests/net/cmsg_so_mark.c b/tools/testing/selftests/net/cmsg_so_mark.c
deleted file mode 100644 (file)
index 27f2804..0000000
+++ /dev/null
@@ -1,67 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-#include <errno.h>
-#include <netdb.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-#include <linux/types.h>
-#include <sys/socket.h>
-
-int main(int argc, const char **argv)
-{
-       char cbuf[CMSG_SPACE(sizeof(__u32))];
-       struct addrinfo hints, *ai;
-       struct cmsghdr *cmsg;
-       struct iovec iov[1];
-       struct msghdr msg;
-       int mark;
-       int err;
-       int fd;
-
-       if (argc != 4) {
-               fprintf(stderr, "Usage: %s <dst_ip> <port> <mark>\n", argv[0]);
-               return 1;
-       }
-       mark = atoi(argv[3]);
-
-       memset(&hints, 0, sizeof(hints));
-       hints.ai_family = AF_UNSPEC;
-       hints.ai_socktype = SOCK_DGRAM;
-
-       ai = NULL;
-       err = getaddrinfo(argv[1], argv[2], &hints, &ai);
-       if (err) {
-               fprintf(stderr, "Can't resolve address: %s\n", strerror(errno));
-               return 1;
-       }
-
-       fd = socket(ai->ai_family, SOCK_DGRAM, IPPROTO_UDP);
-       if (fd < 0) {
-               fprintf(stderr, "Can't open socket: %s\n", strerror(errno));
-               freeaddrinfo(ai);
-               return 1;
-       }
-
-       iov[0].iov_base = "bla";
-       iov[0].iov_len = 4;
-
-       msg.msg_name = ai->ai_addr;
-       msg.msg_namelen = ai->ai_addrlen;
-       msg.msg_iov = iov;
-       msg.msg_iovlen = 1;
-       msg.msg_control = cbuf;
-       msg.msg_controllen = sizeof(cbuf);
-
-       cmsg = CMSG_FIRSTHDR(&msg);
-       cmsg->cmsg_level = SOL_SOCKET;
-       cmsg->cmsg_type = SO_MARK;
-       cmsg->cmsg_len = CMSG_LEN(sizeof(__u32));
-       *(__u32 *)CMSG_DATA(cmsg) = mark;
-
-       err = sendmsg(fd, &msg, 0);
-
-       close(fd);
-       freeaddrinfo(ai);
-       return err != 4;
-}
index 19c6aab..1650b86 100755 (executable)
@@ -18,6 +18,8 @@ trap cleanup EXIT
 # Namespaces
 ip netns add $NS
 
+ip netns exec $NS sysctl -w net.ipv4.ping_group_range='0 2147483647' > /dev/null
+
 # Connectivity
 ip -netns $NS link add type dummy
 ip -netns $NS link set dev dummy0 up
@@ -41,15 +43,29 @@ check_result() {
     fi
 }
 
-ip netns exec $NS ./cmsg_so_mark $TGT4 1234 $((MARK + 1))
-check_result $? 0 "IPv4 pass"
-ip netns exec $NS ./cmsg_so_mark $TGT6 1234 $((MARK + 1))
-check_result $? 0 "IPv6 pass"
+for ovr in setsock cmsg both; do
+    for i in 4 6; do
+       [ $i == 4 ] && TGT=$TGT4 || TGT=$TGT6
+
+       for p in u i r; do
+           [ $p == "u" ] && prot=UDP
+           [ $p == "i" ] && prot=ICMP
+           [ $p == "r" ] && prot=RAW
+
+           [ $ovr == "setsock" ] && m="-M"
+           [ $ovr == "cmsg" ]    && m="-m"
+           [ $ovr == "both" ]    && m="-M $MARK -m"
+
+           ip netns exec $NS ./cmsg_sender -$i -p $p $m $((MARK + 1)) $TGT 1234
+           check_result $? 0 "$prot $ovr - pass"
+
+           [ $ovr == "diff" ] && m="-M $((MARK + 1)) -m"
 
-ip netns exec $NS ./cmsg_so_mark $TGT4 1234 $MARK
-check_result $? 1 "IPv4 rejection"
-ip netns exec $NS ./cmsg_so_mark $TGT6 1234 $MARK
-check_result $? 1 "IPv6 rejection"
+           ip netns exec $NS ./cmsg_sender -$i -p $p $m $MARK -s $TGT 1234
+           check_result $? 1 "$prot $ovr - rejection"
+       done
+    done
+done
 
 # Summary
 if [ $BAD -ne 0 ]; then
diff --git a/tools/testing/selftests/net/cmsg_time.sh b/tools/testing/selftests/net/cmsg_time.sh
new file mode 100755 (executable)
index 0000000..91161e1
--- /dev/null
@@ -0,0 +1,83 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+NS=ns
+IP4=172.16.0.1/24
+TGT4=172.16.0.2
+IP6=2001:db8:1::1/64
+TGT6=2001:db8:1::2
+
+cleanup()
+{
+    ip netns del $NS
+}
+
+trap cleanup EXIT
+
+# Namespaces
+ip netns add $NS
+
+ip netns exec $NS sysctl -w net.ipv4.ping_group_range='0 2147483647' > /dev/null
+
+# Connectivity
+ip -netns $NS link add type dummy
+ip -netns $NS link set dev dummy0 up
+ip -netns $NS addr add $IP4 dev dummy0
+ip -netns $NS addr add $IP6 dev dummy0
+
+# Need FQ for TXTIME
+ip netns exec $NS tc qdisc replace dev dummy0 root fq
+
+# Test
+BAD=0
+TOTAL=0
+
+check_result() {
+    ((TOTAL++))
+    if [ $1 -ne 0 ]; then
+       echo "  Case $4 returned $1, expected 0"
+       ((BAD++))
+    elif [ "$2" != "$3" ]; then
+       echo "  Case $4 returned '$2', expected '$3'"
+       ((BAD++))
+    fi
+}
+
+for i in "-4 $TGT4" "-6 $TGT6"; do
+    for p in u i r; do
+       [ $p == "u" ] && prot=UDPv${i:1:2}
+       [ $p == "i" ] && prot=ICMPv${i:1:2}
+       [ $p == "r" ] && prot=RAWv${i:1:2}
+
+       ts=$(ip netns exec $NS ./cmsg_sender -p $p $i 1234)
+       check_result $? "$ts" "" "$prot - no options"
+
+       ts=$(ip netns exec $NS ./cmsg_sender -p $p $i 1234 -t | wc -l)
+       check_result $? "$ts" "2" "$prot - ts cnt"
+       ts=$(ip netns exec $NS ./cmsg_sender -p $p $i 1234 -t |
+                sed -n "s/.*SCHED ts0 [0-9].*/OK/p")
+       check_result $? "$ts" "OK" "$prot - ts0 SCHED"
+       ts=$(ip netns exec $NS ./cmsg_sender -p $p $i 1234 -t |
+                sed -n "s/.*SND ts0 [0-9].*/OK/p")
+       check_result $? "$ts" "OK" "$prot - ts0 SND"
+
+       ts=$(ip netns exec $NS ./cmsg_sender -p $p $i 1234 -t -d 1000 |
+                awk '/SND/ { if ($3 > 1000) print "OK"; }')
+       check_result $? "$ts" "OK" "$prot - TXTIME abs"
+
+       ts=$(ip netns exec $NS ./cmsg_sender -p $p $i 1234 -t -d 1000 |
+                awk '/SND/ {snd=$3}
+                     /SCHED/ {sch=$3}
+                     END { if (snd - sch > 500) print "OK"; }')
+       check_result $? "$ts" "OK" "$prot - TXTIME rel"
+    done
+done
+
+# Summary
+if [ $BAD -ne 0 ]; then
+    echo "FAIL - $BAD/$TOTAL cases failed"
+    exit 1
+else
+    echo "OK"
+    exit 0
+fi
index 43ea840..4f70baa 100755 (executable)
@@ -96,7 +96,7 @@ fib_rule6_del()
 
 fib_rule6_del_by_pref()
 {
-       pref=$($IP -6 rule show | grep "$1 lookup $TABLE" | cut -d ":" -f 1)
+       pref=$($IP -6 rule show $1 table $RTABLE | cut -d ":" -f 1)
        $IP -6 rule del pref $pref
 }
 
@@ -104,17 +104,36 @@ fib_rule6_test_match_n_redirect()
 {
        local match="$1"
        local getmatch="$2"
+       local description="$3"
 
        $IP -6 rule add $match table $RTABLE
        $IP -6 route get $GW_IP6 $getmatch | grep -q "table $RTABLE"
-       log_test $? 0 "rule6 check: $1"
+       log_test $? 0 "rule6 check: $description"
 
        fib_rule6_del_by_pref "$match"
-       log_test $? 0 "rule6 del by pref: $match"
+       log_test $? 0 "rule6 del by pref: $description"
+}
+
+fib_rule6_test_reject()
+{
+       local match="$1"
+       local rc
+
+       $IP -6 rule add $match table $RTABLE 2>/dev/null
+       rc=$?
+       log_test $rc 2 "rule6 check: $match"
+
+       if [ $rc -eq 0 ]; then
+               $IP -6 rule del $match table $RTABLE
+       fi
 }
 
 fib_rule6_test()
 {
+       local getmatch
+       local match
+       local cnt
+
        # setup the fib rule redirect route
        $IP -6 route add table $RTABLE default via $GW_IP6 dev $DEV onlink
 
@@ -124,8 +143,21 @@ fib_rule6_test()
        match="from $SRC_IP6 iif $DEV"
        fib_rule6_test_match_n_redirect "$match" "$match" "iif redirect to table"
 
+       # Reject dsfield (tos) options which have ECN bits set
+       for cnt in $(seq 1 3); do
+               match="dsfield $cnt"
+               fib_rule6_test_reject "$match"
+       done
+
+       # Don't take ECN bits into account when matching on dsfield
        match="tos 0x10"
-       fib_rule6_test_match_n_redirect "$match" "$match" "tos redirect to table"
+       for cnt in "0x10" "0x11" "0x12" "0x13"; do
+               # Using option 'tos' instead of 'dsfield' as old iproute2
+               # versions don't support 'dsfield' in ip rule show.
+               getmatch="tos $cnt"
+               fib_rule6_test_match_n_redirect "$match" "$getmatch" \
+                                               "$getmatch redirect to table"
+       done
 
        match="fwmark 0x64"
        getmatch="mark 0x64"
@@ -165,7 +197,7 @@ fib_rule4_del()
 
 fib_rule4_del_by_pref()
 {
-       pref=$($IP rule show | grep "$1 lookup $TABLE" | cut -d ":" -f 1)
+       pref=$($IP rule show $1 table $RTABLE | cut -d ":" -f 1)
        $IP rule del pref $pref
 }
 
@@ -173,17 +205,36 @@ fib_rule4_test_match_n_redirect()
 {
        local match="$1"
        local getmatch="$2"
+       local description="$3"
 
        $IP rule add $match table $RTABLE
        $IP route get $GW_IP4 $getmatch | grep -q "table $RTABLE"
-       log_test $? 0 "rule4 check: $1"
+       log_test $? 0 "rule4 check: $description"
 
        fib_rule4_del_by_pref "$match"
-       log_test $? 0 "rule4 del by pref: $match"
+       log_test $? 0 "rule4 del by pref: $description"
+}
+
+fib_rule4_test_reject()
+{
+       local match="$1"
+       local rc
+
+       $IP rule add $match table $RTABLE 2>/dev/null
+       rc=$?
+       log_test $rc 2 "rule4 check: $match"
+
+       if [ $rc -eq 0 ]; then
+               $IP rule del $match table $RTABLE
+       fi
 }
 
 fib_rule4_test()
 {
+       local getmatch
+       local match
+       local cnt
+
        # setup the fib rule redirect route
        $IP route add table $RTABLE default via $GW_IP4 dev $DEV onlink
 
@@ -192,14 +243,27 @@ fib_rule4_test()
 
        # need enable forwarding and disable rp_filter temporarily as all the
        # addresses are in the same subnet and egress device == ingress device.
-       ip netns exec testns sysctl -w net.ipv4.ip_forward=1
-       ip netns exec testns sysctl -w net.ipv4.conf.$DEV.rp_filter=0
+       ip netns exec testns sysctl -qw net.ipv4.ip_forward=1
+       ip netns exec testns sysctl -qw net.ipv4.conf.$DEV.rp_filter=0
        match="from $SRC_IP iif $DEV"
        fib_rule4_test_match_n_redirect "$match" "$match" "iif redirect to table"
-       ip netns exec testns sysctl -w net.ipv4.ip_forward=0
+       ip netns exec testns sysctl -qw net.ipv4.ip_forward=0
+
+       # Reject dsfield (tos) options which have ECN bits set
+       for cnt in $(seq 1 3); do
+               match="dsfield $cnt"
+               fib_rule4_test_reject "$match"
+       done
 
+       # Don't take ECN bits into account when matching on dsfield
        match="tos 0x10"
-       fib_rule4_test_match_n_redirect "$match" "$match" "tos redirect to table"
+       for cnt in "0x10" "0x11" "0x12" "0x13"; do
+               # Using option 'tos' instead of 'dsfield' as old iproute2
+               # versions don't support 'dsfield' in ip rule show.
+               getmatch="tos $cnt"
+               fib_rule4_test_match_n_redirect "$match" "$getmatch" \
+                                               "$getmatch redirect to table"
+       done
 
        match="fwmark 0x64"
        getmatch="mark 0x64"
index 996af1a..bb73235 100755 (executable)
@@ -1447,6 +1447,81 @@ ipv4_local_rt_cache()
        log_test $? 0 "Cached route removed from VRF port device"
 }
 
+ipv4_rt_dsfield()
+{
+       echo
+       echo "IPv4 route with dsfield tests"
+
+       run_cmd "$IP route flush 172.16.102.0/24"
+
+       # New routes should reject dsfield options that interfere with ECN
+       run_cmd "$IP route add 172.16.102.0/24 dsfield 0x01 via 172.16.101.2"
+       log_test $? 2 "Reject route with dsfield 0x01"
+
+       run_cmd "$IP route add 172.16.102.0/24 dsfield 0x02 via 172.16.101.2"
+       log_test $? 2 "Reject route with dsfield 0x02"
+
+       run_cmd "$IP route add 172.16.102.0/24 dsfield 0x03 via 172.16.101.2"
+       log_test $? 2 "Reject route with dsfield 0x03"
+
+       # A generic route that doesn't take DSCP into account
+       run_cmd "$IP route add 172.16.102.0/24 via 172.16.101.2"
+
+       # A more specific route for DSCP 0x10
+       run_cmd "$IP route add 172.16.102.0/24 dsfield 0x10 via 172.16.103.2"
+
+       # DSCP 0x10 should match the specific route, no matter the ECN bits
+       $IP route get fibmatch 172.16.102.1 dsfield 0x10 | \
+               grep -q "via 172.16.103.2"
+       log_test $? 0 "IPv4 route with DSCP and ECN:Not-ECT"
+
+       $IP route get fibmatch 172.16.102.1 dsfield 0x11 | \
+               grep -q "via 172.16.103.2"
+       log_test $? 0 "IPv4 route with DSCP and ECN:ECT(1)"
+
+       $IP route get fibmatch 172.16.102.1 dsfield 0x12 | \
+               grep -q "via 172.16.103.2"
+       log_test $? 0 "IPv4 route with DSCP and ECN:ECT(0)"
+
+       $IP route get fibmatch 172.16.102.1 dsfield 0x13 | \
+               grep -q "via 172.16.103.2"
+       log_test $? 0 "IPv4 route with DSCP and ECN:CE"
+
+       # Unknown DSCP should match the generic route, no matter the ECN bits
+       $IP route get fibmatch 172.16.102.1 dsfield 0x14 | \
+               grep -q "via 172.16.101.2"
+       log_test $? 0 "IPv4 route with unknown DSCP and ECN:Not-ECT"
+
+       $IP route get fibmatch 172.16.102.1 dsfield 0x15 | \
+               grep -q "via 172.16.101.2"
+       log_test $? 0 "IPv4 route with unknown DSCP and ECN:ECT(1)"
+
+       $IP route get fibmatch 172.16.102.1 dsfield 0x16 | \
+               grep -q "via 172.16.101.2"
+       log_test $? 0 "IPv4 route with unknown DSCP and ECN:ECT(0)"
+
+       $IP route get fibmatch 172.16.102.1 dsfield 0x17 | \
+               grep -q "via 172.16.101.2"
+       log_test $? 0 "IPv4 route with unknown DSCP and ECN:CE"
+
+       # Null DSCP should match the generic route, no matter the ECN bits
+       $IP route get fibmatch 172.16.102.1 dsfield 0x00 | \
+               grep -q "via 172.16.101.2"
+       log_test $? 0 "IPv4 route with no DSCP and ECN:Not-ECT"
+
+       $IP route get fibmatch 172.16.102.1 dsfield 0x01 | \
+               grep -q "via 172.16.101.2"
+       log_test $? 0 "IPv4 route with no DSCP and ECN:ECT(1)"
+
+       $IP route get fibmatch 172.16.102.1 dsfield 0x02 | \
+               grep -q "via 172.16.101.2"
+       log_test $? 0 "IPv4 route with no DSCP and ECN:ECT(0)"
+
+       $IP route get fibmatch 172.16.102.1 dsfield 0x03 | \
+               grep -q "via 172.16.101.2"
+       log_test $? 0 "IPv4 route with no DSCP and ECN:CE"
+}
+
 ipv4_route_test()
 {
        route_setup
@@ -1454,6 +1529,7 @@ ipv4_route_test()
        ipv4_rt_add
        ipv4_rt_replace
        ipv4_local_rt_cache
+       ipv4_rt_dsfield
 
        route_cleanup
 }
index b90dff8..64bd00f 100755 (executable)
@@ -28,8 +28,9 @@ h2_destroy()
 
 switch_create()
 {
-       # 10 Seconds ageing time.
-       ip link add dev br0 type bridge vlan_filtering 1 ageing_time 1000 \
+       ip link add dev br0 type bridge \
+               vlan_filtering 1 \
+               ageing_time $LOW_AGEING_TIME \
                mcast_snooping 0
 
        ip link set dev $swp1 master br0
index c15c6c8..1c8a260 100755 (executable)
@@ -27,8 +27,9 @@ h2_destroy()
 
 switch_create()
 {
-       # 10 Seconds ageing time.
-       ip link add dev br0 type bridge ageing_time 1000 mcast_snooping 0
+       ip link add dev br0 type bridge \
+               ageing_time $LOW_AGEING_TIME \
+               mcast_snooping 0
 
        ip link set dev $swp1 master br0
        ip link set dev $swp2 master br0
index e134a5f..1b3b462 100644 (file)
@@ -99,15 +99,15 @@ fib_ipv4_tos_test()
        fib4_trap_check $ns "192.0.2.0/24 dev dummy1 tos 0 metric 1024" false
        check_err $? "Route not in hardware when should"
 
-       ip -n $ns route add 192.0.2.0/24 dev dummy1 tos 2 metric 1024
-       fib4_trap_check $ns "192.0.2.0/24 dev dummy1 tos 2 metric 1024" false
+       ip -n $ns route add 192.0.2.0/24 dev dummy1 tos 8 metric 1024
+       fib4_trap_check $ns "192.0.2.0/24 dev dummy1 tos 8 metric 1024" false
        check_err $? "Highest TOS route not in hardware when should"
 
        fib4_trap_check $ns "192.0.2.0/24 dev dummy1 tos 0 metric 1024" true
        check_err $? "Lowest TOS route still in hardware when should not"
 
-       ip -n $ns route add 192.0.2.0/24 dev dummy1 tos 1 metric 1024
-       fib4_trap_check $ns "192.0.2.0/24 dev dummy1 tos 1 metric 1024" true
+       ip -n $ns route add 192.0.2.0/24 dev dummy1 tos 4 metric 1024
+       fib4_trap_check $ns "192.0.2.0/24 dev dummy1 tos 4 metric 1024" true
        check_err $? "Middle TOS route in hardware when should not"
 
        log_test "IPv4 routes with TOS"
@@ -277,11 +277,11 @@ fib_ipv4_replay_tos_test()
        ip -n $ns link set dev dummy1 up
 
        ip -n $ns route add 192.0.2.0/24 dev dummy1 tos 0
-       ip -n $ns route add 192.0.2.0/24 dev dummy1 tos 1
+       ip -n $ns route add 192.0.2.0/24 dev dummy1 tos 4
 
        devlink -N $ns dev reload $devlink_dev
 
-       fib4_trap_check $ns "192.0.2.0/24 dev dummy1 tos 1" false
+       fib4_trap_check $ns "192.0.2.0/24 dev dummy1 tos 4" false
        check_err $? "Highest TOS route not in hardware when should"
 
        fib4_trap_check $ns "192.0.2.0/24 dev dummy1 tos 0" true
index b0980a2..4a54650 100644 (file)
@@ -41,6 +41,8 @@ NETIF_CREATE=yes
 # Timeout (in seconds) before ping exits regardless of how many packets have
 # been sent or received
 PING_TIMEOUT=5
+# Minimum ageing_time (in centiseconds) supported by hardware
+LOW_AGEING_TIME=1000
 # Flag for tc match, supposed to be skip_sw/skip_hw which means do not process
 # filter by software/hardware
 TC_FLAG=skip_hw
index 7da783d..e7e434a 100644 (file)
@@ -24,6 +24,7 @@ PING_COUNT=${PING_COUNT:=10}
 PING_TIMEOUT=${PING_TIMEOUT:=5}
 WAIT_TIMEOUT=${WAIT_TIMEOUT:=20}
 INTERFACE_TIMEOUT=${INTERFACE_TIMEOUT:=600}
+LOW_AGEING_TIME=${LOW_AGEING_TIME:=1000}
 REQUIRE_JQ=${REQUIRE_JQ:=yes}
 REQUIRE_MZ=${REQUIRE_MZ:=yes}
 
diff --git a/tools/testing/selftests/net/forwarding/pedit_ip.sh b/tools/testing/selftests/net/forwarding/pedit_ip.sh
new file mode 100755 (executable)
index 0000000..d14efb2
--- /dev/null
@@ -0,0 +1,201 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This test sends traffic from H1 to H2. Either on ingress of $swp1, or on
+# egress of $swp2, the traffic is acted upon by a pedit action. An ingress
+# filter installed on $h2 verifies that the packet looks like expected.
+#
+# +----------------------+                             +----------------------+
+# | H1                   |                             |                   H2 |
+# |    + $h1             |                             |            $h2 +     |
+# |    | 192.0.2.1/28    |                             |   192.0.2.2/28 |     |
+# +----|-----------------+                             +----------------|-----+
+#      |                                                                |
+# +----|----------------------------------------------------------------|-----+
+# | SW |                                                                |     |
+# |  +-|----------------------------------------------------------------|-+   |
+# |  | + $swp1                       BR                           $swp2 + |   |
+# |  +--------------------------------------------------------------------+   |
+# +---------------------------------------------------------------------------+
+
+ALL_TESTS="
+       ping_ipv4
+       ping_ipv6
+       test_ip4_src
+       test_ip4_dst
+       test_ip6_src
+       test_ip6_dst
+"
+
+NUM_NETIFS=4
+source lib.sh
+source tc_common.sh
+
+h1_create()
+{
+       simple_if_init $h1 192.0.2.1/28 2001:db8:1::1/64
+}
+
+h1_destroy()
+{
+       simple_if_fini $h1 192.0.2.1/28 2001:db8:1::1/64
+}
+
+h2_create()
+{
+       simple_if_init $h2 192.0.2.2/28 2001:db8:1::2/64
+       tc qdisc add dev $h2 clsact
+}
+
+h2_destroy()
+{
+       tc qdisc del dev $h2 clsact
+       simple_if_fini $h2 192.0.2.2/28 2001:db8:1::2/64
+}
+
+switch_create()
+{
+       ip link add name br1 up type bridge vlan_filtering 1
+       ip link set dev $swp1 master br1
+       ip link set dev $swp1 up
+       ip link set dev $swp2 master br1
+       ip link set dev $swp2 up
+
+       tc qdisc add dev $swp1 clsact
+       tc qdisc add dev $swp2 clsact
+}
+
+switch_destroy()
+{
+       tc qdisc del dev $swp2 clsact
+       tc qdisc del dev $swp1 clsact
+
+       ip link set dev $swp2 down
+       ip link set dev $swp2 nomaster
+       ip link set dev $swp1 down
+       ip link set dev $swp1 nomaster
+       ip link del dev br1
+}
+
+setup_prepare()
+{
+       h1=${NETIFS[p1]}
+       swp1=${NETIFS[p2]}
+
+       swp2=${NETIFS[p3]}
+       h2=${NETIFS[p4]}
+
+       h2mac=$(mac_get $h2)
+
+       vrf_prepare
+       h1_create
+       h2_create
+       switch_create
+}
+
+cleanup()
+{
+       pre_cleanup
+
+       switch_destroy
+       h2_destroy
+       h1_destroy
+       vrf_cleanup
+}
+
+ping_ipv4()
+{
+       ping_test $h1 192.0.2.2
+}
+
+ping_ipv6()
+{
+       ping6_test $h1 2001:db8:1::2
+}
+
+do_test_pedit_ip()
+{
+       local pedit_locus=$1; shift
+       local pedit_action=$1; shift
+       local match_prot=$1; shift
+       local match_flower=$1; shift
+       local mz_flags=$1; shift
+
+       tc filter add $pedit_locus handle 101 pref 1 \
+          flower action pedit ex munge $pedit_action
+       tc filter add dev $h2 ingress handle 101 pref 1 prot $match_prot \
+          flower skip_hw $match_flower action pass
+
+       RET=0
+
+       $MZ $mz_flags $h1 -c 10 -d 20msec -p 100 -a own -b $h2mac -q -t ip
+
+       local pkts
+       pkts=$(busywait "$TC_HIT_TIMEOUT" until_counter_is ">= 10" \
+                       tc_rule_handle_stats_get "dev $h2 ingress" 101)
+       check_err $? "Expected to get 10 packets, but got $pkts."
+
+       pkts=$(tc_rule_handle_stats_get "$pedit_locus" 101)
+       ((pkts >= 10))
+       check_err $? "Expected to get 10 packets on pedit rule, but got $pkts."
+
+       log_test "$pedit_locus pedit $pedit_action"
+
+       tc filter del dev $h2 ingress pref 1
+       tc filter del $pedit_locus pref 1
+}
+
+do_test_pedit_ip6()
+{
+       local locus=$1; shift
+       local pedit_addr=$1; shift
+       local flower_addr=$1; shift
+
+       do_test_pedit_ip "$locus" "$pedit_addr set 2001:db8:2::1" ipv6  \
+                        "$flower_addr 2001:db8:2::1"                   \
+                        "-6 -A 2001:db8:1::1 -B 2001:db8:1::2"
+}
+
+do_test_pedit_ip4()
+{
+       local locus=$1; shift
+       local pedit_addr=$1; shift
+       local flower_addr=$1; shift
+
+       do_test_pedit_ip "$locus" "$pedit_addr set 198.51.100.1" ip     \
+                        "$flower_addr 198.51.100.1"                    \
+                        "-A 192.0.2.1 -B 192.0.2.2"
+}
+
+test_ip4_src()
+{
+       do_test_pedit_ip4 "dev $swp1 ingress" "ip src" src_ip
+       do_test_pedit_ip4 "dev $swp2 egress"  "ip src" src_ip
+}
+
+test_ip4_dst()
+{
+       do_test_pedit_ip4 "dev $swp1 ingress" "ip dst" dst_ip
+       do_test_pedit_ip4 "dev $swp2 egress"  "ip dst" dst_ip
+}
+
+test_ip6_src()
+{
+       do_test_pedit_ip6 "dev $swp1 ingress" "ip6 src" src_ip
+       do_test_pedit_ip6 "dev $swp2 egress"  "ip6 src" src_ip
+}
+
+test_ip6_dst()
+{
+       do_test_pedit_ip6 "dev $swp1 ingress" "ip6 dst" dst_ip
+       do_test_pedit_ip6 "dev $swp2 egress"  "ip6 dst" dst_ip
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
index c0801df..18bb0d0 100755 (executable)
@@ -15,6 +15,7 @@ timeout_test=$((timeout_poll * 2 + 1))
 mptcp_connect=""
 capture=0
 checksum=0
+ip_mptcp=0
 do_all_tests=1
 
 TEST_COUNT=0
@@ -239,6 +240,16 @@ is_v6()
        [ -z "${1##*:*}" ]
 }
 
+is_addr()
+{
+       [ -z "${1##*[.:]*}" ]
+}
+
+is_number()
+{
+       [[ $1 == ?(-)+([0-9]) ]]
+}
+
 # $1: ns, $2: port
 wait_local_port_listen()
 {
@@ -278,6 +289,109 @@ wait_rm_addr()
        done
 }
 
+pm_nl_set_limits()
+{
+       local ns=$1
+       local addrs=$2
+       local subflows=$3
+
+       if [ $ip_mptcp -eq 1 ]; then
+               ip -n $ns mptcp limits set add_addr_accepted $addrs subflows $subflows
+       else
+               ip netns exec $ns ./pm_nl_ctl limits $addrs $subflows
+       fi
+}
+
+pm_nl_add_endpoint()
+{
+       local ns=$1
+       local addr=$2
+       local flags
+       local port
+       local dev
+       local id
+       local nr=2
+
+       for p in $@
+       do
+               if [ $p = "flags" ]; then
+                       eval _flags=\$"$nr"
+                       [ ! -z $_flags ]; flags="flags $_flags"
+               fi
+               if [ $p = "dev" ]; then
+                       eval _dev=\$"$nr"
+                       [ ! -z $_dev ]; dev="dev $_dev"
+               fi
+               if [ $p = "id" ]; then
+                       eval _id=\$"$nr"
+                       [ ! -z $_id ]; id="id $_id"
+               fi
+               if [ $p = "port" ]; then
+                       eval _port=\$"$nr"
+                       [ ! -z $_port ]; port="port $_port"
+               fi
+
+               let nr+=1
+       done
+
+       if [ $ip_mptcp -eq 1 ]; then
+               ip -n $ns mptcp endpoint add $addr ${_flags//","/" "} $dev $id $port
+       else
+               ip netns exec $ns ./pm_nl_ctl add $addr $flags $dev $id $port
+       fi
+}
+
+pm_nl_del_endpoint()
+{
+       local ns=$1
+       local id=$2
+       local addr=$3
+
+       if [ $ip_mptcp -eq 1 ]; then
+               ip -n $ns mptcp endpoint delete id $id $addr
+       else
+               ip netns exec $ns ./pm_nl_ctl del $id $addr
+       fi
+}
+
+pm_nl_flush_endpoint()
+{
+       local ns=$1
+
+       if [ $ip_mptcp -eq 1 ]; then
+               ip -n $ns mptcp endpoint flush
+       else
+               ip netns exec $ns ./pm_nl_ctl flush
+       fi
+}
+
+pm_nl_show_endpoints()
+{
+       local ns=$1
+
+       if [ $ip_mptcp -eq 1 ]; then
+               ip -n $ns mptcp endpoint show
+       else
+               ip netns exec $ns ./pm_nl_ctl dump
+       fi
+}
+
+pm_nl_change_endpoint()
+{
+       local ns=$1
+       local flags=$2
+       local id=$3
+       local addr=$4
+       local port=""
+
+       if [ $ip_mptcp -eq 1 ]; then
+               ip -n $ns mptcp endpoint change id $id ${flags//","/" "}
+       else
+               if [ $5 -ne 0 ]; then port="port $5"; fi
+               ip netns exec $ns ./pm_nl_ctl set $addr flags $flags $port
+       fi
+}
+
 do_transfer()
 {
        listener_ns="$1"
@@ -289,7 +403,7 @@ do_transfer()
        addr_nr_ns1="$7"
        addr_nr_ns2="$8"
        speed="$9"
-       bkup="${10}"
+       sflags="${10}"
 
        port=$((10000+$TEST_COUNT))
        TEST_COUNT=$((TEST_COUNT+1))
@@ -378,31 +492,36 @@ do_transfer()
                        else
                                addr="10.0.$counter.1"
                        fi
-                       ip netns exec $ns1 ./pm_nl_ctl add $addr flags signal
+                       pm_nl_add_endpoint $ns1 $addr flags signal
                        let counter+=1
                        let add_nr_ns1-=1
                done
        elif [ $addr_nr_ns1 -lt 0 ]; then
                let rm_nr_ns1=-addr_nr_ns1
                if [ $rm_nr_ns1 -lt 8 ]; then
-                       counter=1
-                       pos=1
-                       dump=(`ip netns exec ${listener_ns} ./pm_nl_ctl dump`)
-                       if [ ${#dump[@]} -gt 0 ]; then
-                               while [ $counter -le $rm_nr_ns1 ]
-                               do
-                                       id=${dump[$pos]}
-                                       rm_addr=$(rm_addr_count ${connector_ns})
-                                       ip netns exec ${listener_ns} ./pm_nl_ctl del $id
-                                       wait_rm_addr ${connector_ns} ${rm_addr}
-                                       let counter+=1
-                                       let pos+=5
+                       counter=0
+                       pm_nl_show_endpoints ${listener_ns} | while read line; do
+                               local arr=($line)
+                               local nr=0
+
+                               for i in ${arr[@]}; do
+                                       if [ $i = "id" ]; then
+                                               if [ $counter -eq $rm_nr_ns1 ]; then
+                                                       break
+                                               fi
+                                               id=${arr[$nr+1]}
+                                               rm_addr=$(rm_addr_count ${connector_ns})
+                                               pm_nl_del_endpoint ${listener_ns} $id
+                                               wait_rm_addr ${connector_ns} ${rm_addr}
+                                               let counter+=1
+                                       fi
+                                       let nr+=1
                                done
-                       fi
+                       done
                elif [ $rm_nr_ns1 -eq 8 ]; then
-                       ip netns exec ${listener_ns} ./pm_nl_ctl flush
+                       pm_nl_flush_endpoint ${listener_ns}
                elif [ $rm_nr_ns1 -eq 9 ]; then
-                       ip netns exec ${listener_ns} ./pm_nl_ctl del 0 ${connect_addr}
+                       pm_nl_del_endpoint ${listener_ns} 0 ${connect_addr}
                fi
        fi
 
@@ -426,30 +545,36 @@ do_transfer()
                        else
                                addr="10.0.$counter.2"
                        fi
-                       ip netns exec $ns2 ./pm_nl_ctl add $addr flags $flags
+                       pm_nl_add_endpoint $ns2 $addr flags $flags
                        let counter+=1
                        let add_nr_ns2-=1
                done
        elif [ $addr_nr_ns2 -lt 0 ]; then
                let rm_nr_ns2=-addr_nr_ns2
                if [ $rm_nr_ns2 -lt 8 ]; then
-                       counter=1
-                       pos=1
-                       dump=(`ip netns exec ${connector_ns} ./pm_nl_ctl dump`)
-                       if [ ${#dump[@]} -gt 0 ]; then
-                               while [ $counter -le $rm_nr_ns2 ]
-                               do
-                                       # rm_addr are serialized, allow the previous one to complete
-                                       id=${dump[$pos]}
-                                       rm_addr=$(rm_addr_count ${listener_ns})
-                                       ip netns exec ${connector_ns} ./pm_nl_ctl del $id
-                                       wait_rm_addr ${listener_ns} ${rm_addr}
-                                       let counter+=1
-                                       let pos+=5
+                       counter=0
+                       pm_nl_show_endpoints ${connector_ns} | while read line; do
+                               local arr=($line)
+                               local nr=0
+
+                               for i in ${arr[@]}; do
+                                       if [ $i = "id" ]; then
+                                               if [ $counter -eq $rm_nr_ns2 ]; then
+                                                       break
+                                               fi
+                                               # rm_addr are serialized, allow the previous one to
+                                               # complete
+                                               id=${arr[$nr+1]}
+                                               rm_addr=$(rm_addr_count ${listener_ns})
+                                               pm_nl_del_endpoint ${connector_ns} $id
+                                               wait_rm_addr ${listener_ns} ${rm_addr}
+                                               let counter+=1
+                                       fi
+                                       let nr+=1
                                done
-                       fi
+                       done
                elif [ $rm_nr_ns2 -eq 8 ]; then
-                       ip netns exec ${connector_ns} ./pm_nl_ctl flush
+                       pm_nl_flush_endpoint ${connector_ns}
                elif [ $rm_nr_ns2 -eq 9 ]; then
                        local addr
                        if is_v6 "${connect_addr}"; then
@@ -457,19 +582,34 @@ do_transfer()
                        else
                                addr="10.0.1.2"
                        fi
-                       ip netns exec ${connector_ns} ./pm_nl_ctl del 0 $addr
+                       pm_nl_del_endpoint ${connector_ns} 0 $addr
                fi
        fi
 
-       if [ ! -z $bkup ]; then
+       if [ ! -z $sflags ]; then
                sleep 1
                for netns in "$ns1" "$ns2"; do
-                       dump=(`ip netns exec $netns ./pm_nl_ctl dump`)
-                       if [ ${#dump[@]} -gt 0 ]; then
-                               addr=${dump[${#dump[@]} - 1]}
-                               backup="ip netns exec $netns ./pm_nl_ctl set $addr flags $bkup"
-                               $backup
-                       fi
+                       pm_nl_show_endpoints $netns | while read line; do
+                               local arr=($line)
+                               local addr
+                               local port=0
+                               local id
+
+                               for i in ${arr[@]}; do
+                                       if is_addr $i; then
+                                               addr=$i
+                                       elif is_number $i; then
+                                               # The minimum expected port number is 10000
+                                               if [ $i -gt 10000 ]; then
+                                                       port=$i
+                                               # The maximum id number is 255
+                                               elif [ $i -lt 255 ]; then
+                                                       id=$i
+                                               fi
+                                       fi
+                               done
+                               pm_nl_change_endpoint $netns $sflags $id $addr $port
+                       done
                done
        fi
 
@@ -545,7 +685,7 @@ run_tests()
        addr_nr_ns1="${5:-0}"
        addr_nr_ns2="${6:-0}"
        speed="${7:-fast}"
-       bkup="${8:-""}"
+       sflags="${8:-""}"
        lret=0
        oldin=""
 
@@ -574,7 +714,7 @@ run_tests()
        fi
 
        do_transfer ${listener_ns} ${connector_ns} MPTCP MPTCP ${connect_addr} \
-               ${test_linkfail} ${addr_nr_ns1} ${addr_nr_ns2} ${speed} ${bkup}
+               ${test_linkfail} ${addr_nr_ns1} ${addr_nr_ns2} ${speed} ${sflags}
        lret=$?
 }
 
@@ -978,51 +1118,51 @@ subflows_tests()
 
        # subflow limited by client
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 0
-       ip netns exec $ns2 ./pm_nl_ctl limits 0 0
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
+       pm_nl_set_limits $ns1 0 0
+       pm_nl_set_limits $ns2 0 0
+       pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
        run_tests $ns1 $ns2 10.0.1.1
        chk_join_nr "single subflow, limited by client" 0 0 0
 
        # subflow limited by server
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 0
-       ip netns exec $ns2 ./pm_nl_ctl limits 0 1
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
+       pm_nl_set_limits $ns1 0 0
+       pm_nl_set_limits $ns2 0 1
+       pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
        run_tests $ns1 $ns2 10.0.1.1
        chk_join_nr "single subflow, limited by server" 1 1 0
 
        # subflow
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 1
-       ip netns exec $ns2 ./pm_nl_ctl limits 0 1
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
+       pm_nl_set_limits $ns1 0 1
+       pm_nl_set_limits $ns2 0 1
+       pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
        run_tests $ns1 $ns2 10.0.1.1
        chk_join_nr "single subflow" 1 1 1
 
        # multiple subflows
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 2
-       ip netns exec $ns2 ./pm_nl_ctl limits 0 2
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.2.2 flags subflow
+       pm_nl_set_limits $ns1 0 2
+       pm_nl_set_limits $ns2 0 2
+       pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
+       pm_nl_add_endpoint $ns2 10.0.2.2 flags subflow
        run_tests $ns1 $ns2 10.0.1.1
        chk_join_nr "multiple subflows" 2 2 2
 
        # multiple subflows limited by server
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 1
-       ip netns exec $ns2 ./pm_nl_ctl limits 0 2
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.2.2 flags subflow
+       pm_nl_set_limits $ns1 0 1
+       pm_nl_set_limits $ns2 0 2
+       pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
+       pm_nl_add_endpoint $ns2 10.0.2.2 flags subflow
        run_tests $ns1 $ns2 10.0.1.1
        chk_join_nr "multiple subflows, limited by server" 2 2 1
 
        # single subflow, dev
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 1
-       ip netns exec $ns2 ./pm_nl_ctl limits 0 1
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow dev ns2eth3
+       pm_nl_set_limits $ns1 0 1
+       pm_nl_set_limits $ns2 0 1
+       pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow dev ns2eth3
        run_tests $ns1 $ns2 10.0.1.1
        chk_join_nr "single subflow, dev" 1 1 1
 }
@@ -1032,28 +1172,28 @@ subflows_error_tests()
        # If a single subflow is configured, and matches the MPC src
        # address, no additional subflow should be created
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 1
-       ip netns exec $ns2 ./pm_nl_ctl limits 0 1
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.1.2 flags subflow
+       pm_nl_set_limits $ns1 0 1
+       pm_nl_set_limits $ns2 0 1
+       pm_nl_add_endpoint $ns2 10.0.1.2 flags subflow
        run_tests $ns1 $ns2 10.0.1.1 0 0 0 slow
        chk_join_nr "no MPC reuse with single endpoint" 0 0 0
 
        # multiple subflows, with subflow creation error
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 2
-       ip netns exec $ns2 ./pm_nl_ctl limits 0 2
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.2.2 flags subflow
+       pm_nl_set_limits $ns1 0 2
+       pm_nl_set_limits $ns2 0 2
+       pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
+       pm_nl_add_endpoint $ns2 10.0.2.2 flags subflow
        ip netns exec $ns1 iptables -A INPUT -s 10.0.3.2 -p tcp -j REJECT
        run_tests $ns1 $ns2 10.0.1.1 0 0 0 slow
        chk_join_nr "multi subflows, with failing subflow" 1 1 1
 
        # multiple subflows, with subflow timeout on MPJ
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 2
-       ip netns exec $ns2 ./pm_nl_ctl limits 0 2
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.2.2 flags subflow
+       pm_nl_set_limits $ns1 0 2
+       pm_nl_set_limits $ns2 0 2
+       pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
+       pm_nl_add_endpoint $ns2 10.0.2.2 flags subflow
        ip netns exec $ns1 iptables -A INPUT -s 10.0.3.2 -p tcp -j DROP
        run_tests $ns1 $ns2 10.0.1.1 0 0 0 slow
        chk_join_nr "multi subflows, with subflow timeout" 1 1 1
@@ -1062,9 +1202,9 @@ subflows_error_tests()
        # closed subflow (due to reset) is not reused if additional
        # subflows are added later
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 1
-       ip netns exec $ns2 ./pm_nl_ctl limits 0 1
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
+       pm_nl_set_limits $ns1 0 1
+       pm_nl_set_limits $ns2 0 1
+       pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
        ip netns exec $ns1 iptables -A INPUT -s 10.0.3.2 -p tcp -j REJECT
        run_tests $ns1 $ns2 10.0.1.1 0 0 0 slow &
 
@@ -1074,7 +1214,7 @@ subflows_error_tests()
 
        # mpj subflow will be in TW after the reset
        wait_for_tw $ns2
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.2.2 flags subflow
+       pm_nl_add_endpoint $ns2 10.0.2.2 flags subflow
        wait
 
        # additional subflow could be created only if the PM select
@@ -1086,16 +1226,16 @@ signal_address_tests()
 {
        # add_address, unused
        reset
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
+       pm_nl_add_endpoint $ns1 10.0.2.1 flags signal
        run_tests $ns1 $ns2 10.0.1.1
        chk_join_nr "unused signal address" 0 0 0
        chk_add_nr 1 1
 
        # accept and use add_addr
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 1
-       ip netns exec $ns2 ./pm_nl_ctl limits 1 1
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
+       pm_nl_set_limits $ns1 0 1
+       pm_nl_set_limits $ns2 1 1
+       pm_nl_add_endpoint $ns1 10.0.2.1 flags signal
        run_tests $ns1 $ns2 10.0.1.1
        chk_join_nr "signal address" 1 1 1
        chk_add_nr 1 1
@@ -1105,59 +1245,59 @@ signal_address_tests()
        # belong to different subnets or one of the listed local address could be
        # used for 'add_addr' subflow
        reset
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 2
-       ip netns exec $ns2 ./pm_nl_ctl limits 1 2
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
+       pm_nl_add_endpoint $ns1 10.0.2.1 flags signal
+       pm_nl_set_limits $ns1 0 2
+       pm_nl_set_limits $ns2 1 2
+       pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
        run_tests $ns1 $ns2 10.0.1.1
        chk_join_nr "subflow and signal" 2 2 2
        chk_add_nr 1 1
 
        # accept and use add_addr with additional subflows
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 3
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
-       ip netns exec $ns2 ./pm_nl_ctl limits 1 3
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.4.2 flags subflow
+       pm_nl_set_limits $ns1 0 3
+       pm_nl_add_endpoint $ns1 10.0.2.1 flags signal
+       pm_nl_set_limits $ns2 1 3
+       pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
+       pm_nl_add_endpoint $ns2 10.0.4.2 flags subflow
        run_tests $ns1 $ns2 10.0.1.1
        chk_join_nr "multiple subflows and signal" 3 3 3
        chk_add_nr 1 1
 
        # signal addresses
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 3 3
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.3.1 flags signal
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.4.1 flags signal
-       ip netns exec $ns2 ./pm_nl_ctl limits 3 3
+       pm_nl_set_limits $ns1 3 3
+       pm_nl_add_endpoint $ns1 10.0.2.1 flags signal
+       pm_nl_add_endpoint $ns1 10.0.3.1 flags signal
+       pm_nl_add_endpoint $ns1 10.0.4.1 flags signal
+       pm_nl_set_limits $ns2 3 3
        run_tests $ns1 $ns2 10.0.1.1
        chk_join_nr "signal addresses" 3 3 3
        chk_add_nr 3 3
 
        # signal invalid addresses
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 3 3
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.12.1 flags signal
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.3.1 flags signal
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.14.1 flags signal
-       ip netns exec $ns2 ./pm_nl_ctl limits 3 3
+       pm_nl_set_limits $ns1 3 3
+       pm_nl_add_endpoint $ns1 10.0.12.1 flags signal
+       pm_nl_add_endpoint $ns1 10.0.3.1 flags signal
+       pm_nl_add_endpoint $ns1 10.0.14.1 flags signal
+       pm_nl_set_limits $ns2 3 3
        run_tests $ns1 $ns2 10.0.1.1
        chk_join_nr "signal invalid addresses" 1 1 1
        chk_add_nr 3 3
 
        # signal addresses race test
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 4 4
-       ip netns exec $ns2 ./pm_nl_ctl limits 4 4
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.1 flags signal
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.3.1 flags signal
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.4.1 flags signal
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.1.2 flags signal
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.2.2 flags signal
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags signal
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.4.2 flags signal
+       pm_nl_set_limits $ns1 4 4
+       pm_nl_set_limits $ns2 4 4
+       pm_nl_add_endpoint $ns1 10.0.1.1 flags signal
+       pm_nl_add_endpoint $ns1 10.0.2.1 flags signal
+       pm_nl_add_endpoint $ns1 10.0.3.1 flags signal
+       pm_nl_add_endpoint $ns1 10.0.4.1 flags signal
+       pm_nl_add_endpoint $ns2 10.0.1.2 flags signal
+       pm_nl_add_endpoint $ns2 10.0.2.2 flags signal
+       pm_nl_add_endpoint $ns2 10.0.3.2 flags signal
+       pm_nl_add_endpoint $ns2 10.0.4.2 flags signal
        run_tests $ns1 $ns2 10.0.1.1
        chk_join_nr "signal addresses race test" 3 3 3
 
@@ -1178,11 +1318,11 @@ link_failure_tests()
        # active backup and link switch-over.
        # Let's set some arbitrary (low) virtual link limits.
        init_shapers
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 3
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 dev ns1eth2 flags signal
-       ip netns exec $ns2 ./pm_nl_ctl limits 1 3
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 dev ns2eth3 flags subflow
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.4.2 dev ns2eth4 flags subflow
+       pm_nl_set_limits $ns1 0 3
+       pm_nl_add_endpoint $ns1 10.0.2.1 dev ns1eth2 flags signal
+       pm_nl_set_limits $ns2 1 3
+       pm_nl_add_endpoint $ns2 10.0.3.2 dev ns2eth3 flags subflow
+       pm_nl_add_endpoint $ns2 10.0.4.2 dev ns2eth4 flags subflow
        run_tests $ns1 $ns2 10.0.1.1 1
        chk_join_nr "multiple flows, signal, link failure" 3 3 3
        chk_add_nr 1 1
@@ -1192,11 +1332,11 @@ link_failure_tests()
        # for bidirectional transfer
        reset
        init_shapers
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 3
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 dev ns1eth2 flags signal
-       ip netns exec $ns2 ./pm_nl_ctl limits 1 3
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 dev ns2eth3 flags subflow
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.4.2 dev ns2eth4 flags subflow
+       pm_nl_set_limits $ns1 0 3
+       pm_nl_add_endpoint $ns1 10.0.2.1 dev ns1eth2 flags signal
+       pm_nl_set_limits $ns2 1 3
+       pm_nl_add_endpoint $ns2 10.0.3.2 dev ns2eth3 flags subflow
+       pm_nl_add_endpoint $ns2 10.0.4.2 dev ns2eth4 flags subflow
        run_tests $ns1 $ns2 10.0.1.1 2
        chk_join_nr "multi flows, signal, bidi, link fail" 3 3 3
        chk_add_nr 1 1
@@ -1206,11 +1346,11 @@ link_failure_tests()
        # will never be used
        reset
        init_shapers
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 2
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 dev ns1eth2 flags signal
-       ip netns exec $ns2 ./pm_nl_ctl limits 1 2
+       pm_nl_set_limits $ns1 0 2
+       pm_nl_add_endpoint $ns1 10.0.2.1 dev ns1eth2 flags signal
+       pm_nl_set_limits $ns2 1 2
        export FAILING_LINKS="1"
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 dev ns2eth3 flags subflow,backup
+       pm_nl_add_endpoint $ns2 10.0.3.2 dev ns2eth3 flags subflow,backup
        run_tests $ns1 $ns2 10.0.1.1 1
        chk_join_nr "backup subflow unused, link failure" 2 2 2
        chk_add_nr 1 1
@@ -1220,10 +1360,10 @@ link_failure_tests()
        # the traffic
        reset
        init_shapers
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 2
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 dev ns1eth2 flags signal
-       ip netns exec $ns2 ./pm_nl_ctl limits 1 2
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 dev ns2eth3 flags subflow,backup
+       pm_nl_set_limits $ns1 0 2
+       pm_nl_add_endpoint $ns1 10.0.2.1 dev ns1eth2 flags signal
+       pm_nl_set_limits $ns2 1 2
+       pm_nl_add_endpoint $ns2 10.0.3.2 dev ns2eth3 flags subflow,backup
        export FAILING_LINKS="1 2"
        run_tests $ns1 $ns2 10.0.1.1 1
        chk_join_nr "backup flow used, multi links fail" 2 2 2
@@ -1235,10 +1375,10 @@ link_failure_tests()
        # for bidirectional transfer
        reset
        init_shapers
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 2
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 dev ns1eth2 flags signal
-       ip netns exec $ns2 ./pm_nl_ctl limits 1 3
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 dev ns2eth3 flags subflow,backup
+       pm_nl_set_limits $ns1 0 2
+       pm_nl_add_endpoint $ns1 10.0.2.1 dev ns1eth2 flags signal
+       pm_nl_set_limits $ns2 1 3
+       pm_nl_add_endpoint $ns2 10.0.3.2 dev ns2eth3 flags subflow,backup
        run_tests $ns1 $ns2 10.0.1.1 2
        chk_join_nr "backup flow used, bidi, link failure" 2 2 2
        chk_add_nr 1 1
@@ -1250,38 +1390,38 @@ add_addr_timeout_tests()
 {
        # add_addr timeout
        reset_with_add_addr_timeout
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 1
-       ip netns exec $ns2 ./pm_nl_ctl limits 1 1
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
+       pm_nl_set_limits $ns1 0 1
+       pm_nl_set_limits $ns2 1 1
+       pm_nl_add_endpoint $ns1 10.0.2.1 flags signal
        run_tests $ns1 $ns2 10.0.1.1 0 0 0 slow
        chk_join_nr "signal address, ADD_ADDR timeout" 1 1 1
        chk_add_nr 4 0
 
        # add_addr timeout IPv6
        reset_with_add_addr_timeout 6
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 1
-       ip netns exec $ns2 ./pm_nl_ctl limits 1 1
-       ip netns exec $ns1 ./pm_nl_ctl add dead:beef:2::1 flags signal
+       pm_nl_set_limits $ns1 0 1
+       pm_nl_set_limits $ns2 1 1
+       pm_nl_add_endpoint $ns1 dead:beef:2::1 flags signal
        run_tests $ns1 $ns2 dead:beef:1::1 0 0 0 slow
        chk_join_nr "signal address, ADD_ADDR6 timeout" 1 1 1
        chk_add_nr 4 0
 
        # signal addresses timeout
        reset_with_add_addr_timeout
-       ip netns exec $ns1 ./pm_nl_ctl limits 2 2
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.3.1 flags signal
-       ip netns exec $ns2 ./pm_nl_ctl limits 2 2
+       pm_nl_set_limits $ns1 2 2
+       pm_nl_add_endpoint $ns1 10.0.2.1 flags signal
+       pm_nl_add_endpoint $ns1 10.0.3.1 flags signal
+       pm_nl_set_limits $ns2 2 2
        run_tests $ns1 $ns2 10.0.1.1 0 0 0 least
        chk_join_nr "signal addresses, ADD_ADDR timeout" 2 2 2
        chk_add_nr 8 0
 
        # signal invalid addresses timeout
        reset_with_add_addr_timeout
-       ip netns exec $ns1 ./pm_nl_ctl limits 2 2
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.12.1 flags signal
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.3.1 flags signal
-       ip netns exec $ns2 ./pm_nl_ctl limits 2 2
+       pm_nl_set_limits $ns1 2 2
+       pm_nl_add_endpoint $ns1 10.0.12.1 flags signal
+       pm_nl_add_endpoint $ns1 10.0.3.1 flags signal
+       pm_nl_set_limits $ns2 2 2
        run_tests $ns1 $ns2 10.0.1.1 0 0 0 least
        chk_join_nr "invalid address, ADD_ADDR timeout" 1 1 1
        chk_add_nr 8 0
@@ -1291,28 +1431,28 @@ remove_tests()
 {
        # single subflow, remove
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 1
-       ip netns exec $ns2 ./pm_nl_ctl limits 0 1
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
+       pm_nl_set_limits $ns1 0 1
+       pm_nl_set_limits $ns2 0 1
+       pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
        run_tests $ns1 $ns2 10.0.1.1 0 0 -1 slow
        chk_join_nr "remove single subflow" 1 1 1
        chk_rm_nr 1 1
 
        # multiple subflows, remove
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 2
-       ip netns exec $ns2 ./pm_nl_ctl limits 0 2
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.2.2 flags subflow
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
+       pm_nl_set_limits $ns1 0 2
+       pm_nl_set_limits $ns2 0 2
+       pm_nl_add_endpoint $ns2 10.0.2.2 flags subflow
+       pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
        run_tests $ns1 $ns2 10.0.1.1 0 0 -2 slow
        chk_join_nr "remove multiple subflows" 2 2 2
        chk_rm_nr 2 2
 
        # single address, remove
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 1
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
-       ip netns exec $ns2 ./pm_nl_ctl limits 1 1
+       pm_nl_set_limits $ns1 0 1
+       pm_nl_add_endpoint $ns1 10.0.2.1 flags signal
+       pm_nl_set_limits $ns2 1 1
        run_tests $ns1 $ns2 10.0.1.1 0 -1 0 slow
        chk_join_nr "remove single address" 1 1 1
        chk_add_nr 1 1
@@ -1320,10 +1460,10 @@ remove_tests()
 
        # subflow and signal, remove
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 2
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
-       ip netns exec $ns2 ./pm_nl_ctl limits 1 2
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
+       pm_nl_set_limits $ns1 0 2
+       pm_nl_add_endpoint $ns1 10.0.2.1 flags signal
+       pm_nl_set_limits $ns2 1 2
+       pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
        run_tests $ns1 $ns2 10.0.1.1 0 -1 -1 slow
        chk_join_nr "remove subflow and signal" 2 2 2
        chk_add_nr 1 1
@@ -1331,11 +1471,11 @@ remove_tests()
 
        # subflows and signal, remove
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 3
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
-       ip netns exec $ns2 ./pm_nl_ctl limits 1 3
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.4.2 flags subflow
+       pm_nl_set_limits $ns1 0 3
+       pm_nl_add_endpoint $ns1 10.0.2.1 flags signal
+       pm_nl_set_limits $ns2 1 3
+       pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
+       pm_nl_add_endpoint $ns2 10.0.4.2 flags subflow
        run_tests $ns1 $ns2 10.0.1.1 0 -1 -2 slow
        chk_join_nr "remove subflows and signal" 3 3 3
        chk_add_nr 1 1
@@ -1343,11 +1483,11 @@ remove_tests()
 
        # addresses remove
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 3 3
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal id 250
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.3.1 flags signal
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.4.1 flags signal
-       ip netns exec $ns2 ./pm_nl_ctl limits 3 3
+       pm_nl_set_limits $ns1 3 3
+       pm_nl_add_endpoint $ns1 10.0.2.1 flags signal id 250
+       pm_nl_add_endpoint $ns1 10.0.3.1 flags signal
+       pm_nl_add_endpoint $ns1 10.0.4.1 flags signal
+       pm_nl_set_limits $ns2 3 3
        run_tests $ns1 $ns2 10.0.1.1 0 -3 0 slow
        chk_join_nr "remove addresses" 3 3 3
        chk_add_nr 3 3
@@ -1355,11 +1495,11 @@ remove_tests()
 
        # invalid addresses remove
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 3 3
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.12.1 flags signal
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.3.1 flags signal
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.14.1 flags signal
-       ip netns exec $ns2 ./pm_nl_ctl limits 3 3
+       pm_nl_set_limits $ns1 3 3
+       pm_nl_add_endpoint $ns1 10.0.12.1 flags signal
+       pm_nl_add_endpoint $ns1 10.0.3.1 flags signal
+       pm_nl_add_endpoint $ns1 10.0.14.1 flags signal
+       pm_nl_set_limits $ns2 3 3
        run_tests $ns1 $ns2 10.0.1.1 0 -3 0 slow
        chk_join_nr "remove invalid addresses" 1 1 1
        chk_add_nr 3 3
@@ -1367,11 +1507,11 @@ remove_tests()
 
        # subflows and signal, flush
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 3
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
-       ip netns exec $ns2 ./pm_nl_ctl limits 1 3
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.4.2 flags subflow
+       pm_nl_set_limits $ns1 0 3
+       pm_nl_add_endpoint $ns1 10.0.2.1 flags signal
+       pm_nl_set_limits $ns2 1 3
+       pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
+       pm_nl_add_endpoint $ns2 10.0.4.2 flags subflow
        run_tests $ns1 $ns2 10.0.1.1 0 -8 -8 slow
        chk_join_nr "flush subflows and signal" 3 3 3
        chk_add_nr 1 1
@@ -1379,22 +1519,22 @@ remove_tests()
 
        # subflows flush
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 3 3
-       ip netns exec $ns2 ./pm_nl_ctl limits 3 3
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.2.2 flags subflow id 150
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.4.2 flags subflow
+       pm_nl_set_limits $ns1 3 3
+       pm_nl_set_limits $ns2 3 3
+       pm_nl_add_endpoint $ns2 10.0.2.2 flags subflow id 150
+       pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
+       pm_nl_add_endpoint $ns2 10.0.4.2 flags subflow
        run_tests $ns1 $ns2 10.0.1.1 0 -8 -8 slow
        chk_join_nr "flush subflows" 3 3 3
        chk_rm_nr 3 3
 
        # addresses flush
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 3 3
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal id 250
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.3.1 flags signal
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.4.1 flags signal
-       ip netns exec $ns2 ./pm_nl_ctl limits 3 3
+       pm_nl_set_limits $ns1 3 3
+       pm_nl_add_endpoint $ns1 10.0.2.1 flags signal id 250
+       pm_nl_add_endpoint $ns1 10.0.3.1 flags signal
+       pm_nl_add_endpoint $ns1 10.0.4.1 flags signal
+       pm_nl_set_limits $ns2 3 3
        run_tests $ns1 $ns2 10.0.1.1 0 -8 -8 slow
        chk_join_nr "flush addresses" 3 3 3
        chk_add_nr 3 3
@@ -1402,11 +1542,11 @@ remove_tests()
 
        # invalid addresses flush
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 3 3
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.12.1 flags signal
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.3.1 flags signal
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.14.1 flags signal
-       ip netns exec $ns2 ./pm_nl_ctl limits 3 3
+       pm_nl_set_limits $ns1 3 3
+       pm_nl_add_endpoint $ns1 10.0.12.1 flags signal
+       pm_nl_add_endpoint $ns1 10.0.3.1 flags signal
+       pm_nl_add_endpoint $ns1 10.0.14.1 flags signal
+       pm_nl_set_limits $ns2 3 3
        run_tests $ns1 $ns2 10.0.1.1 0 -8 0 slow
        chk_join_nr "flush invalid addresses" 1 1 1
        chk_add_nr 3 3
@@ -1414,18 +1554,18 @@ remove_tests()
 
        # remove id 0 subflow
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 1
-       ip netns exec $ns2 ./pm_nl_ctl limits 0 1
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
+       pm_nl_set_limits $ns1 0 1
+       pm_nl_set_limits $ns2 0 1
+       pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
        run_tests $ns1 $ns2 10.0.1.1 0 0 -9 slow
        chk_join_nr "remove id 0 subflow" 1 1 1
        chk_rm_nr 1 1
 
        # remove id 0 address
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 1
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
-       ip netns exec $ns2 ./pm_nl_ctl limits 1 1
+       pm_nl_set_limits $ns1 0 1
+       pm_nl_add_endpoint $ns1 10.0.2.1 flags signal
+       pm_nl_set_limits $ns2 1 1
        run_tests $ns1 $ns2 10.0.1.1 0 -9 0 slow
        chk_join_nr "remove id 0 address" 1 1 1
        chk_add_nr 1 1
@@ -1436,37 +1576,37 @@ add_tests()
 {
        # add single subflow
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 1
-       ip netns exec $ns2 ./pm_nl_ctl limits 0 1
+       pm_nl_set_limits $ns1 0 1
+       pm_nl_set_limits $ns2 0 1
        run_tests $ns1 $ns2 10.0.1.1 0 0 1 slow
        chk_join_nr "add single subflow" 1 1 1
 
        # add signal address
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 1
-       ip netns exec $ns2 ./pm_nl_ctl limits 1 1
+       pm_nl_set_limits $ns1 0 1
+       pm_nl_set_limits $ns2 1 1
        run_tests $ns1 $ns2 10.0.1.1 0 1 0 slow
        chk_join_nr "add signal address" 1 1 1
        chk_add_nr 1 1
 
        # add multiple subflows
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 2
-       ip netns exec $ns2 ./pm_nl_ctl limits 0 2
+       pm_nl_set_limits $ns1 0 2
+       pm_nl_set_limits $ns2 0 2
        run_tests $ns1 $ns2 10.0.1.1 0 0 2 slow
        chk_join_nr "add multiple subflows" 2 2 2
 
        # add multiple subflows IPv6
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 2
-       ip netns exec $ns2 ./pm_nl_ctl limits 0 2
+       pm_nl_set_limits $ns1 0 2
+       pm_nl_set_limits $ns2 0 2
        run_tests $ns1 $ns2 dead:beef:1::1 0 0 2 slow
        chk_join_nr "add multiple subflows IPv6" 2 2 2
 
        # add multiple addresses IPv6
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 2
-       ip netns exec $ns2 ./pm_nl_ctl limits 2 2
+       pm_nl_set_limits $ns1 0 2
+       pm_nl_set_limits $ns2 2 2
        run_tests $ns1 $ns2 dead:beef:1::1 0 2 0 slow
        chk_join_nr "add multiple addresses IPv6" 2 2 2
        chk_add_nr 2 2
@@ -1476,33 +1616,33 @@ ipv6_tests()
 {
        # subflow IPv6
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 1
-       ip netns exec $ns2 ./pm_nl_ctl limits 0 1
-       ip netns exec $ns2 ./pm_nl_ctl add dead:beef:3::2 dev ns2eth3 flags subflow
+       pm_nl_set_limits $ns1 0 1
+       pm_nl_set_limits $ns2 0 1
+       pm_nl_add_endpoint $ns2 dead:beef:3::2 dev ns2eth3 flags subflow
        run_tests $ns1 $ns2 dead:beef:1::1 0 0 0 slow
        chk_join_nr "single subflow IPv6" 1 1 1
 
        # add_address, unused IPv6
        reset
-       ip netns exec $ns1 ./pm_nl_ctl add dead:beef:2::1 flags signal
+       pm_nl_add_endpoint $ns1 dead:beef:2::1 flags signal
        run_tests $ns1 $ns2 dead:beef:1::1 0 0 0 slow
        chk_join_nr "unused signal address IPv6" 0 0 0
        chk_add_nr 1 1
 
        # signal address IPv6
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 1
-       ip netns exec $ns1 ./pm_nl_ctl add dead:beef:2::1 flags signal
-       ip netns exec $ns2 ./pm_nl_ctl limits 1 1
+       pm_nl_set_limits $ns1 0 1
+       pm_nl_add_endpoint $ns1 dead:beef:2::1 flags signal
+       pm_nl_set_limits $ns2 1 1
        run_tests $ns1 $ns2 dead:beef:1::1 0 0 0 slow
        chk_join_nr "single address IPv6" 1 1 1
        chk_add_nr 1 1
 
        # single address IPv6, remove
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 1
-       ip netns exec $ns1 ./pm_nl_ctl add dead:beef:2::1 flags signal
-       ip netns exec $ns2 ./pm_nl_ctl limits 1 1
+       pm_nl_set_limits $ns1 0 1
+       pm_nl_add_endpoint $ns1 dead:beef:2::1 flags signal
+       pm_nl_set_limits $ns2 1 1
        run_tests $ns1 $ns2 dead:beef:1::1 0 -1 0 slow
        chk_join_nr "remove single address IPv6" 1 1 1
        chk_add_nr 1 1
@@ -1510,10 +1650,10 @@ ipv6_tests()
 
        # subflow and signal IPv6, remove
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 2
-       ip netns exec $ns1 ./pm_nl_ctl add dead:beef:2::1 flags signal
-       ip netns exec $ns2 ./pm_nl_ctl limits 1 2
-       ip netns exec $ns2 ./pm_nl_ctl add dead:beef:3::2 dev ns2eth3 flags subflow
+       pm_nl_set_limits $ns1 0 2
+       pm_nl_add_endpoint $ns1 dead:beef:2::1 flags signal
+       pm_nl_set_limits $ns2 1 2
+       pm_nl_add_endpoint $ns2 dead:beef:3::2 dev ns2eth3 flags subflow
        run_tests $ns1 $ns2 dead:beef:1::1 0 -1 -1 slow
        chk_join_nr "remove subflow and signal IPv6" 2 2 2
        chk_add_nr 1 1
@@ -1524,76 +1664,76 @@ v4mapped_tests()
 {
        # subflow IPv4-mapped to IPv4-mapped
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 1
-       ip netns exec $ns2 ./pm_nl_ctl limits 0 1
-       ip netns exec $ns2 ./pm_nl_ctl add "::ffff:10.0.3.2" flags subflow
+       pm_nl_set_limits $ns1 0 1
+       pm_nl_set_limits $ns2 0 1
+       pm_nl_add_endpoint $ns2 "::ffff:10.0.3.2" flags subflow
        run_tests $ns1 $ns2 "::ffff:10.0.1.1"
        chk_join_nr "single subflow IPv4-mapped" 1 1 1
 
        # signal address IPv4-mapped with IPv4-mapped sk
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 1
-       ip netns exec $ns2 ./pm_nl_ctl limits 1 1
-       ip netns exec $ns1 ./pm_nl_ctl add "::ffff:10.0.2.1" flags signal
+       pm_nl_set_limits $ns1 0 1
+       pm_nl_set_limits $ns2 1 1
+       pm_nl_add_endpoint $ns1 "::ffff:10.0.2.1" flags signal
        run_tests $ns1 $ns2 "::ffff:10.0.1.1"
        chk_join_nr "signal address IPv4-mapped" 1 1 1
        chk_add_nr 1 1
 
        # subflow v4-map-v6
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 1
-       ip netns exec $ns2 ./pm_nl_ctl limits 0 1
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
+       pm_nl_set_limits $ns1 0 1
+       pm_nl_set_limits $ns2 0 1
+       pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
        run_tests $ns1 $ns2 "::ffff:10.0.1.1"
        chk_join_nr "single subflow v4-map-v6" 1 1 1
 
        # signal address v4-map-v6
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 1
-       ip netns exec $ns2 ./pm_nl_ctl limits 1 1
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
+       pm_nl_set_limits $ns1 0 1
+       pm_nl_set_limits $ns2 1 1
+       pm_nl_add_endpoint $ns1 10.0.2.1 flags signal
        run_tests $ns1 $ns2 "::ffff:10.0.1.1"
        chk_join_nr "signal address v4-map-v6" 1 1 1
        chk_add_nr 1 1
 
        # subflow v6-map-v4
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 1
-       ip netns exec $ns2 ./pm_nl_ctl limits 0 1
-       ip netns exec $ns2 ./pm_nl_ctl add "::ffff:10.0.3.2" flags subflow
+       pm_nl_set_limits $ns1 0 1
+       pm_nl_set_limits $ns2 0 1
+       pm_nl_add_endpoint $ns2 "::ffff:10.0.3.2" flags subflow
        run_tests $ns1 $ns2 10.0.1.1
        chk_join_nr "single subflow v6-map-v4" 1 1 1
 
        # signal address v6-map-v4
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 1
-       ip netns exec $ns2 ./pm_nl_ctl limits 1 1
-       ip netns exec $ns1 ./pm_nl_ctl add "::ffff:10.0.2.1" flags signal
+       pm_nl_set_limits $ns1 0 1
+       pm_nl_set_limits $ns2 1 1
+       pm_nl_add_endpoint $ns1 "::ffff:10.0.2.1" flags signal
        run_tests $ns1 $ns2 10.0.1.1
        chk_join_nr "signal address v6-map-v4" 1 1 1
        chk_add_nr 1 1
 
        # no subflow IPv6 to v4 address
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 1
-       ip netns exec $ns2 ./pm_nl_ctl limits 0 1
-       ip netns exec $ns2 ./pm_nl_ctl add dead:beef:2::2 flags subflow
+       pm_nl_set_limits $ns1 0 1
+       pm_nl_set_limits $ns2 0 1
+       pm_nl_add_endpoint $ns2 dead:beef:2::2 flags subflow
        run_tests $ns1 $ns2 10.0.1.1
        chk_join_nr "no JOIN with diff families v4-v6" 0 0 0
 
        # no subflow IPv6 to v4 address even if v6 has a valid v4 at the end
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 1
-       ip netns exec $ns2 ./pm_nl_ctl limits 0 1
-       ip netns exec $ns2 ./pm_nl_ctl add dead:beef:2::10.0.3.2 flags subflow
+       pm_nl_set_limits $ns1 0 1
+       pm_nl_set_limits $ns2 0 1
+       pm_nl_add_endpoint $ns2 dead:beef:2::10.0.3.2 flags subflow
        run_tests $ns1 $ns2 10.0.1.1
        chk_join_nr "no JOIN with diff families v4-v6-2" 0 0 0
 
        # no subflow IPv4 to v6 address, no need to slow down too then
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 1
-       ip netns exec $ns2 ./pm_nl_ctl limits 0 1
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
+       pm_nl_set_limits $ns1 0 1
+       pm_nl_set_limits $ns2 0 1
+       pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
        run_tests $ns1 $ns2 dead:beef:1::1
        chk_join_nr "no JOIN with diff families v6-v4" 0 0 0
 }
@@ -1602,50 +1742,60 @@ backup_tests()
 {
        # single subflow, backup
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 1
-       ip netns exec $ns2 ./pm_nl_ctl limits 0 1
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow,backup
+       pm_nl_set_limits $ns1 0 1
+       pm_nl_set_limits $ns2 0 1
+       pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow,backup
        run_tests $ns1 $ns2 10.0.1.1 0 0 0 slow nobackup
        chk_join_nr "single subflow, backup" 1 1 1
        chk_prio_nr 0 1
 
        # single address, backup
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 1
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
-       ip netns exec $ns2 ./pm_nl_ctl limits 1 1
+       pm_nl_set_limits $ns1 0 1
+       pm_nl_add_endpoint $ns1 10.0.2.1 flags signal
+       pm_nl_set_limits $ns2 1 1
        run_tests $ns1 $ns2 10.0.1.1 0 0 0 slow backup
        chk_join_nr "single address, backup" 1 1 1
        chk_add_nr 1 1
        chk_prio_nr 1 0
+
+       # single address with port, backup
+       reset
+       pm_nl_set_limits $ns1 0 1
+       pm_nl_add_endpoint $ns1 10.0.2.1 flags signal port 10100
+       pm_nl_set_limits $ns2 1 1
+       run_tests $ns1 $ns2 10.0.1.1 0 0 0 slow backup
+       chk_join_nr "single address with port, backup" 1 1 1
+       chk_add_nr 1 1
+       chk_prio_nr 1 0
 }
 
 add_addr_ports_tests()
 {
        # signal address with port
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 1
-       ip netns exec $ns2 ./pm_nl_ctl limits 1 1
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal port 10100
+       pm_nl_set_limits $ns1 0 1
+       pm_nl_set_limits $ns2 1 1
+       pm_nl_add_endpoint $ns1 10.0.2.1 flags signal port 10100
        run_tests $ns1 $ns2 10.0.1.1
        chk_join_nr "signal address with port" 1 1 1
        chk_add_nr 1 1 1
 
        # subflow and signal with port
        reset
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal port 10100
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 2
-       ip netns exec $ns2 ./pm_nl_ctl limits 1 2
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
+       pm_nl_add_endpoint $ns1 10.0.2.1 flags signal port 10100
+       pm_nl_set_limits $ns1 0 2
+       pm_nl_set_limits $ns2 1 2
+       pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
        run_tests $ns1 $ns2 10.0.1.1
        chk_join_nr "subflow and signal with port" 2 2 2
        chk_add_nr 1 1 1
 
        # single address with port, remove
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 1
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal port 10100
-       ip netns exec $ns2 ./pm_nl_ctl limits 1 1
+       pm_nl_set_limits $ns1 0 1
+       pm_nl_add_endpoint $ns1 10.0.2.1 flags signal port 10100
+       pm_nl_set_limits $ns2 1 1
        run_tests $ns1 $ns2 10.0.1.1 0 -1 0 slow
        chk_join_nr "remove single address with port" 1 1 1
        chk_add_nr 1 1 1
@@ -1653,10 +1803,10 @@ add_addr_ports_tests()
 
        # subflow and signal with port, remove
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 2
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal port 10100
-       ip netns exec $ns2 ./pm_nl_ctl limits 1 2
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
+       pm_nl_set_limits $ns1 0 2
+       pm_nl_add_endpoint $ns1 10.0.2.1 flags signal port 10100
+       pm_nl_set_limits $ns2 1 2
+       pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
        run_tests $ns1 $ns2 10.0.1.1 0 -1 -1 slow
        chk_join_nr "remove subflow and signal with port" 2 2 2
        chk_add_nr 1 1 1
@@ -1664,11 +1814,11 @@ add_addr_ports_tests()
 
        # subflows and signal with port, flush
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 3
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal port 10100
-       ip netns exec $ns2 ./pm_nl_ctl limits 1 3
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.4.2 flags subflow
+       pm_nl_set_limits $ns1 0 3
+       pm_nl_add_endpoint $ns1 10.0.2.1 flags signal port 10100
+       pm_nl_set_limits $ns2 1 3
+       pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
+       pm_nl_add_endpoint $ns2 10.0.4.2 flags subflow
        run_tests $ns1 $ns2 10.0.1.1 0 -8 -2 slow
        chk_join_nr "flush subflows and signal with port" 3 3 3
        chk_add_nr 1 1
@@ -1676,20 +1826,20 @@ add_addr_ports_tests()
 
        # multiple addresses with port
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 2 2
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal port 10100
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.3.1 flags signal port 10100
-       ip netns exec $ns2 ./pm_nl_ctl limits 2 2
+       pm_nl_set_limits $ns1 2 2
+       pm_nl_add_endpoint $ns1 10.0.2.1 flags signal port 10100
+       pm_nl_add_endpoint $ns1 10.0.3.1 flags signal port 10100
+       pm_nl_set_limits $ns2 2 2
        run_tests $ns1 $ns2 10.0.1.1
        chk_join_nr "multiple addresses with port" 2 2 2
        chk_add_nr 2 2 2
 
        # multiple addresses with ports
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 2 2
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal port 10100
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.3.1 flags signal port 10101
-       ip netns exec $ns2 ./pm_nl_ctl limits 2 2
+       pm_nl_set_limits $ns1 2 2
+       pm_nl_add_endpoint $ns1 10.0.2.1 flags signal port 10100
+       pm_nl_add_endpoint $ns1 10.0.3.1 flags signal port 10101
+       pm_nl_set_limits $ns2 2 2
        run_tests $ns1 $ns2 10.0.1.1
        chk_join_nr "multiple addresses with ports" 2 2 2
        chk_add_nr 2 2 2
@@ -1699,56 +1849,56 @@ syncookies_tests()
 {
        # single subflow, syncookies
        reset_with_cookies
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 1
-       ip netns exec $ns2 ./pm_nl_ctl limits 0 1
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
+       pm_nl_set_limits $ns1 0 1
+       pm_nl_set_limits $ns2 0 1
+       pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
        run_tests $ns1 $ns2 10.0.1.1
        chk_join_nr "single subflow with syn cookies" 1 1 1
 
        # multiple subflows with syn cookies
        reset_with_cookies
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 2
-       ip netns exec $ns2 ./pm_nl_ctl limits 0 2
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.2.2 flags subflow
+       pm_nl_set_limits $ns1 0 2
+       pm_nl_set_limits $ns2 0 2
+       pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
+       pm_nl_add_endpoint $ns2 10.0.2.2 flags subflow
        run_tests $ns1 $ns2 10.0.1.1
        chk_join_nr "multiple subflows with syn cookies" 2 2 2
 
        # multiple subflows limited by server
        reset_with_cookies
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 1
-       ip netns exec $ns2 ./pm_nl_ctl limits 0 2
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.2.2 flags subflow
+       pm_nl_set_limits $ns1 0 1
+       pm_nl_set_limits $ns2 0 2
+       pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
+       pm_nl_add_endpoint $ns2 10.0.2.2 flags subflow
        run_tests $ns1 $ns2 10.0.1.1
        chk_join_nr "subflows limited by server w cookies" 2 1 1
 
        # test signal address with cookies
        reset_with_cookies
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 1
-       ip netns exec $ns2 ./pm_nl_ctl limits 1 1
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
+       pm_nl_set_limits $ns1 0 1
+       pm_nl_set_limits $ns2 1 1
+       pm_nl_add_endpoint $ns1 10.0.2.1 flags signal
        run_tests $ns1 $ns2 10.0.1.1
        chk_join_nr "signal address with syn cookies" 1 1 1
        chk_add_nr 1 1
 
        # test cookie with subflow and signal
        reset_with_cookies
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 2
-       ip netns exec $ns2 ./pm_nl_ctl limits 1 2
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
+       pm_nl_add_endpoint $ns1 10.0.2.1 flags signal
+       pm_nl_set_limits $ns1 0 2
+       pm_nl_set_limits $ns2 1 2
+       pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
        run_tests $ns1 $ns2 10.0.1.1
        chk_join_nr "subflow and signal w cookies" 2 2 2
        chk_add_nr 1 1
 
        # accept and use add_addr with additional subflows
        reset_with_cookies
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 3
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
-       ip netns exec $ns2 ./pm_nl_ctl limits 1 3
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.4.2 flags subflow
+       pm_nl_set_limits $ns1 0 3
+       pm_nl_add_endpoint $ns1 10.0.2.1 flags signal
+       pm_nl_set_limits $ns2 1 3
+       pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
+       pm_nl_add_endpoint $ns2 10.0.4.2 flags subflow
        run_tests $ns1 $ns2 10.0.1.1
        chk_join_nr "subflows and signal w. cookies" 3 3 3
        chk_add_nr 1 1
@@ -1758,29 +1908,29 @@ checksum_tests()
 {
        # checksum test 0 0
        reset_with_checksum 0 0
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 1
-       ip netns exec $ns2 ./pm_nl_ctl limits 0 1
+       pm_nl_set_limits $ns1 0 1
+       pm_nl_set_limits $ns2 0 1
        run_tests $ns1 $ns2 10.0.1.1
        chk_csum_nr "checksum test 0 0"
 
        # checksum test 1 1
        reset_with_checksum 1 1
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 1
-       ip netns exec $ns2 ./pm_nl_ctl limits 0 1
+       pm_nl_set_limits $ns1 0 1
+       pm_nl_set_limits $ns2 0 1
        run_tests $ns1 $ns2 10.0.1.1
        chk_csum_nr "checksum test 1 1"
 
        # checksum test 0 1
        reset_with_checksum 0 1
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 1
-       ip netns exec $ns2 ./pm_nl_ctl limits 0 1
+       pm_nl_set_limits $ns1 0 1
+       pm_nl_set_limits $ns2 0 1
        run_tests $ns1 $ns2 10.0.1.1
        chk_csum_nr "checksum test 0 1"
 
        # checksum test 1 0
        reset_with_checksum 1 0
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 1
-       ip netns exec $ns2 ./pm_nl_ctl limits 0 1
+       pm_nl_set_limits $ns1 0 1
+       pm_nl_set_limits $ns2 0 1
        run_tests $ns1 $ns2 10.0.1.1
        chk_csum_nr "checksum test 1 0"
 }
@@ -1789,26 +1939,26 @@ deny_join_id0_tests()
 {
        # subflow allow join id0 ns1
        reset_with_allow_join_id0 1 0
-       ip netns exec $ns1 ./pm_nl_ctl limits 1 1
-       ip netns exec $ns2 ./pm_nl_ctl limits 1 1
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
+       pm_nl_set_limits $ns1 1 1
+       pm_nl_set_limits $ns2 1 1
+       pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
        run_tests $ns1 $ns2 10.0.1.1
        chk_join_nr "single subflow allow join id0 ns1" 1 1 1
 
        # subflow allow join id0 ns2
        reset_with_allow_join_id0 0 1
-       ip netns exec $ns1 ./pm_nl_ctl limits 1 1
-       ip netns exec $ns2 ./pm_nl_ctl limits 1 1
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
+       pm_nl_set_limits $ns1 1 1
+       pm_nl_set_limits $ns2 1 1
+       pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
        run_tests $ns1 $ns2 10.0.1.1
        chk_join_nr "single subflow allow join id0 ns2" 0 0 0
 
        # signal address allow join id0 ns1
        # ADD_ADDRs are not affected by allow_join_id0 value.
        reset_with_allow_join_id0 1 0
-       ip netns exec $ns1 ./pm_nl_ctl limits 1 1
-       ip netns exec $ns2 ./pm_nl_ctl limits 1 1
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
+       pm_nl_set_limits $ns1 1 1
+       pm_nl_set_limits $ns2 1 1
+       pm_nl_add_endpoint $ns1 10.0.2.1 flags signal
        run_tests $ns1 $ns2 10.0.1.1
        chk_join_nr "signal address allow join id0 ns1" 1 1 1
        chk_add_nr 1 1
@@ -1816,28 +1966,28 @@ deny_join_id0_tests()
        # signal address allow join id0 ns2
        # ADD_ADDRs are not affected by allow_join_id0 value.
        reset_with_allow_join_id0 0 1
-       ip netns exec $ns1 ./pm_nl_ctl limits 1 1
-       ip netns exec $ns2 ./pm_nl_ctl limits 1 1
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
+       pm_nl_set_limits $ns1 1 1
+       pm_nl_set_limits $ns2 1 1
+       pm_nl_add_endpoint $ns1 10.0.2.1 flags signal
        run_tests $ns1 $ns2 10.0.1.1
        chk_join_nr "signal address allow join id0 ns2" 1 1 1
        chk_add_nr 1 1
 
        # subflow and address allow join id0 ns1
        reset_with_allow_join_id0 1 0
-       ip netns exec $ns1 ./pm_nl_ctl limits 2 2
-       ip netns exec $ns2 ./pm_nl_ctl limits 2 2
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
+       pm_nl_set_limits $ns1 2 2
+       pm_nl_set_limits $ns2 2 2
+       pm_nl_add_endpoint $ns1 10.0.2.1 flags signal
+       pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
        run_tests $ns1 $ns2 10.0.1.1
        chk_join_nr "subflow and address allow join id0 1" 2 2 2
 
        # subflow and address allow join id0 ns2
        reset_with_allow_join_id0 0 1
-       ip netns exec $ns1 ./pm_nl_ctl limits 2 2
-       ip netns exec $ns2 ./pm_nl_ctl limits 2 2
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
+       pm_nl_set_limits $ns1 2 2
+       pm_nl_set_limits $ns2 2 2
+       pm_nl_add_endpoint $ns1 10.0.2.1 flags signal
+       pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
        run_tests $ns1 $ns2 10.0.1.1
        chk_join_nr "subflow and address allow join id0 2" 1 1 1
 }
@@ -1848,10 +1998,10 @@ fullmesh_tests()
        # 2 fullmesh addrs in ns2, added before the connection,
        # 1 non-fullmesh addr in ns1, added during the connection.
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 0 4
-       ip netns exec $ns2 ./pm_nl_ctl limits 1 4
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.2.2 flags subflow,fullmesh
-       ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow,fullmesh
+       pm_nl_set_limits $ns1 0 4
+       pm_nl_set_limits $ns2 1 4
+       pm_nl_add_endpoint $ns2 10.0.2.2 flags subflow,fullmesh
+       pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow,fullmesh
        run_tests $ns1 $ns2 10.0.1.1 0 1 0 slow
        chk_join_nr "fullmesh test 2x1" 4 4 4
        chk_add_nr 1 1
@@ -1860,9 +2010,9 @@ fullmesh_tests()
        # 1 non-fullmesh addr in ns1, added before the connection,
        # 1 fullmesh addr in ns2, added during the connection.
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 1 3
-       ip netns exec $ns2 ./pm_nl_ctl limits 1 3
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
+       pm_nl_set_limits $ns1 1 3
+       pm_nl_set_limits $ns2 1 3
+       pm_nl_add_endpoint $ns1 10.0.2.1 flags signal
        run_tests $ns1 $ns2 10.0.1.1 0 0 fullmesh_1 slow
        chk_join_nr "fullmesh test 1x1" 3 3 3
        chk_add_nr 1 1
@@ -1871,9 +2021,9 @@ fullmesh_tests()
        # 1 non-fullmesh addr in ns1, added before the connection,
        # 2 fullmesh addrs in ns2, added during the connection.
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 2 5
-       ip netns exec $ns2 ./pm_nl_ctl limits 1 5
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
+       pm_nl_set_limits $ns1 2 5
+       pm_nl_set_limits $ns2 1 5
+       pm_nl_add_endpoint $ns1 10.0.2.1 flags signal
        run_tests $ns1 $ns2 10.0.1.1 0 0 fullmesh_2 slow
        chk_join_nr "fullmesh test 1x2" 5 5 5
        chk_add_nr 1 1
@@ -1883,12 +2033,50 @@ fullmesh_tests()
        # 2 fullmesh addrs in ns2, added during the connection,
        # limit max_subflows to 4.
        reset
-       ip netns exec $ns1 ./pm_nl_ctl limits 2 4
-       ip netns exec $ns2 ./pm_nl_ctl limits 1 4
-       ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
+       pm_nl_set_limits $ns1 2 4
+       pm_nl_set_limits $ns2 1 4
+       pm_nl_add_endpoint $ns1 10.0.2.1 flags signal
        run_tests $ns1 $ns2 10.0.1.1 0 0 fullmesh_2 slow
        chk_join_nr "fullmesh test 1x2, limited" 4 4 4
        chk_add_nr 1 1
+
+       # set fullmesh flag
+       reset
+       pm_nl_set_limits $ns1 4 4
+       pm_nl_add_endpoint $ns1 10.0.2.1 flags subflow
+       pm_nl_set_limits $ns2 4 4
+       run_tests $ns1 $ns2 10.0.1.1 0 0 1 slow fullmesh
+       chk_join_nr "set fullmesh flag test" 2 2 2
+       chk_rm_nr 0 1
+
+       # set nofullmesh flag
+       reset
+       pm_nl_set_limits $ns1 4 4
+       pm_nl_add_endpoint $ns1 10.0.2.1 flags subflow,fullmesh
+       pm_nl_set_limits $ns2 4 4
+       run_tests $ns1 $ns2 10.0.1.1 0 0 fullmesh_1 slow nofullmesh
+       chk_join_nr "set nofullmesh flag test" 2 2 2
+       chk_rm_nr 0 1
+
+       # set backup,fullmesh flags
+       reset
+       pm_nl_set_limits $ns1 4 4
+       pm_nl_add_endpoint $ns1 10.0.2.1 flags subflow
+       pm_nl_set_limits $ns2 4 4
+       run_tests $ns1 $ns2 10.0.1.1 0 0 1 slow backup,fullmesh
+       chk_join_nr "set backup,fullmesh flags test" 2 2 2
+       chk_prio_nr 0 1
+       chk_rm_nr 0 1
+
+       # set nobackup,nofullmesh flags
+       reset
+       pm_nl_set_limits $ns1 4 4
+       pm_nl_set_limits $ns2 4 4
+       pm_nl_add_endpoint $ns2 10.0.2.2 flags subflow,backup,fullmesh
+       run_tests $ns1 $ns2 10.0.1.1 0 0 0 slow nobackup,nofullmesh
+       chk_join_nr "set nobackup,nofullmesh flags test" 2 2 2
+       chk_prio_nr 0 1
+       chk_rm_nr 0 1
 }
 
 all_tests()
@@ -1930,6 +2118,7 @@ usage()
        echo "  -m fullmesh_tests"
        echo "  -c capture pcap files"
        echo "  -C enable data checksum"
+       echo "  -i use ip mptcp"
        echo "  -h help"
 }
 
@@ -1951,9 +2140,12 @@ for arg in "$@"; do
        if [[ "${arg}" =~ ^"-"[0-9a-zA-Z]*"C"[0-9a-zA-Z]*$ ]]; then
                checksum=1
        fi
+       if [[ "${arg}" =~ ^"-"[0-9a-zA-Z]*"i"[0-9a-zA-Z]*$ ]]; then
+               ip_mptcp=1
+       fi
 
-       # exception for the capture/checksum options, the rest means: a part of the tests
-       if [ "${arg}" != "-c" ] && [ "${arg}" != "-C" ]; then
+       # exception for the capture/checksum/ip_mptcp options, the rest means: a part of the tests
+       if [ "${arg}" != "-c" ] && [ "${arg}" != "-C" ] && [ "${arg}" != "-i" ]; then
                do_all_tests=0
        fi
 done
@@ -1963,7 +2155,7 @@ if [ $do_all_tests -eq 1 ]; then
        exit $ret
 fi
 
-while getopts 'fesltra64bpkdmchCS' opt; do
+while getopts 'fesltra64bpkdmchCSi' opt; do
        case $opt in
                f)
                        subflows_tests
@@ -2014,6 +2206,8 @@ while getopts 'fesltra64bpkdmchCS' opt; do
                        ;;
                C)
                        ;;
+               i)
+                       ;;
                h | *)
                        usage
                        ;;
index cbacf9f..89839d1 100755 (executable)
@@ -164,4 +164,22 @@ id 253 flags  10.0.0.5
 id 254 flags  10.0.0.2
 id 255 flags  10.0.0.3" "wrap-around ids"
 
+ip netns exec $ns1 ./pm_nl_ctl flush
+ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.1 flags subflow
+ip netns exec $ns1 ./pm_nl_ctl set 10.0.1.1 flags backup
+check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags \
+subflow,backup 10.0.1.1" "set flags (backup)"
+ip netns exec $ns1 ./pm_nl_ctl set 10.0.1.1 flags nobackup
+check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags \
+subflow 10.0.1.1" "          (nobackup)"
+ip netns exec $ns1 ./pm_nl_ctl set id 1 flags fullmesh
+check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags \
+subflow,fullmesh 10.0.1.1" "          (fullmesh)"
+ip netns exec $ns1 ./pm_nl_ctl set id 1 flags nofullmesh
+check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags \
+subflow 10.0.1.1" "          (nofullmesh)"
+ip netns exec $ns1 ./pm_nl_ctl set id 1 flags backup,fullmesh
+check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags \
+subflow,backup,fullmesh 10.0.1.1" "          (backup,fullmesh)"
+
 exit $ret
index 3547845..22a5ec1 100644 (file)
@@ -28,7 +28,7 @@ static void syntax(char *argv[])
        fprintf(stderr, "\tadd [flags signal|subflow|backup|fullmesh] [id <nr>] [dev <name>] <ip>\n");
        fprintf(stderr, "\tdel <id> [<ip>]\n");
        fprintf(stderr, "\tget <id>\n");
-       fprintf(stderr, "\tset <ip> [flags backup|nobackup]\n");
+       fprintf(stderr, "\tset [<ip>] [id <nr>] flags [no]backup|[no]fullmesh [port <nr>]\n");
        fprintf(stderr, "\tflush\n");
        fprintf(stderr, "\tdump\n");
        fprintf(stderr, "\tlimits [<rcv addr max> <subflow max>]\n");
@@ -657,8 +657,10 @@ int set_flags(int fd, int pm_family, int argc, char *argv[])
        u_int32_t flags = 0;
        u_int16_t family;
        int nest_start;
+       int use_id = 0;
+       u_int8_t id;
        int off = 0;
-       int arg;
+       int arg = 2;
 
        memset(data, 0, sizeof(data));
        nh = (void *)data;
@@ -674,29 +676,45 @@ int set_flags(int fd, int pm_family, int argc, char *argv[])
        nest->rta_len = RTA_LENGTH(0);
        off += NLMSG_ALIGN(nest->rta_len);
 
-       /* addr data */
-       rta = (void *)(data + off);
-       if (inet_pton(AF_INET, argv[2], RTA_DATA(rta))) {
-               family = AF_INET;
-               rta->rta_type = MPTCP_PM_ADDR_ATTR_ADDR4;
-               rta->rta_len = RTA_LENGTH(4);
-       } else if (inet_pton(AF_INET6, argv[2], RTA_DATA(rta))) {
-               family = AF_INET6;
-               rta->rta_type = MPTCP_PM_ADDR_ATTR_ADDR6;
-               rta->rta_len = RTA_LENGTH(16);
+       if (!strcmp(argv[arg], "id")) {
+               if (++arg >= argc)
+                       error(1, 0, " missing id value");
+
+               use_id = 1;
+               id = atoi(argv[arg]);
+               rta = (void *)(data + off);
+               rta->rta_type = MPTCP_PM_ADDR_ATTR_ID;
+               rta->rta_len = RTA_LENGTH(1);
+               memcpy(RTA_DATA(rta), &id, 1);
+               off += NLMSG_ALIGN(rta->rta_len);
        } else {
-               error(1, errno, "can't parse ip %s", argv[2]);
+               /* addr data */
+               rta = (void *)(data + off);
+               if (inet_pton(AF_INET, argv[arg], RTA_DATA(rta))) {
+                       family = AF_INET;
+                       rta->rta_type = MPTCP_PM_ADDR_ATTR_ADDR4;
+                       rta->rta_len = RTA_LENGTH(4);
+               } else if (inet_pton(AF_INET6, argv[arg], RTA_DATA(rta))) {
+                       family = AF_INET6;
+                       rta->rta_type = MPTCP_PM_ADDR_ATTR_ADDR6;
+                       rta->rta_len = RTA_LENGTH(16);
+               } else {
+                       error(1, errno, "can't parse ip %s", argv[arg]);
+               }
+               off += NLMSG_ALIGN(rta->rta_len);
+
+               /* family */
+               rta = (void *)(data + off);
+               rta->rta_type = MPTCP_PM_ADDR_ATTR_FAMILY;
+               rta->rta_len = RTA_LENGTH(2);
+               memcpy(RTA_DATA(rta), &family, 2);
+               off += NLMSG_ALIGN(rta->rta_len);
        }
-       off += NLMSG_ALIGN(rta->rta_len);
 
-       /* family */
-       rta = (void *)(data + off);
-       rta->rta_type = MPTCP_PM_ADDR_ATTR_FAMILY;
-       rta->rta_len = RTA_LENGTH(2);
-       memcpy(RTA_DATA(rta), &family, 2);
-       off += NLMSG_ALIGN(rta->rta_len);
+       if (++arg >= argc)
+               error(1, 0, " missing flags keyword");
 
-       for (arg = 3; arg < argc; arg++) {
+       for (; arg < argc; arg++) {
                if (!strcmp(argv[arg], "flags")) {
                        char *tok, *str;
 
@@ -704,12 +722,14 @@ int set_flags(int fd, int pm_family, int argc, char *argv[])
                        if (++arg >= argc)
                                error(1, 0, " missing flags value");
 
-                       /* do not support flag list yet */
                        for (str = argv[arg]; (tok = strtok(str, ","));
                             str = NULL) {
                                if (!strcmp(tok, "backup"))
                                        flags |= MPTCP_PM_ADDR_FLAG_BACKUP;
-                               else if (strcmp(tok, "nobackup"))
+                               else if (!strcmp(tok, "fullmesh"))
+                                       flags |= MPTCP_PM_ADDR_FLAG_FULLMESH;
+                               else if (strcmp(tok, "nobackup") &&
+                                        strcmp(tok, "nofullmesh"))
                                        error(1, errno,
                                              "unknown flag %s", argv[arg]);
                        }
@@ -719,6 +739,21 @@ int set_flags(int fd, int pm_family, int argc, char *argv[])
                        rta->rta_len = RTA_LENGTH(4);
                        memcpy(RTA_DATA(rta), &flags, 4);
                        off += NLMSG_ALIGN(rta->rta_len);
+               } else if (!strcmp(argv[arg], "port")) {
+                       u_int16_t port;
+
+                       if (use_id)
+                               error(1, 0, " port can't be used with id");
+
+                       if (++arg >= argc)
+                               error(1, 0, " missing port value");
+
+                       port = atoi(argv[arg]);
+                       rta = (void *)(data + off);
+                       rta->rta_type = MPTCP_PM_ADDR_ATTR_PORT;
+                       rta->rta_len = RTA_LENGTH(2);
+                       memcpy(RTA_DATA(rta), &port, 2);
+                       off += NLMSG_ALIGN(rta->rta_len);
                } else {
                        error(1, 0, "unknown keyword %s", argv[arg]);
                }
index c9ce3df..0900c54 100755 (executable)
@@ -216,9 +216,9 @@ kci_test_route_get()
        check_err $?
        ip route get fe80::1 dev "$devdummy" > /dev/null
        check_err $?
-       ip route get 127.0.0.1 from 127.0.0.1 oif lo tos 0x1 mark 0x1 > /dev/null
+       ip route get 127.0.0.1 from 127.0.0.1 oif lo tos 0x10 mark 0x1 > /dev/null
        check_err $?
-       ip route get ::1 from ::1 iif lo oif lo tos 0x1 mark 0x1 > /dev/null
+       ip route get ::1 from ::1 iif lo oif lo tos 0x10 mark 0x1 > /dev/null
        check_err $?
        ip addr add dev "$devdummy" 10.23.7.11/24
        check_err $?
index aee631c..044bc0e 100644 (file)
@@ -325,8 +325,8 @@ int main(int argc, char **argv)
        struct ifreq device;
        struct ifreq hwtstamp;
        struct hwtstamp_config hwconfig, hwconfig_requested;
-       struct so_timestamping so_timestamping_get = { 0, -1 };
-       struct so_timestamping so_timestamping = { 0, -1 };
+       struct so_timestamping so_timestamping_get = { 0, 0 };
+       struct so_timestamping so_timestamping = { 0, 0 };
        struct sockaddr_in addr;
        struct ip_mreq imr;
        struct in_addr iaddr;