Merge tag 'mac80211-next-for-davem-2017-10-11' of git://git.kernel.org/pub/scm/linux...
authorDavid S. Miller <davem@davemloft.net>
Wed, 11 Oct 2017 17:15:01 +0000 (10:15 -0700)
committerDavid S. Miller <davem@davemloft.net>
Wed, 11 Oct 2017 17:15:01 +0000 (10:15 -0700)
Johannes Berg says:

====================
Work continues in various areas:
 * port authorized event for 4-way-HS offload (Avi)
 * enable MFP optional for such devices (Emmanuel)
 * Kees's timer setup patch for mac80211 mesh
   (the part that isn't trivially scripted)
 * improve VLAN vs. TXQ handling (myself)
 * load regulatory database as firmware file (myself)
 * with various other small improvements and cleanups

I merged net-next once in the meantime to allow Kees's
timer setup patch to go in.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
368 files changed:
Documentation/devicetree/bindings/mmc/marvell,xenon-sdhci.txt
Documentation/filesystems/overlayfs.txt
Documentation/i2c/busses/i2c-i801
Documentation/networking/bonding.txt
Documentation/networking/netvsc.txt
MAINTAINERS
Makefile
arch/Kconfig
arch/arc/Kconfig
arch/arc/Makefile
arch/arc/boot/dts/axs10x_mb.dtsi
arch/arc/boot/dts/hsdk.dts
arch/arc/configs/axs101_defconfig
arch/arc/configs/axs103_defconfig
arch/arc/configs/axs103_smp_defconfig
arch/arc/configs/haps_hs_smp_defconfig
arch/arc/configs/hsdk_defconfig
arch/arc/configs/vdk_hs38_defconfig
arch/arc/configs/vdk_hs38_smp_defconfig
arch/arc/include/asm/arcregs.h
arch/arc/kernel/setup.c
arch/arc/plat-axs10x/axs10x.c
arch/arc/plat-hsdk/Kconfig
arch/arc/plat-hsdk/platform.c
arch/arm64/include/asm/memory.h
arch/arm64/kernel/armv8_deprecated.c
arch/arm64/kernel/cpufeature.c
arch/arm64/kernel/fpsimd.c
arch/arm64/mm/fault.c
arch/parisc/kernel/process.c
arch/powerpc/kernel/dt_cpu_ftrs.c
arch/powerpc/kernel/exceptions-64s.S
arch/powerpc/kernel/mce_power.c
arch/powerpc/kernel/setup-common.c
arch/powerpc/kernel/signal_64.c
arch/powerpc/kernel/watchdog.c
arch/powerpc/kvm/book3s_xive.c
arch/powerpc/kvm/book3s_xive.h
arch/powerpc/mm/pgtable_32.c
arch/powerpc/platforms/powernv/setup.c
arch/powerpc/sysdev/xive/common.c
arch/powerpc/sysdev/xive/spapr.c
arch/sparc/Kconfig
arch/x86/events/intel/core.c
arch/x86/include/asm/kvm_para.h
arch/x86/kernel/kvm.c
arch/x86/kvm/Kconfig
arch/x86/kvm/emulate.c
arch/x86/kvm/mmu.c
block/blk-mq-debugfs.c
block/blk-throttle.c
block/bsg-lib.c
drivers/acpi/arm64/iort.c
drivers/block/Kconfig
drivers/block/nbd.c
drivers/clk/clk-bulk.c
drivers/clk/rockchip/clk-rk3128.c
drivers/clk/samsung/clk-exynos4.c
drivers/gpu/drm/i915/intel_audio.c
drivers/gpu/drm/i915/intel_bios.c
drivers/gpu/drm/i915/intel_csr.c
drivers/gpu/drm/i915/intel_ddi.c
drivers/gpu/drm/i915/intel_display.c
drivers/gpu/drm/i915/intel_dpio_phy.c
drivers/gpu/drm/i915/intel_modes.c
drivers/gpu/drm/i915/intel_runtime_pm.c
drivers/gpu/drm/sun4i/sun4i_hdmi_enc.c
drivers/hwmon/xgene-hwmon.c
drivers/i2c/busses/Kconfig
drivers/i2c/busses/i2c-i801.c
drivers/i2c/busses/i2c-sprd.c
drivers/i2c/busses/i2c-stm32f7.c
drivers/ide/ide-probe.c
drivers/ide/ide-scan-pci.c
drivers/ide/setup-pci.c
drivers/infiniband/core/iwpm_msg.c
drivers/infiniband/core/iwpm_util.c
drivers/infiniband/hw/i40iw/i40iw_ctrl.c
drivers/infiniband/hw/i40iw/i40iw_p.h
drivers/infiniband/hw/i40iw/i40iw_puda.c
drivers/infiniband/hw/i40iw/i40iw_verbs.c
drivers/infiniband/hw/mlx5/main.c
drivers/infiniband/hw/qedr/qedr.h
drivers/infiniband/hw/qedr/qedr_cm.c
drivers/md/bcache/closure.c
drivers/misc/cxl/cxllib.c
drivers/mmc/core/block.c
drivers/mmc/core/mmc.c
drivers/mmc/core/queue.c
drivers/mmc/core/queue.h
drivers/mmc/host/cavium.c
drivers/mmc/host/meson-gx-mmc.c
drivers/mmc/host/pxamci.c
drivers/mmc/host/sdhci-xenon.c
drivers/mmc/host/sdhci-xenon.h
drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c
drivers/net/ethernet/broadcom/bnxt/Makefile
drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
drivers/net/ethernet/cavium/thunder/nicvf_main.c
drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h
drivers/net/ethernet/hisilicon/Kconfig
drivers/net/ethernet/hisilicon/hns3/hnae3.h
drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c
drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c
drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_dcbnl.c
drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c
drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.h
drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_ethtool.c
drivers/net/ethernet/intel/e1000e/defines.h
drivers/net/ethernet/intel/e1000e/e1000.h
drivers/net/ethernet/intel/e1000e/mac.c
drivers/net/ethernet/intel/e1000e/netdev.c
drivers/net/ethernet/intel/e1000e/param.c
drivers/net/ethernet/intel/e1000e/phy.c
drivers/net/ethernet/intel/i40e/i40e.h
drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
drivers/net/ethernet/intel/i40e/i40e_common.c
drivers/net/ethernet/intel/i40e/i40e_debugfs.c
drivers/net/ethernet/intel/i40e/i40e_ethtool.c
drivers/net/ethernet/intel/i40e/i40e_main.c
drivers/net/ethernet/intel/i40e/i40e_nvm.c
drivers/net/ethernet/intel/i40e/i40e_register.h
drivers/net/ethernet/intel/i40e/i40e_txrx.c
drivers/net/ethernet/intel/i40e/i40e_txrx.h
drivers/net/ethernet/intel/i40e/i40e_type.h
drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
drivers/net/ethernet/intel/i40evf/i40e_adminq_cmd.h
drivers/net/ethernet/intel/i40evf/i40e_txrx.c
drivers/net/ethernet/intel/i40evf/i40e_txrx.h
drivers/net/ethernet/intel/i40evf/i40e_type.h
drivers/net/ethernet/intel/i40evf/i40evf.h
drivers/net/ethernet/intel/i40evf/i40evf_main.c
drivers/net/ethernet/intel/igb/igb_main.c
drivers/net/ethernet/intel/ixgbe/ixgbe.h
drivers/net/ethernet/intel/ixgbe/ixgbe_82598.c
drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c
drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
drivers/net/ethernet/intel/ixgbe/ixgbe_x540.c
drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c
drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
drivers/net/ethernet/mellanox/mlx4/en_main.c
drivers/net/ethernet/mellanox/mlx4/en_netdev.c
drivers/net/ethernet/mellanox/mlx4/en_resources.c
drivers/net/ethernet/mellanox/mlx4/en_rx.c
drivers/net/ethernet/mellanox/mlx4/en_tx.c
drivers/net/ethernet/mellanox/mlx4/fw.c
drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
drivers/net/ethernet/mellanox/mlx4/qp.c
drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
drivers/net/ethernet/mellanox/mlxsw/spectrum.c
drivers/net/ethernet/mellanox/mlxsw/spectrum.h
drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h
drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
drivers/net/ethernet/netronome/nfp/Makefile
drivers/net/ethernet/netronome/nfp/bpf/jit.c
drivers/net/ethernet/netronome/nfp/bpf/main.c
drivers/net/ethernet/netronome/nfp/bpf/main.h
drivers/net/ethernet/netronome/nfp/flower/action.c
drivers/net/ethernet/netronome/nfp/flower/cmsg.h
drivers/net/ethernet/netronome/nfp/flower/match.c
drivers/net/ethernet/netronome/nfp/flower/offload.c
drivers/net/ethernet/netronome/nfp/nfp_app.h
drivers/net/ethernet/netronome/nfp/nfp_asm.c [new file with mode: 0644]
drivers/net/ethernet/netronome/nfp/nfp_asm.h
drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h
drivers/net/ethernet/qlogic/qed/qed_dcbx.c
drivers/net/ethernet/qlogic/qed/qed_iwarp.c
drivers/net/ethernet/qlogic/qed/qed_iwarp.h
drivers/net/ethernet/qlogic/qed/qed_ll2.c
drivers/net/ethernet/qlogic/qed/qed_ll2.h
drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c
drivers/net/hyperv/hyperv_net.h
drivers/net/hyperv/netvsc_drv.c
drivers/net/phy/Kconfig
drivers/net/phy/Makefile
drivers/net/phy/uPD60620.c [new file with mode: 0644]
drivers/net/ppp/ppp_generic.c
drivers/net/usb/cdc_ether.c
drivers/nvme/host/core.c
drivers/nvme/host/pci.c
drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c
drivers/scsi/libiscsi.c
drivers/scsi/scsi_scan.c
drivers/scsi/scsi_transport_iscsi.c
drivers/scsi/sd.c
drivers/thunderbolt/nhi.c
drivers/thunderbolt/xdomain.c
drivers/vhost/net.c
fs/btrfs/ctree.h
fs/btrfs/extent_io.c
fs/ceph/mds_client.c
fs/ceph/snap.c
fs/namespace.c
fs/nfs/client.c
fs/nfs/filelayout/filelayout.c
fs/nfs/nfs4idmap.c
fs/nfs/nfs4proc.c
fs/nfs/nfs4xdr.c
fs/overlayfs/copy_up.c
fs/overlayfs/dir.c
fs/overlayfs/namei.c
fs/overlayfs/overlayfs.h
fs/overlayfs/ovl_entry.h
fs/overlayfs/readdir.c
fs/overlayfs/super.c
fs/overlayfs/util.c
fs/xfs/xfs_bmap_util.c
fs/xfs/xfs_reflink.c
include/linux/bpf.h
include/linux/bpf_verifier.h
include/linux/if_bridge.h
include/linux/if_phonet.h
include/linux/mmc/host.h
include/linux/netfilter_bridge/ebtables.h
include/linux/nmi.h
include/linux/once.h
include/linux/perf_event.h
include/linux/qed/qed_ll2_if.h
include/linux/skbuff.h
include/linux/smpboot.h
include/net/dst.h
include/net/dst_metadata.h
include/net/ip6_fib.h
include/net/ip6_route.h
include/net/ipv6.h
include/net/phonet/phonet.h
include/net/sock.h
include/net/switchdev.h
include/net/tcp.h
include/scsi/scsi_device.h
include/scsi/scsi_devinfo.h
include/scsi/scsi_transport_iscsi.h
include/uapi/linux/bpf.h
include/uapi/linux/if_link.h
include/uapi/linux/if_tunnel.h
include/uapi/linux/netfilter/xt_bpf.h
include/uapi/linux/openvswitch.h
kernel/bpf/Makefile
kernel/bpf/arraymap.c
kernel/bpf/core.c
kernel/bpf/disasm.c [new file with mode: 0644]
kernel/bpf/disasm.h [new file with mode: 0644]
kernel/bpf/inode.c
kernel/bpf/syscall.c
kernel/bpf/verifier.c
kernel/cpu.c
kernel/events/core.c
kernel/smpboot.c
kernel/sysctl.c
kernel/trace/bpf_trace.c
kernel/watchdog.c
kernel/watchdog_hld.c
lib/once.c
net/batman-adv/bat_iv_ogm.c
net/batman-adv/bat_v.c
net/batman-adv/bat_v_elp.c
net/batman-adv/bat_v_ogm.c
net/batman-adv/distributed-arp-table.c
net/batman-adv/gateway_client.c
net/batman-adv/gateway_common.c
net/batman-adv/hard-interface.c
net/batman-adv/icmp_socket.c
net/batman-adv/main.c
net/batman-adv/main.h
net/batman-adv/multicast.c
net/batman-adv/originator.c
net/batman-adv/routing.c
net/batman-adv/send.c
net/batman-adv/soft-interface.c
net/batman-adv/sysfs.c
net/batman-adv/tp_meter.c
net/bridge/Makefile
net/bridge/br_arp_nd_proxy.c [new file with mode: 0644]
net/bridge/br_device.c
net/bridge/br_forward.c
net/bridge/br_if.c
net/bridge/br_input.c
net/bridge/br_multicast.c
net/bridge/br_netlink.c
net/bridge/br_private.h
net/bridge/br_sysfs_if.c
net/bridge/netfilter/ebtable_broute.c
net/bridge/netfilter/ebtable_filter.c
net/bridge/netfilter/ebtable_nat.c
net/bridge/netfilter/ebtables.c
net/core/dst.c
net/core/filter.c
net/core/rtnetlink.c
net/core/skbuff.c
net/ipv4/gre_offload.c
net/ipv4/ip_gre.c
net/ipv4/netfilter/ipt_SYNPROXY.c
net/ipv4/route.c
net/ipv4/tcp.c
net/ipv4/tcp_fastopen.c
net/ipv4/tcp_input.c
net/ipv4/tcp_ipv4.c
net/ipv4/tcp_output.c
net/ipv4/tcp_timer.c
net/ipv4/udp.c
net/ipv4/udp_offload.c
net/ipv6/addrconf.c
net/ipv6/addrlabel.c
net/ipv6/icmp.c
net/ipv6/ip6_fib.c
net/ipv6/ip6_offload.c
net/ipv6/netfilter/ip6t_SYNPROXY.c
net/ipv6/ping.c
net/ipv6/route.c
net/mpls/af_mpls.c
net/netfilter/ipset/ip_set_core.c
net/netfilter/ipset/ip_set_hash_ip.c
net/netfilter/ipset/ip_set_hash_ipmark.c
net/netfilter/ipset/ip_set_hash_ipport.c
net/netfilter/ipset/ip_set_hash_ipportip.c
net/netfilter/ipset/ip_set_hash_ipportnet.c
net/netfilter/ipset/ip_set_hash_net.c
net/netfilter/ipset/ip_set_hash_netiface.c
net/netfilter/ipset/ip_set_hash_netnet.c
net/netfilter/ipset/ip_set_hash_netport.c
net/netfilter/ipset/ip_set_hash_netportnet.c
net/netfilter/ipvs/ip_vs_xmit.c
net/netfilter/nf_tables_api.c
net/netfilter/x_tables.c
net/netfilter/xt_bpf.c
net/netfilter/xt_socket.c
net/netlink/af_netlink.c
net/openvswitch/actions.c
net/openvswitch/conntrack.c
net/openvswitch/conntrack.h
net/openvswitch/flow_netlink.c
net/phonet/af_phonet.c
net/phonet/datagram.c
net/phonet/pep.c
net/sched/sch_netem.c
net/sunrpc/xprtsock.c
net/tipc/bcast.c
net/tipc/msg.c
net/wireless/nl80211.c
net/xfrm/xfrm_device.c
net/xfrm/xfrm_input.c
net/xfrm/xfrm_state.c
net/xfrm/xfrm_user.c
samples/bpf/trace_event_kern.c
samples/bpf/trace_event_user.c
samples/bpf/tracex6_kern.c
samples/bpf/tracex6_user.c
samples/bpf/xdp_monitor_kern.c
samples/bpf/xdp_monitor_user.c
tools/bpf/bpftool/Documentation/bpftool-prog.rst
tools/bpf/bpftool/Makefile
tools/bpf/bpftool/main.h
tools/bpf/bpftool/prog.c
tools/include/uapi/linux/bpf.h
tools/testing/selftests/bpf/Makefile
tools/testing/selftests/bpf/bpf_helpers.h
tools/testing/selftests/bpf/test_verifier_log.c [new file with mode: 0644]
tools/testing/selftests/net/rtnetlink.sh
tools/testing/selftests/networking/timestamping/rxtimestamp.c

index b878a1e..ed1456f 100644 (file)
@@ -16,11 +16,13 @@ Required Properties:
 
 - clocks:
   Array of clocks required for SDHC.
-  Require at least input clock for Xenon IP core.
+  Require at least input clock for Xenon IP core. For Armada AP806 and
+  CP110, the AXI clock is also mandatory.
 
 - clock-names:
   Array of names corresponding to clocks property.
   The input clock for Xenon IP core should be named as "core".
+  The input clock for the AXI bus must be named as "axi".
 
 - reg:
   * For "marvell,armada-3700-sdhci", two register areas.
@@ -106,8 +108,8 @@ Example:
                compatible = "marvell,armada-ap806-sdhci";
                reg = <0xaa0000 0x1000>;
                interrupts = <GIC_SPI 13 IRQ_TYPE_LEVEL_HIGH>
-               clocks = <&emmc_clk>;
-               clock-names = "core";
+               clocks = <&emmc_clk>,<&axi_clk>;
+               clock-names = "core", "axi";
                bus-width = <4>;
                marvell,xenon-phy-slow-mode;
                marvell,xenon-tun-count = <11>;
@@ -126,8 +128,8 @@ Example:
                interrupts = <GIC_SPI 55 IRQ_TYPE_LEVEL_HIGH>
                vqmmc-supply = <&sd_vqmmc_regulator>;
                vmmc-supply = <&sd_vmmc_regulator>;
-               clocks = <&sdclk>;
-               clock-names = "core";
+               clocks = <&sdclk>, <&axi_clk>;
+               clock-names = "core", "axi";
                bus-width = <4>;
                marvell,xenon-tun-count = <9>;
        };
index 36f528a..8caa607 100644 (file)
@@ -210,8 +210,11 @@ path as another overlay mount and it may use a lower layer path that is
 beneath or above the path of another overlay lower layer path.
 
 Using an upper layer path and/or a workdir path that are already used by
-another overlay mount is not allowed and will fail with EBUSY.  Using
+another overlay mount is not allowed and may fail with EBUSY.  Using
 partially overlapping paths is not allowed but will not fail with EBUSY.
+If files are accessed from two overlayfs mounts which share or overlap the
+upper layer and/or workdir path the behavior of the overlay is undefined,
+though it will not result in a crash or deadlock.
 
 Mounting an overlay using an upper layer path, where the upper layer path
 was previously used by another mounted overlay in combination with a
index 0500193..d477024 100644 (file)
@@ -36,6 +36,7 @@ Supported adapters:
   * Intel Gemini Lake (SOC)
   * Intel Cannon Lake-H (PCH)
   * Intel Cannon Lake-LP (PCH)
+  * Intel Cedar Fork (PCH)
    Datasheets: Publicly available at the Intel website
 
 On Intel Patsburg and later chipsets, both the normal host SMBus controller
index 57f52cd..9ba04c0 100644 (file)
@@ -2387,7 +2387,7 @@ broadcast: Like active-backup, there is not much advantage to this
        and packet type ID), so in a "gatewayed" configuration, all
        outgoing traffic will generally use the same device.  Incoming
        traffic may also end up on a single device, but that is
-       dependent upon the balancing policy of the peer's 8023.ad
+       dependent upon the balancing policy of the peer's 802.3ad
        implementation.  In a "local" configuration, traffic will be
        distributed across the devices in the bond.
 
index 93560fb..92f5b31 100644 (file)
@@ -19,12 +19,12 @@ Features
 
   Receive Side Scaling
   --------------------
-  Hyper-V supports receive side scaling. For TCP, packets are
-  distributed among available queues based on IP address and port
+  Hyper-V supports receive side scaling. For TCP & UDP, packets can
+  be distributed among available queues based on IP address and port
   number.
 
-  For UDP, we can switch UDP hash level between L3 and L4 by ethtool
-  command. UDP over IPv4 and v6 can be set differently. The default
+  For TCP & UDP, we can switch hash level between L3 and L4 by ethtool
+  command. TCP/UDP over IPv4 and v6 can be set differently. The default
   hash level is L4. We currently only allow switching TX hash level
   from within the guests.
 
index e90cdec..3944f16 100644 (file)
@@ -5264,7 +5264,8 @@ S:        Maintained
 F:     drivers/iommu/exynos-iommu.c
 
 EZchip NPS platform support
-M:     Noam Camus <noamc@ezchip.com>
+M:     Elad Kanfi <eladkan@mellanox.com>
+M:     Vineet Gupta <vgupta@synopsys.com>
 S:     Supported
 F:     arch/arc/plat-eznps
 F:     arch/arc/boot/dts/eznps.dts
@@ -9366,7 +9367,7 @@ NETWORK BLOCK DEVICE (NBD)
 M:     Josef Bacik <jbacik@fb.com>
 S:     Maintained
 L:     linux-block@vger.kernel.org
-L:     nbd-general@lists.sourceforge.net
+L:     nbd@other.debian.org
 F:     Documentation/blockdev/nbd.txt
 F:     drivers/block/nbd.c
 F:     include/uapi/linux/nbd.h
index cf007a3..2835863 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 VERSION = 4
 PATCHLEVEL = 14
 SUBLEVEL = 0
-EXTRAVERSION = -rc3
+EXTRAVERSION = -rc4
 NAME = Fearless Coyote
 
 # *DOCUMENTATION*
index 1aafb4e..d789a89 100644 (file)
@@ -937,9 +937,6 @@ config STRICT_MODULE_RWX
          and non-text memory will be made non-executable. This provides
          protection against certain security exploits (e.g. writing to text)
 
-config ARCH_WANT_RELAX_ORDER
-       bool
-
 config ARCH_HAS_REFCOUNT
        bool
        help
index a598641..c84e67f 100644 (file)
@@ -24,7 +24,7 @@ config ARC
        select GENERIC_SMP_IDLE_THREAD
        select HAVE_ARCH_KGDB
        select HAVE_ARCH_TRACEHOOK
-       select HAVE_FUTEX_CMPXCHG
+       select HAVE_FUTEX_CMPXCHG if FUTEX
        select HAVE_IOREMAP_PROT
        select HAVE_KPROBES
        select HAVE_KRETPROBES
index 3a4b52b..d37f49d 100644 (file)
@@ -6,8 +6,6 @@
 # published by the Free Software Foundation.
 #
 
-UTS_MACHINE := arc
-
 ifeq ($(CROSS_COMPILE),)
 ifndef CONFIG_CPU_BIG_ENDIAN
 CROSS_COMPILE := arc-linux-
index 2367a67..e114000 100644 (file)
 
                        mmcclk: mmcclk {
                                compatible = "fixed-clock";
-                               clock-frequency = <50000000>;
+                               /*
+                                * DW sdio controller has external ciu clock divider
+                                * controlled via register in SDIO IP. It divides
+                                * sdio_ref_clk (which comes from CGU) by 16 for
+                                * default. So default mmcclk clock (which comes
+                                * to sdk_in) is 25000000 Hz.
+                                */
+                               clock-frequency = <25000000>;
                                #clock-cells = <0>;
                        };
 
index 229d13a..8adde1b 100644 (file)
@@ -12,6 +12,7 @@
 /dts-v1/;
 
 #include <dt-bindings/net/ti-dp83867.h>
+#include <dt-bindings/reset/snps,hsdk-reset.h>
 
 / {
        model = "snps,hsdk";
                };
        };
 
-       core_clk: core-clk {
+       input_clk: input-clk {
                #clock-cells = <0>;
                compatible = "fixed-clock";
-               clock-frequency = <500000000>;
+               clock-frequency = <33333333>;
        };
 
        cpu_intc: cpu-interrupt-controller {
 
                ranges = <0x00000000 0xf0000000 0x10000000>;
 
+               cgu_rst: reset-controller@8a0 {
+                       compatible = "snps,hsdk-reset";
+                       #reset-cells = <1>;
+                       reg = <0x8A0 0x4>, <0xFF0 0x4>;
+               };
+
+               core_clk: core-clk@0 {
+                       compatible = "snps,hsdk-core-pll-clock";
+                       reg = <0x00 0x10>, <0x14B8 0x4>;
+                       #clock-cells = <0>;
+                       clocks = <&input_clk>;
+               };
+
                serial: serial@5000 {
                        compatible = "snps,dw-apb-uart";
                        reg = <0x5000 0x100>;
 
                mmcclk_ciu: mmcclk-ciu {
                        compatible = "fixed-clock";
-                       clock-frequency = <100000000>;
+                       /*
+                        * DW sdio controller has external ciu clock divider
+                        * controlled via register in SDIO IP. Due to its
+                        * unexpected default value (it should devide by 1
+                        * but it devides by 8) SDIO IP uses wrong clock and
+                        * works unstable (see STAR 9001204800)
+                        * So add temporary fix and change clock frequency
+                        * from 100000000 to 12500000 Hz until we fix dw sdio
+                        * driver itself.
+                        */
+                       clock-frequency = <12500000>;
                        #clock-cells = <0>;
                };
 
                        clocks = <&gmacclk>;
                        clock-names = "stmmaceth";
                        phy-handle = <&phy0>;
+                       resets = <&cgu_rst HSDK_ETH_RESET>;
+                       reset-names = "stmmaceth";
 
                        mdio {
                                #address-cells = <1>;
index 6980b96..ec7c849 100644 (file)
@@ -105,7 +105,7 @@ CONFIG_NLS_ISO8859_1=y
 # CONFIG_ENABLE_WARN_DEPRECATED is not set
 # CONFIG_ENABLE_MUST_CHECK is not set
 CONFIG_STRIP_ASM_SYMS=y
-CONFIG_LOCKUP_DETECTOR=y
+CONFIG_SOFTLOCKUP_DETECTOR=y
 CONFIG_DEFAULT_HUNG_TASK_TIMEOUT=10
 # CONFIG_SCHED_DEBUG is not set
 # CONFIG_DEBUG_PREEMPT is not set
index 2233f57..63d3cf6 100644 (file)
@@ -104,7 +104,7 @@ CONFIG_NLS_ISO8859_1=y
 # CONFIG_ENABLE_WARN_DEPRECATED is not set
 # CONFIG_ENABLE_MUST_CHECK is not set
 CONFIG_STRIP_ASM_SYMS=y
-CONFIG_LOCKUP_DETECTOR=y
+CONFIG_SOFTLOCKUP_DETECTOR=y
 CONFIG_DEFAULT_HUNG_TASK_TIMEOUT=10
 # CONFIG_SCHED_DEBUG is not set
 # CONFIG_DEBUG_PREEMPT is not set
index 30a3d4c..f613eca 100644 (file)
@@ -107,7 +107,7 @@ CONFIG_NLS_ISO8859_1=y
 # CONFIG_ENABLE_WARN_DEPRECATED is not set
 # CONFIG_ENABLE_MUST_CHECK is not set
 CONFIG_STRIP_ASM_SYMS=y
-CONFIG_LOCKUP_DETECTOR=y
+CONFIG_SOFTLOCKUP_DETECTOR=y
 CONFIG_DEFAULT_HUNG_TASK_TIMEOUT=10
 # CONFIG_SCHED_DEBUG is not set
 # CONFIG_DEBUG_PREEMPT is not set
index 821a2e5..3507be2 100644 (file)
@@ -84,5 +84,5 @@ CONFIG_TMPFS=y
 CONFIG_NFS_FS=y
 # CONFIG_ENABLE_WARN_DEPRECATED is not set
 # CONFIG_ENABLE_MUST_CHECK is not set
-CONFIG_LOCKUP_DETECTOR=y
+CONFIG_SOFTLOCKUP_DETECTOR=y
 # CONFIG_DEBUG_PREEMPT is not set
index 9a3fcf4..15f0f6b 100644 (file)
@@ -63,6 +63,7 @@ CONFIG_MMC_SDHCI=y
 CONFIG_MMC_SDHCI_PLTFM=y
 CONFIG_MMC_DW=y
 # CONFIG_IOMMU_SUPPORT is not set
+CONFIG_RESET_HSDK=y
 CONFIG_EXT3_FS=y
 CONFIG_VFAT_FS=y
 CONFIG_TMPFS=y
@@ -72,7 +73,7 @@ CONFIG_NLS_ISO8859_1=y
 # CONFIG_ENABLE_WARN_DEPRECATED is not set
 # CONFIG_ENABLE_MUST_CHECK is not set
 CONFIG_STRIP_ASM_SYMS=y
-CONFIG_LOCKUP_DETECTOR=y
+CONFIG_SOFTLOCKUP_DETECTOR=y
 CONFIG_DEFAULT_HUNG_TASK_TIMEOUT=10
 # CONFIG_SCHED_DEBUG is not set
 # CONFIG_DEBUG_PREEMPT is not set
index c0d6a01..4fcf4f2 100644 (file)
@@ -94,7 +94,7 @@ CONFIG_NLS_ISO8859_1=y
 # CONFIG_ENABLE_MUST_CHECK is not set
 CONFIG_STRIP_ASM_SYMS=y
 CONFIG_DEBUG_SHIRQ=y
-CONFIG_LOCKUP_DETECTOR=y
+CONFIG_SOFTLOCKUP_DETECTOR=y
 CONFIG_DEFAULT_HUNG_TASK_TIMEOUT=10
 # CONFIG_SCHED_DEBUG is not set
 # CONFIG_DEBUG_PREEMPT is not set
index 5c09717..7b71464 100644 (file)
@@ -98,7 +98,7 @@ CONFIG_NLS_ISO8859_1=y
 # CONFIG_ENABLE_MUST_CHECK is not set
 CONFIG_STRIP_ASM_SYMS=y
 CONFIG_DEBUG_SHIRQ=y
-CONFIG_LOCKUP_DETECTOR=y
+CONFIG_SOFTLOCKUP_DETECTOR=y
 CONFIG_DEFAULT_HUNG_TASK_TIMEOUT=10
 # CONFIG_SCHED_DEBUG is not set
 # CONFIG_DEBUG_PREEMPT is not set
index ba8e802..b1c56d3 100644 (file)
@@ -98,6 +98,7 @@
 
 /* Auxiliary registers */
 #define AUX_IDENTITY           4
+#define AUX_EXEC_CTRL          8
 #define AUX_INTR_VEC_BASE      0x25
 #define AUX_VOL                        0x5e
 
@@ -135,12 +136,12 @@ struct bcr_identity {
 #endif
 };
 
-struct bcr_isa {
+struct bcr_isa_arcv2 {
 #ifdef CONFIG_CPU_BIG_ENDIAN
        unsigned int div_rem:4, pad2:4, ldd:1, unalign:1, atomic:1, be:1,
-                    pad1:11, atomic1:1, ver:8;
+                    pad1:12, ver:8;
 #else
-       unsigned int ver:8, atomic1:1, pad1:11, be:1, atomic:1, unalign:1,
+       unsigned int ver:8, pad1:12, be:1, atomic:1, unalign:1,
                     ldd:1, pad2:4, div_rem:4;
 #endif
 };
@@ -263,13 +264,13 @@ struct cpuinfo_arc {
        struct cpuinfo_arc_mmu mmu;
        struct cpuinfo_arc_bpu bpu;
        struct bcr_identity core;
-       struct bcr_isa isa;
+       struct bcr_isa_arcv2 isa;
        const char *details, *name;
        unsigned int vec_base;
        struct cpuinfo_arc_ccm iccm, dccm;
        struct {
                unsigned int swap:1, norm:1, minmax:1, barrel:1, crc:1, swape:1, pad1:2,
-                            fpu_sp:1, fpu_dp:1, pad2:6,
+                            fpu_sp:1, fpu_dp:1, dual_iss_enb:1, dual_iss_exist:1, pad2:4,
                             debug:1, ap:1, smart:1, rtt:1, pad3:4,
                             timer0:1, timer1:1, rtc:1, gfrc:1, pad4:4;
        } extn;
index 877cec8..fb83844 100644 (file)
@@ -51,6 +51,7 @@ static const struct id_to_str arc_cpu_rel[] = {
        { 0x51, "R2.0" },
        { 0x52, "R2.1" },
        { 0x53, "R3.0" },
+       { 0x54, "R4.0" },
 #endif
        { 0x00, NULL   }
 };
@@ -62,6 +63,7 @@ static const struct id_to_str arc_cpu_nm[] = {
 #else
        { 0x40, "ARC EM"  },
        { 0x50, "ARC HS38"  },
+       { 0x54, "ARC HS48"  },
 #endif
        { 0x00, "Unknown"   }
 };
@@ -119,11 +121,11 @@ static void read_arc_build_cfg_regs(void)
        struct bcr_generic bcr;
        struct cpuinfo_arc *cpu = &cpuinfo_arc700[smp_processor_id()];
        const struct id_to_str *tbl;
+       struct bcr_isa_arcv2 isa;
 
        FIX_PTR(cpu);
 
        READ_BCR(AUX_IDENTITY, cpu->core);
-       READ_BCR(ARC_REG_ISA_CFG_BCR, cpu->isa);
 
        for (tbl = &arc_cpu_rel[0]; tbl->id != 0; tbl++) {
                if (cpu->core.family == tbl->id) {
@@ -133,7 +135,7 @@ static void read_arc_build_cfg_regs(void)
        }
 
        for (tbl = &arc_cpu_nm[0]; tbl->id != 0; tbl++) {
-               if ((cpu->core.family & 0xF0) == tbl->id)
+               if ((cpu->core.family & 0xF4) == tbl->id)
                        break;
        }
        cpu->name = tbl->str;
@@ -192,6 +194,14 @@ static void read_arc_build_cfg_regs(void)
                cpu->bpu.full = bpu.ft;
                cpu->bpu.num_cache = 256 << bpu.bce;
                cpu->bpu.num_pred = 2048 << bpu.pte;
+
+               if (cpu->core.family >= 0x54) {
+                       unsigned int exec_ctrl;
+
+                       READ_BCR(AUX_EXEC_CTRL, exec_ctrl);
+                       cpu->extn.dual_iss_exist = 1;
+                       cpu->extn.dual_iss_enb = exec_ctrl & 1;
+               }
        }
 
        READ_BCR(ARC_REG_AP_BCR, bcr);
@@ -205,18 +215,25 @@ static void read_arc_build_cfg_regs(void)
 
        cpu->extn.debug = cpu->extn.ap | cpu->extn.smart | cpu->extn.rtt;
 
+       READ_BCR(ARC_REG_ISA_CFG_BCR, isa);
+
        /* some hacks for lack of feature BCR info in old ARC700 cores */
        if (is_isa_arcompact()) {
-               if (!cpu->isa.ver)      /* ISA BCR absent, use Kconfig info */
+               if (!isa.ver)   /* ISA BCR absent, use Kconfig info */
                        cpu->isa.atomic = IS_ENABLED(CONFIG_ARC_HAS_LLSC);
-               else
-                       cpu->isa.atomic = cpu->isa.atomic1;
+               else {
+                       /* ARC700_BUILD only has 2 bits of isa info */
+                       struct bcr_generic bcr = *(struct bcr_generic *)&isa;
+                       cpu->isa.atomic = bcr.info & 1;
+               }
 
                cpu->isa.be = IS_ENABLED(CONFIG_CPU_BIG_ENDIAN);
 
                 /* there's no direct way to distinguish 750 vs. 770 */
                if (unlikely(cpu->core.family < 0x34 || cpu->mmu.ver < 3))
                        cpu->name = "ARC750";
+       } else {
+               cpu->isa = isa;
        }
 }
 
@@ -232,10 +249,11 @@ static char *arc_cpu_mumbojumbo(int cpu_id, char *buf, int len)
                       "\nIDENTITY\t: ARCVER [%#02x] ARCNUM [%#02x] CHIPID [%#4x]\n",
                       core->family, core->cpu_id, core->chip_id);
 
-       n += scnprintf(buf + n, len - n, "processor [%d]\t: %s %s (%s ISA) %s\n",
+       n += scnprintf(buf + n, len - n, "processor [%d]\t: %s %s (%s ISA) %s%s%s\n",
                       cpu_id, cpu->name, cpu->details,
                       is_isa_arcompact() ? "ARCompact" : "ARCv2",
-                      IS_AVAIL1(cpu->isa.be, "[Big-Endian]"));
+                      IS_AVAIL1(cpu->isa.be, "[Big-Endian]"),
+                      IS_AVAIL3(cpu->extn.dual_iss_exist, cpu->extn.dual_iss_enb, " Dual-Issue"));
 
        n += scnprintf(buf + n, len - n, "Timers\t\t: %s%s%s%s%s%s\nISA Extn\t: ",
                       IS_AVAIL1(cpu->extn.timer0, "Timer0 "),
index f1ac679..cf14ebc 100644 (file)
@@ -111,6 +111,13 @@ static void __init axs10x_early_init(void)
 
        axs10x_enable_gpio_intc_wire();
 
+       /*
+        * Reset ethernet IP core.
+        * TODO: get rid of this quirk after axs10x reset driver (or simple
+        * reset driver) will be available in upstream.
+        */
+       iowrite32((1 << 5), (void __iomem *) CREG_MB_SW_RESET);
+
        scnprintf(mb, 32, "MainBoard v%d", mb_rev);
        axs10x_print_board_ver(CREG_MB_VER, mb);
 }
index 5a6ed5a..bd08de4 100644 (file)
@@ -6,4 +6,5 @@
 #
 
 menuconfig ARC_SOC_HSDK
-       bool "ARC HS Development Kit SOC"
+       bool "ARC HS Development Kit SOC"
+       select CLK_HSDK
index a2e7fd1..744e62e 100644 (file)
@@ -38,6 +38,42 @@ static void __init hsdk_init_per_cpu(unsigned int cpu)
 #define CREG_PAE               (CREG_BASE + 0x180)
 #define CREG_PAE_UPDATE                (CREG_BASE + 0x194)
 
+#define CREG_CORE_IF_CLK_DIV   (CREG_BASE + 0x4B8)
+#define CREG_CORE_IF_CLK_DIV_2 0x1
+#define CGU_BASE               ARC_PERIPHERAL_BASE
+#define CGU_PLL_STATUS         (ARC_PERIPHERAL_BASE + 0x4)
+#define CGU_PLL_CTRL           (ARC_PERIPHERAL_BASE + 0x0)
+#define CGU_PLL_STATUS_LOCK    BIT(0)
+#define CGU_PLL_STATUS_ERR     BIT(1)
+#define CGU_PLL_CTRL_1GHZ      0x3A10
+#define HSDK_PLL_LOCK_TIMEOUT  500
+
+#define HSDK_PLL_LOCKED() \
+       !!(ioread32((void __iomem *) CGU_PLL_STATUS) & CGU_PLL_STATUS_LOCK)
+
+#define HSDK_PLL_ERR() \
+       !!(ioread32((void __iomem *) CGU_PLL_STATUS) & CGU_PLL_STATUS_ERR)
+
+static void __init hsdk_set_cpu_freq_1ghz(void)
+{
+       u32 timeout = HSDK_PLL_LOCK_TIMEOUT;
+
+       /*
+        * As we set cpu clock which exceeds 500MHz, the divider for the interface
+        * clock must be programmed to div-by-2.
+        */
+       iowrite32(CREG_CORE_IF_CLK_DIV_2, (void __iomem *) CREG_CORE_IF_CLK_DIV);
+
+       /* Set cpu clock to 1GHz */
+       iowrite32(CGU_PLL_CTRL_1GHZ, (void __iomem *) CGU_PLL_CTRL);
+
+       while (!HSDK_PLL_LOCKED() && timeout--)
+               cpu_relax();
+
+       if (!HSDK_PLL_LOCKED() || HSDK_PLL_ERR())
+               pr_err("Failed to setup CPU frequency to 1GHz!");
+}
+
 static void __init hsdk_init_early(void)
 {
        /*
@@ -52,6 +88,12 @@ static void __init hsdk_init_early(void)
 
        /* Really apply settings made above */
        writel(1, (void __iomem *) CREG_PAE_UPDATE);
+
+       /*
+        * Setup CPU frequency to 1GHz.
+        * TODO: remove it after smart hsdk pll driver will be introduced.
+        */
+       hsdk_set_cpu_freq_1ghz();
 }
 
 static const char *hsdk_compat[] __initconst = {
index 3585a5e..f7c4d21 100644 (file)
 #define KERNEL_END        _end
 
 /*
- * The size of the KASAN shadow region. This should be 1/8th of the
- * size of the entire kernel virtual address space.
+ * KASAN requires 1/8th of the kernel virtual address space for the shadow
+ * region. KASAN can bloat the stack significantly, so double the (minimum)
+ * stack size when KASAN is in use.
  */
 #ifdef CONFIG_KASAN
 #define KASAN_SHADOW_SIZE      (UL(1) << (VA_BITS - 3))
+#define KASAN_THREAD_SHIFT     1
 #else
 #define KASAN_SHADOW_SIZE      (0)
+#define KASAN_THREAD_SHIFT     0
 #endif
 
-#define MIN_THREAD_SHIFT       14
+#define MIN_THREAD_SHIFT       (14 + KASAN_THREAD_SHIFT)
 
 /*
  * VMAP'd stacks are allocated at page granularity, so we must ensure that such
index f0e6d71..d06fbe4 100644 (file)
@@ -649,4 +649,4 @@ static int __init armv8_deprecated_init(void)
        return 0;
 }
 
-late_initcall(armv8_deprecated_init);
+core_initcall(armv8_deprecated_init);
index cd52d36..21e2c95 100644 (file)
@@ -1307,4 +1307,4 @@ static int __init enable_mrs_emulation(void)
        return 0;
 }
 
-late_initcall(enable_mrs_emulation);
+core_initcall(enable_mrs_emulation);
index f444f37..5d547de 100644 (file)
@@ -444,4 +444,4 @@ static int __init fpsimd_init(void)
 
        return 0;
 }
-late_initcall(fpsimd_init);
+core_initcall(fpsimd_init);
index 2069e9b..b64958b 100644 (file)
@@ -97,7 +97,7 @@ static void data_abort_decode(unsigned int esr)
                         (esr & ESR_ELx_SF) >> ESR_ELx_SF_SHIFT,
                         (esr & ESR_ELx_AR) >> ESR_ELx_AR_SHIFT);
        } else {
-               pr_alert("  ISV = 0, ISS = 0x%08lu\n", esr & ESR_ELx_ISS_MASK);
+               pr_alert("  ISV = 0, ISS = 0x%08lx\n", esr & ESR_ELx_ISS_MASK);
        }
 
        pr_alert("  CM = %lu, WnR = %lu\n",
index a45a67d..30f9239 100644 (file)
@@ -146,7 +146,7 @@ void machine_power_off(void)
 
        /* prevent soft lockup/stalled CPU messages for endless loop. */
        rcu_sysrq_start();
-       lockup_detector_suspend();
+       lockup_detector_soft_poweroff();
        for (;;);
 }
 
index 1df770e..7275fed 100644 (file)
@@ -102,10 +102,10 @@ static void cpufeatures_flush_tlb(void)
        case PVR_POWER8:
        case PVR_POWER8E:
        case PVR_POWER8NVL:
-               __flush_tlb_power8(POWER8_TLB_SETS);
+               __flush_tlb_power8(TLB_INVAL_SCOPE_GLOBAL);
                break;
        case PVR_POWER9:
-               __flush_tlb_power9(POWER9_TLB_SETS_HASH);
+               __flush_tlb_power9(TLB_INVAL_SCOPE_GLOBAL);
                break;
        default:
                pr_err("unknown CPU version for boot TLB flush\n");
index 48da0f5..b82586c 100644 (file)
@@ -734,7 +734,29 @@ EXC_REAL(program_check, 0x700, 0x100)
 EXC_VIRT(program_check, 0x4700, 0x100, 0x700)
 TRAMP_KVM(PACA_EXGEN, 0x700)
 EXC_COMMON_BEGIN(program_check_common)
-       EXCEPTION_PROLOG_COMMON(0x700, PACA_EXGEN)
+       /*
+        * It's possible to receive a TM Bad Thing type program check with
+        * userspace register values (in particular r1), but with SRR1 reporting
+        * that we came from the kernel. Normally that would confuse the bad
+        * stack logic, and we would report a bad kernel stack pointer. Instead
+        * we switch to the emergency stack if we're taking a TM Bad Thing from
+        * the kernel.
+        */
+       li      r10,MSR_PR              /* Build a mask of MSR_PR ..    */
+       oris    r10,r10,0x200000@h      /* .. and SRR1_PROGTM           */
+       and     r10,r10,r12             /* Mask SRR1 with that.         */
+       srdi    r10,r10,8               /* Shift it so we can compare   */
+       cmpldi  r10,(0x200000 >> 8)     /* .. with an immediate.        */
+       bne 1f                          /* If != go to normal path.     */
+
+       /* SRR1 had PR=0 and SRR1_PROGTM=1, so use the emergency stack  */
+       andi.   r10,r12,MSR_PR;         /* Set CR0 correctly for label  */
+                                       /* 3 in EXCEPTION_PROLOG_COMMON */
+       mr      r10,r1                  /* Save r1                      */
+       ld      r1,PACAEMERGSP(r13)     /* Use emergency stack          */
+       subi    r1,r1,INT_FRAME_SIZE    /* alloc stack frame            */
+       b 3f                            /* Jump into the macro !!       */
+1:     EXCEPTION_PROLOG_COMMON(0x700, PACA_EXGEN)
        bl      save_nvgprs
        RECONCILE_IRQ_STATE(r10, r11)
        addi    r3,r1,STACK_FRAME_OVERHEAD
index b76ca19..72f153c 100644 (file)
@@ -624,5 +624,18 @@ long __machine_check_early_realmode_p8(struct pt_regs *regs)
 
 long __machine_check_early_realmode_p9(struct pt_regs *regs)
 {
+       /*
+        * On POWER9 DD2.1 and below, it's possible to get a machine check
+        * caused by a paste instruction where only DSISR bit 25 is set. This
+        * will result in the MCE handler seeing an unknown event and the kernel
+        * crashing. An MCE that occurs like this is spurious, so we don't need
+        * to do anything in terms of servicing it. If there is something that
+        * needs to be serviced, the CPU will raise the MCE again with the
+        * correct DSISR so that it can be serviced properly. So detect this
+        * case and mark it as handled.
+        */
+       if (SRR1_MC_LOADSTORE(regs->msr) && regs->dsisr == 0x02000000)
+               return 1;
+
        return mce_handle_error(regs, mce_p9_derror_table, mce_p9_ierror_table);
 }
index 0ac741f..2e3bc16 100644 (file)
@@ -904,9 +904,6 @@ void __init setup_arch(char **cmdline_p)
 #endif
 #endif
 
-#ifdef CONFIG_PPC_64K_PAGES
-       init_mm.context.pte_frag = NULL;
-#endif
 #ifdef CONFIG_SPAPR_TCE_IOMMU
        mm_iommu_init(&init_mm);
 #endif
index c83c115..b2c0029 100644 (file)
@@ -452,9 +452,20 @@ static long restore_tm_sigcontexts(struct task_struct *tsk,
        if (MSR_TM_RESV(msr))
                return -EINVAL;
 
-       /* pull in MSR TM from user context */
+       /* pull in MSR TS bits from user context */
        regs->msr = (regs->msr & ~MSR_TS_MASK) | (msr & MSR_TS_MASK);
 
+       /*
+        * Ensure that TM is enabled in regs->msr before we leave the signal
+        * handler. It could be the case that (a) user disabled the TM bit
+        * through the manipulation of the MSR bits in uc_mcontext or (b) the
+        * TM bit was disabled because a sufficient number of context switches
+        * happened whilst in the signal handler and load_tm overflowed,
+        * disabling the TM bit. In either case we can end up with an illegal
+        * TM state leading to a TM Bad Thing when we return to userspace.
+        */
+       regs->msr |= MSR_TM;
+
        /* pull in MSR LE from user context */
        regs->msr = (regs->msr & ~MSR_LE) | (msr & MSR_LE);
 
index 2f6eadd..c702a89 100644 (file)
@@ -310,9 +310,6 @@ static int start_wd_on_cpu(unsigned int cpu)
        if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
                return 0;
 
-       if (watchdog_suspended)
-               return 0;
-
        if (!cpumask_test_cpu(cpu, &watchdog_cpumask))
                return 0;
 
@@ -358,36 +355,39 @@ static void watchdog_calc_timeouts(void)
        wd_timer_period_ms = watchdog_thresh * 1000 * 2 / 5;
 }
 
-void watchdog_nmi_reconfigure(void)
+void watchdog_nmi_stop(void)
 {
        int cpu;
 
-       watchdog_calc_timeouts();
-
        for_each_cpu(cpu, &wd_cpus_enabled)
                stop_wd_on_cpu(cpu);
+}
 
+void watchdog_nmi_start(void)
+{
+       int cpu;
+
+       watchdog_calc_timeouts();
        for_each_cpu_and(cpu, cpu_online_mask, &watchdog_cpumask)
                start_wd_on_cpu(cpu);
 }
 
 /*
- * This runs after lockup_detector_init() which sets up watchdog_cpumask.
+ * Invoked from core watchdog init.
  */
-static int __init powerpc_watchdog_init(void)
+int __init watchdog_nmi_probe(void)
 {
        int err;
 
-       watchdog_calc_timeouts();
-
-       err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "powerpc/watchdog:online",
-                               start_wd_on_cpu, stop_wd_on_cpu);
-       if (err < 0)
+       err = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
+                                       "powerpc/watchdog:online",
+                                       start_wd_on_cpu, stop_wd_on_cpu);
+       if (err < 0) {
                pr_warn("Watchdog could not be initialized");
-
+               return err;
+       }
        return 0;
 }
-arch_initcall(powerpc_watchdog_init);
 
 static void handle_backtrace_ipi(struct pt_regs *regs)
 {
index 1330462..bf45784 100644 (file)
@@ -622,7 +622,7 @@ int kvmppc_xive_get_xive(struct kvm *kvm, u32 irq, u32 *server,
                return -EINVAL;
        state = &sb->irq_state[idx];
        arch_spin_lock(&sb->lock);
-       *server = state->guest_server;
+       *server = state->act_server;
        *priority = state->guest_priority;
        arch_spin_unlock(&sb->lock);
 
@@ -1331,7 +1331,7 @@ static int xive_get_source(struct kvmppc_xive *xive, long irq, u64 addr)
        xive->saved_src_count++;
 
        /* Convert saved state into something compatible with xics */
-       val = state->guest_server;
+       val = state->act_server;
        prio = state->saved_scan_prio;
 
        if (prio == MASKED) {
@@ -1507,7 +1507,6 @@ static int xive_set_source(struct kvmppc_xive *xive, long irq, u64 addr)
        /* First convert prio and mark interrupt as untargetted */
        act_prio = xive_prio_from_guest(guest_prio);
        state->act_priority = MASKED;
-       state->guest_server = server;
 
        /*
         * We need to drop the lock due to the mutex below. Hopefully
index 5938f76..6ba63f8 100644 (file)
@@ -35,7 +35,6 @@ struct kvmppc_xive_irq_state {
        struct xive_irq_data *pt_data;  /* XIVE Pass-through associated data */
 
        /* Targetting as set by guest */
-       u32 guest_server;               /* Current guest selected target */
        u8 guest_priority;              /* Guest set priority */
        u8 saved_priority;              /* Saved priority when masking */
 
index 65eda19..f6c7f54 100644 (file)
@@ -361,9 +361,9 @@ static int change_page_attr(struct page *page, int numpages, pgprot_t prot)
                        break;
        }
        wmb();
+       local_irq_restore(flags);
        flush_tlb_kernel_range((unsigned long)page_address(start),
                               (unsigned long)page_address(page));
-       local_irq_restore(flags);
        return err;
 }
 
index 897aa14..bbb73aa 100644 (file)
@@ -272,7 +272,15 @@ static void pnv_kexec_cpu_down(int crash_shutdown, int secondary)
 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
 static unsigned long pnv_memory_block_size(void)
 {
-       return 256UL * 1024 * 1024;
+       /*
+        * We map the kernel linear region with 1GB large pages on radix. For
+        * memory hot unplug to work our memory block size must be at least
+        * this size.
+        */
+       if (radix_enabled())
+               return 1UL * 1024 * 1024 * 1024;
+       else
+               return 256UL * 1024 * 1024;
 }
 #endif
 
index f387318..a3b8d7d 100644 (file)
@@ -1402,6 +1402,14 @@ void xive_teardown_cpu(void)
 
        if (xive_ops->teardown_cpu)
                xive_ops->teardown_cpu(cpu, xc);
+
+#ifdef CONFIG_SMP
+       /* Get rid of IPI */
+       xive_cleanup_cpu_ipi(cpu, xc);
+#endif
+
+       /* Disable and free the queues */
+       xive_cleanup_cpu_queues(cpu, xc);
 }
 
 void xive_kexec_teardown_cpu(int secondary)
index f24a70b..d9c4c93 100644 (file)
@@ -431,7 +431,11 @@ static int xive_spapr_get_ipi(unsigned int cpu, struct xive_cpu *xc)
 
 static void xive_spapr_put_ipi(unsigned int cpu, struct xive_cpu *xc)
 {
+       if (!xc->hw_ipi)
+               return;
+
        xive_irq_bitmap_free(xc->hw_ipi);
+       xc->hw_ipi = 0;
 }
 #endif /* CONFIG_SMP */
 
index 0be3828..4e83f95 100644 (file)
@@ -44,7 +44,6 @@ config SPARC
        select ARCH_HAS_SG_CHAIN
        select CPU_NO_EFFICIENT_FFS
        select LOCKDEP_SMALL if LOCKDEP
-       select ARCH_WANT_RELAX_ORDER
 
 config SPARC32
        def_bool !64BIT
index 829e89c..9fb9a1f 100644 (file)
@@ -4409,10 +4409,9 @@ static __init int fixup_ht_bug(void)
                return 0;
        }
 
-       if (lockup_detector_suspend() != 0) {
-               pr_debug("failed to disable PMU erratum BJ122, BV98, HSD29 workaround\n");
-               return 0;
-       }
+       cpus_read_lock();
+
+       hardlockup_detector_perf_stop();
 
        x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED);
 
@@ -4420,9 +4419,7 @@ static __init int fixup_ht_bug(void)
        x86_pmu.commit_scheduling = NULL;
        x86_pmu.stop_scheduling = NULL;
 
-       lockup_detector_resume();
-
-       cpus_read_lock();
+       hardlockup_detector_perf_restart();
 
        for_each_online_cpu(c)
                free_excl_cntrs(c);
index bc62e7c..59ad3d1 100644 (file)
@@ -88,7 +88,7 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
 bool kvm_para_available(void);
 unsigned int kvm_arch_para_features(void);
 void __init kvm_guest_init(void);
-void kvm_async_pf_task_wait(u32 token);
+void kvm_async_pf_task_wait(u32 token, int interrupt_kernel);
 void kvm_async_pf_task_wake(u32 token);
 u32 kvm_read_and_reset_pf_reason(void);
 extern void kvm_disable_steal_time(void);
@@ -103,7 +103,7 @@ static inline void kvm_spinlock_init(void)
 
 #else /* CONFIG_KVM_GUEST */
 #define kvm_guest_init() do {} while (0)
-#define kvm_async_pf_task_wait(T) do {} while(0)
+#define kvm_async_pf_task_wait(T, I) do {} while(0)
 #define kvm_async_pf_task_wake(T) do {} while(0)
 
 static inline bool kvm_para_available(void)
index e675704..8bb9594 100644 (file)
@@ -117,7 +117,11 @@ static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b,
        return NULL;
 }
 
-void kvm_async_pf_task_wait(u32 token)
+/*
+ * @interrupt_kernel: Is this called from a routine which interrupts the kernel
+ *                   (other than user space)?
+ */
+void kvm_async_pf_task_wait(u32 token, int interrupt_kernel)
 {
        u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
        struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
@@ -140,8 +144,10 @@ void kvm_async_pf_task_wait(u32 token)
 
        n.token = token;
        n.cpu = smp_processor_id();
-       n.halted = is_idle_task(current) || preempt_count() > 1 ||
-                  rcu_preempt_depth();
+       n.halted = is_idle_task(current) ||
+                  (IS_ENABLED(CONFIG_PREEMPT_COUNT)
+                   ? preempt_count() > 1 || rcu_preempt_depth()
+                   : interrupt_kernel);
        init_swait_queue_head(&n.wq);
        hlist_add_head(&n.link, &b->list);
        raw_spin_unlock(&b->lock);
@@ -269,7 +275,7 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
        case KVM_PV_REASON_PAGE_NOT_PRESENT:
                /* page is swapped out by the host. */
                prev_state = exception_enter();
-               kvm_async_pf_task_wait((u32)read_cr2());
+               kvm_async_pf_task_wait((u32)read_cr2(), !user_mode(regs));
                exception_exit(prev_state);
                break;
        case KVM_PV_REASON_PAGE_READY:
index 3ea6244..3c48bc8 100644 (file)
@@ -23,6 +23,7 @@ config KVM
        depends on HIGH_RES_TIMERS
        # for TASKSTATS/TASK_DELAY_ACCT:
        depends on NET && MULTIUSER
+       depends on X86_LOCAL_APIC
        select PREEMPT_NOTIFIERS
        select MMU_NOTIFIER
        select ANON_INODES
index a36254c..d90cdc7 100644 (file)
@@ -425,8 +425,10 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *));
        #op " %al \n\t" \
        FOP_RET
 
-asm(".global kvm_fastop_exception \n"
-    "kvm_fastop_exception: xor %esi, %esi; ret");
+asm(".pushsection .fixup, \"ax\"\n"
+    ".global kvm_fastop_exception \n"
+    "kvm_fastop_exception: xor %esi, %esi; ret\n"
+    ".popsection");
 
 FOP_START(setcc)
 FOP_SETCC(seto)
index eca30c1..106d4a0 100644 (file)
@@ -3837,7 +3837,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
        case KVM_PV_REASON_PAGE_NOT_PRESENT:
                vcpu->arch.apf.host_apf_reason = 0;
                local_irq_disable();
-               kvm_async_pf_task_wait(fault_address);
+               kvm_async_pf_task_wait(fault_address, 0);
                local_irq_enable();
                break;
        case KVM_PV_REASON_PAGE_READY:
index 980e730..de294d7 100644 (file)
@@ -815,10 +815,14 @@ int blk_mq_debugfs_register(struct request_queue *q)
                goto err;
 
        /*
-        * blk_mq_init_hctx() attempted to do this already, but q->debugfs_dir
+        * blk_mq_init_sched() attempted to do this already, but q->debugfs_dir
         * didn't exist yet (because we don't know what to name the directory
         * until the queue is registered to a gendisk).
         */
+       if (q->elevator && !q->sched_debugfs_dir)
+               blk_mq_debugfs_register_sched(q);
+
+       /* Similarly, blk_mq_init_hctx() couldn't do this previously. */
        queue_for_each_hw_ctx(q, hctx, i) {
                if (!hctx->debugfs_dir && blk_mq_debugfs_register_hctx(q, hctx))
                        goto err;
index 0fea76a..17816a0 100644 (file)
@@ -1911,11 +1911,11 @@ static void throtl_upgrade_state(struct throtl_data *td)
 
                tg->disptime = jiffies - 1;
                throtl_select_dispatch(sq);
-               throtl_schedule_next_dispatch(sq, false);
+               throtl_schedule_next_dispatch(sq, true);
        }
        rcu_read_unlock();
        throtl_select_dispatch(&td->service_queue);
-       throtl_schedule_next_dispatch(&td->service_queue, false);
+       throtl_schedule_next_dispatch(&td->service_queue, true);
        queue_work(kthrotld_workqueue, &td->dispatch_work);
 }
 
index dbddff8..15d25cc 100644 (file)
@@ -207,20 +207,34 @@ static int bsg_init_rq(struct request_queue *q, struct request *req, gfp_t gfp)
        struct bsg_job *job = blk_mq_rq_to_pdu(req);
        struct scsi_request *sreq = &job->sreq;
 
+       /* called right after the request is allocated for the request_queue */
+
+       sreq->sense = kzalloc(SCSI_SENSE_BUFFERSIZE, gfp);
+       if (!sreq->sense)
+               return -ENOMEM;
+
+       return 0;
+}
+
+static void bsg_initialize_rq(struct request *req)
+{
+       struct bsg_job *job = blk_mq_rq_to_pdu(req);
+       struct scsi_request *sreq = &job->sreq;
+       void *sense = sreq->sense;
+
+       /* called right before the request is given to the request_queue user */
+
        memset(job, 0, sizeof(*job));
 
        scsi_req_init(sreq);
+
+       sreq->sense = sense;
        sreq->sense_len = SCSI_SENSE_BUFFERSIZE;
-       sreq->sense = kzalloc(sreq->sense_len, gfp);
-       if (!sreq->sense)
-               return -ENOMEM;
 
        job->req = req;
-       job->reply = sreq->sense;
+       job->reply = sense;
        job->reply_len = sreq->sense_len;
        job->dd_data = job + 1;
-
-       return 0;
 }
 
 static void bsg_exit_rq(struct request_queue *q, struct request *req)
@@ -251,6 +265,7 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
        q->cmd_size = sizeof(struct bsg_job) + dd_job_size;
        q->init_rq_fn = bsg_init_rq;
        q->exit_rq_fn = bsg_exit_rq;
+       q->initialize_rq_fn = bsg_initialize_rq;
        q->request_fn = bsg_request_fn;
 
        ret = blk_init_allocated_queue(q);
index 9565d57..de56394 100644 (file)
@@ -1178,12 +1178,44 @@ dev_put:
        return ret;
 }
 
+static bool __init iort_enable_acs(struct acpi_iort_node *iort_node)
+{
+       if (iort_node->type == ACPI_IORT_NODE_PCI_ROOT_COMPLEX) {
+               struct acpi_iort_node *parent;
+               struct acpi_iort_id_mapping *map;
+               int i;
+
+               map = ACPI_ADD_PTR(struct acpi_iort_id_mapping, iort_node,
+                                  iort_node->mapping_offset);
+
+               for (i = 0; i < iort_node->mapping_count; i++, map++) {
+                       if (!map->output_reference)
+                               continue;
+
+                       parent = ACPI_ADD_PTR(struct acpi_iort_node,
+                                       iort_table,  map->output_reference);
+                       /*
+                        * If we detect a RC->SMMU mapping, make sure
+                        * we enable ACS on the system.
+                        */
+                       if ((parent->type == ACPI_IORT_NODE_SMMU) ||
+                               (parent->type == ACPI_IORT_NODE_SMMU_V3)) {
+                               pci_request_acs();
+                               return true;
+                       }
+               }
+       }
+
+       return false;
+}
+
 static void __init iort_init_platform_devices(void)
 {
        struct acpi_iort_node *iort_node, *iort_end;
        struct acpi_table_iort *iort;
        struct fwnode_handle *fwnode;
        int i, ret;
+       bool acs_enabled = false;
 
        /*
         * iort_table and iort both point to the start of IORT table, but
@@ -1203,6 +1235,9 @@ static void __init iort_init_platform_devices(void)
                        return;
                }
 
+               if (!acs_enabled)
+                       acs_enabled = iort_enable_acs(iort_node);
+
                if ((iort_node->type == ACPI_IORT_NODE_SMMU) ||
                        (iort_node->type == ACPI_IORT_NODE_SMMU_V3)) {
 
index 4a438b8..2dfe99b 100644 (file)
@@ -17,7 +17,7 @@ if BLK_DEV
 
 config BLK_DEV_NULL_BLK
        tristate "Null test block driver"
-       depends on CONFIGFS_FS
+       select CONFIGFS_FS
 
 config BLK_DEV_FD
        tristate "Normal floppy disk support"
index 3684e21..883dfeb 100644 (file)
@@ -820,9 +820,13 @@ static blk_status_t nbd_queue_rq(struct blk_mq_hw_ctx *hctx,
         * appropriate.
         */
        ret = nbd_handle_cmd(cmd, hctx->queue_num);
+       if (ret < 0)
+               ret = BLK_STS_IOERR;
+       else if (!ret)
+               ret = BLK_STS_OK;
        complete(&cmd->send_complete);
 
-       return ret < 0 ? BLK_STS_IOERR : BLK_STS_OK;
+       return ret;
 }
 
 static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg,
index c834f5a..4c10456 100644 (file)
@@ -105,6 +105,7 @@ err:
 
        return  ret;
 }
+EXPORT_SYMBOL_GPL(clk_bulk_prepare);
 
 #endif /* CONFIG_HAVE_CLK_PREPARE */
 
index 62d7854..5970a50 100644 (file)
@@ -315,13 +315,13 @@ static struct rockchip_clk_branch common_clk_branches[] __initdata = {
                        RK2928_CLKGATE_CON(10), 8, GFLAGS),
 
        GATE(SCLK_PVTM_CORE, "clk_pvtm_core", "xin24m", 0,
-                       RK2928_CLKGATE_CON(10), 8, GFLAGS),
+                       RK2928_CLKGATE_CON(10), 0, GFLAGS),
        GATE(SCLK_PVTM_GPU, "clk_pvtm_gpu", "xin24m", 0,
-                       RK2928_CLKGATE_CON(10), 8, GFLAGS),
+                       RK2928_CLKGATE_CON(10), 1, GFLAGS),
        GATE(SCLK_PVTM_FUNC, "clk_pvtm_func", "xin24m", 0,
-                       RK2928_CLKGATE_CON(10), 8, GFLAGS),
+                       RK2928_CLKGATE_CON(10), 2, GFLAGS),
        GATE(SCLK_MIPI_24M, "clk_mipi_24m", "xin24m", CLK_IGNORE_UNUSED,
-                       RK2928_CLKGATE_CON(10), 8, GFLAGS),
+                       RK2928_CLKGATE_CON(2), 15, GFLAGS),
 
        COMPOSITE(SCLK_SDMMC, "sclk_sdmmc0", mux_mmc_src_p, 0,
                        RK2928_CLKSEL_CON(11), 6, 2, MFLAGS, 0, 6, DFLAGS,
@@ -541,7 +541,7 @@ static struct rockchip_clk_branch common_clk_branches[] __initdata = {
        GATE(0, "pclk_grf", "pclk_cpu", CLK_IGNORE_UNUSED, RK2928_CLKGATE_CON(5), 4, GFLAGS),
        GATE(0, "pclk_mipiphy", "pclk_cpu", CLK_IGNORE_UNUSED, RK2928_CLKGATE_CON(5), 0, GFLAGS),
 
-       GATE(0, "pclk_pmu", "pclk_pmu_pre", CLK_IGNORE_UNUSED, RK2928_CLKGATE_CON(9), 2, GFLAGS),
+       GATE(0, "pclk_pmu", "pclk_pmu_pre", 0, RK2928_CLKGATE_CON(9), 2, GFLAGS),
        GATE(0, "pclk_pmu_niu", "pclk_pmu_pre", CLK_IGNORE_UNUSED, RK2928_CLKGATE_CON(9), 3, GFLAGS),
 
        /* PD_MMC */
@@ -577,6 +577,8 @@ static const char *const rk3128_critical_clocks[] __initconst = {
        "aclk_peri",
        "hclk_peri",
        "pclk_peri",
+       "pclk_pmu",
+       "sclk_timer5",
 };
 
 static struct rockchip_clk_provider *__init rk3128_common_clk_init(struct device_node *np)
index e40b775..d8d3cb6 100644 (file)
@@ -294,6 +294,18 @@ static const struct samsung_clk_reg_dump src_mask_suspend_e4210[] = {
 #define PLL_ENABLED    (1 << 31)
 #define PLL_LOCKED     (1 << 29)
 
+static void exynos4_clk_enable_pll(u32 reg)
+{
+       u32 pll_con = readl(reg_base + reg);
+       pll_con |= PLL_ENABLED;
+       writel(pll_con, reg_base + reg);
+
+       while (!(pll_con & PLL_LOCKED)) {
+               cpu_relax();
+               pll_con = readl(reg_base + reg);
+       }
+}
+
 static void exynos4_clk_wait_for_pll(u32 reg)
 {
        u32 pll_con;
@@ -315,6 +327,9 @@ static int exynos4_clk_suspend(void)
        samsung_clk_save(reg_base, exynos4_save_pll,
                                ARRAY_SIZE(exynos4_clk_pll_regs));
 
+       exynos4_clk_enable_pll(EPLL_CON0);
+       exynos4_clk_enable_pll(VPLL_CON0);
+
        if (exynos4_soc == EXYNOS4210) {
                samsung_clk_save(reg_base, exynos4_save_soc,
                                        ARRAY_SIZE(exynos4210_clk_save));
index d805b6e..27743be 100644 (file)
@@ -606,11 +606,6 @@ void intel_audio_codec_enable(struct intel_encoder *intel_encoder,
                         connector->encoder->base.id,
                         connector->encoder->name);
 
-       /* ELD Conn_Type */
-       connector->eld[5] &= ~(3 << 2);
-       if (intel_crtc_has_dp_encoder(crtc_state))
-               connector->eld[5] |= (1 << 2);
-
        connector->eld[6] = drm_av_sync_delay(connector, adjusted_mode) / 2;
 
        if (dev_priv->display.audio_codec_enable)
index 183e87e..00c6aee 100644 (file)
@@ -1163,6 +1163,13 @@ static void parse_ddi_port(struct drm_i915_private *dev_priv, enum port port,
        is_hdmi = is_dvi && (child->common.device_type & DEVICE_TYPE_NOT_HDMI_OUTPUT) == 0;
        is_edp = is_dp && (child->common.device_type & DEVICE_TYPE_INTERNAL_CONNECTOR);
 
+       if (port == PORT_A && is_dvi) {
+               DRM_DEBUG_KMS("VBT claims port A supports DVI%s, ignoring\n",
+                             is_hdmi ? "/HDMI" : "");
+               is_dvi = false;
+               is_hdmi = false;
+       }
+
        info->supports_dvi = is_dvi;
        info->supports_hdmi = is_hdmi;
        info->supports_dp = is_dp;
index 965988f..92c1f8e 100644 (file)
@@ -216,7 +216,7 @@ static void gen9_set_dc_state_debugmask(struct drm_i915_private *dev_priv)
 
        mask = DC_STATE_DEBUG_MASK_MEMORY_UP;
 
-       if (IS_BROXTON(dev_priv))
+       if (IS_GEN9_LP(dev_priv))
                mask |= DC_STATE_DEBUG_MASK_CORES;
 
        /* The below bit doesn't need to be cleared ever afterwards */
index 4b4fd1f..476681d 100644 (file)
@@ -1655,7 +1655,8 @@ bool intel_ddi_get_hw_state(struct intel_encoder *encoder,
 out:
        if (ret && IS_GEN9_LP(dev_priv)) {
                tmp = I915_READ(BXT_PHY_CTL(port));
-               if ((tmp & (BXT_PHY_LANE_POWERDOWN_ACK |
+               if ((tmp & (BXT_PHY_CMNLANE_POWERDOWN_ACK |
+                           BXT_PHY_LANE_POWERDOWN_ACK |
                            BXT_PHY_LANE_ENABLED)) != BXT_PHY_LANE_ENABLED)
                        DRM_ERROR("Port %c enabled but PHY powered down? "
                                  "(PHY_CTL %08x)\n", port_name(port), tmp);
index 00cd17c..64f7b51 100644 (file)
@@ -12359,7 +12359,6 @@ static void intel_atomic_commit_tail(struct drm_atomic_state *state)
        struct drm_crtc_state *old_crtc_state, *new_crtc_state;
        struct drm_crtc *crtc;
        struct intel_crtc_state *intel_cstate;
-       bool hw_check = intel_state->modeset;
        u64 put_domains[I915_MAX_PIPES] = {};
        unsigned crtc_vblank_mask = 0;
        int i;
@@ -12376,7 +12375,6 @@ static void intel_atomic_commit_tail(struct drm_atomic_state *state)
 
                if (needs_modeset(new_crtc_state) ||
                    to_intel_crtc_state(new_crtc_state)->update_pipe) {
-                       hw_check = true;
 
                        put_domains[to_intel_crtc(crtc)->pipe] =
                                modeset_get_crtc_power_domains(crtc,
index 09b6709..de38d01 100644 (file)
@@ -208,12 +208,6 @@ static const struct bxt_ddi_phy_info glk_ddi_phy_info[] = {
        },
 };
 
-static u32 bxt_phy_port_mask(const struct bxt_ddi_phy_info *phy_info)
-{
-       return (phy_info->dual_channel * BIT(phy_info->channel[DPIO_CH1].port)) |
-               BIT(phy_info->channel[DPIO_CH0].port);
-}
-
 static const struct bxt_ddi_phy_info *
 bxt_get_phy_list(struct drm_i915_private *dev_priv, int *count)
 {
@@ -313,7 +307,6 @@ bool bxt_ddi_phy_is_enabled(struct drm_i915_private *dev_priv,
                            enum dpio_phy phy)
 {
        const struct bxt_ddi_phy_info *phy_info;
-       enum port port;
 
        phy_info = bxt_get_phy_info(dev_priv, phy);
 
@@ -335,19 +328,6 @@ bool bxt_ddi_phy_is_enabled(struct drm_i915_private *dev_priv,
                return false;
        }
 
-       for_each_port_masked(port, bxt_phy_port_mask(phy_info)) {
-               u32 tmp = I915_READ(BXT_PHY_CTL(port));
-
-               if (tmp & BXT_PHY_CMNLANE_POWERDOWN_ACK) {
-                       DRM_DEBUG_DRIVER("DDI PHY %d powered, but common lane "
-                                        "for port %c powered down "
-                                        "(PHY_CTL %08x)\n",
-                                        phy, port_name(port), tmp);
-
-                       return false;
-               }
-       }
-
        return true;
 }
 
index 951e834..28a778b 100644 (file)
 #include "intel_drv.h"
 #include "i915_drv.h"
 
+static void intel_connector_update_eld_conn_type(struct drm_connector *connector)
+{
+       u8 conn_type;
+
+       if (connector->connector_type == DRM_MODE_CONNECTOR_DisplayPort ||
+           connector->connector_type == DRM_MODE_CONNECTOR_eDP) {
+               conn_type = DRM_ELD_CONN_TYPE_DP;
+       } else {
+               conn_type = DRM_ELD_CONN_TYPE_HDMI;
+       }
+
+       connector->eld[DRM_ELD_SAD_COUNT_CONN_TYPE] &= ~DRM_ELD_CONN_TYPE_MASK;
+       connector->eld[DRM_ELD_SAD_COUNT_CONN_TYPE] |= conn_type;
+}
+
 /**
  * intel_connector_update_modes - update connector from edid
  * @connector: DRM connector device to use
@@ -44,6 +59,8 @@ int intel_connector_update_modes(struct drm_connector *connector,
        ret = drm_add_edid_modes(connector, edid);
        drm_edid_to_eld(connector, edid);
 
+       intel_connector_update_eld_conn_type(connector);
+
        return ret;
 }
 
index b66d8e1..b3a087c 100644 (file)
@@ -2782,6 +2782,9 @@ static void cnl_display_core_init(struct drm_i915_private *dev_priv, bool resume
 
        /* 6. Enable DBUF */
        gen9_dbuf_enable(dev_priv);
+
+       if (resume && dev_priv->csr.dmc_payload)
+               intel_csr_load_program(dev_priv);
 }
 
 #undef CNL_PROCMON_IDX
index 9ea6cd5..3cf1a69 100644 (file)
@@ -302,26 +302,29 @@ static int sun4i_hdmi_bind(struct device *dev, struct device *master,
        hdmi->mod_clk = devm_clk_get(dev, "mod");
        if (IS_ERR(hdmi->mod_clk)) {
                dev_err(dev, "Couldn't get the HDMI mod clock\n");
-               return PTR_ERR(hdmi->mod_clk);
+               ret = PTR_ERR(hdmi->mod_clk);
+               goto err_disable_bus_clk;
        }
        clk_prepare_enable(hdmi->mod_clk);
 
        hdmi->pll0_clk = devm_clk_get(dev, "pll-0");
        if (IS_ERR(hdmi->pll0_clk)) {
                dev_err(dev, "Couldn't get the HDMI PLL 0 clock\n");
-               return PTR_ERR(hdmi->pll0_clk);
+               ret = PTR_ERR(hdmi->pll0_clk);
+               goto err_disable_mod_clk;
        }
 
        hdmi->pll1_clk = devm_clk_get(dev, "pll-1");
        if (IS_ERR(hdmi->pll1_clk)) {
                dev_err(dev, "Couldn't get the HDMI PLL 1 clock\n");
-               return PTR_ERR(hdmi->pll1_clk);
+               ret = PTR_ERR(hdmi->pll1_clk);
+               goto err_disable_mod_clk;
        }
 
        ret = sun4i_tmds_create(hdmi);
        if (ret) {
                dev_err(dev, "Couldn't create the TMDS clock\n");
-               return ret;
+               goto err_disable_mod_clk;
        }
 
        writel(SUN4I_HDMI_CTRL_ENABLE, hdmi->base + SUN4I_HDMI_CTRL_REG);
@@ -362,7 +365,7 @@ static int sun4i_hdmi_bind(struct device *dev, struct device *master,
        ret = sun4i_hdmi_i2c_create(dev, hdmi);
        if (ret) {
                dev_err(dev, "Couldn't create the HDMI I2C adapter\n");
-               return ret;
+               goto err_disable_mod_clk;
        }
 
        drm_encoder_helper_add(&hdmi->encoder,
@@ -422,6 +425,10 @@ err_cleanup_connector:
        drm_encoder_cleanup(&hdmi->encoder);
 err_del_i2c_adapter:
        i2c_del_adapter(hdmi->i2c);
+err_disable_mod_clk:
+       clk_disable_unprepare(hdmi->mod_clk);
+err_disable_bus_clk:
+       clk_disable_unprepare(hdmi->bus_clk);
        return ret;
 }
 
@@ -434,6 +441,8 @@ static void sun4i_hdmi_unbind(struct device *dev, struct device *master,
        drm_connector_cleanup(&hdmi->connector);
        drm_encoder_cleanup(&hdmi->encoder);
        i2c_del_adapter(hdmi->i2c);
+       clk_disable_unprepare(hdmi->mod_clk);
+       clk_disable_unprepare(hdmi->bus_clk);
 }
 
 static const struct component_ops sun4i_hdmi_ops = {
index 9c0dbb8..e1be610 100644 (file)
@@ -630,7 +630,7 @@ static int xgene_hwmon_probe(struct platform_device *pdev)
                         sizeof(struct slimpro_resp_msg) * ASYNC_MSG_FIFO_SIZE,
                         GFP_KERNEL);
        if (rc)
-               goto out_mbox_free;
+               return -ENOMEM;
 
        INIT_WORK(&ctx->workq, xgene_hwmon_evt_work);
 
@@ -646,7 +646,8 @@ static int xgene_hwmon_probe(struct platform_device *pdev)
                if (IS_ERR(ctx->mbox_chan)) {
                        dev_err(&pdev->dev,
                                "SLIMpro mailbox channel request failed\n");
-                       return -ENODEV;
+                       rc = -ENODEV;
+                       goto out_mbox_free;
                }
        } else {
                struct acpi_pcct_hw_reduced *cppc_ss;
@@ -654,7 +655,8 @@ static int xgene_hwmon_probe(struct platform_device *pdev)
                if (device_property_read_u32(&pdev->dev, "pcc-channel",
                                             &ctx->mbox_idx)) {
                        dev_err(&pdev->dev, "no pcc-channel property\n");
-                       return -ENODEV;
+                       rc = -ENODEV;
+                       goto out_mbox_free;
                }
 
                cl->rx_callback = xgene_hwmon_pcc_rx_cb;
@@ -662,7 +664,8 @@ static int xgene_hwmon_probe(struct platform_device *pdev)
                if (IS_ERR(ctx->mbox_chan)) {
                        dev_err(&pdev->dev,
                                "PPC channel request failed\n");
-                       return -ENODEV;
+                       rc = -ENODEV;
+                       goto out_mbox_free;
                }
 
                /*
@@ -675,13 +678,13 @@ static int xgene_hwmon_probe(struct platform_device *pdev)
                if (!cppc_ss) {
                        dev_err(&pdev->dev, "PPC subspace not found\n");
                        rc = -ENODEV;
-                       goto out_mbox_free;
+                       goto out;
                }
 
                if (!ctx->mbox_chan->mbox->txdone_irq) {
                        dev_err(&pdev->dev, "PCC IRQ not supported\n");
                        rc = -ENODEV;
-                       goto out_mbox_free;
+                       goto out;
                }
 
                /*
@@ -696,14 +699,14 @@ static int xgene_hwmon_probe(struct platform_device *pdev)
                } else {
                        dev_err(&pdev->dev, "Failed to get PCC comm region\n");
                        rc = -ENODEV;
-                       goto out_mbox_free;
+                       goto out;
                }
 
                if (!ctx->pcc_comm_addr) {
                        dev_err(&pdev->dev,
                                "Failed to ioremap PCC comm region\n");
                        rc = -ENOMEM;
-                       goto out_mbox_free;
+                       goto out;
                }
 
                /*
index c06dce2..45a3f3c 100644 (file)
@@ -131,6 +131,7 @@ config I2C_I801
            Gemini Lake (SOC)
            Cannon Lake-H (PCH)
            Cannon Lake-LP (PCH)
+           Cedar Fork (PCH)
 
          This driver can also be built as a module.  If so, the module
          will be called i2c-i801.
index e114e4e..9e12a53 100644 (file)
@@ -68,6 +68,7 @@
  * Gemini Lake (SOC)           0x31d4  32      hard    yes     yes     yes
  * Cannon Lake-H (PCH)         0xa323  32      hard    yes     yes     yes
  * Cannon Lake-LP (PCH)                0x9da3  32      hard    yes     yes     yes
+ * Cedar Fork (PCH)            0x18df  32      hard    yes     yes     yes
  *
  * Features supported by this driver:
  * Software PEC                                no
 
 /* Older devices have their ID defined in <linux/pci_ids.h> */
 #define PCI_DEVICE_ID_INTEL_BAYTRAIL_SMBUS             0x0f12
+#define PCI_DEVICE_ID_INTEL_CDF_SMBUS                  0x18df
 #define PCI_DEVICE_ID_INTEL_DNV_SMBUS                  0x19df
 #define PCI_DEVICE_ID_INTEL_COUGARPOINT_SMBUS          0x1c22
 #define PCI_DEVICE_ID_INTEL_PATSBURG_SMBUS             0x1d22
@@ -1025,6 +1027,7 @@ static const struct pci_device_id i801_ids[] = {
        { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_BRASWELL_SMBUS) },
        { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SUNRISEPOINT_H_SMBUS) },
        { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SUNRISEPOINT_LP_SMBUS) },
+       { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CDF_SMBUS) },
        { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_DNV_SMBUS) },
        { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_BROXTON_SMBUS) },
        { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_LEWISBURG_SMBUS) },
@@ -1513,6 +1516,7 @@ static int i801_probe(struct pci_dev *dev, const struct pci_device_id *id)
        case PCI_DEVICE_ID_INTEL_CANNONLAKE_LP_SMBUS:
        case PCI_DEVICE_ID_INTEL_LEWISBURG_SMBUS:
        case PCI_DEVICE_ID_INTEL_LEWISBURG_SSKU_SMBUS:
+       case PCI_DEVICE_ID_INTEL_CDF_SMBUS:
        case PCI_DEVICE_ID_INTEL_DNV_SMBUS:
        case PCI_DEVICE_ID_INTEL_KABYLAKE_PCH_H_SMBUS:
                priv->features |= FEATURE_I2C_BLOCK_READ;
index 22e08ae..25fcc3c 100644 (file)
@@ -627,6 +627,7 @@ static const struct dev_pm_ops sprd_i2c_pm_ops = {
 
 static const struct of_device_id sprd_i2c_of_match[] = {
        { .compatible = "sprd,sc9860-i2c", },
+       {},
 };
 
 static struct platform_driver sprd_i2c_driver = {
index 47c67b0..d4a6e9c 100644 (file)
@@ -215,7 +215,7 @@ struct stm32f7_i2c_dev {
        unsigned int msg_num;
        unsigned int msg_id;
        struct stm32f7_i2c_msg f7_msg;
-       struct stm32f7_i2c_setup *setup;
+       struct stm32f7_i2c_setup setup;
        struct stm32f7_i2c_timings timing;
 };
 
@@ -265,7 +265,7 @@ static struct stm32f7_i2c_spec i2c_specs[] = {
        },
 };
 
-struct stm32f7_i2c_setup stm32f7_setup = {
+static const struct stm32f7_i2c_setup stm32f7_setup = {
        .rise_time = STM32F7_I2C_RISE_TIME_DEFAULT,
        .fall_time = STM32F7_I2C_FALL_TIME_DEFAULT,
        .dnf = STM32F7_I2C_DNF_DEFAULT,
@@ -537,7 +537,7 @@ static void stm32f7_i2c_hw_config(struct stm32f7_i2c_dev *i2c_dev)
        writel_relaxed(timing, i2c_dev->base + STM32F7_I2C_TIMINGR);
 
        /* Enable I2C */
-       if (i2c_dev->setup->analog_filter)
+       if (i2c_dev->setup.analog_filter)
                stm32f7_i2c_clr_bits(i2c_dev->base + STM32F7_I2C_CR1,
                                     STM32F7_I2C_CR1_ANFOFF);
        else
@@ -887,22 +887,19 @@ static int stm32f7_i2c_probe(struct platform_device *pdev)
        }
 
        setup = of_device_get_match_data(&pdev->dev);
-       i2c_dev->setup->rise_time = setup->rise_time;
-       i2c_dev->setup->fall_time = setup->fall_time;
-       i2c_dev->setup->dnf = setup->dnf;
-       i2c_dev->setup->analog_filter = setup->analog_filter;
+       i2c_dev->setup = *setup;
 
        ret = device_property_read_u32(i2c_dev->dev, "i2c-scl-rising-time-ns",
                                       &rise_time);
        if (!ret)
-               i2c_dev->setup->rise_time = rise_time;
+               i2c_dev->setup.rise_time = rise_time;
 
        ret = device_property_read_u32(i2c_dev->dev, "i2c-scl-falling-time-ns",
                                       &fall_time);
        if (!ret)
-               i2c_dev->setup->fall_time = fall_time;
+               i2c_dev->setup.fall_time = fall_time;
 
-       ret = stm32f7_i2c_setup_timing(i2c_dev, i2c_dev->setup);
+       ret = stm32f7_i2c_setup_timing(i2c_dev, &i2c_dev->setup);
        if (ret)
                goto clk_free;
 
index 01b2adf..eaf39e5 100644 (file)
@@ -1451,6 +1451,7 @@ int ide_host_register(struct ide_host *host, const struct ide_port_info *d,
                if (hwif_init(hwif) == 0) {
                        printk(KERN_INFO "%s: failed to initialize IDE "
                                         "interface\n", hwif->name);
+                       device_unregister(hwif->portdev);
                        device_unregister(&hwif->gendev);
                        ide_disable_port(hwif);
                        continue;
index 86aa88a..acf8748 100644 (file)
@@ -56,6 +56,7 @@ static int __init ide_scan_pcidev(struct pci_dev *dev)
 {
        struct list_head *l;
        struct pci_driver *d;
+       int ret;
 
        list_for_each(l, &ide_pci_drivers) {
                d = list_entry(l, struct pci_driver, node);
@@ -63,10 +64,14 @@ static int __init ide_scan_pcidev(struct pci_dev *dev)
                        const struct pci_device_id *id =
                                pci_match_id(d->id_table, dev);
 
-                       if (id != NULL && d->probe(dev, id) >= 0) {
-                               dev->driver = d;
-                               pci_dev_get(dev);
-                               return 1;
+                       if (id != NULL) {
+                               pci_assign_irq(dev);
+                               ret = d->probe(dev, id);
+                               if (ret >= 0) {
+                                       dev->driver = d;
+                                       pci_dev_get(dev);
+                                       return 1;
+                               }
                        }
                }
        }
index 112d2fe..fdc8e81 100644 (file)
@@ -179,6 +179,7 @@ EXPORT_SYMBOL_GPL(ide_setup_pci_noise);
 /**
  *     ide_pci_enable  -       do PCI enables
  *     @dev: PCI device
+ *     @bars: PCI BARs mask
  *     @d: IDE port info
  *
  *     Enable the IDE PCI device. We attempt to enable the device in full
@@ -189,9 +190,10 @@ EXPORT_SYMBOL_GPL(ide_setup_pci_noise);
  *     Returns zero on success or an error code
  */
 
-static int ide_pci_enable(struct pci_dev *dev, const struct ide_port_info *d)
+static int ide_pci_enable(struct pci_dev *dev, int bars,
+                         const struct ide_port_info *d)
 {
-       int ret, bars;
+       int ret;
 
        if (pci_enable_device(dev)) {
                ret = pci_enable_device_io(dev);
@@ -216,18 +218,6 @@ static int ide_pci_enable(struct pci_dev *dev, const struct ide_port_info *d)
                goto out;
        }
 
-       if (d->host_flags & IDE_HFLAG_SINGLE)
-               bars = (1 << 2) - 1;
-       else
-               bars = (1 << 4) - 1;
-
-       if ((d->host_flags & IDE_HFLAG_NO_DMA) == 0) {
-               if (d->host_flags & IDE_HFLAG_CS5520)
-                       bars |= (1 << 2);
-               else
-                       bars |= (1 << 4);
-       }
-
        ret = pci_request_selected_regions(dev, bars, d->name);
        if (ret < 0)
                printk(KERN_ERR "%s %s: can't reserve resources\n",
@@ -403,6 +393,7 @@ int ide_hwif_setup_dma(ide_hwif_t *hwif, const struct ide_port_info *d)
 /**
  *     ide_setup_pci_controller        -       set up IDE PCI
  *     @dev: PCI device
+ *     @bars: PCI BARs mask
  *     @d: IDE port info
  *     @noisy: verbose flag
  *
@@ -411,7 +402,7 @@ int ide_hwif_setup_dma(ide_hwif_t *hwif, const struct ide_port_info *d)
  *     and enables it if need be
  */
 
-static int ide_setup_pci_controller(struct pci_dev *dev,
+static int ide_setup_pci_controller(struct pci_dev *dev, int bars,
                                    const struct ide_port_info *d, int noisy)
 {
        int ret;
@@ -420,7 +411,7 @@ static int ide_setup_pci_controller(struct pci_dev *dev,
        if (noisy)
                ide_setup_pci_noise(dev, d);
 
-       ret = ide_pci_enable(dev, d);
+       ret = ide_pci_enable(dev, bars, d);
        if (ret < 0)
                goto out;
 
@@ -428,16 +419,20 @@ static int ide_setup_pci_controller(struct pci_dev *dev,
        if (ret < 0) {
                printk(KERN_ERR "%s %s: error accessing PCI regs\n",
                        d->name, pci_name(dev));
-               goto out;
+               goto out_free_bars;
        }
        if (!(pcicmd & PCI_COMMAND_IO)) {       /* is device disabled? */
                ret = ide_pci_configure(dev, d);
                if (ret < 0)
-                       goto out;
+                       goto out_free_bars;
                printk(KERN_INFO "%s %s: device enabled (Linux)\n",
                        d->name, pci_name(dev));
        }
 
+       goto out;
+
+out_free_bars:
+       pci_release_selected_regions(dev, bars);
 out:
        return ret;
 }
@@ -540,13 +535,28 @@ int ide_pci_init_two(struct pci_dev *dev1, struct pci_dev *dev2,
 {
        struct pci_dev *pdev[] = { dev1, dev2 };
        struct ide_host *host;
-       int ret, i, n_ports = dev2 ? 4 : 2;
+       int ret, i, n_ports = dev2 ? 4 : 2, bars;
        struct ide_hw hw[4], *hws[] = { NULL, NULL, NULL, NULL };
 
+       if (d->host_flags & IDE_HFLAG_SINGLE)
+               bars = (1 << 2) - 1;
+       else
+               bars = (1 << 4) - 1;
+
+       if ((d->host_flags & IDE_HFLAG_NO_DMA) == 0) {
+               if (d->host_flags & IDE_HFLAG_CS5520)
+                       bars |= (1 << 2);
+               else
+                       bars |= (1 << 4);
+       }
+
        for (i = 0; i < n_ports / 2; i++) {
-               ret = ide_setup_pci_controller(pdev[i], d, !i);
-               if (ret < 0)
+               ret = ide_setup_pci_controller(pdev[i], bars, d, !i);
+               if (ret < 0) {
+                       if (i == 1)
+                               pci_release_selected_regions(pdev[0], bars);
                        goto out;
+               }
 
                ide_pci_setup_ports(pdev[i], d, &hw[i*2], &hws[i*2]);
        }
@@ -554,7 +564,7 @@ int ide_pci_init_two(struct pci_dev *dev1, struct pci_dev *dev2,
        host = ide_host_alloc(d, hws, n_ports);
        if (host == NULL) {
                ret = -ENOMEM;
-               goto out;
+               goto out_free_bars;
        }
 
        host->dev[0] = &dev1->dev;
@@ -576,7 +586,7 @@ int ide_pci_init_two(struct pci_dev *dev1, struct pci_dev *dev2,
                 * do_ide_setup_pci_device() on the first device!
                 */
                if (ret < 0)
-                       goto out;
+                       goto out_free_bars;
 
                /* fixup IRQ */
                if (ide_pci_is_in_compatibility_mode(pdev[i])) {
@@ -589,6 +599,13 @@ int ide_pci_init_two(struct pci_dev *dev1, struct pci_dev *dev2,
        ret = ide_host_register(host, d, hws);
        if (ret)
                ide_host_free(host);
+       else
+               goto out;
+
+out_free_bars:
+       i = n_ports / 2;
+       while (i--)
+               pci_release_selected_regions(pdev[i], bars);
 out:
        return ret;
 }
index 30825bb..8861c05 100644 (file)
@@ -100,6 +100,8 @@ int iwpm_register_pid(struct iwpm_dev_data *pm_msg, u8 nl_client)
        if (ret)
                goto pid_query_error;
 
+       nlmsg_end(skb, nlh);
+
        pr_debug("%s: Multicasting a nlmsg (dev = %s ifname = %s iwpm = %s)\n",
                __func__, pm_msg->dev_name, pm_msg->if_name, iwpm_ulib_name);
 
@@ -170,6 +172,8 @@ int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client)
                                &pm_msg->loc_addr, IWPM_NLA_MANAGE_ADDR);
        if (ret)
                goto add_mapping_error;
+
+       nlmsg_end(skb, nlh);
        nlmsg_request->req_buffer = pm_msg;
 
        ret = rdma_nl_unicast_wait(skb, iwpm_user_pid);
@@ -246,6 +250,8 @@ int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client)
                                &pm_msg->rem_addr, IWPM_NLA_QUERY_REMOTE_ADDR);
        if (ret)
                goto query_mapping_error;
+
+       nlmsg_end(skb, nlh);
        nlmsg_request->req_buffer = pm_msg;
 
        ret = rdma_nl_unicast_wait(skb, iwpm_user_pid);
@@ -308,6 +314,8 @@ int iwpm_remove_mapping(struct sockaddr_storage *local_addr, u8 nl_client)
        if (ret)
                goto remove_mapping_error;
 
+       nlmsg_end(skb, nlh);
+
        ret = rdma_nl_unicast_wait(skb, iwpm_user_pid);
        if (ret) {
                skb = NULL; /* skb is freed in the netlink send-op handling */
index c81c559..3c4faad 100644 (file)
@@ -597,6 +597,9 @@ static int send_mapinfo_num(u32 mapping_num, u8 nl_client, int iwpm_pid)
                                &mapping_num, IWPM_NLA_MAPINFO_SEND_NUM);
        if (ret)
                goto mapinfo_num_error;
+
+       nlmsg_end(skb, nlh);
+
        ret = rdma_nl_unicast(skb, iwpm_pid);
        if (ret) {
                skb = NULL;
@@ -678,6 +681,8 @@ int iwpm_send_mapinfo(u8 nl_client, int iwpm_pid)
                        if (ret)
                                goto send_mapping_info_unlock;
 
+                       nlmsg_end(skb, nlh);
+
                        iwpm_print_sockaddr(&map_info->local_sockaddr,
                                "send_mapping_info: Local sockaddr:");
                        iwpm_print_sockaddr(&map_info->mapped_sockaddr,
index d1f5345..42ca534 100644 (file)
@@ -48,7 +48,7 @@
  * @wqe: cqp wqe for header
  * @header: header for the cqp wqe
  */
-static inline void i40iw_insert_wqe_hdr(u64 *wqe, u64 header)
+void i40iw_insert_wqe_hdr(u64 *wqe, u64 header)
 {
        wmb();            /* make sure WQE is populated before polarity is set */
        set_64bit_val(wqe, 24, header);
index e217a12..5498ad0 100644 (file)
@@ -59,6 +59,8 @@ enum i40iw_status_code i40iw_sc_mr_fast_register(struct i40iw_sc_qp *qp,
                                                 struct i40iw_fast_reg_stag_info *info,
                                                 bool post_sq);
 
+void i40iw_insert_wqe_hdr(u64 *wqe, u64 header);
+
 /* HMC/FPM functions */
 enum i40iw_status_code i40iw_sc_init_iw_hmc(struct i40iw_sc_dev *dev,
                                            u8 hmc_fn_id);
index c2cab20..59f7067 100644 (file)
@@ -123,12 +123,11 @@ static void i40iw_puda_post_recvbuf(struct i40iw_puda_rsrc *rsrc, u32 wqe_idx,
                get_64bit_val(wqe, 24, &offset24);
 
        offset24 = (offset24) ? 0 : LS_64(1, I40IWQPSQ_VALID);
-       set_64bit_val(wqe, 24, offset24);
 
        set_64bit_val(wqe, 0, buf->mem.pa);
        set_64bit_val(wqe, 8,
                      LS_64(buf->mem.size, I40IWQPSQ_FRAG_LEN));
-       set_64bit_val(wqe, 24, offset24);
+       i40iw_insert_wqe_hdr(wqe, offset24);
 }
 
 /**
@@ -409,9 +408,7 @@ enum i40iw_status_code i40iw_puda_send(struct i40iw_sc_qp *qp,
        set_64bit_val(wqe, 8, LS_64(info->len, I40IWQPSQ_FRAG_LEN));
        set_64bit_val(wqe, 16, header[0]);
 
-       /* Ensure all data is written before writing valid bit */
-       wmb();
-       set_64bit_val(wqe, 24, header[1]);
+       i40iw_insert_wqe_hdr(wqe, header[1]);
 
        i40iw_debug_buf(qp->dev, I40IW_DEBUG_PUDA, "PUDA SEND WQE", wqe, 32);
        i40iw_qp_post_wr(&qp->qp_uk);
@@ -539,7 +536,7 @@ static enum i40iw_status_code i40iw_puda_qp_wqe(struct i40iw_sc_dev *dev, struct
                 LS_64(2, I40IW_CQPSQ_QP_NEXTIWSTATE) |
                 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
 
-       set_64bit_val(wqe, 24, header);
+       i40iw_insert_wqe_hdr(wqe, header);
 
        i40iw_debug_buf(cqp->dev, I40IW_DEBUG_PUDA, "PUDA CQE", wqe, 32);
        i40iw_sc_cqp_post_sq(cqp);
@@ -655,7 +652,7 @@ static enum i40iw_status_code i40iw_puda_cq_wqe(struct i40iw_sc_dev *dev, struct
            LS_64(1, I40IW_CQPSQ_CQ_ENCEQEMASK) |
            LS_64(1, I40IW_CQPSQ_CQ_CEQIDVALID) |
            LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
-       set_64bit_val(wqe, 24, header);
+       i40iw_insert_wqe_hdr(wqe, header);
 
        i40iw_debug_buf(dev, I40IW_DEBUG_PUDA, "PUDA CQE",
                        wqe, I40IW_CQP_WQE_SIZE * 8);
index 28b3d02..62be0a4 100644 (file)
@@ -826,12 +826,14 @@ static int i40iw_query_qp(struct ib_qp *ibqp,
        attr->cap.max_inline_data = I40IW_MAX_INLINE_DATA_SIZE;
        attr->cap.max_send_sge = I40IW_MAX_WQ_FRAGMENT_COUNT;
        attr->cap.max_recv_sge = I40IW_MAX_WQ_FRAGMENT_COUNT;
+       attr->port_num = 1;
        init_attr->event_handler = iwqp->ibqp.event_handler;
        init_attr->qp_context = iwqp->ibqp.qp_context;
        init_attr->send_cq = iwqp->ibqp.send_cq;
        init_attr->recv_cq = iwqp->ibqp.recv_cq;
        init_attr->srq = iwqp->ibqp.srq;
        init_attr->cap = attr->cap;
+       init_attr->port_num = 1;
        return 0;
 }
 
index d6fbad8..552f7bd 100644 (file)
@@ -4174,9 +4174,9 @@ err_bfreg:
 err_uar_page:
        mlx5_put_uars_page(dev->mdev, dev->mdev->priv.uar);
 
-err_cnt:
-       mlx5_ib_cleanup_cong_debugfs(dev);
 err_cong:
+       mlx5_ib_cleanup_cong_debugfs(dev);
+err_cnt:
        if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt))
                mlx5_ib_dealloc_counters(dev);
 
index b2bb42e..254083b 100644 (file)
@@ -387,7 +387,7 @@ struct qedr_qp {
                u8 wqe_size;
 
                u8 smac[ETH_ALEN];
-               u16 vlan_id;
+               u16 vlan;
                int rc;
        } *rqe_wr_id;
 
index 4689e80..ad89653 100644 (file)
@@ -105,7 +105,7 @@ void qedr_ll2_complete_rx_packet(void *cxt,
 
        qp->rqe_wr_id[qp->rq.gsi_cons].rc = data->u.data_length_error ?
                -EINVAL : 0;
-       qp->rqe_wr_id[qp->rq.gsi_cons].vlan_id = data->vlan;
+       qp->rqe_wr_id[qp->rq.gsi_cons].vlan = data->vlan;
        /* note: length stands for data length i.e. GRH is excluded */
        qp->rqe_wr_id[qp->rq.gsi_cons].sg_list[0].length =
                data->length.data_length;
@@ -694,6 +694,7 @@ int qedr_gsi_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
        struct qedr_cq *cq = get_qedr_cq(ibcq);
        struct qedr_qp *qp = dev->gsi_qp;
        unsigned long flags;
+       u16 vlan_id;
        int i = 0;
 
        spin_lock_irqsave(&cq->cq_lock, flags);
@@ -712,9 +713,14 @@ int qedr_gsi_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
                wc[i].wc_flags |= IB_WC_GRH | IB_WC_IP_CSUM_OK;
                ether_addr_copy(wc[i].smac, qp->rqe_wr_id[qp->rq.cons].smac);
                wc[i].wc_flags |= IB_WC_WITH_SMAC;
-               if (qp->rqe_wr_id[qp->rq.cons].vlan_id) {
+
+               vlan_id = qp->rqe_wr_id[qp->rq.cons].vlan &
+                         VLAN_VID_MASK;
+               if (vlan_id) {
                        wc[i].wc_flags |= IB_WC_WITH_VLAN;
-                       wc[i].vlan_id = qp->rqe_wr_id[qp->rq.cons].vlan_id;
+                       wc[i].vlan_id = vlan_id;
+                       wc[i].sl = (qp->rqe_wr_id[qp->rq.cons].vlan &
+                                   VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
                }
 
                qedr_inc_sw_cons(&qp->rq);
index 7d5286b..1841d03 100644 (file)
@@ -64,7 +64,7 @@ EXPORT_SYMBOL(closure_put);
 void __closure_wake_up(struct closure_waitlist *wait_list)
 {
        struct llist_node *list;
-       struct closure *cl;
+       struct closure *cl, *t;
        struct llist_node *reverse = NULL;
 
        list = llist_del_all(&wait_list->list);
@@ -73,7 +73,7 @@ void __closure_wake_up(struct closure_waitlist *wait_list)
        reverse = llist_reverse_order(list);
 
        /* Then do the wakeups */
-       llist_for_each_entry(cl, reverse, list) {
+       llist_for_each_entry_safe(cl, t, reverse, list) {
                closure_set_waiting(cl, 0);
                closure_sub(cl, CLOSURE_WAITING + 1);
        }
index 5dba23c..dc9bc18 100644 (file)
@@ -219,8 +219,17 @@ int cxllib_handle_fault(struct mm_struct *mm, u64 addr, u64 size, u64 flags)
 
        down_read(&mm->mmap_sem);
 
-       for (dar = addr; dar < addr + size; dar += page_size) {
-               if (!vma || dar < vma->vm_start || dar > vma->vm_end) {
+       vma = find_vma(mm, addr);
+       if (!vma) {
+               pr_err("Can't find vma for addr %016llx\n", addr);
+               rc = -EFAULT;
+               goto out;
+       }
+       /* get the size of the pages allocated */
+       page_size = vma_kernel_pagesize(vma);
+
+       for (dar = (addr & ~(page_size - 1)); dar < (addr + size); dar += page_size) {
+               if (dar < vma->vm_start || dar >= vma->vm_end) {
                        vma = find_vma(mm, addr);
                        if (!vma) {
                                pr_err("Can't find vma for addr %016llx\n", addr);
index 29fc1e6..2ad7b5c 100644 (file)
@@ -1634,8 +1634,6 @@ static void mmc_blk_data_prep(struct mmc_queue *mq, struct mmc_queue_req *mqrq,
        }
 
        mqrq->areq.mrq = &brq->mrq;
-
-       mmc_queue_bounce_pre(mqrq);
 }
 
 static void mmc_blk_rw_rq_prep(struct mmc_queue_req *mqrq,
@@ -1829,7 +1827,6 @@ static void mmc_blk_issue_rw_rq(struct mmc_queue *mq, struct request *new_req)
                brq = &mq_rq->brq;
                old_req = mmc_queue_req_to_req(mq_rq);
                type = rq_data_dir(old_req) == READ ? MMC_BLK_READ : MMC_BLK_WRITE;
-               mmc_queue_bounce_post(mq_rq);
 
                switch (status) {
                case MMC_BLK_SUCCESS:
index a7eb623..36217ad 100644 (file)
@@ -1286,6 +1286,23 @@ out_err:
        return err;
 }
 
+static void mmc_select_driver_type(struct mmc_card *card)
+{
+       int card_drv_type, drive_strength, drv_type;
+
+       card_drv_type = card->ext_csd.raw_driver_strength |
+                       mmc_driver_type_mask(0);
+
+       drive_strength = mmc_select_drive_strength(card,
+                                                  card->ext_csd.hs200_max_dtr,
+                                                  card_drv_type, &drv_type);
+
+       card->drive_strength = drive_strength;
+
+       if (drv_type)
+               mmc_set_driver_type(card->host, drv_type);
+}
+
 static int mmc_select_hs400es(struct mmc_card *card)
 {
        struct mmc_host *host = card->host;
@@ -1341,6 +1358,8 @@ static int mmc_select_hs400es(struct mmc_card *card)
                goto out_err;
        }
 
+       mmc_select_driver_type(card);
+
        /* Switch card to HS400 */
        val = EXT_CSD_TIMING_HS400 |
              card->drive_strength << EXT_CSD_DRV_STR_SHIFT;
@@ -1374,23 +1393,6 @@ out_err:
        return err;
 }
 
-static void mmc_select_driver_type(struct mmc_card *card)
-{
-       int card_drv_type, drive_strength, drv_type;
-
-       card_drv_type = card->ext_csd.raw_driver_strength |
-                       mmc_driver_type_mask(0);
-
-       drive_strength = mmc_select_drive_strength(card,
-                                                  card->ext_csd.hs200_max_dtr,
-                                                  card_drv_type, &drv_type);
-
-       card->drive_strength = drive_strength;
-
-       if (drv_type)
-               mmc_set_driver_type(card->host, drv_type);
-}
-
 /*
  * For device supporting HS200 mode, the following sequence
  * should be done before executing the tuning process.
index 74c663b..0a4e77a 100644 (file)
@@ -23,8 +23,6 @@
 #include "core.h"
 #include "card.h"
 
-#define MMC_QUEUE_BOUNCESZ     65536
-
 /*
  * Prepare a MMC request. This just filters out odd stuff.
  */
@@ -150,26 +148,6 @@ static void mmc_queue_setup_discard(struct request_queue *q,
                queue_flag_set_unlocked(QUEUE_FLAG_SECERASE, q);
 }
 
-static unsigned int mmc_queue_calc_bouncesz(struct mmc_host *host)
-{
-       unsigned int bouncesz = MMC_QUEUE_BOUNCESZ;
-
-       if (host->max_segs != 1 || (host->caps & MMC_CAP_NO_BOUNCE_BUFF))
-               return 0;
-
-       if (bouncesz > host->max_req_size)
-               bouncesz = host->max_req_size;
-       if (bouncesz > host->max_seg_size)
-               bouncesz = host->max_seg_size;
-       if (bouncesz > host->max_blk_count * 512)
-               bouncesz = host->max_blk_count * 512;
-
-       if (bouncesz <= 512)
-               return 0;
-
-       return bouncesz;
-}
-
 /**
  * mmc_init_request() - initialize the MMC-specific per-request data
  * @q: the request queue
@@ -184,26 +162,9 @@ static int mmc_init_request(struct request_queue *q, struct request *req,
        struct mmc_card *card = mq->card;
        struct mmc_host *host = card->host;
 
-       if (card->bouncesz) {
-               mq_rq->bounce_buf = kmalloc(card->bouncesz, gfp);
-               if (!mq_rq->bounce_buf)
-                       return -ENOMEM;
-               if (card->bouncesz > 512) {
-                       mq_rq->sg = mmc_alloc_sg(1, gfp);
-                       if (!mq_rq->sg)
-                               return -ENOMEM;
-                       mq_rq->bounce_sg = mmc_alloc_sg(card->bouncesz / 512,
-                                                       gfp);
-                       if (!mq_rq->bounce_sg)
-                               return -ENOMEM;
-               }
-       } else {
-               mq_rq->bounce_buf = NULL;
-               mq_rq->bounce_sg = NULL;
-               mq_rq->sg = mmc_alloc_sg(host->max_segs, gfp);
-               if (!mq_rq->sg)
-                       return -ENOMEM;
-       }
+       mq_rq->sg = mmc_alloc_sg(host->max_segs, gfp);
+       if (!mq_rq->sg)
+               return -ENOMEM;
 
        return 0;
 }
@@ -212,13 +173,6 @@ static void mmc_exit_request(struct request_queue *q, struct request *req)
 {
        struct mmc_queue_req *mq_rq = req_to_mmc_queue_req(req);
 
-       /* It is OK to kfree(NULL) so this will be smooth */
-       kfree(mq_rq->bounce_sg);
-       mq_rq->bounce_sg = NULL;
-
-       kfree(mq_rq->bounce_buf);
-       mq_rq->bounce_buf = NULL;
-
        kfree(mq_rq->sg);
        mq_rq->sg = NULL;
 }
@@ -242,12 +196,6 @@ int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card,
        if (mmc_dev(host)->dma_mask && *mmc_dev(host)->dma_mask)
                limit = (u64)dma_max_pfn(mmc_dev(host)) << PAGE_SHIFT;
 
-       /*
-        * mmc_init_request() depends on card->bouncesz so it must be calculated
-        * before blk_init_allocated_queue() starts allocating requests.
-        */
-       card->bouncesz = mmc_queue_calc_bouncesz(host);
-
        mq->card = card;
        mq->queue = blk_alloc_queue(GFP_KERNEL);
        if (!mq->queue)
@@ -271,17 +219,11 @@ int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card,
        if (mmc_can_erase(card))
                mmc_queue_setup_discard(mq->queue, card);
 
-       if (card->bouncesz) {
-               blk_queue_max_hw_sectors(mq->queue, card->bouncesz / 512);
-               blk_queue_max_segments(mq->queue, card->bouncesz / 512);
-               blk_queue_max_segment_size(mq->queue, card->bouncesz);
-       } else {
-               blk_queue_bounce_limit(mq->queue, limit);
-               blk_queue_max_hw_sectors(mq->queue,
-                       min(host->max_blk_count, host->max_req_size / 512));
-               blk_queue_max_segments(mq->queue, host->max_segs);
-               blk_queue_max_segment_size(mq->queue, host->max_seg_size);
-       }
+       blk_queue_bounce_limit(mq->queue, limit);
+       blk_queue_max_hw_sectors(mq->queue,
+               min(host->max_blk_count, host->max_req_size / 512));
+       blk_queue_max_segments(mq->queue, host->max_segs);
+       blk_queue_max_segment_size(mq->queue, host->max_seg_size);
 
        sema_init(&mq->thread_sem, 1);
 
@@ -370,56 +312,7 @@ void mmc_queue_resume(struct mmc_queue *mq)
  */
 unsigned int mmc_queue_map_sg(struct mmc_queue *mq, struct mmc_queue_req *mqrq)
 {
-       unsigned int sg_len;
-       size_t buflen;
-       struct scatterlist *sg;
        struct request *req = mmc_queue_req_to_req(mqrq);
-       int i;
-
-       if (!mqrq->bounce_buf)
-               return blk_rq_map_sg(mq->queue, req, mqrq->sg);
-
-       sg_len = blk_rq_map_sg(mq->queue, req, mqrq->bounce_sg);
-
-       mqrq->bounce_sg_len = sg_len;
-
-       buflen = 0;
-       for_each_sg(mqrq->bounce_sg, sg, sg_len, i)
-               buflen += sg->length;
-
-       sg_init_one(mqrq->sg, mqrq->bounce_buf, buflen);
-
-       return 1;
-}
-
-/*
- * If writing, bounce the data to the buffer before the request
- * is sent to the host driver
- */
-void mmc_queue_bounce_pre(struct mmc_queue_req *mqrq)
-{
-       if (!mqrq->bounce_buf)
-               return;
-
-       if (rq_data_dir(mmc_queue_req_to_req(mqrq)) != WRITE)
-               return;
-
-       sg_copy_to_buffer(mqrq->bounce_sg, mqrq->bounce_sg_len,
-               mqrq->bounce_buf, mqrq->sg[0].length);
-}
-
-/*
- * If reading, bounce the data from the buffer after the request
- * has been handled by the host driver
- */
-void mmc_queue_bounce_post(struct mmc_queue_req *mqrq)
-{
-       if (!mqrq->bounce_buf)
-               return;
-
-       if (rq_data_dir(mmc_queue_req_to_req(mqrq)) != READ)
-               return;
 
-       sg_copy_from_buffer(mqrq->bounce_sg, mqrq->bounce_sg_len,
-               mqrq->bounce_buf, mqrq->sg[0].length);
+       return blk_rq_map_sg(mq->queue, req, mqrq->sg);
 }
index 04fc893..f18d3f6 100644 (file)
@@ -49,9 +49,6 @@ enum mmc_drv_op {
 struct mmc_queue_req {
        struct mmc_blk_request  brq;
        struct scatterlist      *sg;
-       char                    *bounce_buf;
-       struct scatterlist      *bounce_sg;
-       unsigned int            bounce_sg_len;
        struct mmc_async_req    areq;
        enum mmc_drv_op         drv_op;
        int                     drv_op_result;
@@ -81,11 +78,8 @@ extern int mmc_init_queue(struct mmc_queue *, struct mmc_card *, spinlock_t *,
 extern void mmc_cleanup_queue(struct mmc_queue *);
 extern void mmc_queue_suspend(struct mmc_queue *);
 extern void mmc_queue_resume(struct mmc_queue *);
-
 extern unsigned int mmc_queue_map_sg(struct mmc_queue *,
                                     struct mmc_queue_req *);
-extern void mmc_queue_bounce_pre(struct mmc_queue_req *);
-extern void mmc_queue_bounce_post(struct mmc_queue_req *);
 
 extern int mmc_access_rpmb(struct mmc_queue *);
 
index 27fb625..fbd29f0 100644 (file)
@@ -1038,7 +1038,7 @@ int cvm_mmc_of_slot_probe(struct device *dev, struct cvm_mmc_host *host)
         */
        mmc->caps |= MMC_CAP_MMC_HIGHSPEED | MMC_CAP_SD_HIGHSPEED |
                     MMC_CAP_ERASE | MMC_CAP_CMD23 | MMC_CAP_POWER_OFF_CARD |
-                    MMC_CAP_3_3V_DDR | MMC_CAP_NO_BOUNCE_BUFF;
+                    MMC_CAP_3_3V_DDR;
 
        if (host->use_sg)
                mmc->max_segs = 16;
index c885c2d..85745ef 100644 (file)
@@ -531,8 +531,7 @@ static int meson_mmc_clk_init(struct meson_host *host)
        div->shift = __ffs(CLK_DIV_MASK);
        div->width = __builtin_popcountl(CLK_DIV_MASK);
        div->hw.init = &init;
-       div->flags = (CLK_DIVIDER_ONE_BASED |
-                     CLK_DIVIDER_ROUND_CLOSEST);
+       div->flags = CLK_DIVIDER_ONE_BASED;
 
        clk = devm_clk_register(host->dev, &div->hw);
        if (WARN_ON(IS_ERR(clk)))
@@ -717,6 +716,22 @@ static int meson_mmc_clk_phase_tuning(struct mmc_host *mmc, u32 opcode,
 static int meson_mmc_execute_tuning(struct mmc_host *mmc, u32 opcode)
 {
        struct meson_host *host = mmc_priv(mmc);
+       int ret;
+
+       /*
+        * If this is the initial tuning, try to get a sane Rx starting
+        * phase before doing the actual tuning.
+        */
+       if (!mmc->doing_retune) {
+               ret = meson_mmc_clk_phase_tuning(mmc, opcode, host->rx_clk);
+
+               if (ret)
+                       return ret;
+       }
+
+       ret = meson_mmc_clk_phase_tuning(mmc, opcode, host->tx_clk);
+       if (ret)
+               return ret;
 
        return meson_mmc_clk_phase_tuning(mmc, opcode, host->rx_clk);
 }
@@ -746,6 +761,11 @@ static void meson_mmc_set_ios(struct mmc_host *mmc, struct mmc_ios *ios)
        case MMC_POWER_UP:
                if (!IS_ERR(mmc->supply.vmmc))
                        mmc_regulator_set_ocr(mmc, mmc->supply.vmmc, ios->vdd);
+
+               /* Reset phases */
+               clk_set_phase(host->rx_clk, 0);
+               clk_set_phase(host->tx_clk, 270);
+
                break;
 
        case MMC_POWER_ON:
@@ -759,8 +779,6 @@ static void meson_mmc_set_ios(struct mmc_host *mmc, struct mmc_ios *ios)
                                host->vqmmc_enabled = true;
                }
 
-               /* Reset rx phase */
-               clk_set_phase(host->rx_clk, 0);
                break;
        }
 
index 59ab194..c763b40 100644 (file)
@@ -702,11 +702,7 @@ static int pxamci_probe(struct platform_device *pdev)
 
        pxamci_init_ocr(host);
 
-       /*
-        * This architecture used to disable bounce buffers through its
-        * defconfig, now it is done at runtime as a host property.
-        */
-       mmc->caps = MMC_CAP_NO_BOUNCE_BUFF;
+       mmc->caps = 0;
        host->cmdat = 0;
        if (!cpu_is_pxa25x()) {
                mmc->caps |= MMC_CAP_4_BIT_DATA | MMC_CAP_SDIO_IRQ;
index 2eec2e6..0842bbc 100644 (file)
@@ -466,6 +466,7 @@ static int xenon_probe(struct platform_device *pdev)
 {
        struct sdhci_pltfm_host *pltfm_host;
        struct sdhci_host *host;
+       struct xenon_priv *priv;
        int err;
 
        host = sdhci_pltfm_init(pdev, &sdhci_xenon_pdata,
@@ -474,6 +475,7 @@ static int xenon_probe(struct platform_device *pdev)
                return PTR_ERR(host);
 
        pltfm_host = sdhci_priv(host);
+       priv = sdhci_pltfm_priv(pltfm_host);
 
        /*
         * Link Xenon specific mmc_host_ops function,
@@ -491,9 +493,20 @@ static int xenon_probe(struct platform_device *pdev)
        if (err)
                goto free_pltfm;
 
+       priv->axi_clk = devm_clk_get(&pdev->dev, "axi");
+       if (IS_ERR(priv->axi_clk)) {
+               err = PTR_ERR(priv->axi_clk);
+               if (err == -EPROBE_DEFER)
+                       goto err_clk;
+       } else {
+               err = clk_prepare_enable(priv->axi_clk);
+               if (err)
+                       goto err_clk;
+       }
+
        err = mmc_of_parse(host->mmc);
        if (err)
-               goto err_clk;
+               goto err_clk_axi;
 
        sdhci_get_of_property(pdev);
 
@@ -502,11 +515,11 @@ static int xenon_probe(struct platform_device *pdev)
        /* Xenon specific dt parse */
        err = xenon_probe_dt(pdev);
        if (err)
-               goto err_clk;
+               goto err_clk_axi;
 
        err = xenon_sdhc_prepare(host);
        if (err)
-               goto err_clk;
+               goto err_clk_axi;
 
        pm_runtime_get_noresume(&pdev->dev);
        pm_runtime_set_active(&pdev->dev);
@@ -527,6 +540,8 @@ remove_sdhc:
        pm_runtime_disable(&pdev->dev);
        pm_runtime_put_noidle(&pdev->dev);
        xenon_sdhc_unprepare(host);
+err_clk_axi:
+       clk_disable_unprepare(priv->axi_clk);
 err_clk:
        clk_disable_unprepare(pltfm_host->clk);
 free_pltfm:
@@ -538,6 +553,7 @@ static int xenon_remove(struct platform_device *pdev)
 {
        struct sdhci_host *host = platform_get_drvdata(pdev);
        struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host);
+       struct xenon_priv *priv = sdhci_pltfm_priv(pltfm_host);
 
        pm_runtime_get_sync(&pdev->dev);
        pm_runtime_disable(&pdev->dev);
@@ -546,7 +562,7 @@ static int xenon_remove(struct platform_device *pdev)
        sdhci_remove_host(host, 0);
 
        xenon_sdhc_unprepare(host);
-
+       clk_disable_unprepare(priv->axi_clk);
        clk_disable_unprepare(pltfm_host->clk);
 
        sdhci_pltfm_free(pdev);
index 2bc0510..9994995 100644 (file)
@@ -83,6 +83,7 @@ struct xenon_priv {
        unsigned char   bus_width;
        unsigned char   timing;
        unsigned int    clock;
+       struct clk      *axi_clk;
 
        int             phy_type;
        /*
index 9ca994d..3591077 100644 (file)
@@ -1074,11 +1074,6 @@ static void bnx2x_vf_set_bars(struct bnx2x *bp, struct bnx2x_virtf *vf)
        }
 }
 
-static int bnx2x_ari_enabled(struct pci_dev *dev)
-{
-       return dev->bus->self && dev->bus->self->ari_enabled;
-}
-
 static int
 bnx2x_get_vf_igu_cam_info(struct bnx2x *bp)
 {
@@ -1212,7 +1207,7 @@ int bnx2x_iov_init_one(struct bnx2x *bp, int int_mode_param,
 
        err = -EIO;
        /* verify ari is enabled */
-       if (!bnx2x_ari_enabled(bp->pdev)) {
+       if (!pci_ari_enabled(bp->pdev->bus)) {
                BNX2X_ERR("ARI not supported (check pci bridge ARI forwarding), SRIOV can not be enabled\n");
                return 0;
        }
index 4f0cb8e..457201f 100644 (file)
@@ -1,3 +1,4 @@
 obj-$(CONFIG_BNXT) += bnxt_en.o
 
-bnxt_en-y := bnxt.o bnxt_sriov.o bnxt_ethtool.o bnxt_dcb.o bnxt_ulp.o bnxt_xdp.o bnxt_vfr.o bnxt_tc.o
+bnxt_en-y := bnxt.o bnxt_sriov.o bnxt_ethtool.o bnxt_dcb.o bnxt_ulp.o bnxt_xdp.o bnxt_vfr.o
+bnxt_en-$(CONFIG_BNXT_FLOWER_OFFLOAD) += bnxt_tc.o
index 7dd3d13..4730c04 100644 (file)
@@ -23,8 +23,6 @@
 #include "bnxt_tc.h"
 #include "bnxt_vfr.h"
 
-#ifdef CONFIG_BNXT_FLOWER_OFFLOAD
-
 #define BNXT_FID_INVALID                       0xffff
 #define VLAN_TCI(vid, prio)    ((vid) | ((prio) << VLAN_PRIO_SHIFT))
 
@@ -833,6 +831,3 @@ void bnxt_shutdown_tc(struct bnxt *bp)
        rhashtable_destroy(&tc_info->flow_table);
        rhashtable_destroy(&tc_info->l2_table);
 }
-
-#else
-#endif
index d68478a..71989e1 100644 (file)
@@ -566,8 +566,10 @@ static inline bool nicvf_xdp_rx(struct nicvf *nic, struct bpf_prog *prog,
                return true;
        default:
                bpf_warn_invalid_xdp_action(action);
+               /* fall through */
        case XDP_ABORTED:
                trace_xdp_exception(nic->netdev, prog, action);
+               /* fall through */
        case XDP_DROP:
                /* Check if it's a recycled page, if not
                 * unmap the DMA mapping.
index b65ce26..b3fd1f4 100644 (file)
@@ -8205,7 +8205,7 @@ struct flash_desc {
        u32 size_mb;
 };
 
-static int get_flash_params(struct adapter *adap)
+static int t4_get_flash_params(struct adapter *adap)
 {
        /* Table for non-Numonix supported flash parts.  Numonix parts are left
         * to the preexisting code.  All flash parts have 64KB sectors.
@@ -8214,40 +8214,136 @@ static int get_flash_params(struct adapter *adap)
                { 0x150201, 4 << 20 },       /* Spansion 4MB S25FL032P */
        };
 
+       unsigned int part, manufacturer;
+       unsigned int density, size;
+       u32 flashid = 0;
        int ret;
-       u32 info;
+
+       /* Issue a Read ID Command to the Flash part.  We decode supported
+        * Flash parts and their sizes from this.  There's a newer Query
+        * Command which can retrieve detailed geometry information but many
+        * Flash parts don't support it.
+        */
 
        ret = sf1_write(adap, 1, 1, 0, SF_RD_ID);
        if (!ret)
-               ret = sf1_read(adap, 3, 0, 1, &info);
+               ret = sf1_read(adap, 3, 0, 1, &flashid);
        t4_write_reg(adap, SF_OP_A, 0);                    /* unlock SF */
        if (ret)
                return ret;
 
-       for (ret = 0; ret < ARRAY_SIZE(supported_flash); ++ret)
-               if (supported_flash[ret].vendor_and_model_id == info) {
-                       adap->params.sf_size = supported_flash[ret].size_mb;
+       /* Check to see if it's one of our non-standard supported Flash parts.
+        */
+       for (part = 0; part < ARRAY_SIZE(supported_flash); part++)
+               if (supported_flash[part].vendor_and_model_id == flashid) {
+                       adap->params.sf_size = supported_flash[part].size_mb;
                        adap->params.sf_nsec =
                                adap->params.sf_size / SF_SEC_SIZE;
-                       return 0;
+                       goto found;
                }
 
-       if ((info & 0xff) != 0x20)             /* not a Numonix flash */
+       /* Decode Flash part size.  The code below looks repetative with
+        * common encodings, but that's not guaranteed in the JEDEC
+        * specification for the Read JADEC ID command.  The only thing that
+        * we're guaranteed by the JADEC specification is where the
+        * Manufacturer ID is in the returned result.  After that each
+        * Manufacturer ~could~ encode things completely differently.
+        * Note, all Flash parts must have 64KB sectors.
+        */
+       manufacturer = flashid & 0xff;
+       switch (manufacturer) {
+       case 0x20: { /* Micron/Numonix */
+               /* This Density -> Size decoding table is taken from Micron
+                * Data Sheets.
+                */
+               density = (flashid >> 16) & 0xff;
+               switch (density) {
+               case 0x14: /* 1MB */
+                       size = 1 << 20;
+                       break;
+               case 0x15: /* 2MB */
+                       size = 1 << 21;
+                       break;
+               case 0x16: /* 4MB */
+                       size = 1 << 22;
+                       break;
+               case 0x17: /* 8MB */
+                       size = 1 << 23;
+                       break;
+               case 0x18: /* 16MB */
+                       size = 1 << 24;
+                       break;
+               case 0x19: /* 32MB */
+                       size = 1 << 25;
+                       break;
+               case 0x20: /* 64MB */
+                       size = 1 << 26;
+                       break;
+               case 0x21: /* 128MB */
+                       size = 1 << 27;
+                       break;
+               case 0x22: /* 256MB */
+                       size = 1 << 28;
+                       break;
+
+               default:
+                       dev_err(adap->pdev_dev, "Micron Flash Part has bad size, ID = %#x, Density code = %#x\n",
+                               flashid, density);
                return -EINVAL;
-       info >>= 16;                           /* log2 of size */
-       if (info >= 0x14 && info < 0x18)
-               adap->params.sf_nsec = 1 << (info - 16);
-       else if (info == 0x18)
-               adap->params.sf_nsec = 64;
-       else
+               }
+               break;
+       }
+       case 0xc2: { /* Macronix */
+               /* This Density -> Size decoding table is taken from Macronix
+                * Data Sheets.
+                */
+               density = (flashid >> 16) & 0xff;
+               switch (density) {
+               case 0x17: /* 8MB */
+                       size = 1 << 23;
+                       break;
+               case 0x18: /* 16MB */
+                       size = 1 << 24;
+                       break;
+               default:
+                       dev_err(adap->pdev_dev, "Macronix Flash Part has bad size, ID = %#x, Density code = %#x\n",
+                               flashid, density);
+               return -EINVAL;
+               }
+       }
+       case 0xef: { /* Winbond */
+               /* This Density -> Size decoding table is taken from Winbond
+                * Data Sheets.
+                */
+               density = (flashid >> 16) & 0xff;
+               switch (density) {
+               case 0x17: /* 8MB */
+                       size = 1 << 23;
+                       break;
+               case 0x18: /* 16MB */
+                       size = 1 << 24;
+                       break;
+               default:
+                       dev_err(adap->pdev_dev, "Winbond Flash Part has bad size, ID = %#x, Density code = %#x\n",
+                               flashid, density);
                return -EINVAL;
-       adap->params.sf_size = 1 << info;
-       adap->params.sf_fw_start =
-               t4_read_reg(adap, CIM_BOOT_CFG_A) & BOOTADDR_M;
+               }
+               break;
+       }
+       default:
+               dev_err(adap->pdev_dev, "Unsupported Flash Part, ID = %#x\n",
+                       flashid);
+               return -EINVAL;
+       }
+
+       /* Store decoded Flash size and fall through into vetting code. */
+       adap->params.sf_size = size;
+       adap->params.sf_nsec = size / SF_SEC_SIZE;
 
+found:
        if (adap->params.sf_size < FLASH_MIN_SIZE)
-               dev_warn(adap->pdev_dev, "WARNING!!! FLASH size %#x < %#x!!!\n",
-                        adap->params.sf_size, FLASH_MIN_SIZE);
+               dev_warn(adap->pdev_dev, "WARNING: Flash Part ID %#x, size %#x < %#x\n",
+                        flashid, adap->params.sf_size, FLASH_MIN_SIZE);
        return 0;
 }
 
@@ -8285,7 +8381,7 @@ int t4_prep_adapter(struct adapter *adapter)
        get_pci_mode(adapter, &adapter->params.pci);
        pl_rev = REV_G(t4_read_reg(adapter, PL_REV_A));
 
-       ret = get_flash_params(adapter);
+       ret = t4_get_flash_params(adapter);
        if (ret < 0) {
                dev_err(adapter->pdev_dev, "error %d identifying flash\n", ret);
                return ret;
index 633e975..8c22bb8 100644 (file)
@@ -181,6 +181,8 @@ CH_PCI_DEVICE_ID_TABLE_DEFINE_BEGIN
        CH_PCI_ID_TABLE_FENTRY(0x50a7), /* Custom T580-CR */
        CH_PCI_ID_TABLE_FENTRY(0x50a8), /* Custom T580-KR */
        CH_PCI_ID_TABLE_FENTRY(0x50a9), /* Custom T580-KR */
+       CH_PCI_ID_TABLE_FENTRY(0x50aa), /* Custom T580-CR */
+       CH_PCI_ID_TABLE_FENTRY(0x50ab), /* Custom T520-CR */
 
        /* T6 adapters:
         */
index 9d7cb03..30000b6 100644 (file)
@@ -78,7 +78,7 @@ config HNS_ENET
 
 config HNS3
        tristate "Hisilicon Network Subsystem Support HNS3 (Framework)"
-    depends on PCI
+       depends on PCI
        ---help---
          This selects the framework support for Hisilicon Network Subsystem 3.
          This layer facilitates clients like ENET, RoCE and user-space ethernet
@@ -87,7 +87,7 @@ config HNS3
 
 config HNS3_HCLGE
        tristate "Hisilicon HNS3 HCLGE Acceleration Engine & Compatibility Layer Support"
-    depends on PCI_MSI
+       depends on PCI_MSI
        depends on HNS3
        ---help---
          This selects the HNS3_HCLGE network acceleration engine & its hardware
@@ -96,7 +96,7 @@ config HNS3_HCLGE
 
 config HNS3_ENET
        tristate "Hisilicon HNS3 Ethernet Device Support"
-    depends on 64BIT && PCI
+       depends on 64BIT && PCI
        depends on HNS3 && HNS3_HCLGE
        ---help---
          This selects the Ethernet Driver for Hisilicon Network Subsystem 3 for hip08
index c677530..575f50d 100644 (file)
@@ -339,6 +339,10 @@ struct hnae3_ae_ops {
                       u8 *hfunc);
        int (*set_rss)(struct hnae3_handle *handle, const u32 *indir,
                       const u8 *key, const u8 hfunc);
+       int (*set_rss_tuple)(struct hnae3_handle *handle,
+                            struct ethtool_rxnfc *cmd);
+       int (*get_rss_tuple)(struct hnae3_handle *handle,
+                            struct ethtool_rxnfc *cmd);
 
        int (*get_tc_size)(struct hnae3_handle *handle);
 
index 8b511e6..60960e5 100644 (file)
@@ -85,6 +85,15 @@ static int hclge_init_cmd_queue(struct hclge_dev *hdev, int ring_type)
        return 0;
 }
 
+void hclge_cmd_reuse_desc(struct hclge_desc *desc, bool is_read)
+{
+       desc->flag = cpu_to_le16(HCLGE_CMD_FLAG_NO_INTR | HCLGE_CMD_FLAG_IN);
+       if (is_read)
+               desc->flag |= cpu_to_le16(HCLGE_CMD_FLAG_WR);
+       else
+               desc->flag &= cpu_to_le16(~HCLGE_CMD_FLAG_WR);
+}
+
 void hclge_cmd_setup_basic_desc(struct hclge_desc *desc,
                                enum hclge_opcode_type opcode, bool is_read)
 {
@@ -208,7 +217,7 @@ int hclge_cmd_send(struct hclge_hw *hw, struct hclge_desc *desc, int num)
         * which will be use for hardware to write back
         */
        ntc = hw->cmq.csq.next_to_use;
-       opcode = desc[0].opcode;
+       opcode = le16_to_cpu(desc[0].opcode);
        while (handle < num) {
                desc_to_use = &hw->cmq.csq.desc[hw->cmq.csq.next_to_use];
                *desc_to_use = desc[handle];
@@ -225,7 +234,7 @@ int hclge_cmd_send(struct hclge_hw *hw, struct hclge_desc *desc, int num)
         * If the command is sync, wait for the firmware to write back,
         * if multi descriptors to be sent, use the first one to check
         */
-       if (HCLGE_SEND_SYNC(desc->flag)) {
+       if (HCLGE_SEND_SYNC(le16_to_cpu(desc->flag))) {
                do {
                        if (hclge_cmd_csq_done(hw))
                                break;
@@ -244,9 +253,9 @@ int hclge_cmd_send(struct hclge_hw *hw, struct hclge_desc *desc, int num)
                        pr_debug("Get cmd desc:\n");
 
                        if (likely(!hclge_is_special_opcode(opcode)))
-                               desc_ret = desc[handle].retval;
+                               desc_ret = le16_to_cpu(desc[handle].retval);
                        else
-                               desc_ret = desc[0].retval;
+                               desc_ret = le16_to_cpu(desc[0].retval);
 
                        if ((enum hclge_cmd_return_status)desc_ret ==
                            HCLGE_CMD_EXEC_SUCCESS)
@@ -276,15 +285,15 @@ int hclge_cmd_send(struct hclge_hw *hw, struct hclge_desc *desc, int num)
        return retval;
 }
 
-enum hclge_cmd_status hclge_cmd_query_firmware_version(struct hclge_hw *hw,
-                                                      u32 *version)
+static enum hclge_cmd_status hclge_cmd_query_firmware_version(
+               struct hclge_hw *hw, u32 *version)
 {
-       struct hclge_query_version *resp;
+       struct hclge_query_version_cmd *resp;
        struct hclge_desc desc;
        int ret;
 
        hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_QUERY_FW_VER, 1);
-       resp = (struct hclge_query_version *)desc.data;
+       resp = (struct hclge_query_version_cmd *)desc.data;
 
        ret = hclge_cmd_send(hw, &desc, 1);
        if (!ret)
index 6b6d28e..b437334 100644 (file)
@@ -221,12 +221,12 @@ enum hclge_opcode_type {
 #define HCLGE_RCB_INIT_QUERY_TIMEOUT   10
 #define HCLGE_RCB_INIT_FLAG_EN_B       0
 #define HCLGE_RCB_INIT_FLAG_FINI_B     8
-struct hclge_config_rcb_init {
+struct hclge_config_rcb_init_cmd {
        __le16 rcb_init_flag;
        u8 rsv[22];
 };
 
-struct hclge_tqp_map {
+struct hclge_tqp_map_cmd {
        __le16 tqp_id;  /* Absolute tqp id for in this pf */
        u8 tqp_vf;      /* VF id */
 #define HCLGE_TQP_MAP_TYPE_PF          0
@@ -246,15 +246,15 @@ enum hclge_int_type {
        HCLGE_INT_EVENT,
 };
 
-struct hclge_ctrl_vector_chain {
+struct hclge_ctrl_vector_chain_cmd {
        u8 int_vector_id;
        u8 int_cause_num;
 #define HCLGE_INT_TYPE_S       0
-#define HCLGE_INT_TYPE_M       0x3
+#define HCLGE_INT_TYPE_M       GENMASK(1, 0)
 #define HCLGE_TQP_ID_S         2
-#define HCLGE_TQP_ID_M         (0x7ff << HCLGE_TQP_ID_S)
+#define HCLGE_TQP_ID_M         GENMASK(12, 2)
 #define HCLGE_INT_GL_IDX_S     13
-#define HCLGE_INT_GL_IDX_M     (0x3 << HCLGE_INT_GL_IDX_S)
+#define HCLGE_INT_GL_IDX_M     GENMASK(14, 13)
        __le16 tqp_type_and_id[HCLGE_VECTOR_ELEMENTS_PER_CMD];
        u8 vfid;
        u8 rsv;
@@ -263,18 +263,18 @@ struct hclge_ctrl_vector_chain {
 #define HCLGE_TC_NUM           8
 #define HCLGE_TC0_PRI_BUF_EN_B 15 /* Bit 15 indicate enable or not */
 #define HCLGE_BUF_UNIT_S       7  /* Buf size is united by 128 bytes */
-struct hclge_tx_buff_alloc {
+struct hclge_tx_buff_alloc_cmd {
        __le16 tx_pkt_buff[HCLGE_TC_NUM];
        u8 tx_buff_rsv[8];
 };
 
-struct hclge_rx_priv_buff {
+struct hclge_rx_priv_buff_cmd {
        __le16 buf_num[HCLGE_TC_NUM];
        __le16 shared_buf;
        u8 rsv[6];
 };
 
-struct hclge_query_version {
+struct hclge_query_version_cmd {
        __le32 firmware;
        __le32 firmware_rsv[5];
 };
@@ -328,14 +328,14 @@ struct hclge_pkt_buf_alloc {
 };
 
 #define HCLGE_RX_COM_WL_EN_B   15
-struct hclge_rx_com_wl_buf {
+struct hclge_rx_com_wl_buf_cmd {
        __le16 high_wl;
        __le16 low_wl;
        u8 rsv[20];
 };
 
 #define HCLGE_RX_PKT_EN_B      15
-struct hclge_rx_pkt_buf {
+struct hclge_rx_pkt_buf_cmd {
        __le16 high_pkt;
        __le16 low_pkt;
        u8 rsv[20];
@@ -348,7 +348,7 @@ struct hclge_rx_pkt_buf {
 #define HCLGE_PF_MAC_NUM_MASK  0x3
 #define HCLGE_PF_STATE_MAIN    BIT(HCLGE_PF_STATE_MAIN_B)
 #define HCLGE_PF_STATE_DONE    BIT(HCLGE_PF_STATE_DONE_B)
-struct hclge_func_status {
+struct hclge_func_status_cmd {
        __le32  vf_rst_state[4];
        u8 pf_state;
        u8 mac_id;
@@ -359,7 +359,7 @@ struct hclge_func_status {
        u8 rsv[2];
 };
 
-struct hclge_pf_res {
+struct hclge_pf_res_cmd {
        __le16 tqp_num;
        __le16 buf_size;
        __le16 msixcap_localid_ba_nic;
@@ -372,30 +372,30 @@ struct hclge_pf_res {
 };
 
 #define HCLGE_CFG_OFFSET_S     0
-#define HCLGE_CFG_OFFSET_M     0xfffff /* Byte (8-10.3) */
+#define HCLGE_CFG_OFFSET_M     GENMASK(19, 0)
 #define HCLGE_CFG_RD_LEN_S     24
-#define HCLGE_CFG_RD_LEN_M     (0xf << HCLGE_CFG_RD_LEN_S)
+#define HCLGE_CFG_RD_LEN_M     GENMASK(27, 24)
 #define HCLGE_CFG_RD_LEN_BYTES 16
 #define HCLGE_CFG_RD_LEN_UNIT  4
 
 #define HCLGE_CFG_VMDQ_S       0
-#define HCLGE_CFG_VMDQ_M       (0xff << HCLGE_CFG_VMDQ_S)
+#define HCLGE_CFG_VMDQ_M       GENMASK(7, 0)
 #define HCLGE_CFG_TC_NUM_S     8
-#define HCLGE_CFG_TC_NUM_M     (0xff << HCLGE_CFG_TC_NUM_S)
+#define HCLGE_CFG_TC_NUM_M     GENMASK(15, 8)
 #define HCLGE_CFG_TQP_DESC_N_S 16
-#define HCLGE_CFG_TQP_DESC_N_M (0xffff << HCLGE_CFG_TQP_DESC_N_S)
+#define HCLGE_CFG_TQP_DESC_N_M GENMASK(31, 16)
 #define HCLGE_CFG_PHY_ADDR_S   0
-#define HCLGE_CFG_PHY_ADDR_M   (0x1f << HCLGE_CFG_PHY_ADDR_S)
+#define HCLGE_CFG_PHY_ADDR_M   GENMASK(4, 0)
 #define HCLGE_CFG_MEDIA_TP_S   8
-#define HCLGE_CFG_MEDIA_TP_M   (0xff << HCLGE_CFG_MEDIA_TP_S)
+#define HCLGE_CFG_MEDIA_TP_M   GENMASK(15, 8)
 #define HCLGE_CFG_RX_BUF_LEN_S 16
-#define HCLGE_CFG_RX_BUF_LEN_M (0xffff << HCLGE_CFG_RX_BUF_LEN_S)
+#define HCLGE_CFG_RX_BUF_LEN_M GENMASK(31, 16)
 #define HCLGE_CFG_MAC_ADDR_H_S 0
-#define HCLGE_CFG_MAC_ADDR_H_M (0xffff << HCLGE_CFG_MAC_ADDR_H_S)
+#define HCLGE_CFG_MAC_ADDR_H_M GENMASK(15, 0)
 #define HCLGE_CFG_DEFAULT_SPEED_S      16
-#define HCLGE_CFG_DEFAULT_SPEED_M      (0xff << HCLGE_CFG_DEFAULT_SPEED_S)
+#define HCLGE_CFG_DEFAULT_SPEED_M      GENMASK(23, 16)
 
-struct hclge_cfg_param {
+struct hclge_cfg_param_cmd {
        __le32 offset;
        __le32 rsv;
        __le32 param[4];
@@ -405,7 +405,7 @@ struct hclge_cfg_param {
 #define HCLGE_DESC_NUM         0x40
 
 #define HCLGE_ALLOC_VALID_B    0
-struct hclge_vf_num {
+struct hclge_vf_num_cmd {
        u8 alloc_valid;
        u8 rsv[23];
 };
@@ -413,13 +413,13 @@ struct hclge_vf_num {
 #define HCLGE_RSS_DEFAULT_OUTPORT_B    4
 #define HCLGE_RSS_HASH_KEY_OFFSET_B    4
 #define HCLGE_RSS_HASH_KEY_NUM         16
-struct hclge_rss_config {
+struct hclge_rss_config_cmd {
        u8 hash_config;
        u8 rsv[7];
        u8 hash_key[HCLGE_RSS_HASH_KEY_NUM];
 };
 
-struct hclge_rss_input_tuple {
+struct hclge_rss_input_tuple_cmd {
        u8 ipv4_tcp_en;
        u8 ipv4_udp_en;
        u8 ipv4_sctp_en;
@@ -433,26 +433,26 @@ struct hclge_rss_input_tuple {
 
 #define HCLGE_RSS_CFG_TBL_SIZE 16
 
-struct hclge_rss_indirection_table {
-       u16 start_table_index;
-       u16 rss_set_bitmap;
+struct hclge_rss_indirection_table_cmd {
+       __le16 start_table_index;
+       __le16 rss_set_bitmap;
        u8 rsv[4];
        u8 rss_result[HCLGE_RSS_CFG_TBL_SIZE];
 };
 
 #define HCLGE_RSS_TC_OFFSET_S          0
-#define HCLGE_RSS_TC_OFFSET_M          (0x3ff << HCLGE_RSS_TC_OFFSET_S)
+#define HCLGE_RSS_TC_OFFSET_M          GENMASK(9, 0)
 #define HCLGE_RSS_TC_SIZE_S            12
-#define HCLGE_RSS_TC_SIZE_M            (0x7 << HCLGE_RSS_TC_SIZE_S)
+#define HCLGE_RSS_TC_SIZE_M            GENMASK(14, 12)
 #define HCLGE_RSS_TC_VALID_B           15
-struct hclge_rss_tc_mode {
-       u16 rss_tc_mode[HCLGE_MAX_TC_NUM];
+struct hclge_rss_tc_mode_cmd {
+       __le16 rss_tc_mode[HCLGE_MAX_TC_NUM];
        u8 rsv[8];
 };
 
 #define HCLGE_LINK_STS_B       0
 #define HCLGE_LINK_STATUS      BIT(HCLGE_LINK_STS_B)
-struct hclge_link_status {
+struct hclge_link_status_cmd {
        u8 status;
        u8 rsv[23];
 };
@@ -467,7 +467,7 @@ struct hclge_promisc_param {
 #define HCLGE_PROMISC_EN_UC    0x1
 #define HCLGE_PROMISC_EN_MC    0x2
 #define HCLGE_PROMISC_EN_BC    0x4
-struct hclge_promisc_cfg {
+struct hclge_promisc_cfg_cmd {
        u8 flag;
        u8 vf_id;
        __le16 rsv0;
@@ -495,18 +495,18 @@ enum hclge_promisc_type {
 #define HCLGE_MAC_TX_UNDER_MIN_ERR_B           21
 #define HCLGE_MAC_TX_OVERSIZE_TRUNCATE_B       22
 
-struct hclge_config_mac_mode {
+struct hclge_config_mac_mode_cmd {
        __le32 txrx_pad_fcs_loop_en;
        u8 rsv[20];
 };
 
 #define HCLGE_CFG_SPEED_S              0
-#define HCLGE_CFG_SPEED_M              (0x3f << HCLGE_CFG_SPEED_S)
+#define HCLGE_CFG_SPEED_M              GENMASK(5, 0)
 
 #define HCLGE_CFG_DUPLEX_B             7
 #define HCLGE_CFG_DUPLEX_M             BIT(HCLGE_CFG_DUPLEX_B)
 
-struct hclge_config_mac_speed_dup {
+struct hclge_config_mac_speed_dup_cmd {
        u8 speed_dup;
 
 #define HCLGE_CFG_MAC_SPEED_CHANGE_EN_B        0
@@ -518,17 +518,17 @@ struct hclge_config_mac_speed_dup {
 #define HCLGE_QUERY_AN_B               0
 #define HCLGE_QUERY_DUPLEX_B           2
 
-#define HCLGE_QUERY_SPEED_M            (0x1f << HCLGE_QUERY_SPEED_S)
+#define HCLGE_QUERY_SPEED_M            GENMASK(4, 0)
 #define HCLGE_QUERY_AN_M               BIT(HCLGE_QUERY_AN_B)
 #define HCLGE_QUERY_DUPLEX_M           BIT(HCLGE_QUERY_DUPLEX_B)
 
-struct hclge_query_an_speed_dup {
+struct hclge_query_an_speed_dup_cmd {
        u8 an_syn_dup_speed;
        u8 pause;
        u8 rsv[23];
 };
 
-#define HCLGE_RING_ID_MASK             0x3ff
+#define HCLGE_RING_ID_MASK             GENMASK(9, 0)
 #define HCLGE_TQP_ENABLE_B             0
 
 #define HCLGE_MAC_CFG_AN_EN_B          0
@@ -539,7 +539,7 @@ struct hclge_query_an_speed_dup {
 
 #define HCLGE_MAC_CFG_AN_EN    BIT(HCLGE_MAC_CFG_AN_EN_B)
 
-struct hclge_config_auto_neg {
+struct hclge_config_auto_neg_cmd {
        __le32  cfg_an_cmd_flag;
        u8      rsv[20];
 };
@@ -548,7 +548,7 @@ struct hclge_config_auto_neg {
 #define HCLGE_MAC_MAX_MTU              9728
 #define HCLGE_MAC_UPLINK_PORT          0x100
 
-struct hclge_config_max_frm_size {
+struct hclge_config_max_frm_size_cmd {
        __le16  max_frm_size;
        u8      rsv[22];
 };
@@ -565,10 +565,10 @@ enum hclge_mac_vlan_tbl_opcode {
 #define HCLGE_MAC_EPORT_SW_EN_B                0xc
 #define HCLGE_MAC_EPORT_TYPE_B         0xb
 #define HCLGE_MAC_EPORT_VFID_S         0x3
-#define HCLGE_MAC_EPORT_VFID_M         (0xff << HCLGE_MAC_EPORT_VFID_S)
+#define HCLGE_MAC_EPORT_VFID_M         GENMASK(10, 3)
 #define HCLGE_MAC_EPORT_PFID_S         0x0
-#define HCLGE_MAC_EPORT_PFID_M         (0x7 << HCLGE_MAC_EPORT_PFID_S)
-struct hclge_mac_vlan_tbl_entry {
+#define HCLGE_MAC_EPORT_PFID_M         GENMASK(2, 0)
+struct hclge_mac_vlan_tbl_entry_cmd {
        u8      flags;
        u8      resp_code;
        __le16  vlan_tag;
@@ -583,15 +583,15 @@ struct hclge_mac_vlan_tbl_entry {
 };
 
 #define HCLGE_CFG_MTA_MAC_SEL_S                0x0
-#define HCLGE_CFG_MTA_MAC_SEL_M                (0x3 << HCLGE_CFG_MTA_MAC_SEL_S)
+#define HCLGE_CFG_MTA_MAC_SEL_M                GENMASK(1, 0)
 #define HCLGE_CFG_MTA_MAC_EN_B         0x7
-struct hclge_mta_filter_mode {
+struct hclge_mta_filter_mode_cmd {
        u8      dmac_sel_en; /* Use lowest 2 bit as sel_mode, bit 7 as enable */
        u8      rsv[23];
 };
 
 #define HCLGE_CFG_FUNC_MTA_ACCEPT_B    0x0
-struct hclge_cfg_func_mta_filter {
+struct hclge_cfg_func_mta_filter_cmd {
        u8      accept; /* Only used lowest 1 bit */
        u8      function_id;
        u8      rsv[22];
@@ -599,14 +599,14 @@ struct hclge_cfg_func_mta_filter {
 
 #define HCLGE_CFG_MTA_ITEM_ACCEPT_B    0x0
 #define HCLGE_CFG_MTA_ITEM_IDX_S       0x0
-#define HCLGE_CFG_MTA_ITEM_IDX_M       (0xfff << HCLGE_CFG_MTA_ITEM_IDX_S)
-struct hclge_cfg_func_mta_item {
-       u16     item_idx; /* Only used lowest 12 bit */
+#define HCLGE_CFG_MTA_ITEM_IDX_M       GENMASK(11, 0)
+struct hclge_cfg_func_mta_item_cmd {
+       __le16  item_idx; /* Only used lowest 12 bit */
        u8      accept;   /* Only used lowest 1 bit */
        u8      rsv[21];
 };
 
-struct hclge_mac_vlan_add {
+struct hclge_mac_vlan_add_cmd {
        __le16  flags;
        __le16  mac_addr_hi16;
        __le32  mac_addr_lo32;
@@ -619,7 +619,7 @@ struct hclge_mac_vlan_add {
 };
 
 #define HNS3_MAC_VLAN_CFG_FLAG_BIT 0
-struct hclge_mac_vlan_remove {
+struct hclge_mac_vlan_remove_cmd {
        __le16  flags;
        __le16  mac_addr_hi16;
        __le32  mac_addr_lo32;
@@ -631,21 +631,21 @@ struct hclge_mac_vlan_remove {
        u8      rsv[4];
 };
 
-struct hclge_vlan_filter_ctrl {
+struct hclge_vlan_filter_ctrl_cmd {
        u8 vlan_type;
        u8 vlan_fe;
        u8 rsv[22];
 };
 
-struct hclge_vlan_filter_pf_cfg {
+struct hclge_vlan_filter_pf_cfg_cmd {
        u8 vlan_offset;
        u8 vlan_cfg;
        u8 rsv[2];
        u8 vlan_offset_bitmap[20];
 };
 
-struct hclge_vlan_filter_vf_cfg {
-       u16 vlan_id;
+struct hclge_vlan_filter_vf_cfg_cmd {
+       __le16 vlan_id;
        u8  resp_code;
        u8  rsv;
        u8  vlan_cfg;
@@ -653,14 +653,14 @@ struct hclge_vlan_filter_vf_cfg {
        u8  vf_bitmap[16];
 };
 
-struct hclge_cfg_com_tqp_queue {
+struct hclge_cfg_com_tqp_queue_cmd {
        __le16 tqp_id;
        __le16 stream_id;
        u8 enable;
        u8 rsv[19];
 };
 
-struct hclge_cfg_tx_queue_pointer {
+struct hclge_cfg_tx_queue_pointer_cmd {
        __le16 tqp_id;
        __le16 tx_tail;
        __le16 tx_head;
@@ -670,12 +670,12 @@ struct hclge_cfg_tx_queue_pointer {
 };
 
 #define HCLGE_TSO_MSS_MIN_S    0
-#define HCLGE_TSO_MSS_MIN_M    (0x3FFF << HCLGE_TSO_MSS_MIN_S)
+#define HCLGE_TSO_MSS_MIN_M    GENMASK(13, 0)
 
 #define HCLGE_TSO_MSS_MAX_S    16
-#define HCLGE_TSO_MSS_MAX_M    (0x3FFF << HCLGE_TSO_MSS_MAX_S)
+#define HCLGE_TSO_MSS_MAX_M    GENMASK(29, 16)
 
-struct hclge_cfg_tso_status {
+struct hclge_cfg_tso_status_cmd {
        __le16 tso_mss_min;
        __le16 tso_mss_max;
        u8 rsv[20];
@@ -685,7 +685,7 @@ struct hclge_cfg_tso_status {
 #define HCLGE_TSO_MSS_MAX      9668
 
 #define HCLGE_TQP_RESET_B      0
-struct hclge_reset_tqp_queue {
+struct hclge_reset_tqp_queue_cmd {
        __le16 tqp_id;
        u8 reset_req;
        u8 ready_to_reset;
@@ -739,6 +739,7 @@ struct hclge_hw;
 int hclge_cmd_send(struct hclge_hw *hw, struct hclge_desc *desc, int num);
 void hclge_cmd_setup_basic_desc(struct hclge_desc *desc,
                                enum hclge_opcode_type opcode, bool is_read);
+void hclge_cmd_reuse_desc(struct hclge_desc *desc, bool is_read);
 
 int hclge_cmd_set_promisc_mode(struct hclge_dev *hdev,
                               struct hclge_promisc_param *param);
index 1a13614..c322b45 100644 (file)
@@ -362,7 +362,7 @@ static int hclge_64_bit_update_stats(struct hclge_dev *hdev)
 #define HCLGE_64_BIT_RTN_DATANUM 4
        u64 *data = (u64 *)(&hdev->hw_stats.all_64_bit_stats);
        struct hclge_desc desc[HCLGE_64_BIT_CMD_NUM];
-       u64 *desc_data;
+       __le64 *desc_data;
        int i, k, n;
        int ret;
 
@@ -376,14 +376,14 @@ static int hclge_64_bit_update_stats(struct hclge_dev *hdev)
 
        for (i = 0; i < HCLGE_64_BIT_CMD_NUM; i++) {
                if (unlikely(i == 0)) {
-                       desc_data = (u64 *)(&desc[i].data[0]);
+                       desc_data = (__le64 *)(&desc[i].data[0]);
                        n = HCLGE_64_BIT_RTN_DATANUM - 1;
                } else {
-                       desc_data = (u64 *)(&desc[i]);
+                       desc_data = (__le64 *)(&desc[i]);
                        n = HCLGE_64_BIT_RTN_DATANUM;
                }
                for (k = 0; k < n; k++) {
-                       *data++ += cpu_to_le64(*desc_data);
+                       *data++ += le64_to_cpu(*desc_data);
                        desc_data++;
                }
        }
@@ -411,7 +411,7 @@ static int hclge_32_bit_update_stats(struct hclge_dev *hdev)
 
        struct hclge_desc desc[HCLGE_32_BIT_CMD_NUM];
        struct hclge_32_bit_stats *all_32_bit_stats;
-       u32 *desc_data;
+       __le32 *desc_data;
        int i, k, n;
        u64 *data;
        int ret;
@@ -431,21 +431,27 @@ static int hclge_32_bit_update_stats(struct hclge_dev *hdev)
        hclge_reset_partial_32bit_counter(all_32_bit_stats);
        for (i = 0; i < HCLGE_32_BIT_CMD_NUM; i++) {
                if (unlikely(i == 0)) {
+                       __le16 *desc_data_16bit;
+
                        all_32_bit_stats->igu_rx_err_pkt +=
-                               cpu_to_le32(desc[i].data[0]);
+                               le32_to_cpu(desc[i].data[0]);
+
+                       desc_data_16bit = (__le16 *)&desc[i].data[1];
                        all_32_bit_stats->igu_rx_no_eof_pkt +=
-                               cpu_to_le32(desc[i].data[1] & 0xffff);
+                               le16_to_cpu(*desc_data_16bit);
+
+                       desc_data_16bit++;
                        all_32_bit_stats->igu_rx_no_sof_pkt +=
-                               cpu_to_le32((desc[i].data[1] >> 16) & 0xffff);
+                               le16_to_cpu(*desc_data_16bit);
 
-                       desc_data = (u32 *)(&desc[i].data[2]);
+                       desc_data = &desc[i].data[2];
                        n = HCLGE_32_BIT_RTN_DATANUM - 4;
                } else {
-                       desc_data = (u32 *)(&desc[i]);
+                       desc_data = (__le32 *)&desc[i];
                        n = HCLGE_32_BIT_RTN_DATANUM;
                }
                for (k = 0; k < n; k++) {
-                       *data++ += cpu_to_le32(*desc_data);
+                       *data++ += le32_to_cpu(*desc_data);
                        desc_data++;
                }
        }
@@ -460,7 +466,7 @@ static int hclge_mac_update_stats(struct hclge_dev *hdev)
 
        u64 *data = (u64 *)(&hdev->hw_stats.mac_stats);
        struct hclge_desc desc[HCLGE_MAC_CMD_NUM];
-       u64 *desc_data;
+       __le64 *desc_data;
        int i, k, n;
        int ret;
 
@@ -475,14 +481,14 @@ static int hclge_mac_update_stats(struct hclge_dev *hdev)
 
        for (i = 0; i < HCLGE_MAC_CMD_NUM; i++) {
                if (unlikely(i == 0)) {
-                       desc_data = (u64 *)(&desc[i].data[0]);
+                       desc_data = (__le64 *)(&desc[i].data[0]);
                        n = HCLGE_RTN_DATA_NUM - 2;
                } else {
-                       desc_data = (u64 *)(&desc[i]);
+                       desc_data = (__le64 *)(&desc[i]);
                        n = HCLGE_RTN_DATA_NUM;
                }
                for (k = 0; k < n; k++) {
-                       *data++ += cpu_to_le64(*desc_data);
+                       *data++ += le64_to_cpu(*desc_data);
                        desc_data++;
                }
        }
@@ -508,7 +514,7 @@ static int hclge_tqps_update_stats(struct hnae3_handle *handle)
                                           HCLGE_OPC_QUERY_RX_STATUS,
                                           true);
 
-               desc[0].data[0] = (tqp->index & 0x1ff);
+               desc[0].data[0] = cpu_to_le32((tqp->index & 0x1ff));
                ret = hclge_cmd_send(&hdev->hw, desc, 1);
                if (ret) {
                        dev_err(&hdev->pdev->dev,
@@ -517,7 +523,7 @@ static int hclge_tqps_update_stats(struct hnae3_handle *handle)
                        return ret;
                }
                tqp->tqp_stats.rcb_rx_ring_pktnum_rcd +=
-                       cpu_to_le32(desc[0].data[4]);
+                       le32_to_cpu(desc[0].data[4]);
        }
 
        for (i = 0; i < kinfo->num_tqps; i++) {
@@ -528,7 +534,7 @@ static int hclge_tqps_update_stats(struct hnae3_handle *handle)
                                           HCLGE_OPC_QUERY_TX_STATUS,
                                           true);
 
-               desc[0].data[0] = (tqp->index & 0x1ff);
+               desc[0].data[0] = cpu_to_le32((tqp->index & 0x1ff));
                ret = hclge_cmd_send(&hdev->hw, desc, 1);
                if (ret) {
                        dev_err(&hdev->pdev->dev,
@@ -537,7 +543,7 @@ static int hclge_tqps_update_stats(struct hnae3_handle *handle)
                        return ret;
                }
                tqp->tqp_stats.rcb_tx_ring_pktnum_rcd +=
-                       cpu_to_le32(desc[0].data[4]);
+                       le32_to_cpu(desc[0].data[4]);
        }
 
        return 0;
@@ -552,12 +558,12 @@ static u64 *hclge_tqps_get_stats(struct hnae3_handle *handle, u64 *data)
 
        for (i = 0; i < kinfo->num_tqps; i++) {
                tqp = container_of(kinfo->tqp[i], struct hclge_tqp, q);
-               *buff++ = cpu_to_le64(tqp->tqp_stats.rcb_tx_ring_pktnum_rcd);
+               *buff++ = tqp->tqp_stats.rcb_tx_ring_pktnum_rcd;
        }
 
        for (i = 0; i < kinfo->num_tqps; i++) {
                tqp = container_of(kinfo->tqp[i], struct hclge_tqp, q);
-               *buff++ = cpu_to_le64(tqp->tqp_stats.rcb_rx_ring_pktnum_rcd);
+               *buff++ = tqp->tqp_stats.rcb_rx_ring_pktnum_rcd;
        }
 
        return buff;
@@ -820,7 +826,7 @@ static void hclge_get_stats(struct hnae3_handle *handle, u64 *data)
 }
 
 static int hclge_parse_func_status(struct hclge_dev *hdev,
-                                  struct hclge_func_status *status)
+                                  struct hclge_func_status_cmd *status)
 {
        if (!(status->pf_state & HCLGE_PF_STATE_DONE))
                return -EINVAL;
@@ -837,13 +843,13 @@ static int hclge_parse_func_status(struct hclge_dev *hdev,
 
 static int hclge_query_function_status(struct hclge_dev *hdev)
 {
-       struct hclge_func_status *req;
+       struct hclge_func_status_cmd *req;
        struct hclge_desc desc;
        int timeout = 0;
        int ret;
 
        hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_QUERY_FUNC_STATUS, true);
-       req = (struct hclge_func_status *)desc.data;
+       req = (struct hclge_func_status_cmd *)desc.data;
 
        do {
                ret = hclge_cmd_send(&hdev->hw, &desc, 1);
@@ -868,7 +874,7 @@ static int hclge_query_function_status(struct hclge_dev *hdev)
 
 static int hclge_query_pf_resource(struct hclge_dev *hdev)
 {
-       struct hclge_pf_res *req;
+       struct hclge_pf_res_cmd *req;
        struct hclge_desc desc;
        int ret;
 
@@ -880,7 +886,7 @@ static int hclge_query_pf_resource(struct hclge_dev *hdev)
                return ret;
        }
 
-       req = (struct hclge_pf_res *)desc.data;
+       req = (struct hclge_pf_res_cmd *)desc.data;
        hdev->num_tqps = __le16_to_cpu(req->tqp_num);
        hdev->pkt_buf_size = __le16_to_cpu(req->buf_size) << HCLGE_BUF_UNIT_S;
 
@@ -938,12 +944,12 @@ static int hclge_parse_speed(int speed_cmd, int *speed)
 
 static void hclge_parse_cfg(struct hclge_cfg *cfg, struct hclge_desc *desc)
 {
-       struct hclge_cfg_param *req;
+       struct hclge_cfg_param_cmd *req;
        u64 mac_addr_tmp_high;
        u64 mac_addr_tmp;
        int i;
 
-       req = (struct hclge_cfg_param *)desc[0].data;
+       req = (struct hclge_cfg_param_cmd *)desc[0].data;
 
        /* get the configuration */
        cfg->vmdq_vport_num = hnae_get_field(__le32_to_cpu(req->param[0]),
@@ -978,7 +984,7 @@ static void hclge_parse_cfg(struct hclge_cfg *cfg, struct hclge_desc *desc)
        for (i = 0; i < ETH_ALEN; i++)
                cfg->mac_addr[i] = (mac_addr_tmp >> (8 * i)) & 0xff;
 
-       req = (struct hclge_cfg_param *)desc[1].data;
+       req = (struct hclge_cfg_param_cmd *)desc[1].data;
        cfg->numa_node_map = __le32_to_cpu(req->param[0]);
 }
 
@@ -989,20 +995,21 @@ static void hclge_parse_cfg(struct hclge_cfg *cfg, struct hclge_desc *desc)
 static int hclge_get_cfg(struct hclge_dev *hdev, struct hclge_cfg *hcfg)
 {
        struct hclge_desc desc[HCLGE_PF_CFG_DESC_NUM];
-       struct hclge_cfg_param *req;
+       struct hclge_cfg_param_cmd *req;
        int i, ret;
 
        for (i = 0; i < HCLGE_PF_CFG_DESC_NUM; i++) {
-               req = (struct hclge_cfg_param *)desc[i].data;
+               u32 offset = 0;
+
+               req = (struct hclge_cfg_param_cmd *)desc[i].data;
                hclge_cmd_setup_basic_desc(&desc[i], HCLGE_OPC_GET_CFG_PARAM,
                                           true);
-               hnae_set_field(req->offset, HCLGE_CFG_OFFSET_M,
+               hnae_set_field(offset, HCLGE_CFG_OFFSET_M,
                               HCLGE_CFG_OFFSET_S, i * HCLGE_CFG_RD_LEN_BYTES);
                /* Len should be united by 4 bytes when send to hardware */
-               hnae_set_field(req->offset, HCLGE_CFG_RD_LEN_M,
-                              HCLGE_CFG_RD_LEN_S,
+               hnae_set_field(offset, HCLGE_CFG_RD_LEN_M, HCLGE_CFG_RD_LEN_S,
                               HCLGE_CFG_RD_LEN_BYTES / HCLGE_CFG_RD_LEN_UNIT);
-               req->offset = cpu_to_le32(req->offset);
+               req->offset = cpu_to_le32(offset);
        }
 
        ret = hclge_cmd_send(&hdev->hw, desc, HCLGE_PF_CFG_DESC_NUM);
@@ -1099,16 +1106,23 @@ static int hclge_configure(struct hclge_dev *hdev)
 static int hclge_config_tso(struct hclge_dev *hdev, int tso_mss_min,
                            int tso_mss_max)
 {
-       struct hclge_cfg_tso_status *req;
+       struct hclge_cfg_tso_status_cmd *req;
        struct hclge_desc desc;
+       u16 tso_mss;
 
        hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_TSO_GENERIC_CONFIG, false);
 
-       req = (struct hclge_cfg_tso_status *)desc.data;
-       hnae_set_field(req->tso_mss_min, HCLGE_TSO_MSS_MIN_M,
+       req = (struct hclge_cfg_tso_status_cmd *)desc.data;
+
+       tso_mss = 0;
+       hnae_set_field(tso_mss, HCLGE_TSO_MSS_MIN_M,
                       HCLGE_TSO_MSS_MIN_S, tso_mss_min);
-       hnae_set_field(req->tso_mss_max, HCLGE_TSO_MSS_MIN_M,
+       req->tso_mss_min = cpu_to_le16(tso_mss);
+
+       tso_mss = 0;
+       hnae_set_field(tso_mss, HCLGE_TSO_MSS_MIN_M,
                       HCLGE_TSO_MSS_MIN_S, tso_mss_max);
+       req->tso_mss_max = cpu_to_le16(tso_mss);
 
        return hclge_cmd_send(&hdev->hw, &desc, 1);
 }
@@ -1144,15 +1158,15 @@ static int hclge_alloc_tqps(struct hclge_dev *hdev)
 static int hclge_map_tqps_to_func(struct hclge_dev *hdev, u16 func_id,
                                  u16 tqp_pid, u16 tqp_vid, bool is_pf)
 {
-       struct hclge_tqp_map *req;
+       struct hclge_tqp_map_cmd *req;
        struct hclge_desc desc;
        int ret;
 
        hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_SET_TQP_MAP, false);
 
-       req = (struct hclge_tqp_map *)desc.data;
+       req = (struct hclge_tqp_map_cmd *)desc.data;
        req->tqp_id = cpu_to_le16(tqp_pid);
-       req->tqp_vf = cpu_to_le16(func_id);
+       req->tqp_vf = func_id;
        req->tqp_flag = !is_pf << HCLGE_TQP_MAP_TYPE_B |
                        1 << HCLGE_TQP_MAP_EN_B;
        req->tqp_vid = cpu_to_le16(tqp_vid);
@@ -1340,12 +1354,12 @@ static int  hclge_cmd_alloc_tx_buff(struct hclge_dev *hdev,
 /* TX buffer size is unit by 128 byte */
 #define HCLGE_BUF_SIZE_UNIT_SHIFT      7
 #define HCLGE_BUF_SIZE_UPDATE_EN_MSK   BIT(15)
-       struct hclge_tx_buff_alloc *req;
+       struct hclge_tx_buff_alloc_cmd *req;
        struct hclge_desc desc;
        int ret;
        u8 i;
 
-       req = (struct hclge_tx_buff_alloc *)desc.data;
+       req = (struct hclge_tx_buff_alloc_cmd *)desc.data;
 
        hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_TX_BUFF_ALLOC, 0);
        for (i = 0; i < HCLGE_TC_NUM; i++) {
@@ -1536,8 +1550,8 @@ static int hclge_tx_buffer_calc(struct hclge_dev *hdev,
  * @buf_alloc: pointer to buffer calculation data
  * @return: 0: calculate sucessful, negative: fail
  */
-int hclge_rx_buffer_calc(struct hclge_dev *hdev,
-                        struct hclge_pkt_buf_alloc *buf_alloc)
+static int hclge_rx_buffer_calc(struct hclge_dev *hdev,
+                               struct hclge_pkt_buf_alloc *buf_alloc)
 {
        u32 rx_all = hdev->pkt_buf_size;
        int no_pfc_priv_num, pfc_priv_num;
@@ -1672,13 +1686,13 @@ int hclge_rx_buffer_calc(struct hclge_dev *hdev,
 static int hclge_rx_priv_buf_alloc(struct hclge_dev *hdev,
                                   struct hclge_pkt_buf_alloc *buf_alloc)
 {
-       struct hclge_rx_priv_buff *req;
+       struct hclge_rx_priv_buff_cmd *req;
        struct hclge_desc desc;
        int ret;
        int i;
 
        hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_RX_PRIV_BUFF_ALLOC, false);
-       req = (struct hclge_rx_priv_buff *)desc.data;
+       req = (struct hclge_rx_priv_buff_cmd *)desc.data;
 
        /* Alloc private buffer TCs */
        for (i = 0; i < HCLGE_MAX_TC_NUM; i++) {
@@ -1687,7 +1701,7 @@ static int hclge_rx_priv_buf_alloc(struct hclge_dev *hdev,
                req->buf_num[i] =
                        cpu_to_le16(priv->buf_size >> HCLGE_BUF_UNIT_S);
                req->buf_num[i] |=
-                       cpu_to_le16(true << HCLGE_TC0_PRI_BUF_EN_B);
+                       cpu_to_le16(1 << HCLGE_TC0_PRI_BUF_EN_B);
        }
 
        req->shared_buf =
@@ -2000,11 +2014,11 @@ static void hclge_check_speed_dup(struct hclge_dev *hdev, int duplex, int speed)
 
 int hclge_cfg_mac_speed_dup(struct hclge_dev *hdev, int speed, u8 duplex)
 {
-       struct hclge_config_mac_speed_dup *req;
+       struct hclge_config_mac_speed_dup_cmd *req;
        struct hclge_desc desc;
        int ret;
 
-       req = (struct hclge_config_mac_speed_dup *)desc.data;
+       req = (struct hclge_config_mac_speed_dup_cmd *)desc.data;
 
        hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_CONFIG_SPEED_DUP, false);
 
@@ -2075,12 +2089,12 @@ static int hclge_cfg_mac_speed_dup_h(struct hnae3_handle *handle, int speed,
 static int hclge_query_mac_an_speed_dup(struct hclge_dev *hdev, int *speed,
                                        u8 *duplex)
 {
-       struct hclge_query_an_speed_dup *req;
+       struct hclge_query_an_speed_dup_cmd *req;
        struct hclge_desc desc;
        int speed_tmp;
        int ret;
 
-       req = (struct hclge_query_an_speed_dup *)desc.data;
+       req = (struct hclge_query_an_speed_dup_cmd *)desc.data;
 
        hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_QUERY_AN_RESULT, true);
        ret = hclge_cmd_send(&hdev->hw, &desc, 1);
@@ -2108,11 +2122,11 @@ static int hclge_query_mac_an_speed_dup(struct hclge_dev *hdev, int *speed,
 static int hclge_query_autoneg_result(struct hclge_dev *hdev)
 {
        struct hclge_mac *mac = &hdev->hw.mac;
-       struct hclge_query_an_speed_dup *req;
+       struct hclge_query_an_speed_dup_cmd *req;
        struct hclge_desc desc;
        int ret;
 
-       req = (struct hclge_query_an_speed_dup *)desc.data;
+       req = (struct hclge_query_an_speed_dup_cmd *)desc.data;
 
        hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_QUERY_AN_RESULT, true);
        ret = hclge_cmd_send(&hdev->hw, &desc, 1);
@@ -2129,14 +2143,16 @@ static int hclge_query_autoneg_result(struct hclge_dev *hdev)
 
 static int hclge_set_autoneg_en(struct hclge_dev *hdev, bool enable)
 {
-       struct hclge_config_auto_neg *req;
+       struct hclge_config_auto_neg_cmd *req;
        struct hclge_desc desc;
+       u32 flag = 0;
        int ret;
 
        hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_CONFIG_AN_MODE, false);
 
-       req = (struct hclge_config_auto_neg *)desc.data;
-       hnae_set_bit(req->cfg_an_cmd_flag, HCLGE_MAC_CFG_AN_EN_B, !!enable);
+       req = (struct hclge_config_auto_neg_cmd *)desc.data;
+       hnae_set_bit(flag, HCLGE_MAC_CFG_AN_EN_B, !!enable);
+       req->cfg_an_cmd_flag = cpu_to_le32(flag);
 
        ret = hclge_cmd_send(&hdev->hw, &desc, 1);
        if (ret) {
@@ -2214,7 +2230,7 @@ static void hclge_task_schedule(struct hclge_dev *hdev)
 
 static int hclge_get_mac_link_status(struct hclge_dev *hdev)
 {
-       struct hclge_link_status *req;
+       struct hclge_link_status_cmd *req;
        struct hclge_desc desc;
        int link_status;
        int ret;
@@ -2227,7 +2243,7 @@ static int hclge_get_mac_link_status(struct hclge_dev *hdev)
                return ret;
        }
 
-       req = (struct hclge_link_status *)desc.data;
+       req = (struct hclge_link_status_cmd *)desc.data;
        link_status = req->status & HCLGE_LINK_STATUS;
 
        return !!link_status;
@@ -2451,7 +2467,7 @@ static u32 hclge_get_rss_indir_size(struct hnae3_handle *handle)
 
 static int hclge_get_rss_algo(struct hclge_dev *hdev)
 {
-       struct hclge_rss_config *req;
+       struct hclge_rss_config_cmd *req;
        struct hclge_desc desc;
        int rss_hash_algo;
        int ret;
@@ -2465,7 +2481,7 @@ static int hclge_get_rss_algo(struct hclge_dev *hdev)
                return ret;
        }
 
-       req = (struct hclge_rss_config *)desc.data;
+       req = (struct hclge_rss_config_cmd *)desc.data;
        rss_hash_algo = (req->hash_config & HCLGE_RSS_HASH_ALGO_MASK);
 
        if (rss_hash_algo == HCLGE_RSS_HASH_ALGO_TOEPLITZ)
@@ -2477,13 +2493,13 @@ static int hclge_get_rss_algo(struct hclge_dev *hdev)
 static int hclge_set_rss_algo_key(struct hclge_dev *hdev,
                                  const u8 hfunc, const u8 *key)
 {
-       struct hclge_rss_config *req;
+       struct hclge_rss_config_cmd *req;
        struct hclge_desc desc;
        int key_offset;
        int key_size;
        int ret;
 
-       req = (struct hclge_rss_config *)desc.data;
+       req = (struct hclge_rss_config_cmd *)desc.data;
 
        for (key_offset = 0; key_offset < 3; key_offset++) {
                hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_RSS_GENERIC_CONFIG,
@@ -2514,19 +2530,20 @@ static int hclge_set_rss_algo_key(struct hclge_dev *hdev,
 
 static int hclge_set_rss_indir_table(struct hclge_dev *hdev, const u32 *indir)
 {
-       struct hclge_rss_indirection_table *req;
+       struct hclge_rss_indirection_table_cmd *req;
        struct hclge_desc desc;
        int i, j;
        int ret;
 
-       req = (struct hclge_rss_indirection_table *)desc.data;
+       req = (struct hclge_rss_indirection_table_cmd *)desc.data;
 
        for (i = 0; i < HCLGE_RSS_CFG_TBL_NUM; i++) {
                hclge_cmd_setup_basic_desc
                        (&desc, HCLGE_OPC_RSS_INDIR_TABLE, false);
 
-               req->start_table_index = i * HCLGE_RSS_CFG_TBL_SIZE;
-               req->rss_set_bitmap = HCLGE_RSS_SET_BITMAP_MSK;
+               req->start_table_index =
+                       cpu_to_le16(i * HCLGE_RSS_CFG_TBL_SIZE);
+               req->rss_set_bitmap = cpu_to_le16(HCLGE_RSS_SET_BITMAP_MSK);
 
                for (j = 0; j < HCLGE_RSS_CFG_TBL_SIZE; j++)
                        req->rss_result[j] =
@@ -2546,21 +2563,24 @@ static int hclge_set_rss_indir_table(struct hclge_dev *hdev, const u32 *indir)
 static int hclge_set_rss_tc_mode(struct hclge_dev *hdev, u16 *tc_valid,
                                 u16 *tc_size, u16 *tc_offset)
 {
-       struct hclge_rss_tc_mode *req;
+       struct hclge_rss_tc_mode_cmd *req;
        struct hclge_desc desc;
        int ret;
        int i;
 
        hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_RSS_TC_MODE, false);
-       req = (struct hclge_rss_tc_mode *)desc.data;
+       req = (struct hclge_rss_tc_mode_cmd *)desc.data;
 
        for (i = 0; i < HCLGE_MAX_TC_NUM; i++) {
-               hnae_set_bit(req->rss_tc_mode[i], HCLGE_RSS_TC_VALID_B,
-                            (tc_valid[i] & 0x1));
-               hnae_set_field(req->rss_tc_mode[i], HCLGE_RSS_TC_SIZE_M,
+               u16 mode = 0;
+
+               hnae_set_bit(mode, HCLGE_RSS_TC_VALID_B, (tc_valid[i] & 0x1));
+               hnae_set_field(mode, HCLGE_RSS_TC_SIZE_M,
                               HCLGE_RSS_TC_SIZE_S, tc_size[i]);
-               hnae_set_field(req->rss_tc_mode[i], HCLGE_RSS_TC_OFFSET_M,
+               hnae_set_field(mode, HCLGE_RSS_TC_OFFSET_M,
                               HCLGE_RSS_TC_OFFSET_S, tc_offset[i]);
+
+               req->rss_tc_mode[i] = cpu_to_le16(mode);
        }
 
        ret = hclge_cmd_send(&hdev->hw, &desc, 1);
@@ -2575,15 +2595,13 @@ static int hclge_set_rss_tc_mode(struct hclge_dev *hdev, u16 *tc_valid,
 
 static int hclge_set_rss_input_tuple(struct hclge_dev *hdev)
 {
-#define HCLGE_RSS_INPUT_TUPLE_OTHER            0xf
-#define HCLGE_RSS_INPUT_TUPLE_SCTP             0x1f
-       struct hclge_rss_input_tuple *req;
+       struct hclge_rss_input_tuple_cmd *req;
        struct hclge_desc desc;
        int ret;
 
        hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_RSS_INPUT_TUPLE, false);
 
-       req = (struct hclge_rss_input_tuple *)desc.data;
+       req = (struct hclge_rss_input_tuple_cmd *)desc.data;
        req->ipv4_tcp_en = HCLGE_RSS_INPUT_TUPLE_OTHER;
        req->ipv4_udp_en = HCLGE_RSS_INPUT_TUPLE_OTHER;
        req->ipv4_sctp_en = HCLGE_RSS_INPUT_TUPLE_SCTP;
@@ -2657,6 +2675,161 @@ static int hclge_set_rss(struct hnae3_handle *handle, const u32 *indir,
        return ret;
 }
 
+static u8 hclge_get_rss_hash_bits(struct ethtool_rxnfc *nfc)
+{
+       u8 hash_sets = nfc->data & RXH_L4_B_0_1 ? HCLGE_S_PORT_BIT : 0;
+
+       if (nfc->data & RXH_L4_B_2_3)
+               hash_sets |= HCLGE_D_PORT_BIT;
+       else
+               hash_sets &= ~HCLGE_D_PORT_BIT;
+
+       if (nfc->data & RXH_IP_SRC)
+               hash_sets |= HCLGE_S_IP_BIT;
+       else
+               hash_sets &= ~HCLGE_S_IP_BIT;
+
+       if (nfc->data & RXH_IP_DST)
+               hash_sets |= HCLGE_D_IP_BIT;
+       else
+               hash_sets &= ~HCLGE_D_IP_BIT;
+
+       if (nfc->flow_type == SCTP_V4_FLOW || nfc->flow_type == SCTP_V6_FLOW)
+               hash_sets |= HCLGE_V_TAG_BIT;
+
+       return hash_sets;
+}
+
+static int hclge_set_rss_tuple(struct hnae3_handle *handle,
+                              struct ethtool_rxnfc *nfc)
+{
+       struct hclge_vport *vport = hclge_get_vport(handle);
+       struct hclge_dev *hdev = vport->back;
+       struct hclge_rss_input_tuple_cmd *req;
+       struct hclge_desc desc;
+       u8 tuple_sets;
+       int ret;
+
+       if (nfc->data & ~(RXH_IP_SRC | RXH_IP_DST |
+                         RXH_L4_B_0_1 | RXH_L4_B_2_3))
+               return -EINVAL;
+
+       req = (struct hclge_rss_input_tuple_cmd *)desc.data;
+       hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_RSS_INPUT_TUPLE, true);
+       ret = hclge_cmd_send(&hdev->hw, &desc, 1);
+       if (ret) {
+               dev_err(&hdev->pdev->dev,
+                       "Read rss tuple fail, status = %d\n", ret);
+               return ret;
+       }
+
+       hclge_cmd_reuse_desc(&desc, false);
+
+       tuple_sets = hclge_get_rss_hash_bits(nfc);
+       switch (nfc->flow_type) {
+       case TCP_V4_FLOW:
+               req->ipv4_tcp_en = tuple_sets;
+               break;
+       case TCP_V6_FLOW:
+               req->ipv6_tcp_en = tuple_sets;
+               break;
+       case UDP_V4_FLOW:
+               req->ipv4_udp_en = tuple_sets;
+               break;
+       case UDP_V6_FLOW:
+               req->ipv6_udp_en = tuple_sets;
+               break;
+       case SCTP_V4_FLOW:
+               req->ipv4_sctp_en = tuple_sets;
+               break;
+       case SCTP_V6_FLOW:
+               if ((nfc->data & RXH_L4_B_0_1) ||
+                   (nfc->data & RXH_L4_B_2_3))
+                       return -EINVAL;
+
+               req->ipv6_sctp_en = tuple_sets;
+               break;
+       case IPV4_FLOW:
+               req->ipv4_fragment_en = HCLGE_RSS_INPUT_TUPLE_OTHER;
+               break;
+       case IPV6_FLOW:
+               req->ipv6_fragment_en = HCLGE_RSS_INPUT_TUPLE_OTHER;
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       ret = hclge_cmd_send(&hdev->hw, &desc, 1);
+       if (ret)
+               dev_err(&hdev->pdev->dev,
+                       "Set rss tuple fail, status = %d\n", ret);
+
+       return ret;
+}
+
+static int hclge_get_rss_tuple(struct hnae3_handle *handle,
+                              struct ethtool_rxnfc *nfc)
+{
+       struct hclge_vport *vport = hclge_get_vport(handle);
+       struct hclge_dev *hdev = vport->back;
+       struct hclge_rss_input_tuple_cmd *req;
+       struct hclge_desc desc;
+       u8 tuple_sets;
+       int ret;
+
+       nfc->data = 0;
+
+       req = (struct hclge_rss_input_tuple_cmd *)desc.data;
+       hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_RSS_INPUT_TUPLE, true);
+       ret = hclge_cmd_send(&hdev->hw, &desc, 1);
+       if (ret) {
+               dev_err(&hdev->pdev->dev,
+                       "Read rss tuple fail, status = %d\n", ret);
+               return ret;
+       }
+
+       switch (nfc->flow_type) {
+       case TCP_V4_FLOW:
+               tuple_sets = req->ipv4_tcp_en;
+               break;
+       case UDP_V4_FLOW:
+               tuple_sets = req->ipv4_udp_en;
+               break;
+       case TCP_V6_FLOW:
+               tuple_sets = req->ipv6_tcp_en;
+               break;
+       case UDP_V6_FLOW:
+               tuple_sets = req->ipv6_udp_en;
+               break;
+       case SCTP_V4_FLOW:
+               tuple_sets = req->ipv4_sctp_en;
+               break;
+       case SCTP_V6_FLOW:
+               tuple_sets = req->ipv6_sctp_en;
+               break;
+       case IPV4_FLOW:
+       case IPV6_FLOW:
+               tuple_sets = HCLGE_S_IP_BIT | HCLGE_D_IP_BIT;
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       if (!tuple_sets)
+               return 0;
+
+       if (tuple_sets & HCLGE_D_PORT_BIT)
+               nfc->data |= RXH_L4_B_2_3;
+       if (tuple_sets & HCLGE_S_PORT_BIT)
+               nfc->data |= RXH_L4_B_0_1;
+       if (tuple_sets & HCLGE_D_IP_BIT)
+               nfc->data |= RXH_IP_DST;
+       if (tuple_sets & HCLGE_S_IP_BIT)
+               nfc->data |= RXH_IP_SRC;
+
+       return 0;
+}
+
 static int hclge_get_tc_size(struct hnae3_handle *handle)
 {
        struct hclge_vport *vport = hclge_get_vport(handle);
@@ -2750,7 +2923,7 @@ int hclge_map_vport_ring_to_vector(struct hclge_vport *vport, int vector_id,
                                   struct hnae3_ring_chain_node *ring_chain)
 {
        struct hclge_dev *hdev = vport->back;
-       struct hclge_ctrl_vector_chain *req;
+       struct hclge_ctrl_vector_chain_cmd *req;
        struct hnae3_ring_chain_node *node;
        struct hclge_desc desc;
        int ret;
@@ -2758,20 +2931,21 @@ int hclge_map_vport_ring_to_vector(struct hclge_vport *vport, int vector_id,
 
        hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_ADD_RING_TO_VECTOR, false);
 
-       req = (struct hclge_ctrl_vector_chain *)desc.data;
+       req = (struct hclge_ctrl_vector_chain_cmd *)desc.data;
        req->int_vector_id = vector_id;
 
        i = 0;
        for (node = ring_chain; node; node = node->next) {
-               hnae_set_field(req->tqp_type_and_id[i], HCLGE_INT_TYPE_M,
-                              HCLGE_INT_TYPE_S,
+               u16 type_and_id = 0;
+
+               hnae_set_field(type_and_id, HCLGE_INT_TYPE_M, HCLGE_INT_TYPE_S,
                               hnae_get_bit(node->flag, HNAE3_RING_TYPE_B));
-               hnae_set_field(req->tqp_type_and_id[i], HCLGE_TQP_ID_M,
-                              HCLGE_TQP_ID_S,  node->tqp_index);
-               hnae_set_field(req->tqp_type_and_id[i], HCLGE_INT_GL_IDX_M,
+               hnae_set_field(type_and_id, HCLGE_TQP_ID_M, HCLGE_TQP_ID_S,
+                              node->tqp_index);
+               hnae_set_field(type_and_id, HCLGE_INT_GL_IDX_M,
                               HCLGE_INT_GL_IDX_S,
                               hnae_get_bit(node->flag, HNAE3_RING_TYPE_B));
-               req->tqp_type_and_id[i] = cpu_to_le16(req->tqp_type_and_id[i]);
+               req->tqp_type_and_id[i] = cpu_to_le16(type_and_id);
                req->vfid = vport->vport_id;
 
                if (++i >= HCLGE_VECTOR_ELEMENTS_PER_CMD) {
@@ -2807,9 +2981,9 @@ int hclge_map_vport_ring_to_vector(struct hclge_vport *vport, int vector_id,
        return 0;
 }
 
-int hclge_map_handle_ring_to_vector(struct hnae3_handle *handle,
-                                   int vector,
-                                   struct hnae3_ring_chain_node *ring_chain)
+static int hclge_map_handle_ring_to_vector(
+               struct hnae3_handle *handle, int vector,
+               struct hnae3_ring_chain_node *ring_chain)
 {
        struct hclge_vport *vport = hclge_get_vport(handle);
        struct hclge_dev *hdev = vport->back;
@@ -2831,7 +3005,7 @@ static int hclge_unmap_ring_from_vector(
 {
        struct hclge_vport *vport = hclge_get_vport(handle);
        struct hclge_dev *hdev = vport->back;
-       struct hclge_ctrl_vector_chain *req;
+       struct hclge_ctrl_vector_chain_cmd *req;
        struct hnae3_ring_chain_node *node;
        struct hclge_desc desc;
        int i, vector_id;
@@ -2846,21 +3020,22 @@ static int hclge_unmap_ring_from_vector(
 
        hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_DEL_RING_TO_VECTOR, false);
 
-       req = (struct hclge_ctrl_vector_chain *)desc.data;
+       req = (struct hclge_ctrl_vector_chain_cmd *)desc.data;
        req->int_vector_id = vector_id;
 
        i = 0;
        for (node = ring_chain; node; node = node->next) {
-               hnae_set_field(req->tqp_type_and_id[i], HCLGE_INT_TYPE_M,
-                              HCLGE_INT_TYPE_S,
+               u16 type_and_id = 0;
+
+               hnae_set_field(type_and_id, HCLGE_INT_TYPE_M, HCLGE_INT_TYPE_S,
                               hnae_get_bit(node->flag, HNAE3_RING_TYPE_B));
-               hnae_set_field(req->tqp_type_and_id[i], HCLGE_TQP_ID_M,
-                              HCLGE_TQP_ID_S,  node->tqp_index);
-               hnae_set_field(req->tqp_type_and_id[i], HCLGE_INT_GL_IDX_M,
+               hnae_set_field(type_and_id, HCLGE_TQP_ID_M, HCLGE_TQP_ID_S,
+                              node->tqp_index);
+               hnae_set_field(type_and_id, HCLGE_INT_GL_IDX_M,
                               HCLGE_INT_GL_IDX_S,
                               hnae_get_bit(node->flag, HNAE3_RING_TYPE_B));
 
-               req->tqp_type_and_id[i] = cpu_to_le16(req->tqp_type_and_id[i]);
+               req->tqp_type_and_id[i] = cpu_to_le16(type_and_id);
                req->vfid = vport->vport_id;
 
                if (++i >= HCLGE_VECTOR_ELEMENTS_PER_CMD) {
@@ -2898,13 +3073,13 @@ static int hclge_unmap_ring_from_vector(
 int hclge_cmd_set_promisc_mode(struct hclge_dev *hdev,
                               struct hclge_promisc_param *param)
 {
-       struct hclge_promisc_cfg *req;
+       struct hclge_promisc_cfg_cmd *req;
        struct hclge_desc desc;
        int ret;
 
        hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_CFG_PROMISC_MODE, false);
 
-       req = (struct hclge_promisc_cfg *)desc.data;
+       req = (struct hclge_promisc_cfg_cmd *)desc.data;
        req->vf_id = param->vf_id;
        req->flag = (param->enable << HCLGE_PROMISC_EN_B);
 
@@ -2946,29 +3121,27 @@ static void hclge_set_promisc_mode(struct hnae3_handle *handle, u32 en)
 static void hclge_cfg_mac_mode(struct hclge_dev *hdev, bool enable)
 {
        struct hclge_desc desc;
-       struct hclge_config_mac_mode *req =
-               (struct hclge_config_mac_mode *)desc.data;
+       struct hclge_config_mac_mode_cmd *req =
+               (struct hclge_config_mac_mode_cmd *)desc.data;
+       u32 loop_en = 0;
        int ret;
 
        hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_CONFIG_MAC_MODE, false);
-       hnae_set_bit(req->txrx_pad_fcs_loop_en, HCLGE_MAC_TX_EN_B, enable);
-       hnae_set_bit(req->txrx_pad_fcs_loop_en, HCLGE_MAC_RX_EN_B, enable);
-       hnae_set_bit(req->txrx_pad_fcs_loop_en, HCLGE_MAC_PAD_TX_B, enable);
-       hnae_set_bit(req->txrx_pad_fcs_loop_en, HCLGE_MAC_PAD_RX_B, enable);
-       hnae_set_bit(req->txrx_pad_fcs_loop_en, HCLGE_MAC_1588_TX_B, 0);
-       hnae_set_bit(req->txrx_pad_fcs_loop_en, HCLGE_MAC_1588_RX_B, 0);
-       hnae_set_bit(req->txrx_pad_fcs_loop_en, HCLGE_MAC_APP_LP_B, 0);
-       hnae_set_bit(req->txrx_pad_fcs_loop_en, HCLGE_MAC_LINE_LP_B, 0);
-       hnae_set_bit(req->txrx_pad_fcs_loop_en, HCLGE_MAC_FCS_TX_B, enable);
-       hnae_set_bit(req->txrx_pad_fcs_loop_en, HCLGE_MAC_RX_FCS_B, enable);
-       hnae_set_bit(req->txrx_pad_fcs_loop_en,
-                    HCLGE_MAC_RX_FCS_STRIP_B, enable);
-       hnae_set_bit(req->txrx_pad_fcs_loop_en,
-                    HCLGE_MAC_TX_OVERSIZE_TRUNCATE_B, enable);
-       hnae_set_bit(req->txrx_pad_fcs_loop_en,
-                    HCLGE_MAC_RX_OVERSIZE_TRUNCATE_B, enable);
-       hnae_set_bit(req->txrx_pad_fcs_loop_en,
-                    HCLGE_MAC_TX_UNDER_MIN_ERR_B, enable);
+       hnae_set_bit(loop_en, HCLGE_MAC_TX_EN_B, enable);
+       hnae_set_bit(loop_en, HCLGE_MAC_RX_EN_B, enable);
+       hnae_set_bit(loop_en, HCLGE_MAC_PAD_TX_B, enable);
+       hnae_set_bit(loop_en, HCLGE_MAC_PAD_RX_B, enable);
+       hnae_set_bit(loop_en, HCLGE_MAC_1588_TX_B, 0);
+       hnae_set_bit(loop_en, HCLGE_MAC_1588_RX_B, 0);
+       hnae_set_bit(loop_en, HCLGE_MAC_APP_LP_B, 0);
+       hnae_set_bit(loop_en, HCLGE_MAC_LINE_LP_B, 0);
+       hnae_set_bit(loop_en, HCLGE_MAC_FCS_TX_B, enable);
+       hnae_set_bit(loop_en, HCLGE_MAC_RX_FCS_B, enable);
+       hnae_set_bit(loop_en, HCLGE_MAC_RX_FCS_STRIP_B, enable);
+       hnae_set_bit(loop_en, HCLGE_MAC_TX_OVERSIZE_TRUNCATE_B, enable);
+       hnae_set_bit(loop_en, HCLGE_MAC_RX_OVERSIZE_TRUNCATE_B, enable);
+       hnae_set_bit(loop_en, HCLGE_MAC_TX_UNDER_MIN_ERR_B, enable);
+       req->txrx_pad_fcs_loop_en = cpu_to_le32(loop_en);
 
        ret = hclge_cmd_send(&hdev->hw, &desc, 1);
        if (ret)
@@ -2980,8 +3153,8 @@ static int hclge_tqp_enable(struct hclge_dev *hdev, int tqp_id,
                            int stream_id, bool enable)
 {
        struct hclge_desc desc;
-       struct hclge_cfg_com_tqp_queue *req =
-               (struct hclge_cfg_com_tqp_queue *)desc.data;
+       struct hclge_cfg_com_tqp_queue_cmd *req =
+               (struct hclge_cfg_com_tqp_queue_cmd *)desc.data;
        int ret;
 
        hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_CFG_COM_TQP_QUEUE, false);
@@ -3145,16 +3318,16 @@ static int hclge_update_desc_vfid(struct hclge_desc *desc, int vfid, bool clr)
                word_num = vfid / 32;
                bit_num  = vfid % 32;
                if (clr)
-                       desc[1].data[word_num] &= ~(1 << bit_num);
+                       desc[1].data[word_num] &= cpu_to_le32(~(1 << bit_num));
                else
-                       desc[1].data[word_num] |= (1 << bit_num);
+                       desc[1].data[word_num] |= cpu_to_le32(1 << bit_num);
        } else {
                word_num = (vfid - 192) / 32;
                bit_num  = vfid % 32;
                if (clr)
-                       desc[2].data[word_num] &= ~(1 << bit_num);
+                       desc[2].data[word_num] &= cpu_to_le32(~(1 << bit_num));
                else
-                       desc[2].data[word_num] |= (1 << bit_num);
+                       desc[2].data[word_num] |= cpu_to_le32(1 << bit_num);
        }
 
        return 0;
@@ -3174,7 +3347,7 @@ static bool hclge_is_all_function_id_zero(struct hclge_desc *desc)
        return true;
 }
 
-static void hclge_prepare_mac_addr(struct hclge_mac_vlan_tbl_entry *new_req,
+static void hclge_prepare_mac_addr(struct hclge_mac_vlan_tbl_entry_cmd *new_req,
                                   const u8 *addr)
 {
        const unsigned char *mac_addr = addr;
@@ -3186,8 +3359,8 @@ static void hclge_prepare_mac_addr(struct hclge_mac_vlan_tbl_entry *new_req,
        new_req->mac_addr_lo16 = cpu_to_le16(low_val & 0xffff);
 }
 
-u16 hclge_get_mac_addr_to_mta_index(struct hclge_vport *vport,
-                                   const u8 *addr)
+static u16 hclge_get_mac_addr_to_mta_index(struct hclge_vport *vport,
+                                          const u8 *addr)
 {
        u16 high_val = addr[1] | (addr[0] << 8);
        struct hclge_dev *hdev = vport->back;
@@ -3201,11 +3374,11 @@ static int hclge_set_mta_filter_mode(struct hclge_dev *hdev,
                                     enum hclge_mta_dmac_sel_type mta_mac_sel,
                                     bool enable)
 {
-       struct hclge_mta_filter_mode *req;
+       struct hclge_mta_filter_mode_cmd *req;
        struct hclge_desc desc;
        int ret;
 
-       req = (struct hclge_mta_filter_mode *)desc.data;
+       req = (struct hclge_mta_filter_mode_cmd *)desc.data;
        hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_MTA_MAC_MODE_CFG, false);
 
        hnae_set_bit(req->dmac_sel_en, HCLGE_CFG_MTA_MAC_EN_B,
@@ -3228,11 +3401,11 @@ int hclge_cfg_func_mta_filter(struct hclge_dev *hdev,
                              u8 func_id,
                              bool enable)
 {
-       struct hclge_cfg_func_mta_filter *req;
+       struct hclge_cfg_func_mta_filter_cmd *req;
        struct hclge_desc desc;
        int ret;
 
-       req = (struct hclge_cfg_func_mta_filter *)desc.data;
+       req = (struct hclge_cfg_func_mta_filter_cmd *)desc.data;
        hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_MTA_MAC_FUNC_CFG, false);
 
        hnae_set_bit(req->accept, HCLGE_CFG_FUNC_MTA_ACCEPT_B,
@@ -3255,17 +3428,18 @@ static int hclge_set_mta_table_item(struct hclge_vport *vport,
                                    bool enable)
 {
        struct hclge_dev *hdev = vport->back;
-       struct hclge_cfg_func_mta_item *req;
+       struct hclge_cfg_func_mta_item_cmd *req;
        struct hclge_desc desc;
+       u16 item_idx = 0;
        int ret;
 
-       req = (struct hclge_cfg_func_mta_item *)desc.data;
+       req = (struct hclge_cfg_func_mta_item_cmd *)desc.data;
        hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_MTA_TBL_ITEM_CFG, false);
        hnae_set_bit(req->accept, HCLGE_CFG_MTA_ITEM_ACCEPT_B, enable);
 
-       hnae_set_field(req->item_idx, HCLGE_CFG_MTA_ITEM_IDX_M,
+       hnae_set_field(item_idx, HCLGE_CFG_MTA_ITEM_IDX_M,
                       HCLGE_CFG_MTA_ITEM_IDX_S, idx);
-       req->item_idx = cpu_to_le16(req->item_idx);
+       req->item_idx = cpu_to_le16(item_idx);
 
        ret = hclge_cmd_send(&hdev->hw, &desc, 1);
        if (ret) {
@@ -3279,16 +3453,17 @@ static int hclge_set_mta_table_item(struct hclge_vport *vport,
 }
 
 static int hclge_remove_mac_vlan_tbl(struct hclge_vport *vport,
-                                    struct hclge_mac_vlan_tbl_entry *req)
+                                    struct hclge_mac_vlan_tbl_entry_cmd *req)
 {
        struct hclge_dev *hdev = vport->back;
        struct hclge_desc desc;
        u8 resp_code;
+       u16 retval;
        int ret;
 
        hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_MAC_VLAN_REMOVE, false);
 
-       memcpy(desc.data, req, sizeof(struct hclge_mac_vlan_tbl_entry));
+       memcpy(desc.data, req, sizeof(struct hclge_mac_vlan_tbl_entry_cmd));
 
        ret = hclge_cmd_send(&hdev->hw, &desc, 1);
        if (ret) {
@@ -3297,19 +3472,21 @@ static int hclge_remove_mac_vlan_tbl(struct hclge_vport *vport,
                        ret);
                return ret;
        }
-       resp_code = (desc.data[0] >> 8) & 0xff;
+       resp_code = (le32_to_cpu(desc.data[0]) >> 8) & 0xff;
+       retval = le16_to_cpu(desc.retval);
 
-       return hclge_get_mac_vlan_cmd_status(vport, desc.retval, resp_code,
+       return hclge_get_mac_vlan_cmd_status(vport, retval, resp_code,
                                             HCLGE_MAC_VLAN_REMOVE);
 }
 
 static int hclge_lookup_mac_vlan_tbl(struct hclge_vport *vport,
-                                    struct hclge_mac_vlan_tbl_entry *req,
+                                    struct hclge_mac_vlan_tbl_entry_cmd *req,
                                     struct hclge_desc *desc,
                                     bool is_mc)
 {
        struct hclge_dev *hdev = vport->back;
        u8 resp_code;
+       u16 retval;
        int ret;
 
        hclge_cmd_setup_basic_desc(&desc[0], HCLGE_OPC_MAC_VLAN_ADD, true);
@@ -3317,7 +3494,7 @@ static int hclge_lookup_mac_vlan_tbl(struct hclge_vport *vport,
                desc[0].flag |= cpu_to_le16(HCLGE_CMD_FLAG_NEXT);
                memcpy(desc[0].data,
                       req,
-                      sizeof(struct hclge_mac_vlan_tbl_entry));
+                      sizeof(struct hclge_mac_vlan_tbl_entry_cmd));
                hclge_cmd_setup_basic_desc(&desc[1],
                                           HCLGE_OPC_MAC_VLAN_ADD,
                                           true);
@@ -3329,7 +3506,7 @@ static int hclge_lookup_mac_vlan_tbl(struct hclge_vport *vport,
        } else {
                memcpy(desc[0].data,
                       req,
-                      sizeof(struct hclge_mac_vlan_tbl_entry));
+                      sizeof(struct hclge_mac_vlan_tbl_entry_cmd));
                ret = hclge_cmd_send(&hdev->hw, desc, 1);
        }
        if (ret) {
@@ -3338,19 +3515,21 @@ static int hclge_lookup_mac_vlan_tbl(struct hclge_vport *vport,
                        ret);
                return ret;
        }
-       resp_code = (desc[0].data[0] >> 8) & 0xff;
+       resp_code = (le32_to_cpu(desc[0].data[0]) >> 8) & 0xff;
+       retval = le16_to_cpu(desc[0].retval);
 
-       return hclge_get_mac_vlan_cmd_status(vport, desc[0].retval, resp_code,
+       return hclge_get_mac_vlan_cmd_status(vport, retval, resp_code,
                                             HCLGE_MAC_VLAN_LKUP);
 }
 
 static int hclge_add_mac_vlan_tbl(struct hclge_vport *vport,
-                                 struct hclge_mac_vlan_tbl_entry *req,
+                                 struct hclge_mac_vlan_tbl_entry_cmd *req,
                                  struct hclge_desc *mc_desc)
 {
        struct hclge_dev *hdev = vport->back;
        int cfg_status;
        u8 resp_code;
+       u16 retval;
        int ret;
 
        if (!mc_desc) {
@@ -3359,10 +3538,13 @@ static int hclge_add_mac_vlan_tbl(struct hclge_vport *vport,
                hclge_cmd_setup_basic_desc(&desc,
                                           HCLGE_OPC_MAC_VLAN_ADD,
                                           false);
-               memcpy(desc.data, req, sizeof(struct hclge_mac_vlan_tbl_entry));
+               memcpy(desc.data, req,
+                      sizeof(struct hclge_mac_vlan_tbl_entry_cmd));
                ret = hclge_cmd_send(&hdev->hw, &desc, 1);
-               resp_code = (desc.data[0] >> 8) & 0xff;
-               cfg_status = hclge_get_mac_vlan_cmd_status(vport, desc.retval,
+               resp_code = (le32_to_cpu(desc.data[0]) >> 8) & 0xff;
+               retval = le16_to_cpu(desc.retval);
+
+               cfg_status = hclge_get_mac_vlan_cmd_status(vport, retval,
                                                           resp_code,
                                                           HCLGE_MAC_VLAN_ADD);
        } else {
@@ -3373,11 +3555,12 @@ static int hclge_add_mac_vlan_tbl(struct hclge_vport *vport,
                mc_desc[2].flag &= cpu_to_le16(~HCLGE_CMD_FLAG_WR);
                mc_desc[2].flag &= cpu_to_le16(~HCLGE_CMD_FLAG_NEXT);
                memcpy(mc_desc[0].data, req,
-                      sizeof(struct hclge_mac_vlan_tbl_entry));
+                      sizeof(struct hclge_mac_vlan_tbl_entry_cmd));
                ret = hclge_cmd_send(&hdev->hw, mc_desc, 3);
-               resp_code = (mc_desc[0].data[0] >> 8) & 0xff;
-               cfg_status = hclge_get_mac_vlan_cmd_status(vport,
-                                                          mc_desc[0].retval,
+               resp_code = (le32_to_cpu(mc_desc[0].data[0]) >> 8) & 0xff;
+               retval = le16_to_cpu(mc_desc[0].retval);
+
+               cfg_status = hclge_get_mac_vlan_cmd_status(vport, retval,
                                                           resp_code,
                                                           HCLGE_MAC_VLAN_ADD);
        }
@@ -3404,8 +3587,9 @@ int hclge_add_uc_addr_common(struct hclge_vport *vport,
                             const unsigned char *addr)
 {
        struct hclge_dev *hdev = vport->back;
-       struct hclge_mac_vlan_tbl_entry req;
+       struct hclge_mac_vlan_tbl_entry_cmd req;
        enum hclge_cmd_status status;
+       u16 egress_port = 0;
 
        /* mac addr check */
        if (is_zero_ether_addr(addr) ||
@@ -3425,15 +3609,15 @@ int hclge_add_uc_addr_common(struct hclge_vport *vport,
        hnae_set_bit(req.entry_type, HCLGE_MAC_VLAN_BIT0_EN_B, 0);
        hnae_set_bit(req.entry_type, HCLGE_MAC_VLAN_BIT1_EN_B, 0);
        hnae_set_bit(req.mc_mac_en, HCLGE_MAC_VLAN_BIT0_EN_B, 0);
-       hnae_set_bit(req.egress_port,
-                    HCLGE_MAC_EPORT_SW_EN_B, 0);
-       hnae_set_bit(req.egress_port,
-                    HCLGE_MAC_EPORT_TYPE_B, 0);
-       hnae_set_field(req.egress_port, HCLGE_MAC_EPORT_VFID_M,
+
+       hnae_set_bit(egress_port, HCLGE_MAC_EPORT_SW_EN_B, 0);
+       hnae_set_bit(egress_port, HCLGE_MAC_EPORT_TYPE_B, 0);
+       hnae_set_field(egress_port, HCLGE_MAC_EPORT_VFID_M,
                       HCLGE_MAC_EPORT_VFID_S, vport->vport_id);
-       hnae_set_field(req.egress_port, HCLGE_MAC_EPORT_PFID_M,
+       hnae_set_field(egress_port, HCLGE_MAC_EPORT_PFID_M,
                       HCLGE_MAC_EPORT_PFID_S, 0);
-       req.egress_port = cpu_to_le16(req.egress_port);
+
+       req.egress_port = cpu_to_le16(egress_port);
 
        hclge_prepare_mac_addr(&req, addr);
 
@@ -3454,7 +3638,7 @@ int hclge_rm_uc_addr_common(struct hclge_vport *vport,
                            const unsigned char *addr)
 {
        struct hclge_dev *hdev = vport->back;
-       struct hclge_mac_vlan_tbl_entry req;
+       struct hclge_mac_vlan_tbl_entry_cmd req;
        enum hclge_cmd_status status;
 
        /* mac addr check */
@@ -3488,7 +3672,7 @@ int hclge_add_mc_addr_common(struct hclge_vport *vport,
                             const unsigned char *addr)
 {
        struct hclge_dev *hdev = vport->back;
-       struct hclge_mac_vlan_tbl_entry req;
+       struct hclge_mac_vlan_tbl_entry_cmd req;
        struct hclge_desc desc[3];
        u16 tbl_idx;
        int status;
@@ -3539,7 +3723,7 @@ int hclge_rm_mc_addr_common(struct hclge_vport *vport,
                            const unsigned char *addr)
 {
        struct hclge_dev *hdev = vport->back;
-       struct hclge_mac_vlan_tbl_entry req;
+       struct hclge_mac_vlan_tbl_entry_cmd req;
        enum hclge_cmd_status status;
        struct hclge_desc desc[3];
        u16 tbl_idx;
@@ -3622,13 +3806,13 @@ static int hclge_set_mac_addr(struct hnae3_handle *handle, void *p)
 static int hclge_set_vlan_filter_ctrl(struct hclge_dev *hdev, u8 vlan_type,
                                      bool filter_en)
 {
-       struct hclge_vlan_filter_ctrl *req;
+       struct hclge_vlan_filter_ctrl_cmd *req;
        struct hclge_desc desc;
        int ret;
 
        hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_VLAN_FILTER_CTRL, false);
 
-       req = (struct hclge_vlan_filter_ctrl *)desc.data;
+       req = (struct hclge_vlan_filter_ctrl_cmd *)desc.data;
        req->vlan_type = vlan_type;
        req->vlan_fe = filter_en;
 
@@ -3646,8 +3830,8 @@ int hclge_set_vf_vlan_common(struct hclge_dev *hdev, int vfid,
                             bool is_kill, u16 vlan, u8 qos, __be16 proto)
 {
 #define HCLGE_MAX_VF_BYTES  16
-       struct hclge_vlan_filter_vf_cfg *req0;
-       struct hclge_vlan_filter_vf_cfg *req1;
+       struct hclge_vlan_filter_vf_cfg_cmd *req0;
+       struct hclge_vlan_filter_vf_cfg_cmd *req1;
        struct hclge_desc desc[2];
        u8 vf_byte_val;
        u8 vf_byte_off;
@@ -3663,10 +3847,10 @@ int hclge_set_vf_vlan_common(struct hclge_dev *hdev, int vfid,
        vf_byte_off = vfid / 8;
        vf_byte_val = 1 << (vfid % 8);
 
-       req0 = (struct hclge_vlan_filter_vf_cfg *)desc[0].data;
-       req1 = (struct hclge_vlan_filter_vf_cfg *)desc[1].data;
+       req0 = (struct hclge_vlan_filter_vf_cfg_cmd *)desc[0].data;
+       req1 = (struct hclge_vlan_filter_vf_cfg_cmd *)desc[1].data;
 
-       req0->vlan_id  = vlan;
+       req0->vlan_id  = cpu_to_le16(vlan);
        req0->vlan_cfg = is_kill;
 
        if (vf_byte_off < HCLGE_MAX_VF_BYTES)
@@ -3707,7 +3891,7 @@ static int hclge_set_port_vlan_filter(struct hnae3_handle *handle,
 {
        struct hclge_vport *vport = hclge_get_vport(handle);
        struct hclge_dev *hdev = vport->back;
-       struct hclge_vlan_filter_pf_cfg *req;
+       struct hclge_vlan_filter_pf_cfg_cmd *req;
        struct hclge_desc desc;
        u8 vlan_offset_byte_val;
        u8 vlan_offset_byte;
@@ -3720,7 +3904,7 @@ static int hclge_set_port_vlan_filter(struct hnae3_handle *handle,
        vlan_offset_byte = (vlan_id % 160) / 8;
        vlan_offset_byte_val = 1 << (vlan_id % 8);
 
-       req = (struct hclge_vlan_filter_pf_cfg *)desc.data;
+       req = (struct hclge_vlan_filter_pf_cfg_cmd *)desc.data;
        req->vlan_offset = vlan_offset_160;
        req->vlan_cfg = is_kill;
        req->vlan_offset_bitmap[vlan_offset_byte] = vlan_offset_byte_val;
@@ -3782,7 +3966,7 @@ static int hclge_init_vlan_config(struct hclge_dev *hdev)
 static int hclge_set_mtu(struct hnae3_handle *handle, int new_mtu)
 {
        struct hclge_vport *vport = hclge_get_vport(handle);
-       struct hclge_config_max_frm_size *req;
+       struct hclge_config_max_frm_size_cmd *req;
        struct hclge_dev *hdev = vport->back;
        struct hclge_desc desc;
        int ret;
@@ -3793,7 +3977,7 @@ static int hclge_set_mtu(struct hnae3_handle *handle, int new_mtu)
        hdev->mps = new_mtu;
        hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_CONFIG_MAX_FRM_SIZE, false);
 
-       req = (struct hclge_config_max_frm_size *)desc.data;
+       req = (struct hclge_config_max_frm_size_cmd *)desc.data;
        req->max_frm_size = cpu_to_le16(new_mtu);
 
        ret = hclge_cmd_send(&hdev->hw, &desc, 1);
@@ -3808,13 +3992,13 @@ static int hclge_set_mtu(struct hnae3_handle *handle, int new_mtu)
 static int hclge_send_reset_tqp_cmd(struct hclge_dev *hdev, u16 queue_id,
                                    bool enable)
 {
-       struct hclge_reset_tqp_queue *req;
+       struct hclge_reset_tqp_queue_cmd *req;
        struct hclge_desc desc;
        int ret;
 
        hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_RESET_TQP_QUEUE, false);
 
-       req = (struct hclge_reset_tqp_queue *)desc.data;
+       req = (struct hclge_reset_tqp_queue_cmd *)desc.data;
        req->tqp_id = cpu_to_le16(queue_id & HCLGE_RING_ID_MASK);
        hnae_set_bit(req->reset_req, HCLGE_TQP_RESET_B, enable);
 
@@ -3830,13 +4014,13 @@ static int hclge_send_reset_tqp_cmd(struct hclge_dev *hdev, u16 queue_id,
 
 static int hclge_get_reset_status(struct hclge_dev *hdev, u16 queue_id)
 {
-       struct hclge_reset_tqp_queue *req;
+       struct hclge_reset_tqp_queue_cmd *req;
        struct hclge_desc desc;
        int ret;
 
        hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_RESET_TQP_QUEUE, true);
 
-       req = (struct hclge_reset_tqp_queue *)desc.data;
+       req = (struct hclge_reset_tqp_queue_cmd *)desc.data;
        req->tqp_id = cpu_to_le16(queue_id & HCLGE_RING_ID_MASK);
 
        ret = hclge_cmd_send(&hdev->hw, &desc, 1);
@@ -4313,6 +4497,8 @@ static const struct hnae3_ae_ops hclge_ops = {
        .get_rss_indir_size = hclge_get_rss_indir_size,
        .get_rss = hclge_get_rss,
        .set_rss = hclge_set_rss,
+       .set_rss_tuple = hclge_set_rss_tuple,
+       .get_rss_tuple = hclge_get_rss_tuple,
        .get_tc_size = hclge_get_tc_size,
        .get_mac_addr = hclge_get_mac_addr,
        .set_mac_addr = hclge_set_mac_addr,
index 7c66c00..a7c018c 100644 (file)
@@ -32,7 +32,7 @@
 #define HCLGE_VECTOR_VF_OFFSET         0x100000
 
 #define HCLGE_RSS_IND_TBL_SIZE         512
-#define HCLGE_RSS_SET_BITMAP_MSK       0xffff
+#define HCLGE_RSS_SET_BITMAP_MSK       GENMASK(15, 0)
 #define HCLGE_RSS_KEY_SIZE             40
 #define HCLGE_RSS_HASH_ALGO_TOEPLITZ   0
 #define HCLGE_RSS_HASH_ALGO_SIMPLE     1
 #define HCLGE_RSS_CFG_TBL_NUM \
        (HCLGE_RSS_IND_TBL_SIZE / HCLGE_RSS_CFG_TBL_SIZE)
 
+#define HCLGE_RSS_INPUT_TUPLE_OTHER    GENMASK(3, 0)
+#define HCLGE_RSS_INPUT_TUPLE_SCTP     GENMASK(4, 0)
+#define HCLGE_D_PORT_BIT               BIT(0)
+#define HCLGE_S_PORT_BIT               BIT(1)
+#define HCLGE_D_IP_BIT                 BIT(2)
+#define HCLGE_S_IP_BIT                 BIT(3)
+#define HCLGE_V_TAG_BIT                        BIT(4)
+
 #define HCLGE_RSS_TC_SIZE_0            1
 #define HCLGE_RSS_TC_SIZE_1            2
 #define HCLGE_RSS_TC_SIZE_2            4
@@ -65,7 +73,7 @@
 #define HCLGE_PHY_CSS_REG              17
 
 #define HCLGE_PHY_MDIX_CTRL_S          (5)
-#define HCLGE_PHY_MDIX_CTRL_M          (3 << HCLGE_PHY_MDIX_CTRL_S)
+#define HCLGE_PHY_MDIX_CTRL_M          GENMASK(6, 5)
 
 #define HCLGE_PHY_MDIX_STATUS_B        (6)
 #define HCLGE_PHY_SPEED_DUP_RESOLVE_B  (11)
index 359ee67..1ae6eae 100644 (file)
@@ -283,6 +283,7 @@ static int hclge_tm_pg_shapping_cfg(struct hclge_dev *hdev,
        struct hclge_pg_shapping_cmd *shap_cfg_cmd;
        enum hclge_opcode_type opcode;
        struct hclge_desc desc;
+       u32 shapping_para = 0;
 
        opcode = bucket ? HCLGE_OPC_TM_PG_P_SHAPPING :
                HCLGE_OPC_TM_PG_C_SHAPPING;
@@ -292,11 +293,13 @@ static int hclge_tm_pg_shapping_cfg(struct hclge_dev *hdev,
 
        shap_cfg_cmd->pg_id = pg_id;
 
-       hclge_tm_set_field(shap_cfg_cmd->pg_shapping_para, IR_B, ir_b);
-       hclge_tm_set_field(shap_cfg_cmd->pg_shapping_para, IR_U, ir_u);
-       hclge_tm_set_field(shap_cfg_cmd->pg_shapping_para, IR_S, ir_s);
-       hclge_tm_set_field(shap_cfg_cmd->pg_shapping_para, BS_B, bs_b);
-       hclge_tm_set_field(shap_cfg_cmd->pg_shapping_para, BS_S, bs_s);
+       hclge_tm_set_field(shapping_para, IR_B, ir_b);
+       hclge_tm_set_field(shapping_para, IR_U, ir_u);
+       hclge_tm_set_field(shapping_para, IR_S, ir_s);
+       hclge_tm_set_field(shapping_para, BS_B, bs_b);
+       hclge_tm_set_field(shapping_para, BS_S, bs_s);
+
+       shap_cfg_cmd->pg_shapping_para = cpu_to_le32(shapping_para);
 
        return hclge_cmd_send(&hdev->hw, &desc, 1);
 }
@@ -337,6 +340,7 @@ static int hclge_tm_pri_shapping_cfg(struct hclge_dev *hdev,
        struct hclge_pri_shapping_cmd *shap_cfg_cmd;
        enum hclge_opcode_type opcode;
        struct hclge_desc desc;
+       u32 shapping_para = 0;
 
        opcode = bucket ? HCLGE_OPC_TM_PRI_P_SHAPPING :
                HCLGE_OPC_TM_PRI_C_SHAPPING;
@@ -347,11 +351,13 @@ static int hclge_tm_pri_shapping_cfg(struct hclge_dev *hdev,
 
        shap_cfg_cmd->pri_id = pri_id;
 
-       hclge_tm_set_field(shap_cfg_cmd->pri_shapping_para, IR_B, ir_b);
-       hclge_tm_set_field(shap_cfg_cmd->pri_shapping_para, IR_U, ir_u);
-       hclge_tm_set_field(shap_cfg_cmd->pri_shapping_para, IR_S, ir_s);
-       hclge_tm_set_field(shap_cfg_cmd->pri_shapping_para, BS_B, bs_b);
-       hclge_tm_set_field(shap_cfg_cmd->pri_shapping_para, BS_S, bs_s);
+       hclge_tm_set_field(shapping_para, IR_B, ir_b);
+       hclge_tm_set_field(shapping_para, IR_U, ir_u);
+       hclge_tm_set_field(shapping_para, IR_S, ir_s);
+       hclge_tm_set_field(shapping_para, BS_B, bs_b);
+       hclge_tm_set_field(shapping_para, BS_S, bs_s);
+
+       shap_cfg_cmd->pri_shapping_para = cpu_to_le32(shapping_para);
 
        return hclge_cmd_send(&hdev->hw, &desc, 1);
 }
index 9832172..925619a 100644 (file)
@@ -13,8 +13,7 @@
 static
 int hns3_dcbnl_ieee_getets(struct net_device *ndev, struct ieee_ets *ets)
 {
-       struct hns3_nic_priv *priv = netdev_priv(ndev);
-       struct hnae3_handle *h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(ndev);
 
        if (h->kinfo.dcb_ops->ieee_getets)
                return h->kinfo.dcb_ops->ieee_getets(h, ets);
@@ -25,8 +24,7 @@ int hns3_dcbnl_ieee_getets(struct net_device *ndev, struct ieee_ets *ets)
 static
 int hns3_dcbnl_ieee_setets(struct net_device *ndev, struct ieee_ets *ets)
 {
-       struct hns3_nic_priv *priv = netdev_priv(ndev);
-       struct hnae3_handle *h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(ndev);
 
        if (h->kinfo.dcb_ops->ieee_setets)
                return h->kinfo.dcb_ops->ieee_setets(h, ets);
@@ -37,8 +35,7 @@ int hns3_dcbnl_ieee_setets(struct net_device *ndev, struct ieee_ets *ets)
 static
 int hns3_dcbnl_ieee_getpfc(struct net_device *ndev, struct ieee_pfc *pfc)
 {
-       struct hns3_nic_priv *priv = netdev_priv(ndev);
-       struct hnae3_handle *h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(ndev);
 
        if (h->kinfo.dcb_ops->ieee_getpfc)
                return h->kinfo.dcb_ops->ieee_getpfc(h, pfc);
@@ -49,8 +46,7 @@ int hns3_dcbnl_ieee_getpfc(struct net_device *ndev, struct ieee_pfc *pfc)
 static
 int hns3_dcbnl_ieee_setpfc(struct net_device *ndev, struct ieee_pfc *pfc)
 {
-       struct hns3_nic_priv *priv = netdev_priv(ndev);
-       struct hnae3_handle *h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(ndev);
 
        if (h->kinfo.dcb_ops->ieee_setpfc)
                return h->kinfo.dcb_ops->ieee_setpfc(h, pfc);
@@ -61,8 +57,7 @@ int hns3_dcbnl_ieee_setpfc(struct net_device *ndev, struct ieee_pfc *pfc)
 /* DCBX configuration */
 static u8 hns3_dcbnl_getdcbx(struct net_device *ndev)
 {
-       struct hns3_nic_priv *priv = netdev_priv(ndev);
-       struct hnae3_handle *h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(ndev);
 
        if (h->kinfo.dcb_ops->getdcbx)
                return h->kinfo.dcb_ops->getdcbx(h);
@@ -73,8 +68,7 @@ static u8 hns3_dcbnl_getdcbx(struct net_device *ndev)
 /* return 0 if successful, otherwise fail */
 static u8 hns3_dcbnl_setdcbx(struct net_device *ndev, u8 mode)
 {
-       struct hns3_nic_priv *priv = netdev_priv(ndev);
-       struct hnae3_handle *h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(ndev);
 
        if (h->kinfo.dcb_ops->setdcbx)
                return h->kinfo.dcb_ops->setdcbx(h, mode);
index c315065..ba550c1 100644 (file)
@@ -24,7 +24,7 @@
 #include "hnae3.h"
 #include "hns3_enet.h"
 
-const char hns3_driver_name[] = "hns3";
+static const char hns3_driver_name[] = "hns3";
 const char hns3_driver_version[] = VERMAGIC_STRING;
 static const char hns3_driver_string[] =
                        "Hisilicon Ethernet Network Driver for Hip08 Family";
@@ -198,8 +198,7 @@ static void hns3_vector_gl_rl_init(struct hns3_enet_tqp_vector *tqp_vector)
 
 static int hns3_nic_set_real_num_queue(struct net_device *netdev)
 {
-       struct hns3_nic_priv *priv = netdev_priv(netdev);
-       struct hnae3_handle *h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(netdev);
        struct hnae3_knic_private_info *kinfo = &h->kinfo;
        unsigned int queue_size = kinfo->rss_size * kinfo->num_tc;
        int ret;
@@ -305,24 +304,10 @@ static int hns3_nic_net_stop(struct net_device *netdev)
        return 0;
 }
 
-void hns3_set_multicast_list(struct net_device *netdev)
-{
-       struct hns3_nic_priv *priv = netdev_priv(netdev);
-       struct hnae3_handle *h = priv->ae_handle;
-       struct netdev_hw_addr *ha = NULL;
-
-       if (h->ae_algo->ops->set_mc_addr) {
-               netdev_for_each_mc_addr(ha, netdev)
-                       if (h->ae_algo->ops->set_mc_addr(h, ha->addr))
-                               netdev_err(netdev, "set multicast fail\n");
-       }
-}
-
 static int hns3_nic_uc_sync(struct net_device *netdev,
                            const unsigned char *addr)
 {
-       struct hns3_nic_priv *priv = netdev_priv(netdev);
-       struct hnae3_handle *h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(netdev);
 
        if (h->ae_algo->ops->add_uc_addr)
                return h->ae_algo->ops->add_uc_addr(h, addr);
@@ -333,8 +318,7 @@ static int hns3_nic_uc_sync(struct net_device *netdev,
 static int hns3_nic_uc_unsync(struct net_device *netdev,
                              const unsigned char *addr)
 {
-       struct hns3_nic_priv *priv = netdev_priv(netdev);
-       struct hnae3_handle *h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(netdev);
 
        if (h->ae_algo->ops->rm_uc_addr)
                return h->ae_algo->ops->rm_uc_addr(h, addr);
@@ -345,8 +329,7 @@ static int hns3_nic_uc_unsync(struct net_device *netdev,
 static int hns3_nic_mc_sync(struct net_device *netdev,
                            const unsigned char *addr)
 {
-       struct hns3_nic_priv *priv = netdev_priv(netdev);
-       struct hnae3_handle *h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(netdev);
 
        if (h->ae_algo->ops->add_mc_addr)
                return h->ae_algo->ops->add_mc_addr(h, addr);
@@ -357,8 +340,7 @@ static int hns3_nic_mc_sync(struct net_device *netdev,
 static int hns3_nic_mc_unsync(struct net_device *netdev,
                              const unsigned char *addr)
 {
-       struct hns3_nic_priv *priv = netdev_priv(netdev);
-       struct hnae3_handle *h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(netdev);
 
        if (h->ae_algo->ops->rm_mc_addr)
                return h->ae_algo->ops->rm_mc_addr(h, addr);
@@ -366,10 +348,9 @@ static int hns3_nic_mc_unsync(struct net_device *netdev,
        return 0;
 }
 
-void hns3_nic_set_rx_mode(struct net_device *netdev)
+static void hns3_nic_set_rx_mode(struct net_device *netdev)
 {
-       struct hns3_nic_priv *priv = netdev_priv(netdev);
-       struct hnae3_handle *h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(netdev);
 
        if (h->ae_algo->ops->set_promisc_mode) {
                if (netdev->flags & IFF_PROMISC)
@@ -768,7 +749,7 @@ static int hns3_fill_desc(struct hns3_enet_ring *ring, void *priv,
 
        if (type == DESC_TYPE_SKB) {
                skb = (struct sk_buff *)priv;
-               paylen = cpu_to_le16(skb->len);
+               paylen = skb->len;
 
                if (skb->ip_summed == CHECKSUM_PARTIAL) {
                        skb_reset_mac_len(skb);
@@ -802,7 +783,7 @@ static int hns3_fill_desc(struct hns3_enet_ring *ring, void *priv,
                        cpu_to_le32(ol_type_vlan_len_msec);
                desc->tx.type_cs_vlan_tso_len =
                        cpu_to_le32(type_cs_vlan_tso);
-               desc->tx.paylen = cpu_to_le16(paylen);
+               desc->tx.paylen = cpu_to_le32(paylen);
                desc->tx.mss = cpu_to_le16(mss);
        }
 
@@ -1025,8 +1006,7 @@ out_net_tx_busy:
 
 static int hns3_nic_net_set_mac_address(struct net_device *netdev, void *p)
 {
-       struct hns3_nic_priv *priv = netdev_priv(netdev);
-       struct hnae3_handle *h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(netdev);
        struct sockaddr *mac_addr = p;
        int ret;
 
@@ -1208,8 +1188,7 @@ static void hns3_nic_udp_tunnel_del(struct net_device *netdev,
 
 static int hns3_setup_tc(struct net_device *netdev, u8 tc)
 {
-       struct hns3_nic_priv *priv = netdev_priv(netdev);
-       struct hnae3_handle *h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(netdev);
        struct hnae3_knic_private_info *kinfo = &h->kinfo;
        unsigned int i;
        int ret;
@@ -1259,8 +1238,7 @@ static int hns3_nic_setup_tc(struct net_device *dev, enum tc_setup_type type,
 static int hns3_vlan_rx_add_vid(struct net_device *netdev,
                                __be16 proto, u16 vid)
 {
-       struct hns3_nic_priv *priv = netdev_priv(netdev);
-       struct hnae3_handle *h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(netdev);
        int ret = -EIO;
 
        if (h->ae_algo->ops->set_vlan_filter)
@@ -1272,8 +1250,7 @@ static int hns3_vlan_rx_add_vid(struct net_device *netdev,
 static int hns3_vlan_rx_kill_vid(struct net_device *netdev,
                                 __be16 proto, u16 vid)
 {
-       struct hns3_nic_priv *priv = netdev_priv(netdev);
-       struct hnae3_handle *h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(netdev);
        int ret = -EIO;
 
        if (h->ae_algo->ops->set_vlan_filter)
@@ -1285,8 +1262,7 @@ static int hns3_vlan_rx_kill_vid(struct net_device *netdev,
 static int hns3_ndo_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan,
                                u8 qos, __be16 vlan_proto)
 {
-       struct hns3_nic_priv *priv = netdev_priv(netdev);
-       struct hnae3_handle *h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(netdev);
        int ret = -EIO;
 
        if (h->ae_algo->ops->set_vf_vlan_filter)
@@ -1298,8 +1274,7 @@ static int hns3_ndo_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan,
 
 static int hns3_nic_change_mtu(struct net_device *netdev, int new_mtu)
 {
-       struct hns3_nic_priv *priv = netdev_priv(netdev);
-       struct hnae3_handle *h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(netdev);
        bool if_running = netif_running(netdev);
        int ret;
 
@@ -2609,7 +2584,7 @@ static void hns3_fini_ring(struct hns3_enet_ring *ring)
        ring->next_to_use = 0;
 }
 
-int hns3_buf_size2type(u32 buf_size)
+static int hns3_buf_size2type(u32 buf_size)
 {
        int bd_size_type;
 
@@ -2662,7 +2637,7 @@ static void hns3_init_ring_hw(struct hns3_enet_ring *ring)
        }
 }
 
-static int hns3_init_all_ring(struct hns3_nic_priv *priv)
+int hns3_init_all_ring(struct hns3_nic_priv *priv)
 {
        struct hnae3_handle *h = priv->ae_handle;
        int ring_num = h->kinfo.num_tqps * 2;
@@ -2686,12 +2661,12 @@ static int hns3_init_all_ring(struct hns3_nic_priv *priv)
 
 out_when_alloc_ring_memory:
        for (j = i - 1; j >= 0; j--)
-               hns3_fini_ring(priv->ring_data[i].ring);
+               hns3_fini_ring(priv->ring_data[j].ring);
 
        return -ENOMEM;
 }
 
-static int hns3_uninit_all_ring(struct hns3_nic_priv *priv)
+int hns3_uninit_all_ring(struct hns3_nic_priv *priv)
 {
        struct hnae3_handle *h = priv->ae_handle;
        int i;
@@ -2921,7 +2896,7 @@ err_out:
        return ret;
 }
 
-const struct hnae3_client_ops client_ops = {
+static const struct hnae3_client_ops client_ops = {
        .init_instance = hns3_client_init,
        .uninit_instance = hns3_client_uninit,
        .link_status_change = hns3_link_status_change,
index 481eada..6659989 100644 (file)
@@ -76,6 +76,8 @@ enum hns3_nic_state {
 #define HNS3_RING_NAME_LEN                     16
 #define HNS3_BUFFER_SIZE_2048                  2048
 #define HNS3_RING_MAX_PENDING                  32768
+#define HNS3_RING_MIN_PENDING                  8
+#define HNS3_RING_BD_MULTIPLE                  8
 #define HNS3_MAX_MTU                           9728
 
 #define HNS3_BD_SIZE_512_TYPE                  0
@@ -587,9 +589,14 @@ static inline void hns3_write_reg(void __iomem *base, u32 reg, u32 value)
 #define hns3_for_each_ring(pos, head) \
        for (pos = (head).ring; pos; pos = pos->next)
 
+#define hns3_get_handle(ndev) \
+       (((struct hns3_nic_priv *)netdev_priv(ndev))->ae_handle)
+
 void hns3_ethtool_set_ops(struct net_device *netdev);
 
 int hns3_clean_tx_ring(struct hns3_enet_ring *ring, int budget);
+int hns3_init_all_ring(struct hns3_nic_priv *priv);
+int hns3_uninit_all_ring(struct hns3_nic_priv *priv);
 
 #ifdef CONFIG_HNS3_DCB
 void hns3_dcbnl_setup(struct hnae3_handle *handle);
index d636399..9b36ce0 100644 (file)
@@ -102,8 +102,7 @@ static void hns3_driv_to_eth_caps(u32 caps, struct ethtool_link_ksettings *cmd,
 
 static int hns3_get_sset_count(struct net_device *netdev, int stringset)
 {
-       struct hns3_nic_priv *priv = netdev_priv(netdev);
-       struct hnae3_handle *h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(netdev);
        const struct hnae3_ae_ops *ops = h->ae_algo->ops;
 
        if (!ops->get_sset_count)
@@ -164,8 +163,7 @@ static u8 *hns3_get_strings_tqps(struct hnae3_handle *handle, u8 *data)
 
 static void hns3_get_strings(struct net_device *netdev, u32 stringset, u8 *data)
 {
-       struct hns3_nic_priv *priv = netdev_priv(netdev);
-       struct hnae3_handle *h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(netdev);
        const struct hnae3_ae_ops *ops = h->ae_algo->ops;
        char *buff = (char *)data;
 
@@ -217,11 +215,10 @@ static u64 *hns3_get_stats_tqps(struct hnae3_handle *handle, u64 *data)
  * @stats: statistics info.
  * @data: statistics data.
  */
-void hns3_get_stats(struct net_device *netdev, struct ethtool_stats *stats,
-                   u64 *data)
+static void hns3_get_stats(struct net_device *netdev,
+                          struct ethtool_stats *stats, u64 *data)
 {
-       struct hns3_nic_priv *priv = netdev_priv(netdev);
-       struct hnae3_handle *h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(netdev);
        u64 *p = data;
 
        if (!h->ae_algo->ops->get_stats || !h->ae_algo->ops->update_stats) {
@@ -262,10 +259,7 @@ static void hns3_get_drvinfo(struct net_device *netdev,
 
 static u32 hns3_get_link(struct net_device *netdev)
 {
-       struct hns3_nic_priv *priv = netdev_priv(netdev);
-       struct hnae3_handle *h;
-
-       h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(netdev);
 
        if (h->ae_algo && h->ae_algo->ops && h->ae_algo->ops->get_status)
                return h->ae_algo->ops->get_status(h);
@@ -277,7 +271,8 @@ static void hns3_get_ringparam(struct net_device *netdev,
                               struct ethtool_ringparam *param)
 {
        struct hns3_nic_priv *priv = netdev_priv(netdev);
-       int queue_num = priv->ae_handle->kinfo.num_tqps;
+       struct hnae3_handle *h = priv->ae_handle;
+       int queue_num = h->kinfo.num_tqps;
 
        param->tx_max_pending = HNS3_RING_MAX_PENDING;
        param->rx_max_pending = HNS3_RING_MAX_PENDING;
@@ -289,8 +284,7 @@ static void hns3_get_ringparam(struct net_device *netdev,
 static void hns3_get_pauseparam(struct net_device *netdev,
                                struct ethtool_pauseparam *param)
 {
-       struct hns3_nic_priv *priv = netdev_priv(netdev);
-       struct hnae3_handle *h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(netdev);
 
        if (h->ae_algo && h->ae_algo->ops && h->ae_algo->ops->get_pauseparam)
                h->ae_algo->ops->get_pauseparam(h, &param->autoneg,
@@ -300,8 +294,7 @@ static void hns3_get_pauseparam(struct net_device *netdev,
 static int hns3_get_link_ksettings(struct net_device *netdev,
                                   struct ethtool_link_ksettings *cmd)
 {
-       struct hns3_nic_priv *priv = netdev_priv(netdev);
-       struct hnae3_handle *h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(netdev);
        u32 supported_caps;
        u32 advertised_caps;
        u8 media_type = HNAE3_MEDIA_TYPE_UNKNOWN;
@@ -392,8 +385,7 @@ static int hns3_get_link_ksettings(struct net_device *netdev,
 
 static u32 hns3_get_rss_key_size(struct net_device *netdev)
 {
-       struct hns3_nic_priv *priv = netdev_priv(netdev);
-       struct hnae3_handle *h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(netdev);
 
        if (!h->ae_algo || !h->ae_algo->ops ||
            !h->ae_algo->ops->get_rss_key_size)
@@ -404,8 +396,7 @@ static u32 hns3_get_rss_key_size(struct net_device *netdev)
 
 static u32 hns3_get_rss_indir_size(struct net_device *netdev)
 {
-       struct hns3_nic_priv *priv = netdev_priv(netdev);
-       struct hnae3_handle *h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(netdev);
 
        if (!h->ae_algo || !h->ae_algo->ops ||
            !h->ae_algo->ops->get_rss_indir_size)
@@ -417,8 +408,7 @@ static u32 hns3_get_rss_indir_size(struct net_device *netdev)
 static int hns3_get_rss(struct net_device *netdev, u32 *indir, u8 *key,
                        u8 *hfunc)
 {
-       struct hns3_nic_priv *priv = netdev_priv(netdev);
-       struct hnae3_handle *h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(netdev);
 
        if (!h->ae_algo || !h->ae_algo->ops || !h->ae_algo->ops->get_rss)
                return -EOPNOTSUPP;
@@ -429,8 +419,7 @@ static int hns3_get_rss(struct net_device *netdev, u32 *indir, u8 *key,
 static int hns3_set_rss(struct net_device *netdev, const u32 *indir,
                        const u8 *key, const u8 hfunc)
 {
-       struct hns3_nic_priv *priv = netdev_priv(netdev);
-       struct hnae3_handle *h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(netdev);
 
        if (!h->ae_algo || !h->ae_algo->ops || !h->ae_algo->ops->set_rss)
                return -EOPNOTSUPP;
@@ -454,16 +443,17 @@ static int hns3_get_rxnfc(struct net_device *netdev,
                          struct ethtool_rxnfc *cmd,
                          u32 *rule_locs)
 {
-       struct hns3_nic_priv *priv = netdev_priv(netdev);
-       struct hnae3_handle *h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(netdev);
 
        if (!h->ae_algo || !h->ae_algo->ops || !h->ae_algo->ops->get_tc_size)
                return -EOPNOTSUPP;
 
        switch (cmd->cmd) {
        case ETHTOOL_GRXRINGS:
-               cmd->data = h->ae_algo->ops->get_tc_size(h);
+               cmd->data = h->kinfo.num_tc * h->kinfo.rss_size;
                break;
+       case ETHTOOL_GRXFH:
+               return h->ae_algo->ops->get_rss_tuple(h, cmd);
        default:
                return -EOPNOTSUPP;
        }
@@ -471,15 +461,106 @@ static int hns3_get_rxnfc(struct net_device *netdev,
        return 0;
 }
 
+int hns3_change_all_ring_bd_num(struct hns3_nic_priv *priv, u32 new_desc_num)
+{
+       struct hnae3_handle *h = priv->ae_handle;
+       int i;
+
+       h->kinfo.num_desc = new_desc_num;
+
+       for (i = 0; i < h->kinfo.num_tqps * 2; i++)
+               priv->ring_data[i].ring->desc_num = new_desc_num;
+
+       return hns3_init_all_ring(priv);
+}
+
+int hns3_set_ringparam(struct net_device *ndev, struct ethtool_ringparam *param)
+{
+       struct hns3_nic_priv *priv = netdev_priv(ndev);
+       struct hnae3_handle *h = priv->ae_handle;
+       bool if_running = netif_running(ndev);
+       u32 old_desc_num, new_desc_num;
+       int ret;
+
+       if (param->rx_mini_pending || param->rx_jumbo_pending)
+               return -EINVAL;
+
+       if (param->tx_pending != param->rx_pending) {
+               netdev_err(ndev,
+                          "Descriptors of tx and rx must be equal");
+               return -EINVAL;
+       }
+
+       if (param->tx_pending > HNS3_RING_MAX_PENDING ||
+           param->tx_pending < HNS3_RING_MIN_PENDING) {
+               netdev_err(ndev,
+                          "Descriptors requested (Tx/Rx: %d) out of range [%d-%d]\n",
+                          param->tx_pending, HNS3_RING_MIN_PENDING,
+                          HNS3_RING_MAX_PENDING);
+               return -EINVAL;
+       }
+
+       new_desc_num = param->tx_pending;
+
+       /* Hardware requires that its descriptors must be multiple of eight */
+       new_desc_num = ALIGN(new_desc_num, HNS3_RING_BD_MULTIPLE);
+       old_desc_num = h->kinfo.num_desc;
+       if (old_desc_num == new_desc_num)
+               return 0;
+
+       netdev_info(ndev,
+                   "Changing descriptor count from %d to %d.\n",
+                   old_desc_num, new_desc_num);
+
+       if (if_running)
+               dev_close(ndev);
+
+       ret = hns3_uninit_all_ring(priv);
+       if (ret)
+               return ret;
+
+       ret = hns3_change_all_ring_bd_num(priv, new_desc_num);
+       if (ret) {
+               ret = hns3_change_all_ring_bd_num(priv, old_desc_num);
+               if (ret) {
+                       netdev_err(ndev,
+                                  "Revert to old bd num fail, ret=%d.\n", ret);
+                       return ret;
+               }
+       }
+
+       if (if_running)
+               ret = dev_open(ndev);
+
+       return ret;
+}
+
+static int hns3_set_rxnfc(struct net_device *netdev, struct ethtool_rxnfc *cmd)
+{
+       struct hnae3_handle *h = hns3_get_handle(netdev);
+
+       if (!h->ae_algo || !h->ae_algo->ops || !h->ae_algo->ops->set_rss_tuple)
+               return -EOPNOTSUPP;
+
+       switch (cmd->cmd) {
+       case ETHTOOL_SRXFH:
+               return h->ae_algo->ops->set_rss_tuple(h, cmd);
+       default:
+               return -EOPNOTSUPP;
+       }
+}
+
 static const struct ethtool_ops hns3_ethtool_ops = {
        .get_drvinfo = hns3_get_drvinfo,
        .get_link = hns3_get_link,
        .get_ringparam = hns3_get_ringparam,
+       .set_ringparam = hns3_set_ringparam,
        .get_pauseparam = hns3_get_pauseparam,
        .get_strings = hns3_get_strings,
        .get_ethtool_stats = hns3_get_stats,
        .get_sset_count = hns3_get_sset_count,
        .get_rxnfc = hns3_get_rxnfc,
+       .set_rxnfc = hns3_set_rxnfc,
        .get_rxfh_key_size = hns3_get_rss_key_size,
        .get_rxfh_indir_size = hns3_get_rss_indir_size,
        .get_rxfh = hns3_get_rss,
index 0641c00..afb7ebe 100644 (file)
 #define E1000_ICR_LSC           0x00000004 /* Link Status Change */
 #define E1000_ICR_RXSEQ         0x00000008 /* Rx sequence error */
 #define E1000_ICR_RXDMT0        0x00000010 /* Rx desc min. threshold (0) */
+#define E1000_ICR_RXO           0x00000040 /* Receiver Overrun */
 #define E1000_ICR_RXT0          0x00000080 /* Rx timer intr (ring 0) */
 #define E1000_ICR_ECCER         0x00400000 /* Uncorrectable ECC Error */
 /* If this bit asserted, the driver should claim the interrupt */
index 98e6888..2311b31 100644 (file)
@@ -94,10 +94,6 @@ struct e1000_info;
  */
 #define E1000_CHECK_RESET_COUNT                25
 
-#define DEFAULT_RDTR                   0
-#define DEFAULT_RADV                   8
-#define BURST_RDTR                     0x20
-#define BURST_RADV                     0x20
 #define PCICFG_DESC_RING_STATUS                0xe4
 #define FLUSH_DESC_REQUIRED            0x100
 
index b322011..f457c57 100644 (file)
@@ -410,6 +410,9 @@ void e1000e_clear_hw_cntrs_base(struct e1000_hw *hw)
  *  Checks to see of the link status of the hardware has changed.  If a
  *  change in link status has been detected, then we read the PHY registers
  *  to get the current speed/duplex if link exists.
+ *
+ *  Returns a negative error code (-E1000_ERR_*) or 0 (link down) or 1 (link
+ *  up).
  **/
 s32 e1000e_check_for_copper_link(struct e1000_hw *hw)
 {
@@ -423,7 +426,7 @@ s32 e1000e_check_for_copper_link(struct e1000_hw *hw)
         * Change or Rx Sequence Error interrupt.
         */
        if (!mac->get_link_status)
-               return 0;
+               return 1;
 
        /* First we want to see if the MII Status Register reports
         * link.  If so, then we want to get the current speed/duplex
@@ -461,10 +464,12 @@ s32 e1000e_check_for_copper_link(struct e1000_hw *hw)
         * different link partner.
         */
        ret_val = e1000e_config_fc_after_link_up(hw);
-       if (ret_val)
+       if (ret_val) {
                e_dbg("Error configuring flow control\n");
+               return ret_val;
+       }
 
-       return ret_val;
+       return 1;
 }
 
 /**
index 8436c5f..bf8f38f 100644 (file)
@@ -1071,7 +1071,8 @@ next_desc:
 }
 
 static void e1000_put_txbuf(struct e1000_ring *tx_ring,
-                           struct e1000_buffer *buffer_info)
+                           struct e1000_buffer *buffer_info,
+                           bool drop)
 {
        struct e1000_adapter *adapter = tx_ring->adapter;
 
@@ -1085,7 +1086,10 @@ static void e1000_put_txbuf(struct e1000_ring *tx_ring,
                buffer_info->dma = 0;
        }
        if (buffer_info->skb) {
-               dev_kfree_skb_any(buffer_info->skb);
+               if (drop)
+                       dev_kfree_skb_any(buffer_info->skb);
+               else
+                       dev_consume_skb_any(buffer_info->skb);
                buffer_info->skb = NULL;
        }
        buffer_info->time_stamp = 0;
@@ -1199,7 +1203,7 @@ static void e1000e_tx_hwtstamp_work(struct work_struct *work)
                wmb(); /* force write prior to skb_tstamp_tx */
 
                skb_tstamp_tx(skb, &shhwtstamps);
-               dev_kfree_skb_any(skb);
+               dev_consume_skb_any(skb);
        } else if (time_after(jiffies, adapter->tx_hwtstamp_start
                              + adapter->tx_timeout_factor * HZ)) {
                dev_kfree_skb_any(adapter->tx_hwtstamp_skb);
@@ -1254,7 +1258,7 @@ static bool e1000_clean_tx_irq(struct e1000_ring *tx_ring)
                                }
                        }
 
-                       e1000_put_txbuf(tx_ring, buffer_info);
+                       e1000_put_txbuf(tx_ring, buffer_info, false);
                        tx_desc->upper.data = 0;
 
                        i++;
@@ -1910,14 +1914,30 @@ static irqreturn_t e1000_msix_other(int __always_unused irq, void *data)
        struct net_device *netdev = data;
        struct e1000_adapter *adapter = netdev_priv(netdev);
        struct e1000_hw *hw = &adapter->hw;
+       u32 icr;
+       bool enable = true;
+
+       icr = er32(ICR);
+       if (icr & E1000_ICR_RXO) {
+               ew32(ICR, E1000_ICR_RXO);
+               enable = false;
+               /* napi poll will re-enable Other, make sure it runs */
+               if (napi_schedule_prep(&adapter->napi)) {
+                       adapter->total_rx_bytes = 0;
+                       adapter->total_rx_packets = 0;
+                       __napi_schedule(&adapter->napi);
+               }
+       }
+       if (icr & E1000_ICR_LSC) {
+               ew32(ICR, E1000_ICR_LSC);
+               hw->mac.get_link_status = true;
+               /* guard against interrupt when we're going down */
+               if (!test_bit(__E1000_DOWN, &adapter->state))
+                       mod_timer(&adapter->watchdog_timer, jiffies + 1);
+       }
 
-       hw->mac.get_link_status = true;
-
-       /* guard against interrupt when we're going down */
-       if (!test_bit(__E1000_DOWN, &adapter->state)) {
-               mod_timer(&adapter->watchdog_timer, jiffies + 1);
+       if (enable && !test_bit(__E1000_DOWN, &adapter->state))
                ew32(IMS, E1000_IMS_OTHER);
-       }
 
        return IRQ_HANDLED;
 }
@@ -2421,7 +2441,7 @@ static void e1000_clean_tx_ring(struct e1000_ring *tx_ring)
 
        for (i = 0; i < tx_ring->count; i++) {
                buffer_info = &tx_ring->buffer_info[i];
-               e1000_put_txbuf(tx_ring, buffer_info);
+               e1000_put_txbuf(tx_ring, buffer_info, false);
        }
 
        netdev_reset_queue(adapter->netdev);
@@ -2687,7 +2707,8 @@ static int e1000e_poll(struct napi_struct *napi, int weight)
                napi_complete_done(napi, work_done);
                if (!test_bit(__E1000_DOWN, &adapter->state)) {
                        if (adapter->msix_entries)
-                               ew32(IMS, adapter->rx_ring->ims_val);
+                               ew32(IMS, adapter->rx_ring->ims_val |
+                                    E1000_IMS_OTHER);
                        else
                                e1000_irq_enable(adapter);
                }
@@ -3004,8 +3025,8 @@ static void e1000_configure_tx(struct e1000_adapter *adapter)
 
        hw->mac.ops.config_collision_dist(hw);
 
-       /* SPT and CNP Si errata workaround to avoid data corruption */
-       if (hw->mac.type >= e1000_pch_spt) {
+       /* SPT and KBL Si errata workaround to avoid data corruption */
+       if (hw->mac.type == e1000_pch_spt) {
                u32 reg_val;
 
                reg_val = er32(IOSFPC);
@@ -3013,7 +3034,9 @@ static void e1000_configure_tx(struct e1000_adapter *adapter)
                ew32(IOSFPC, reg_val);
 
                reg_val = er32(TARC(0));
-               reg_val |= E1000_TARC0_CB_MULTIQ_3_REQ;
+               /* SPT and KBL Si errata workaround to avoid Tx hang */
+               reg_val &= ~BIT(28);
+               reg_val |= BIT(29);
                ew32(TARC(0), reg_val);
        }
 }
@@ -3223,14 +3246,6 @@ static void e1000_configure_rx(struct e1000_adapter *adapter)
                 */
                ew32(RXDCTL(0), E1000_RXDCTL_DMA_BURST_ENABLE);
                ew32(RXDCTL(1), E1000_RXDCTL_DMA_BURST_ENABLE);
-
-               /* override the delay timers for enabling bursting, only if
-                * the value was not set by the user via module options
-                */
-               if (adapter->rx_int_delay == DEFAULT_RDTR)
-                       adapter->rx_int_delay = BURST_RDTR;
-               if (adapter->rx_abs_int_delay == DEFAULT_RADV)
-                       adapter->rx_abs_int_delay = BURST_RADV;
        }
 
        /* set the Receive Delay Timer Register */
@@ -4204,7 +4219,7 @@ static void e1000e_trigger_lsc(struct e1000_adapter *adapter)
        struct e1000_hw *hw = &adapter->hw;
 
        if (adapter->msix_entries)
-               ew32(ICS, E1000_ICS_OTHER);
+               ew32(ICS, E1000_ICS_LSC | E1000_ICS_OTHER);
        else
                ew32(ICS, E1000_ICS_LSC);
 }
@@ -5074,14 +5089,14 @@ static bool e1000e_has_link(struct e1000_adapter *adapter)
 
        /* get_link_status is set on LSC (link status) interrupt or
         * Rx sequence error interrupt.  get_link_status will stay
-        * false until the check_for_link establishes link
+        * true until the check_for_link establishes link
         * for copper adapters ONLY
         */
        switch (hw->phy.media_type) {
        case e1000_media_type_copper:
                if (hw->mac.get_link_status) {
                        ret_val = hw->mac.ops.check_for_link(hw);
-                       link_active = !hw->mac.get_link_status;
+                       link_active = ret_val > 0;
                } else {
                        link_active = true;
                }
@@ -5092,14 +5107,14 @@ static bool e1000e_has_link(struct e1000_adapter *adapter)
                break;
        case e1000_media_type_internal_serdes:
                ret_val = hw->mac.ops.check_for_link(hw);
-               link_active = adapter->hw.mac.serdes_has_link;
+               link_active = hw->mac.serdes_has_link;
                break;
        default:
        case e1000_media_type_unknown:
                break;
        }
 
-       if ((ret_val == E1000_ERR_PHY) && (hw->phy.type == e1000_phy_igp_3) &&
+       if ((ret_val == -E1000_ERR_PHY) && (hw->phy.type == e1000_phy_igp_3) &&
            (er32(CTRL) & E1000_PHY_CTRL_GBE_DISABLE)) {
                /* See e1000_kmrn_lock_loss_workaround_ich8lan() */
                e_info("Gigabit has been disabled, downgrading speed\n");
@@ -5614,7 +5629,7 @@ dma_error:
                        i += tx_ring->count;
                i--;
                buffer_info = &tx_ring->buffer_info[i];
-               e1000_put_txbuf(tx_ring, buffer_info);
+               e1000_put_txbuf(tx_ring, buffer_info, true);
        }
 
        return 0;
@@ -7408,7 +7423,7 @@ static void e1000_remove(struct pci_dev *pdev)
        if (adapter->flags & FLAG_HAS_HW_TIMESTAMP) {
                cancel_work_sync(&adapter->tx_hwtstamp_work);
                if (adapter->tx_hwtstamp_skb) {
-                       dev_kfree_skb_any(adapter->tx_hwtstamp_skb);
+                       dev_consume_skb_any(adapter->tx_hwtstamp_skb);
                        adapter->tx_hwtstamp_skb = NULL;
                }
        }
index 6d8c39a..47da518 100644 (file)
@@ -73,17 +73,25 @@ E1000_PARAM(TxAbsIntDelay, "Transmit Absolute Interrupt Delay");
 /* Receive Interrupt Delay in units of 1.024 microseconds
  * hardware will likely hang if you set this to anything but zero.
  *
+ * Burst variant is used as default if device has FLAG2_DMA_BURST.
+ *
  * Valid Range: 0-65535
  */
 E1000_PARAM(RxIntDelay, "Receive Interrupt Delay");
+#define DEFAULT_RDTR   0
+#define BURST_RDTR     0x20
 #define MAX_RXDELAY 0xFFFF
 #define MIN_RXDELAY 0
 
 /* Receive Absolute Interrupt Delay in units of 1.024 microseconds
  *
+ * Burst variant is used as default if device has FLAG2_DMA_BURST.
+ *
  * Valid Range: 0-65535
  */
 E1000_PARAM(RxAbsIntDelay, "Receive Absolute Interrupt Delay");
+#define DEFAULT_RADV   8
+#define BURST_RADV     0x20
 #define MAX_RXABSDELAY 0xFFFF
 #define MIN_RXABSDELAY 0
 
@@ -297,6 +305,9 @@ void e1000e_check_options(struct e1000_adapter *adapter)
                                         .max = MAX_RXDELAY } }
                };
 
+               if (adapter->flags2 & FLAG2_DMA_BURST)
+                       opt.def = BURST_RDTR;
+
                if (num_RxIntDelay > bd) {
                        adapter->rx_int_delay = RxIntDelay[bd];
                        e1000_validate_option(&adapter->rx_int_delay, &opt,
@@ -307,7 +318,7 @@ void e1000e_check_options(struct e1000_adapter *adapter)
        }
        /* Receive Absolute Interrupt Delay */
        {
-               static const struct e1000_option opt = {
+               static struct e1000_option opt = {
                        .type = range_option,
                        .name = "Receive Absolute Interrupt Delay",
                        .err  = "using default of "
@@ -317,6 +328,9 @@ void e1000e_check_options(struct e1000_adapter *adapter)
                                         .max = MAX_RXABSDELAY } }
                };
 
+               if (adapter->flags2 & FLAG2_DMA_BURST)
+                       opt.def = BURST_RADV;
+
                if (num_RxAbsIntDelay > bd) {
                        adapter->rx_abs_int_delay = RxAbsIntDelay[bd];
                        e1000_validate_option(&adapter->rx_abs_int_delay, &opt,
index d78d47b..86ff096 100644 (file)
@@ -1744,6 +1744,7 @@ s32 e1000e_phy_has_link_generic(struct e1000_hw *hw, u32 iterations,
        s32 ret_val = 0;
        u16 i, phy_status;
 
+       *success = false;
        for (i = 0; i < iterations; i++) {
                /* Some PHYs require the MII_BMSR register to be read
                 * twice due to the link bit being sticky.  No harm doing
@@ -1763,16 +1764,16 @@ s32 e1000e_phy_has_link_generic(struct e1000_hw *hw, u32 iterations,
                ret_val = e1e_rphy(hw, MII_BMSR, &phy_status);
                if (ret_val)
                        break;
-               if (phy_status & BMSR_LSTATUS)
+               if (phy_status & BMSR_LSTATUS) {
+                       *success = true;
                        break;
+               }
                if (usec_interval >= 1000)
                        msleep(usec_interval / 1000);
                else
                        udelay(usec_interval);
        }
 
-       *success = (i < iterations);
-
        return ret_val;
 }
 
index 439c63c..8139b4e 100644 (file)
@@ -350,7 +350,7 @@ struct i40e_pf {
        u16 num_vmdq_vsis;         /* num vmdq vsis this PF has set up */
        u16 num_vmdq_qps;          /* num queue pairs per vmdq pool */
        u16 num_vmdq_msix;         /* num queue vectors per vmdq pool */
-       u16 num_req_vfs;           /* num VFs requested for this VF */
+       u16 num_req_vfs;           /* num VFs requested for this PF */
        u16 num_vf_qps;            /* num queue pairs per VF */
        u16 num_lan_qps;           /* num lan queues this PF has set up */
        u16 num_lan_msix;          /* num queue vectors for the base PF vsi */
@@ -403,55 +403,57 @@ struct i40e_pf {
        struct timer_list service_timer;
        struct work_struct service_task;
 
-       u64 hw_features;
-#define I40E_HW_RSS_AQ_CAPABLE                 BIT_ULL(0)
-#define I40E_HW_128_QP_RSS_CAPABLE             BIT_ULL(1)
-#define I40E_HW_ATR_EVICT_CAPABLE              BIT_ULL(2)
-#define I40E_HW_WB_ON_ITR_CAPABLE              BIT_ULL(3)
-#define I40E_HW_MULTIPLE_TCP_UDP_RSS_PCTYPE    BIT_ULL(4)
-#define I40E_HW_NO_PCI_LINK_CHECK              BIT_ULL(5)
-#define I40E_HW_100M_SGMII_CAPABLE             BIT_ULL(6)
-#define I40E_HW_NO_DCB_SUPPORT                 BIT_ULL(7)
-#define I40E_HW_USE_SET_LLDP_MIB               BIT_ULL(8)
-#define I40E_HW_GENEVE_OFFLOAD_CAPABLE         BIT_ULL(9)
-#define I40E_HW_PTP_L4_CAPABLE                 BIT_ULL(10)
-#define I40E_HW_WOL_MC_MAGIC_PKT_WAKE          BIT_ULL(11)
-#define I40E_HW_MPLS_HDR_OFFLOAD_CAPABLE       BIT_ULL(12)
-#define I40E_HW_HAVE_CRT_RETIMER               BIT_ULL(13)
-#define I40E_HW_OUTER_UDP_CSUM_CAPABLE         BIT_ULL(14)
-#define I40E_HW_PHY_CONTROLS_LEDS              BIT_ULL(15)
-#define I40E_HW_STOP_FW_LLDP                   BIT_ULL(16)
-#define I40E_HW_PORT_ID_VALID                  BIT_ULL(17)
-#define I40E_HW_RESTART_AUTONEG                        BIT_ULL(18)
-
-       u64 flags;
-#define I40E_FLAG_RX_CSUM_ENABLED              BIT_ULL(1)
-#define I40E_FLAG_MSI_ENABLED                  BIT_ULL(2)
-#define I40E_FLAG_MSIX_ENABLED                 BIT_ULL(3)
-#define I40E_FLAG_HW_ATR_EVICT_ENABLED         BIT_ULL(4)
-#define I40E_FLAG_RSS_ENABLED                  BIT_ULL(6)
-#define I40E_FLAG_VMDQ_ENABLED                 BIT_ULL(7)
-#define I40E_FLAG_IWARP_ENABLED                        BIT_ULL(10)
-#define I40E_FLAG_FILTER_SYNC                  BIT_ULL(15)
-#define I40E_FLAG_SERVICE_CLIENT_REQUESTED     BIT_ULL(16)
-#define I40E_FLAG_SRIOV_ENABLED                        BIT_ULL(19)
-#define I40E_FLAG_DCB_ENABLED                  BIT_ULL(20)
-#define I40E_FLAG_FD_SB_ENABLED                        BIT_ULL(21)
-#define I40E_FLAG_FD_ATR_ENABLED               BIT_ULL(22)
-#define I40E_FLAG_FD_SB_AUTO_DISABLED          BIT_ULL(23)
-#define I40E_FLAG_FD_ATR_AUTO_DISABLED         BIT_ULL(24)
-#define I40E_FLAG_PTP                          BIT_ULL(25)
-#define I40E_FLAG_MFP_ENABLED                  BIT_ULL(26)
-#define I40E_FLAG_UDP_FILTER_SYNC              BIT_ULL(27)
-#define I40E_FLAG_DCB_CAPABLE                  BIT_ULL(29)
-#define I40E_FLAG_VEB_STATS_ENABLED            BIT_ULL(37)
-#define I40E_FLAG_LINK_POLLING_ENABLED         BIT_ULL(39)
-#define I40E_FLAG_VEB_MODE_ENABLED             BIT_ULL(40)
-#define I40E_FLAG_TRUE_PROMISC_SUPPORT         BIT_ULL(51)
-#define I40E_FLAG_CLIENT_RESET                 BIT_ULL(54)
-#define I40E_FLAG_TEMP_LINK_POLLING            BIT_ULL(55)
-#define I40E_FLAG_CLIENT_L2_CHANGE             BIT_ULL(56)
-#define I40E_FLAG_LEGACY_RX                    BIT_ULL(58)
+       u32 hw_features;
+#define I40E_HW_RSS_AQ_CAPABLE                 BIT(0)
+#define I40E_HW_128_QP_RSS_CAPABLE             BIT(1)
+#define I40E_HW_ATR_EVICT_CAPABLE              BIT(2)
+#define I40E_HW_WB_ON_ITR_CAPABLE              BIT(3)
+#define I40E_HW_MULTIPLE_TCP_UDP_RSS_PCTYPE    BIT(4)
+#define I40E_HW_NO_PCI_LINK_CHECK              BIT(5)
+#define I40E_HW_100M_SGMII_CAPABLE             BIT(6)
+#define I40E_HW_NO_DCB_SUPPORT                 BIT(7)
+#define I40E_HW_USE_SET_LLDP_MIB               BIT(8)
+#define I40E_HW_GENEVE_OFFLOAD_CAPABLE         BIT(9)
+#define I40E_HW_PTP_L4_CAPABLE                 BIT(10)
+#define I40E_HW_WOL_MC_MAGIC_PKT_WAKE          BIT(11)
+#define I40E_HW_MPLS_HDR_OFFLOAD_CAPABLE       BIT(12)
+#define I40E_HW_HAVE_CRT_RETIMER               BIT(13)
+#define I40E_HW_OUTER_UDP_CSUM_CAPABLE         BIT(14)
+#define I40E_HW_PHY_CONTROLS_LEDS              BIT(15)
+#define I40E_HW_STOP_FW_LLDP                   BIT(16)
+#define I40E_HW_PORT_ID_VALID                  BIT(17)
+#define I40E_HW_RESTART_AUTONEG                        BIT(18)
+
+       u32 flags;
+#define I40E_FLAG_RX_CSUM_ENABLED              BIT(0)
+#define I40E_FLAG_MSI_ENABLED                  BIT(1)
+#define I40E_FLAG_MSIX_ENABLED                 BIT(2)
+#define I40E_FLAG_RSS_ENABLED                  BIT(3)
+#define I40E_FLAG_VMDQ_ENABLED                 BIT(4)
+#define I40E_FLAG_FILTER_SYNC                  BIT(5)
+#define I40E_FLAG_SRIOV_ENABLED                        BIT(6)
+#define I40E_FLAG_DCB_CAPABLE                  BIT(7)
+#define I40E_FLAG_DCB_ENABLED                  BIT(8)
+#define I40E_FLAG_FD_SB_ENABLED                        BIT(9)
+#define I40E_FLAG_FD_ATR_ENABLED               BIT(10)
+#define I40E_FLAG_FD_SB_AUTO_DISABLED          BIT(11)
+#define I40E_FLAG_FD_ATR_AUTO_DISABLED         BIT(12)
+#define I40E_FLAG_MFP_ENABLED                  BIT(13)
+#define I40E_FLAG_UDP_FILTER_SYNC              BIT(14)
+#define I40E_FLAG_HW_ATR_EVICT_ENABLED         BIT(15)
+#define I40E_FLAG_VEB_MODE_ENABLED             BIT(16)
+#define I40E_FLAG_VEB_STATS_ENABLED            BIT(17)
+#define I40E_FLAG_LINK_POLLING_ENABLED         BIT(18)
+#define I40E_FLAG_TRUE_PROMISC_SUPPORT         BIT(19)
+#define I40E_FLAG_TEMP_LINK_POLLING            BIT(20)
+#define I40E_FLAG_LEGACY_RX                    BIT(21)
+#define I40E_FLAG_PTP                          BIT(22)
+#define I40E_FLAG_IWARP_ENABLED                        BIT(23)
+#define I40E_FLAG_SERVICE_CLIENT_REQUESTED     BIT(24)
+#define I40E_FLAG_CLIENT_L2_CHANGE             BIT(25)
+#define I40E_FLAG_CLIENT_RESET                 BIT(26)
+#define I40E_FLAG_LINK_DOWN_ON_CLOSE_ENABLED   BIT(27)
+#define I40E_FLAG_SOURCE_PRUNING_DISABLED      BIT(28)
 
        struct i40e_client_instance *cinst;
        bool stat_offsets_loaded;
@@ -947,9 +949,6 @@ static inline void i40e_irq_dynamic_enable(struct i40e_vsi *vsi, int vector)
        struct i40e_hw *hw = &pf->hw;
        u32 val;
 
-       /* definitely clear the PBA here, as this function is meant to
-        * clean out all previous interrupts AND enable the interrupt
-        */
        val = I40E_PFINT_DYN_CTLN_INTENA_MASK |
              I40E_PFINT_DYN_CTLN_CLEARPBA_MASK |
              (I40E_ITR_NONE << I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT);
@@ -958,7 +957,7 @@ static inline void i40e_irq_dynamic_enable(struct i40e_vsi *vsi, int vector)
 }
 
 void i40e_irq_dynamic_disable_icr0(struct i40e_pf *pf);
-void i40e_irq_dynamic_enable_icr0(struct i40e_pf *pf, bool clearpba);
+void i40e_irq_dynamic_enable_icr0(struct i40e_pf *pf);
 int i40e_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd);
 int i40e_open(struct net_device *netdev);
 int i40e_close(struct net_device *netdev);
index 4c85ea9..a8f65ae 100644 (file)
@@ -1771,9 +1771,10 @@ enum i40e_aq_phy_type {
        I40E_PHY_TYPE_25GBASE_CR                = 0x20,
        I40E_PHY_TYPE_25GBASE_SR                = 0x21,
        I40E_PHY_TYPE_25GBASE_LR                = 0x22,
+       I40E_PHY_TYPE_MAX,
+       I40E_PHY_TYPE_NOT_SUPPORTED_HIGH_TEMP   = 0xFD,
        I40E_PHY_TYPE_EMPTY                     = 0xFE,
        I40E_PHY_TYPE_DEFAULT                   = 0xFF,
-       I40E_PHY_TYPE_MAX
 };
 
 #define I40E_LINK_SPEED_100MB_SHIFT    0x1
index 60542be..53aad37 100644 (file)
@@ -1567,30 +1567,46 @@ i40e_status i40e_aq_get_phy_capabilities(struct i40e_hw *hw,
        struct i40e_aq_desc desc;
        i40e_status status;
        u16 abilities_size = sizeof(struct i40e_aq_get_phy_abilities_resp);
+       u16 max_delay = I40E_MAX_PHY_TIMEOUT, total_delay = 0;
 
        if (!abilities)
                return I40E_ERR_PARAM;
 
-       i40e_fill_default_direct_cmd_desc(&desc,
-                                         i40e_aqc_opc_get_phy_abilities);
+       do {
+               i40e_fill_default_direct_cmd_desc(&desc,
+                                              i40e_aqc_opc_get_phy_abilities);
 
-       desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_BUF);
-       if (abilities_size > I40E_AQ_LARGE_BUF)
-               desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_LB);
+               desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_BUF);
+               if (abilities_size > I40E_AQ_LARGE_BUF)
+                       desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_LB);
 
-       if (qualified_modules)
-               desc.params.external.param0 |=
+               if (qualified_modules)
+                       desc.params.external.param0 |=
                        cpu_to_le32(I40E_AQ_PHY_REPORT_QUALIFIED_MODULES);
 
-       if (report_init)
-               desc.params.external.param0 |=
+               if (report_init)
+                       desc.params.external.param0 |=
                        cpu_to_le32(I40E_AQ_PHY_REPORT_INITIAL_VALUES);
 
-       status = i40e_asq_send_command(hw, &desc, abilities, abilities_size,
-                                      cmd_details);
+               status = i40e_asq_send_command(hw, &desc, abilities,
+                                              abilities_size, cmd_details);
 
-       if (hw->aq.asq_last_status == I40E_AQ_RC_EIO)
-               status = I40E_ERR_UNKNOWN_PHY;
+               if (status)
+                       break;
+
+               if (hw->aq.asq_last_status == I40E_AQ_RC_EIO) {
+                       status = I40E_ERR_UNKNOWN_PHY;
+                       break;
+               } else if (hw->aq.asq_last_status == I40E_AQ_RC_EAGAIN) {
+                       usleep_range(1000, 2000);
+                       total_delay++;
+                       status = I40E_ERR_TIMEOUT;
+               }
+       } while ((hw->aq.asq_last_status != I40E_AQ_RC_OK) &&
+                (total_delay < max_delay));
+
+       if (status)
+               return status;
 
        if (report_init) {
                if (hw->mac.type ==  I40E_MAC_XL710 &&
index 8f326f8..6f2725f 100644 (file)
@@ -278,8 +278,8 @@ static void i40e_dbg_dump_vsi_seid(struct i40e_pf *pf, int seid)
                         rx_ring->netdev,
                         rx_ring->rx_bi);
                dev_info(&pf->pdev->dev,
-                        "    rx_rings[%i]: state = %li, queue_index = %d, reg_idx = %d\n",
-                        i, rx_ring->state,
+                        "    rx_rings[%i]: state = %lu, queue_index = %d, reg_idx = %d\n",
+                        i, *rx_ring->state,
                         rx_ring->queue_index,
                         rx_ring->reg_idx);
                dev_info(&pf->pdev->dev,
@@ -334,8 +334,8 @@ static void i40e_dbg_dump_vsi_seid(struct i40e_pf *pf, int seid)
                         tx_ring->netdev,
                         tx_ring->tx_bi);
                dev_info(&pf->pdev->dev,
-                        "    tx_rings[%i]: state = %li, queue_index = %d, reg_idx = %d\n",
-                        i, tx_ring->state,
+                        "    tx_rings[%i]: state = %lu, queue_index = %d, reg_idx = %d\n",
+                        i, *tx_ring->state,
                         tx_ring->queue_index,
                         tx_ring->reg_idx);
                dev_info(&pf->pdev->dev,
index 1136d02..afd3ca8 100644 (file)
@@ -227,6 +227,8 @@ static const struct i40e_priv_flags i40e_gstrings_priv_flags[] = {
        I40E_PRIV_FLAG("veb-stats", I40E_FLAG_VEB_STATS_ENABLED, 0),
        I40E_PRIV_FLAG("hw-atr-eviction", I40E_FLAG_HW_ATR_EVICT_ENABLED, 0),
        I40E_PRIV_FLAG("legacy-rx", I40E_FLAG_LEGACY_RX, 0),
+       I40E_PRIV_FLAG("disable-source-pruning",
+                      I40E_FLAG_SOURCE_PRUNING_DISABLED, 0),
 };
 
 #define I40E_PRIV_FLAGS_STR_LEN ARRAY_SIZE(i40e_gstrings_priv_flags)
@@ -2008,7 +2010,9 @@ static int i40e_set_phys_id(struct net_device *netdev,
                if (!(pf->hw_features & I40E_HW_PHY_CONTROLS_LEDS)) {
                        pf->led_status = i40e_led_get(hw);
                } else {
-                       i40e_aq_set_phy_debug(hw, I40E_PHY_DEBUG_ALL, NULL);
+                       if (!(hw->flags & I40E_HW_FLAG_AQ_PHY_ACCESS_CAPABLE))
+                               i40e_aq_set_phy_debug(hw, I40E_PHY_DEBUG_ALL,
+                                                     NULL);
                        ret = i40e_led_get_phy(hw, &temp_status,
                                               &pf->phy_led_val);
                        pf->led_status = temp_status;
@@ -2033,7 +2037,8 @@ static int i40e_set_phys_id(struct net_device *netdev,
                        ret = i40e_led_set_phy(hw, false, pf->led_status,
                                               (pf->phy_led_val |
                                               I40E_PHY_LED_MODE_ORIG));
-                       i40e_aq_set_phy_debug(hw, 0, NULL);
+                       if (!(hw->flags & I40E_HW_FLAG_AQ_PHY_ACCESS_CAPABLE))
+                               i40e_aq_set_phy_debug(hw, 0, NULL);
                }
                break;
        default:
@@ -4090,7 +4095,7 @@ static int i40e_set_priv_flags(struct net_device *dev, u32 flags)
        struct i40e_netdev_priv *np = netdev_priv(dev);
        struct i40e_vsi *vsi = np->vsi;
        struct i40e_pf *pf = vsi->back;
-       u64 orig_flags, new_flags, changed_flags;
+       u32 orig_flags, new_flags, changed_flags;
        u32 i, j;
 
        orig_flags = READ_ONCE(pf->flags);
@@ -4142,12 +4147,12 @@ flags_complete:
                return -EOPNOTSUPP;
 
        /* Compare and exchange the new flags into place. If we failed, that
-        * is if cmpxchg64 returns anything but the old value, this means that
+        * is if cmpxchg returns anything but the old value, this means that
         * something else has modified the flags variable since we copied it
         * originally. We'll just punt with an error and log something in the
         * message buffer.
         */
-       if (cmpxchg64(&pf->flags, orig_flags, new_flags) != orig_flags) {
+       if (cmpxchg(&pf->flags, orig_flags, new_flags) != orig_flags) {
                dev_warn(&pf->pdev->dev,
                         "Unable to update pf->flags as it was modified by another thread...\n");
                return -EAGAIN;
@@ -4189,8 +4194,9 @@ flags_complete:
        /* Issue reset to cause things to take effect, as additional bits
         * are added we will need to create a mask of bits requiring reset
         */
-       if ((changed_flags & I40E_FLAG_VEB_STATS_ENABLED) ||
-           ((changed_flags & I40E_FLAG_LEGACY_RX) && netif_running(dev)))
+       if (changed_flags & (I40E_FLAG_VEB_STATS_ENABLED |
+                            I40E_FLAG_LEGACY_RX |
+                            I40E_FLAG_SOURCE_PRUNING_DISABLED))
                i40e_do_reset(pf, BIT(__I40E_PF_RESET_REQUESTED), true);
 
        return 0;
index 3f9e89b..4de5200 100644 (file)
@@ -1776,11 +1776,6 @@ static void i40e_set_rx_mode(struct net_device *netdev)
                vsi->flags |= I40E_VSI_FLAG_FILTER_CHANGED;
                vsi->back->flags |= I40E_FLAG_FILTER_SYNC;
        }
-
-       /* schedule our worker thread which will take care of
-        * applying the new filter changes
-        */
-       i40e_service_event_schedule(vsi->back);
 }
 
 /**
@@ -2884,22 +2879,18 @@ static void i40e_vsi_free_rx_resources(struct i40e_vsi *vsi)
  **/
 static void i40e_config_xps_tx_ring(struct i40e_ring *ring)
 {
-       struct i40e_vsi *vsi = ring->vsi;
+       int cpu;
 
        if (!ring->q_vector || !ring->netdev)
                return;
 
-       if ((vsi->tc_config.numtc <= 1) &&
-           !test_and_set_bit(__I40E_TX_XPS_INIT_DONE, &ring->state)) {
-               netif_set_xps_queue(ring->netdev,
-                                   get_cpu_mask(ring->q_vector->v_idx),
-                                   ring->queue_index);
-       }
+       /* We only initialize XPS once, so as not to overwrite user settings */
+       if (test_and_set_bit(__I40E_TX_XPS_INIT_DONE, ring->state))
+               return;
 
-       /* schedule our worker thread which will take care of
-        * applying the new filter changes
-        */
-       i40e_service_event_schedule(vsi->back);
+       cpu = cpumask_local_spread(ring->q_vector->v_idx, -1);
+       netif_set_xps_queue(ring->netdev, get_cpu_mask(cpu),
+                           ring->queue_index);
 }
 
 /**
@@ -3009,7 +3000,7 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring)
        struct i40e_hmc_obj_rxq rx_ctx;
        i40e_status err = 0;
 
-       ring->state = 0;
+       bitmap_zero(ring->state, __I40E_RING_STATE_NBITS);
 
        /* clear the context structure first */
        memset(&rx_ctx, 0, sizeof(rx_ctx));
@@ -3034,7 +3025,7 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring)
        if (hw->revision_id == 0)
                rx_ctx.lrxqthresh = 0;
        else
-               rx_ctx.lrxqthresh = 2;
+               rx_ctx.lrxqthresh = 1;
        rx_ctx.crcstrip = 1;
        rx_ctx.l2tsel = 1;
        /* this controls whether VLAN is stripped from inner headers */
@@ -3407,15 +3398,14 @@ void i40e_irq_dynamic_disable_icr0(struct i40e_pf *pf)
 /**
  * i40e_irq_dynamic_enable_icr0 - Enable default interrupt generation for icr0
  * @pf: board private structure
- * @clearpba: true when all pending interrupt events should be cleared
  **/
-void i40e_irq_dynamic_enable_icr0(struct i40e_pf *pf, bool clearpba)
+void i40e_irq_dynamic_enable_icr0(struct i40e_pf *pf)
 {
        struct i40e_hw *hw = &pf->hw;
        u32 val;
 
        val = I40E_PFINT_DYN_CTL0_INTENA_MASK   |
-             (clearpba ? I40E_PFINT_DYN_CTL0_CLEARPBA_MASK : 0) |
+             I40E_PFINT_DYN_CTL0_CLEARPBA_MASK |
              (I40E_ITR_NONE << I40E_PFINT_DYN_CTL0_ITR_INDX_SHIFT);
 
        wr32(hw, I40E_PFINT_DYN_CTL0, val);
@@ -3482,6 +3472,7 @@ static int i40e_vsi_request_irq_msix(struct i40e_vsi *vsi, char *basename)
        int tx_int_idx = 0;
        int vector, err;
        int irq_num;
+       int cpu;
 
        for (vector = 0; vector < q_vectors; vector++) {
                struct i40e_q_vector *q_vector = vsi->q_vectors[vector];
@@ -3517,10 +3508,14 @@ static int i40e_vsi_request_irq_msix(struct i40e_vsi *vsi, char *basename)
                q_vector->affinity_notify.notify = i40e_irq_affinity_notify;
                q_vector->affinity_notify.release = i40e_irq_affinity_release;
                irq_set_affinity_notifier(irq_num, &q_vector->affinity_notify);
-               /* get_cpu_mask returns a static constant mask with
-                * a permanent lifetime so it's ok to use here.
+               /* Spread affinity hints out across online CPUs.
+                *
+                * get_cpu_mask returns a static constant mask with
+                * a permanent lifetime so it's ok to pass to
+                * irq_set_affinity_hint without making a copy.
                 */
-               irq_set_affinity_hint(irq_num, get_cpu_mask(q_vector->v_idx));
+               cpu = cpumask_local_spread(q_vector->v_idx, -1);
+               irq_set_affinity_hint(irq_num, get_cpu_mask(cpu));
        }
 
        vsi->irqs_ready = true;
@@ -3596,7 +3591,7 @@ static int i40e_vsi_enable_irq(struct i40e_vsi *vsi)
                for (i = 0; i < vsi->num_q_vectors; i++)
                        i40e_irq_dynamic_enable(vsi, i);
        } else {
-               i40e_irq_dynamic_enable_icr0(pf, true);
+               i40e_irq_dynamic_enable_icr0(pf);
        }
 
        i40e_flush(&pf->hw);
@@ -3745,7 +3740,7 @@ enable_intr:
        wr32(hw, I40E_PFINT_ICR0_ENA, ena_mask);
        if (!test_bit(__I40E_DOWN, pf->state)) {
                i40e_service_event_schedule(pf);
-               i40e_irq_dynamic_enable_icr0(pf, false);
+               i40e_irq_dynamic_enable_icr0(pf);
        }
 
        return ret;
@@ -6231,6 +6226,7 @@ void i40e_fdir_check_and_reenable(struct i40e_pf *pf)
                                hlist_del(&filter->fdir_node);
                                kfree(filter);
                                pf->fdir_pf_active_filters--;
+                               pf->fd_inv = 0;
                        }
                }
        }
@@ -6557,12 +6553,26 @@ static void i40e_handle_link_event(struct i40e_pf *pf,
         */
        i40e_link_event(pf);
 
-       /* check for unqualified module, if link is down */
-       if ((status->link_info & I40E_AQ_MEDIA_AVAILABLE) &&
-           (!(status->an_info & I40E_AQ_QUALIFIED_MODULE)) &&
-           (!(status->link_info & I40E_AQ_LINK_UP)))
+       /* Check if module meets thermal requirements */
+       if (status->phy_type == I40E_PHY_TYPE_NOT_SUPPORTED_HIGH_TEMP) {
+               dev_err(&pf->pdev->dev,
+                       "Rx/Tx is disabled on this device because the module does not meet thermal requirements.\n");
                dev_err(&pf->pdev->dev,
-                       "The driver failed to link because an unqualified module was detected.\n");
+                       "Refer to the Intel(R) Ethernet Adapters and Devices User Guide for a list of supported modules.\n");
+       } else {
+               /* check for unqualified module, if link is down, suppress
+                * the message if link was forced to be down.
+                */
+               if ((status->link_info & I40E_AQ_MEDIA_AVAILABLE) &&
+                   (!(status->an_info & I40E_AQ_QUALIFIED_MODULE)) &&
+                   (!(status->link_info & I40E_AQ_LINK_UP)) &&
+                   (!(pf->flags & I40E_FLAG_LINK_DOWN_ON_CLOSE_ENABLED))) {
+                       dev_err(&pf->pdev->dev,
+                               "Rx/Tx is disabled on this device because an unsupported SFP module type was detected.\n");
+                       dev_err(&pf->pdev->dev,
+                               "Refer to the Intel(R) Ethernet Adapters and Devices User Guide for a list of supported modules.\n");
+               }
+       }
 }
 
 /**
@@ -7678,7 +7688,7 @@ static int i40e_set_num_rings_in_vsi(struct i40e_vsi *vsi)
 
 /**
  * i40e_vsi_alloc_arrays - Allocate queue and vector pointer arrays for the vsi
- * @type: VSI pointer
+ * @vsi: VSI pointer
  * @alloc_qvectors: a bool to specify if q_vectors need to be allocated.
  *
  * On error: returns error code (negative)
@@ -8439,7 +8449,7 @@ static int i40e_setup_misc_vector(struct i40e_pf *pf)
 
        i40e_flush(hw);
 
-       i40e_irq_dynamic_enable_icr0(pf, true);
+       i40e_irq_dynamic_enable_icr0(pf);
 
        return err;
 }
@@ -8967,8 +8977,8 @@ static int i40e_sw_init(struct i40e_pf *pf)
                    I40E_FLAG_MSIX_ENABLED;
 
        /* Set default ITR */
-       pf->rx_itr_default = I40E_ITR_DYNAMIC | I40E_ITR_RX_DEF;
-       pf->tx_itr_default = I40E_ITR_DYNAMIC | I40E_ITR_TX_DEF;
+       pf->rx_itr_default = I40E_ITR_RX_DEF;
+       pf->tx_itr_default = I40E_ITR_TX_DEF;
 
        /* Depending on PF configurations, it is possible that the RSS
         * maximum might end up larger than the available queues
@@ -9068,6 +9078,11 @@ static int i40e_sw_init(struct i40e_pf *pf)
            (pf->hw.aq.fw_maj_ver >= 5)))
                pf->hw_features |= I40E_HW_USE_SET_LLDP_MIB;
 
+       /* Enable PTP L4 if FW > v6.0 */
+       if (pf->hw.mac.type == I40E_MAC_XL710 &&
+           pf->hw.aq.fw_maj_ver >= 6)
+               pf->hw_features |= I40E_HW_PTP_L4_CAPABLE;
+
        if (pf->hw.func_caps.vmdq) {
                pf->num_vmdq_vsis = I40E_DEFAULT_NUM_VMDQ_VSI;
                pf->flags |= I40E_FLAG_VMDQ_ENABLED;
@@ -9903,6 +9918,31 @@ static int i40e_add_vsi(struct i40e_vsi *vsi)
 
                enabled_tc = i40e_pf_get_tc_map(pf);
 
+               /* Source pruning is enabled by default, so the flag is
+                * negative logic - if it's set, we need to fiddle with
+                * the VSI to disable source pruning.
+                */
+               if (pf->flags & I40E_FLAG_SOURCE_PRUNING_DISABLED) {
+                       memset(&ctxt, 0, sizeof(ctxt));
+                       ctxt.seid = pf->main_vsi_seid;
+                       ctxt.pf_num = pf->hw.pf_id;
+                       ctxt.vf_num = 0;
+                       ctxt.info.valid_sections |=
+                                    cpu_to_le16(I40E_AQ_VSI_PROP_SWITCH_VALID);
+                       ctxt.info.switch_id =
+                                  cpu_to_le16(I40E_AQ_VSI_SW_ID_FLAG_LOCAL_LB);
+                       ret = i40e_aq_update_vsi_params(hw, &ctxt, NULL);
+                       if (ret) {
+                               dev_info(&pf->pdev->dev,
+                                        "update vsi failed, err %s aq_err %s\n",
+                                        i40e_stat_str(&pf->hw, ret),
+                                        i40e_aq_str(&pf->hw,
+                                                    pf->hw.aq.asq_last_status));
+                               ret = -ENOENT;
+                               goto err;
+                       }
+               }
+
                /* MFP mode setup queue map and update VSI */
                if ((pf->flags & I40E_FLAG_MFP_ENABLED) &&
                    !(pf->hw.func_caps.iscsi)) { /* NIC type PF */
@@ -12000,6 +12040,28 @@ static pci_ers_result_t i40e_pci_error_slot_reset(struct pci_dev *pdev)
 }
 
 /**
+ * i40e_pci_error_reset_prepare - prepare device driver for pci reset
+ * @pdev: PCI device information struct
+ */
+static void i40e_pci_error_reset_prepare(struct pci_dev *pdev)
+{
+       struct i40e_pf *pf = pci_get_drvdata(pdev);
+
+       i40e_prep_for_reset(pf, false);
+}
+
+/**
+ * i40e_pci_error_reset_done - pci reset done, device driver reset can begin
+ * @pdev: PCI device information struct
+ */
+static void i40e_pci_error_reset_done(struct pci_dev *pdev)
+{
+       struct i40e_pf *pf = pci_get_drvdata(pdev);
+
+       i40e_reset_and_rebuild(pf, false, false);
+}
+
+/**
  * i40e_pci_error_resume - restart operations after PCI error recovery
  * @pdev: PCI device information struct
  *
@@ -12189,6 +12251,8 @@ static int i40e_resume(struct device *dev)
 static const struct pci_error_handlers i40e_err_handler = {
        .error_detected = i40e_pci_error_detected,
        .slot_reset = i40e_pci_error_slot_reset,
+       .reset_prepare = i40e_pci_error_reset_prepare,
+       .reset_done = i40e_pci_error_reset_done,
        .resume = i40e_pci_error_resume,
 };
 
index 57505b1..151d9cf 100644 (file)
@@ -311,13 +311,10 @@ static i40e_status i40e_read_nvm_word_aq(struct i40e_hw *hw, u16 offset,
 static i40e_status __i40e_read_nvm_word(struct i40e_hw *hw,
                                        u16 offset, u16 *data)
 {
-       i40e_status ret_code = 0;
-
        if (hw->flags & I40E_HW_FLAG_AQ_SRCTL_ACCESS_ENABLE)
-               ret_code = i40e_read_nvm_word_aq(hw, offset, data);
-       else
-               ret_code = i40e_read_nvm_word_srctl(hw, offset, data);
-       return ret_code;
+               return i40e_read_nvm_word_aq(hw, offset, data);
+
+       return i40e_read_nvm_word_srctl(hw, offset, data);
 }
 
 /**
@@ -331,7 +328,7 @@ static i40e_status __i40e_read_nvm_word(struct i40e_hw *hw,
 i40e_status i40e_read_nvm_word(struct i40e_hw *hw, u16 offset,
                               u16 *data)
 {
-       i40e_status ret_code = 0;
+       i40e_status ret_code;
 
        ret_code = i40e_acquire_nvm(hw, I40E_RESOURCE_READ);
        if (ret_code)
@@ -446,13 +443,10 @@ static i40e_status __i40e_read_nvm_buffer(struct i40e_hw *hw,
                                          u16 offset, u16 *words,
                                          u16 *data)
 {
-       i40e_status ret_code = 0;
-
        if (hw->flags & I40E_HW_FLAG_AQ_SRCTL_ACCESS_ENABLE)
-               ret_code = i40e_read_nvm_buffer_aq(hw, offset, words, data);
-       else
-               ret_code = i40e_read_nvm_buffer_srctl(hw, offset, words, data);
-       return ret_code;
+               return i40e_read_nvm_buffer_aq(hw, offset, words, data);
+
+       return i40e_read_nvm_buffer_srctl(hw, offset, words, data);
 }
 
 /**
index 86ca27f..c234758 100644 (file)
 #define I40E_GLV_RUPP_MAX_INDEX 383
 #define I40E_GLV_RUPP_RUPP_SHIFT 0
 #define I40E_GLV_RUPP_RUPP_MASK I40E_MASK(0xFFFFFFFF, I40E_GLV_RUPP_RUPP_SHIFT)
-#define I40E_GLV_TEPC(_VSI) (0x00344000 + ((_VSI) * 4)) /* _i=0...383 */ /* Reset: CORER */
+#define I40E_GLV_TEPC(_i) (0x00344000 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */
 #define I40E_GLV_TEPC_MAX_INDEX 383
 #define I40E_GLV_TEPC_TEPC_SHIFT 0
 #define I40E_GLV_TEPC_TEPC_MASK I40E_MASK(0xFFFFFFFF, I40E_GLV_TEPC_TEPC_SHIFT)
index d9fdf69..a23306f 100644 (file)
@@ -1372,6 +1372,15 @@ bool i40e_alloc_rx_buffers(struct i40e_ring *rx_ring, u16 cleaned_count)
        union i40e_rx_desc *rx_desc;
        struct i40e_rx_buffer *bi;
 
+       /* Hardware only fetches new descriptors in cache lines of 8,
+        * essentially ignoring the lower 3 bits of the tail register. We want
+        * to ensure our tail writes are aligned to avoid unnecessary work. We
+        * can't simply round down the cleaned count, since we might fail to
+        * allocate some buffers. What we really want is to ensure that
+        * next_to_used + cleaned_count produces an aligned value.
+        */
+       cleaned_count -= (ntu + cleaned_count) & 0x7;
+
        /* do nothing if no valid netdev defined */
        if (!rx_ring->netdev || !cleaned_count)
                return false;
@@ -2202,9 +2211,7 @@ static u32 i40e_buildreg_itr(const int type, const u16 itr)
        u32 val;
 
        val = I40E_PFINT_DYN_CTLN_INTENA_MASK |
-             /* Don't clear PBA because that can cause lost interrupts that
-              * came in while we were cleaning/polling
-              */
+             I40E_PFINT_DYN_CTLN_CLEARPBA_MASK |
              (type << I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT) |
              (itr << I40E_PFINT_DYN_CTLN_INTERVAL_SHIFT);
 
@@ -2241,7 +2248,7 @@ static inline void i40e_update_enable_itr(struct i40e_vsi *vsi,
 
        /* If we don't have MSIX, then we only need to re-enable icr0 */
        if (!(vsi->back->flags & I40E_FLAG_MSIX_ENABLED)) {
-               i40e_irq_dynamic_enable_icr0(vsi->back, false);
+               i40e_irq_dynamic_enable_icr0(vsi->back);
                return;
        }
 
@@ -3167,38 +3174,12 @@ static inline int i40e_tx_map(struct i40e_ring *tx_ring, struct sk_buff *skb,
        /* write last descriptor with EOP bit */
        td_cmd |= I40E_TX_DESC_CMD_EOP;
 
-       /* We can OR these values together as they both are checked against
-        * 4 below and at this point desc_count will be used as a boolean value
-        * after this if/else block.
+       /* We OR these values together to check both against 4 (WB_STRIDE)
+        * below. This is safe since we don't re-use desc_count afterwards.
         */
        desc_count |= ++tx_ring->packet_stride;
 
-       /* Algorithm to optimize tail and RS bit setting:
-        * if queue is stopped
-        *      mark RS bit
-        *      reset packet counter
-        * else if xmit_more is supported and is true
-        *      advance packet counter to 4
-        *      reset desc_count to 0
-        *
-        * if desc_count >= 4
-        *      mark RS bit
-        *      reset packet counter
-        * if desc_count > 0
-        *      update tail
-        *
-        * Note: If there are less than 4 descriptors
-        * pending and interrupts were disabled the service task will
-        * trigger a force WB.
-        */
-       if (netif_xmit_stopped(txring_txq(tx_ring))) {
-               goto do_rs;
-       } else if (skb->xmit_more) {
-               /* set stride to arm on next packet and reset desc_count */
-               tx_ring->packet_stride = WB_STRIDE;
-               desc_count = 0;
-       } else if (desc_count >= WB_STRIDE) {
-do_rs:
+       if (desc_count >= WB_STRIDE) {
                /* write last descriptor with RS bit set */
                td_cmd |= I40E_TX_DESC_CMD_RS;
                tx_ring->packet_stride = 0;
@@ -3219,7 +3200,7 @@ do_rs:
        first->next_to_watch = tx_desc;
 
        /* notify HW of packet */
-       if (desc_count) {
+       if (netif_xmit_stopped(txring_txq(tx_ring)) || !skb->xmit_more) {
                writel(i, tx_ring->tail);
 
                /* we need this if more than one processor can write to our tail
index 2f848bc..ff57ae4 100644 (file)
 #define I40E_ITR_8K                0x003E
 #define I40E_ITR_4K                0x007A
 #define I40E_MAX_INTRL             0x3B    /* reg uses 4 usec resolution */
-#define I40E_ITR_RX_DEF            I40E_ITR_20K
-#define I40E_ITR_TX_DEF            I40E_ITR_20K
+#define I40E_ITR_RX_DEF            (ITR_REG_TO_USEC(I40E_ITR_20K) | \
+                                   I40E_ITR_DYNAMIC)
+#define I40E_ITR_TX_DEF            (ITR_REG_TO_USEC(I40E_ITR_20K) | \
+                                   I40E_ITR_DYNAMIC)
 #define I40E_ITR_DYNAMIC           0x8000  /* use top bit as a flag */
 #define I40E_MIN_INT_RATE          250     /* ~= 1000000 / (I40E_MAX_ITR * 2) */
 #define I40E_MAX_INT_RATE          500000  /* == 1000000 / (I40E_MIN_ITR * 2) */
@@ -206,7 +208,7 @@ static inline bool i40e_test_staterr(union i40e_rx_desc *rx_desc,
 }
 
 /* How many Rx Buffers do we bundle into one write to the hardware ? */
-#define I40E_RX_BUFFER_WRITE   16      /* Must be power of 2 */
+#define I40E_RX_BUFFER_WRITE   32      /* Must be power of 2 */
 #define I40E_RX_INCREMENT(r, i) \
        do {                                    \
                (i)++;                          \
@@ -342,6 +344,7 @@ struct i40e_rx_queue_stats {
 enum i40e_ring_state_t {
        __I40E_TX_FDIR_INIT_DONE,
        __I40E_TX_XPS_INIT_DONE,
+       __I40E_RING_STATE_NBITS /* must be last */
 };
 
 /* some useful defines for virtchannel interface, which
@@ -366,7 +369,7 @@ struct i40e_ring {
                struct i40e_tx_buffer *tx_bi;
                struct i40e_rx_buffer *rx_bi;
        };
-       unsigned long state;
+       DECLARE_BITMAP(state, __I40E_RING_STATE_NBITS);
        u16 queue_index;                /* Queue number of ring */
        u8 dcb_tc;                      /* Traffic class of ring */
        u8 __iomem *tail;
index 4b32b1d..0410fcb 100644 (file)
@@ -46,6 +46,9 @@
 /* Max default timeout in ms, */
 #define I40E_MAX_NVM_TIMEOUT           18000
 
+/* Max timeout in ms for the phy to respond */
+#define I40E_MAX_PHY_TIMEOUT           500
+
 /* Switch from ms to the 1usec global time (this is the GTIME resolution) */
 #define I40E_MS_TO_GTIME(time)         ((time) * 1000)
 
index 0456813..0c4fa22 100644 (file)
@@ -273,7 +273,7 @@ static void i40e_config_irq_link_list(struct i40e_vf *vf, u16 vsi_id,
        struct i40e_hw *hw = &pf->hw;
        u16 vsi_queue_id, pf_queue_id;
        enum i40e_queue_type qtype;
-       u16 next_q, vector_id;
+       u16 next_q, vector_id, size;
        u32 reg, reg_idx;
        u16 itr_idx = 0;
 
@@ -303,9 +303,11 @@ static void i40e_config_irq_link_list(struct i40e_vf *vf, u16 vsi_id,
                                     vsi_queue_id + 1));
        }
 
-       next_q = find_first_bit(&linklistmap,
-                               (I40E_MAX_VSI_QP *
-                                I40E_VIRTCHNL_SUPPORTED_QTYPES));
+       size = I40E_MAX_VSI_QP * I40E_VIRTCHNL_SUPPORTED_QTYPES;
+       next_q = find_first_bit(&linklistmap, size);
+       if (unlikely(next_q == size))
+               goto irq_list_done;
+
        vsi_queue_id = next_q / I40E_VIRTCHNL_SUPPORTED_QTYPES;
        qtype = next_q % I40E_VIRTCHNL_SUPPORTED_QTYPES;
        pf_queue_id = i40e_vc_get_pf_queue_id(vf, vsi_id, vsi_queue_id);
@@ -313,7 +315,7 @@ static void i40e_config_irq_link_list(struct i40e_vf *vf, u16 vsi_id,
 
        wr32(hw, reg_idx, reg);
 
-       while (next_q < (I40E_MAX_VSI_QP * I40E_VIRTCHNL_SUPPORTED_QTYPES)) {
+       while (next_q < size) {
                switch (qtype) {
                case I40E_QUEUE_TYPE_RX:
                        reg_idx = I40E_QINT_RQCTL(pf_queue_id);
@@ -327,12 +329,8 @@ static void i40e_config_irq_link_list(struct i40e_vf *vf, u16 vsi_id,
                        break;
                }
 
-               next_q = find_next_bit(&linklistmap,
-                                      (I40E_MAX_VSI_QP *
-                                       I40E_VIRTCHNL_SUPPORTED_QTYPES),
-                                      next_q + 1);
-               if (next_q <
-                   (I40E_MAX_VSI_QP * I40E_VIRTCHNL_SUPPORTED_QTYPES)) {
+               next_q = find_next_bit(&linklistmap, size, next_q + 1);
+               if (next_q < size) {
                        vsi_queue_id = next_q / I40E_VIRTCHNL_SUPPORTED_QTYPES;
                        qtype = next_q % I40E_VIRTCHNL_SUPPORTED_QTYPES;
                        pf_queue_id = i40e_vc_get_pf_queue_id(vf, vsi_id,
@@ -639,7 +637,7 @@ static int i40e_config_vsi_rx_queue(struct i40e_vf *vf, u16 vsi_id,
        rx_ctx.dsize = 1;
 
        /* default values */
-       rx_ctx.lrxqthresh = 2;
+       rx_ctx.lrxqthresh = 1;
        rx_ctx.crcstrip = 1;
        rx_ctx.prefena = 1;
        rx_ctx.l2tsel = 1;
@@ -1358,7 +1356,7 @@ err_alloc:
                i40e_free_vfs(pf);
 err_iov:
        /* Re-enable interrupt 0. */
-       i40e_irq_dynamic_enable_icr0(pf, false);
+       i40e_irq_dynamic_enable_icr0(pf);
        return ret;
 }
 
@@ -2883,6 +2881,7 @@ int i40e_ndo_set_vf_mac(struct net_device *netdev, int vf_id, u8 *mac)
        struct i40e_mac_filter *f;
        struct i40e_vf *vf;
        int ret = 0;
+       struct hlist_node *h;
        int bkt;
 
        /* validate the request */
@@ -2921,7 +2920,7 @@ int i40e_ndo_set_vf_mac(struct net_device *netdev, int vf_id, u8 *mac)
        /* Delete all the filters for this VSI - we're going to kill it
         * anyway.
         */
-       hash_for_each(vsi->mac_filter_hash, bkt, f, hlist)
+       hash_for_each_safe(vsi->mac_filter_hash, bkt, h, f, hlist)
                __i40e_del_filter(vsi, f);
 
        spin_unlock_bh(&vsi->mac_filter_hash_lock);
index ed5602f..60c892f 100644 (file)
@@ -1767,9 +1767,10 @@ enum i40e_aq_phy_type {
        I40E_PHY_TYPE_25GBASE_CR                = 0x20,
        I40E_PHY_TYPE_25GBASE_SR                = 0x21,
        I40E_PHY_TYPE_25GBASE_LR                = 0x22,
+       I40E_PHY_TYPE_MAX,
+       I40E_PHY_TYPE_NOT_SUPPORTED_HIGH_TEMP   = 0xFD,
        I40E_PHY_TYPE_EMPTY                     = 0xFE,
        I40E_PHY_TYPE_DEFAULT                   = 0xFF,
-       I40E_PHY_TYPE_MAX
 };
 
 #define I40E_LINK_SPEED_100MB_SHIFT    0x1
index 37e1de8..6806ada 100644 (file)
@@ -711,6 +711,15 @@ bool i40evf_alloc_rx_buffers(struct i40e_ring *rx_ring, u16 cleaned_count)
        union i40e_rx_desc *rx_desc;
        struct i40e_rx_buffer *bi;
 
+       /* Hardware only fetches new descriptors in cache lines of 8,
+        * essentially ignoring the lower 3 bits of the tail register. We want
+        * to ensure our tail writes are aligned to avoid unnecessary work. We
+        * can't simply round down the cleaned count, since we might fail to
+        * allocate some buffers. What we really want is to ensure that
+        * next_to_used + cleaned_count produces an aligned value.
+        */
+       cleaned_count -= (ntu + cleaned_count) & 0x7;
+
        /* do nothing if no valid netdev defined */
        if (!rx_ring->netdev || !cleaned_count)
                return false;
@@ -1409,9 +1418,7 @@ static u32 i40e_buildreg_itr(const int type, const u16 itr)
        u32 val;
 
        val = I40E_VFINT_DYN_CTLN1_INTENA_MASK |
-             /* Don't clear PBA because that can cause lost interrupts that
-              * came in while we were cleaning/polling
-              */
+             I40E_VFINT_DYN_CTLN1_CLEARPBA_MASK |
              (type << I40E_VFINT_DYN_CTLN1_ITR_INDX_SHIFT) |
              (itr << I40E_VFINT_DYN_CTLN1_INTERVAL_SHIFT);
 
index 0d9f98b..8d26c85 100644 (file)
 #define I40E_ITR_8K                0x003E
 #define I40E_ITR_4K                0x007A
 #define I40E_MAX_INTRL             0x3B    /* reg uses 4 usec resolution */
-#define I40E_ITR_RX_DEF            I40E_ITR_20K
-#define I40E_ITR_TX_DEF            I40E_ITR_20K
+#define I40E_ITR_RX_DEF            (ITR_REG_TO_USEC(I40E_ITR_20K) | \
+                                   I40E_ITR_DYNAMIC)
+#define I40E_ITR_TX_DEF            (ITR_REG_TO_USEC(I40E_ITR_20K) | \
+                                   I40E_ITR_DYNAMIC)
 #define I40E_ITR_DYNAMIC           0x8000  /* use top bit as a flag */
 #define I40E_MIN_INT_RATE          250     /* ~= 1000000 / (I40E_MAX_ITR * 2) */
 #define I40E_MAX_INT_RATE          500000  /* == 1000000 / (I40E_MIN_ITR * 2) */
@@ -189,7 +191,7 @@ static inline bool i40e_test_staterr(union i40e_rx_desc *rx_desc,
 }
 
 /* How many Rx Buffers do we bundle into one write to the hardware ? */
-#define I40E_RX_BUFFER_WRITE   16      /* Must be power of 2 */
+#define I40E_RX_BUFFER_WRITE   32      /* Must be power of 2 */
 #define I40E_RX_INCREMENT(r, i) \
        do {                                    \
                (i)++;                          \
@@ -325,6 +327,7 @@ struct i40e_rx_queue_stats {
 enum i40e_ring_state_t {
        __I40E_TX_FDIR_INIT_DONE,
        __I40E_TX_XPS_INIT_DONE,
+       __I40E_RING_STATE_NBITS /* must be last */
 };
 
 /* some useful defines for virtchannel interface, which
@@ -348,7 +351,7 @@ struct i40e_ring {
                struct i40e_tx_buffer *tx_bi;
                struct i40e_rx_buffer *rx_bi;
        };
-       unsigned long state;
+       DECLARE_BITMAP(state, __I40E_RING_STATE_NBITS);
        u16 queue_index;                /* Queue number of ring */
        u8 dcb_tc;                      /* Traffic class of ring */
        u8 __iomem *tail;
index 9364b67..213b773 100644 (file)
@@ -46,6 +46,9 @@
 /* Max default timeout in ms, */
 #define I40E_MAX_NVM_TIMEOUT           18000
 
+/* Max timeout in ms for the phy to respond */
+#define I40E_MAX_PHY_TIMEOUT           500
+
 /* Switch from ms to the 1usec global time (this is the GTIME resolution) */
 #define I40E_MS_TO_GTIME(time)         ((time) * 1000)
 
index 5982362..de0af52 100644 (file)
@@ -222,22 +222,22 @@ struct i40evf_adapter {
 
        u32 flags;
 #define I40EVF_FLAG_RX_CSUM_ENABLED            BIT(0)
-#define I40EVF_FLAG_IMIR_ENABLED               BIT(5)
-#define I40EVF_FLAG_MQ_CAPABLE                 BIT(6)
-#define I40EVF_FLAG_PF_COMMS_FAILED            BIT(8)
-#define I40EVF_FLAG_RESET_PENDING              BIT(9)
-#define I40EVF_FLAG_RESET_NEEDED               BIT(10)
-#define I40EVF_FLAG_WB_ON_ITR_CAPABLE          BIT(11)
-#define I40EVF_FLAG_OUTER_UDP_CSUM_CAPABLE     BIT(12)
-#define I40EVF_FLAG_ADDR_SET_BY_PF             BIT(13)
-#define I40EVF_FLAG_SERVICE_CLIENT_REQUESTED   BIT(14)
-#define I40EVF_FLAG_CLIENT_NEEDS_OPEN          BIT(15)
-#define I40EVF_FLAG_CLIENT_NEEDS_CLOSE         BIT(16)
-#define I40EVF_FLAG_CLIENT_NEEDS_L2_PARAMS     BIT(17)
-#define I40EVF_FLAG_PROMISC_ON                 BIT(18)
-#define I40EVF_FLAG_ALLMULTI_ON                        BIT(19)
-#define I40EVF_FLAG_LEGACY_RX                  BIT(20)
-#define I40EVF_FLAG_REINIT_ITR_NEEDED          BIT(21)
+#define I40EVF_FLAG_IMIR_ENABLED               BIT(1)
+#define I40EVF_FLAG_MQ_CAPABLE                 BIT(2)
+#define I40EVF_FLAG_PF_COMMS_FAILED            BIT(3)
+#define I40EVF_FLAG_RESET_PENDING              BIT(4)
+#define I40EVF_FLAG_RESET_NEEDED               BIT(5)
+#define I40EVF_FLAG_WB_ON_ITR_CAPABLE          BIT(6)
+#define I40EVF_FLAG_OUTER_UDP_CSUM_CAPABLE     BIT(7)
+#define I40EVF_FLAG_ADDR_SET_BY_PF             BIT(8)
+#define I40EVF_FLAG_SERVICE_CLIENT_REQUESTED   BIT(9)
+#define I40EVF_FLAG_CLIENT_NEEDS_OPEN          BIT(10)
+#define I40EVF_FLAG_CLIENT_NEEDS_CLOSE         BIT(11)
+#define I40EVF_FLAG_CLIENT_NEEDS_L2_PARAMS     BIT(12)
+#define I40EVF_FLAG_PROMISC_ON                 BIT(13)
+#define I40EVF_FLAG_ALLMULTI_ON                        BIT(14)
+#define I40EVF_FLAG_LEGACY_RX                  BIT(15)
+#define I40EVF_FLAG_REINIT_ITR_NEEDED          BIT(16)
 /* duplicates for common code */
 #define I40E_FLAG_DCB_ENABLED                  0
 #define I40E_FLAG_RX_CSUM_ENABLED              I40EVF_FLAG_RX_CSUM_ENABLED
index f2f1e75..5bcbd46 100644 (file)
@@ -515,6 +515,7 @@ i40evf_request_traffic_irqs(struct i40evf_adapter *adapter, char *basename)
        unsigned int vector, q_vectors;
        unsigned int rx_int_idx = 0, tx_int_idx = 0;
        int irq_num, err;
+       int cpu;
 
        i40evf_irq_disable(adapter);
        /* Decrement for Other and TCP Timer vectors */
@@ -553,10 +554,12 @@ i40evf_request_traffic_irqs(struct i40evf_adapter *adapter, char *basename)
                q_vector->affinity_notify.release =
                                                   i40evf_irq_affinity_release;
                irq_set_affinity_notifier(irq_num, &q_vector->affinity_notify);
-               /* get_cpu_mask returns a static constant mask with
-                * a permanent lifetime so it's ok to use here.
+               /* Spread the IRQ affinity hints across online CPUs. Note that
+                * get_cpu_mask returns a mask with a permanent lifetime so
+                * it's safe to use as a hint for irq_set_affinity_hint.
                 */
-               irq_set_affinity_hint(irq_num, get_cpu_mask(q_vector->v_idx));
+               cpu = cpumask_local_spread(q_vector->v_idx, -1);
+               irq_set_affinity_hint(irq_num, get_cpu_mask(cpu));
        }
 
        return 0;
@@ -877,6 +880,8 @@ i40evf_mac_filter *i40evf_add_filter(struct i40evf_adapter *adapter,
                list_add_tail(&f->list, &adapter->mac_filter_list);
                f->add = true;
                adapter->aq_required |= I40EVF_FLAG_AQ_ADD_MAC_FILTER;
+       } else {
+               f->remove = false;
        }
 
        clear_bit(__I40EVF_IN_CRITICAL_TASK, &adapter->crit_section);
@@ -1218,7 +1223,7 @@ static int i40evf_alloc_queues(struct i40evf_adapter *adapter)
                tx_ring->netdev = adapter->netdev;
                tx_ring->dev = &adapter->pdev->dev;
                tx_ring->count = adapter->tx_desc_count;
-               tx_ring->tx_itr_setting = (I40E_ITR_DYNAMIC | I40E_ITR_TX_DEF);
+               tx_ring->tx_itr_setting = I40E_ITR_TX_DEF;
                if (adapter->flags & I40EVF_FLAG_WB_ON_ITR_CAPABLE)
                        tx_ring->flags |= I40E_TXR_FLAGS_WB_ON_ITR;
 
@@ -1227,7 +1232,7 @@ static int i40evf_alloc_queues(struct i40evf_adapter *adapter)
                rx_ring->netdev = adapter->netdev;
                rx_ring->dev = &adapter->pdev->dev;
                rx_ring->count = adapter->rx_desc_count;
-               rx_ring->rx_itr_setting = (I40E_ITR_DYNAMIC | I40E_ITR_RX_DEF);
+               rx_ring->rx_itr_setting = I40E_ITR_RX_DEF;
        }
 
        adapter->num_active_queues = num_active_queues;
@@ -2420,10 +2425,6 @@ out_err:
        return features & ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
 }
 
-#define I40EVF_VLAN_FEATURES (NETIF_F_HW_VLAN_CTAG_TX |\
-                             NETIF_F_HW_VLAN_CTAG_RX |\
-                             NETIF_F_HW_VLAN_CTAG_FILTER)
-
 /**
  * i40evf_fix_features - fix up the netdev feature bits
  * @netdev: our net device
@@ -2436,9 +2437,11 @@ static netdev_features_t i40evf_fix_features(struct net_device *netdev,
 {
        struct i40evf_adapter *adapter = netdev_priv(netdev);
 
-       features &= ~I40EVF_VLAN_FEATURES;
-       if (adapter->vf_res->vf_cap_flags & VIRTCHNL_VF_OFFLOAD_VLAN)
-               features |= I40EVF_VLAN_FEATURES;
+       if (!(adapter->vf_res->vf_cap_flags & VIRTCHNL_VF_OFFLOAD_VLAN))
+               features &= ~(NETIF_F_HW_VLAN_CTAG_TX |
+                             NETIF_F_HW_VLAN_CTAG_RX |
+                             NETIF_F_HW_VLAN_CTAG_FILTER);
+
        return features;
 }
 
@@ -2569,9 +2572,17 @@ int i40evf_process_config(struct i40evf_adapter *adapter)
         */
        hw_features = hw_enc_features;
 
+       /* Enable VLAN features if supported */
+       if (vfres->vf_cap_flags & VIRTCHNL_VF_OFFLOAD_VLAN)
+               hw_features |= (NETIF_F_HW_VLAN_CTAG_TX |
+                               NETIF_F_HW_VLAN_CTAG_RX);
+
        netdev->hw_features |= hw_features;
 
-       netdev->features |= hw_features | I40EVF_VLAN_FEATURES;
+       netdev->features |= hw_features;
+
+       if (vfres->vf_cap_flags & VIRTCHNL_VF_OFFLOAD_VLAN)
+               netdev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
 
        adapter->vsi.id = adapter->vsi_res->vsi_id;
 
index fd4a46b..837d9b4 100644 (file)
@@ -3162,6 +3162,8 @@ static int igb_sw_init(struct igb_adapter *adapter)
        /* Setup and initialize a copy of the hw vlan table array */
        adapter->shadow_vfta = kcalloc(E1000_VLAN_FILTER_TBL_SIZE, sizeof(u32),
                                       GFP_ATOMIC);
+       if (!adapter->shadow_vfta)
+               return -ENOMEM;
 
        /* This call may decrease the number of queues */
        if (igb_init_interrupt_scheme(adapter, true)) {
index dd55787..468c355 100644 (file)
@@ -275,6 +275,7 @@ struct ixgbe_rx_queue_stats {
        u64 rsc_count;
        u64 rsc_flush;
        u64 non_eop_descs;
+       u64 alloc_rx_page;
        u64 alloc_rx_page_failed;
        u64 alloc_rx_buff_failed;
        u64 csum_err;
@@ -434,8 +435,15 @@ static inline unsigned int ixgbe_rx_pg_order(struct ixgbe_ring *ring)
 }
 #define ixgbe_rx_pg_size(_ring) (PAGE_SIZE << ixgbe_rx_pg_order(_ring))
 
+#define IXGBE_ITR_ADAPTIVE_MIN_INC     2
+#define IXGBE_ITR_ADAPTIVE_MIN_USECS   10
+#define IXGBE_ITR_ADAPTIVE_MAX_USECS   126
+#define IXGBE_ITR_ADAPTIVE_LATENCY     0x80
+#define IXGBE_ITR_ADAPTIVE_BULK                0x00
+
 struct ixgbe_ring_container {
        struct ixgbe_ring *ring;        /* pointer to linked list of rings */
+       unsigned long next_update;      /* jiffies value of last update */
        unsigned int total_bytes;       /* total bytes processed this int */
        unsigned int total_packets;     /* total packets processed this int */
        u16 work_limit;                 /* total work allowed per interrupt */
@@ -655,6 +663,7 @@ struct ixgbe_adapter {
        u64 rsc_total_count;
        u64 rsc_total_flush;
        u64 non_eop_descs;
+       u32 alloc_rx_page;
        u32 alloc_rx_page_failed;
        u32 alloc_rx_buff_failed;
 
index 523f9d0..8a32eb7 100644 (file)
@@ -175,31 +175,9 @@ static s32 ixgbe_init_phy_ops_82598(struct ixgbe_hw *hw)
  **/
 static s32 ixgbe_start_hw_82598(struct ixgbe_hw *hw)
 {
-#ifndef CONFIG_SPARC
-       u32 regval;
-       u32 i;
-#endif
        s32 ret_val;
 
        ret_val = ixgbe_start_hw_generic(hw);
-
-#ifndef CONFIG_SPARC
-       /* Disable relaxed ordering */
-       for (i = 0; ((i < hw->mac.max_tx_queues) &&
-            (i < IXGBE_DCA_MAX_QUEUES_82598)); i++) {
-               regval = IXGBE_READ_REG(hw, IXGBE_DCA_TXCTRL(i));
-               regval &= ~IXGBE_DCA_TXCTRL_DESC_WRO_EN;
-               IXGBE_WRITE_REG(hw, IXGBE_DCA_TXCTRL(i), regval);
-       }
-
-       for (i = 0; ((i < hw->mac.max_rx_queues) &&
-            (i < IXGBE_DCA_MAX_QUEUES_82598)); i++) {
-               regval = IXGBE_READ_REG(hw, IXGBE_DCA_RXCTRL(i));
-               regval &= ~(IXGBE_DCA_RXCTRL_DATA_WRO_EN |
-                           IXGBE_DCA_RXCTRL_HEAD_WRO_EN);
-               IXGBE_WRITE_REG(hw, IXGBE_DCA_RXCTRL(i), regval);
-       }
-#endif
        if (ret_val)
                return ret_val;
 
index 2c19070..9bef255 100644 (file)
@@ -366,25 +366,6 @@ s32 ixgbe_start_hw_gen2(struct ixgbe_hw *hw)
        }
        IXGBE_WRITE_FLUSH(hw);
 
-#ifndef CONFIG_ARCH_WANT_RELAX_ORDER
-       /* Disable relaxed ordering */
-       for (i = 0; i < hw->mac.max_tx_queues; i++) {
-               u32 regval;
-
-               regval = IXGBE_READ_REG(hw, IXGBE_DCA_TXCTRL_82599(i));
-               regval &= ~IXGBE_DCA_TXCTRL_DESC_WRO_EN;
-               IXGBE_WRITE_REG(hw, IXGBE_DCA_TXCTRL_82599(i), regval);
-       }
-
-       for (i = 0; i < hw->mac.max_rx_queues; i++) {
-               u32 regval;
-
-               regval = IXGBE_READ_REG(hw, IXGBE_DCA_RXCTRL(i));
-               regval &= ~(IXGBE_DCA_RXCTRL_DATA_WRO_EN |
-                           IXGBE_DCA_RXCTRL_HEAD_WRO_EN);
-               IXGBE_WRITE_REG(hw, IXGBE_DCA_RXCTRL(i), regval);
-       }
-#endif
        return 0;
 }
 
@@ -3800,10 +3781,10 @@ s32 ixgbe_set_fw_drv_ver_generic(struct ixgbe_hw *hw, u8 maj, u8 min,
        fw_cmd.ver_build = build;
        fw_cmd.ver_sub = sub;
        fw_cmd.hdr.checksum = 0;
-       fw_cmd.hdr.checksum = ixgbe_calculate_checksum((u8 *)&fw_cmd,
-                               (FW_CEM_HDR_LEN + fw_cmd.hdr.buf_len));
        fw_cmd.pad = 0;
        fw_cmd.pad2 = 0;
+       fw_cmd.hdr.checksum = ixgbe_calculate_checksum((u8 *)&fw_cmd,
+                               (FW_CEM_HDR_LEN + fw_cmd.hdr.buf_len));
 
        for (i = 0; i <= FW_CEM_MAX_RETRIES; i++) {
                ret_val = ixgbe_host_interface_command(hw, &fw_cmd,
@@ -4100,8 +4081,8 @@ bool ixgbe_mng_present(struct ixgbe_hw *hw)
                return false;
 
        fwsm = IXGBE_READ_REG(hw, IXGBE_FWSM(hw));
-       fwsm &= IXGBE_FWSM_MODE_MASK;
-       return fwsm == IXGBE_FWSM_FW_MODE_PT;
+
+       return !!(fwsm & IXGBE_FWSM_FW_MODE_PT);
 }
 
 /**
index 72c5657..0aad1c2 100644 (file)
@@ -104,6 +104,7 @@ static const struct ixgbe_stats ixgbe_gstrings_stats[] = {
        {"tx_flow_control_xoff", IXGBE_STAT(stats.lxofftxc)},
        {"rx_flow_control_xoff", IXGBE_STAT(stats.lxoffrxc)},
        {"rx_csum_offload_errors", IXGBE_STAT(hw_csum_rx_error)},
+       {"alloc_rx_page", IXGBE_STAT(alloc_rx_page)},
        {"alloc_rx_page_failed", IXGBE_STAT(alloc_rx_page_failed)},
        {"alloc_rx_buff_failed", IXGBE_STAT(alloc_rx_buff_failed)},
        {"rx_no_dma_resources", IXGBE_STAT(hw_rx_no_dma_resources)},
@@ -1048,7 +1049,7 @@ static int ixgbe_set_ringparam(struct net_device *netdev,
 {
        struct ixgbe_adapter *adapter = netdev_priv(netdev);
        struct ixgbe_ring *temp_ring;
-       int i, err = 0;
+       int i, j, err = 0;
        u32 new_rx_count, new_tx_count;
 
        if ((ring->rx_mini_pending) || (ring->rx_jumbo_pending))
@@ -1085,8 +1086,8 @@ static int ixgbe_set_ringparam(struct net_device *netdev,
        }
 
        /* allocate temporary buffer to store rings in */
-       i = max_t(int, adapter->num_tx_queues, adapter->num_rx_queues);
-       i = max_t(int, i, adapter->num_xdp_queues);
+       i = max_t(int, adapter->num_tx_queues + adapter->num_xdp_queues,
+                 adapter->num_rx_queues);
        temp_ring = vmalloc(i * sizeof(struct ixgbe_ring));
 
        if (!temp_ring) {
@@ -1118,8 +1119,8 @@ static int ixgbe_set_ringparam(struct net_device *netdev,
                        }
                }
 
-               for (i = 0; i < adapter->num_xdp_queues; i++) {
-                       memcpy(&temp_ring[i], adapter->xdp_ring[i],
+               for (j = 0; j < adapter->num_xdp_queues; j++, i++) {
+                       memcpy(&temp_ring[i], adapter->xdp_ring[j],
                               sizeof(struct ixgbe_ring));
 
                        temp_ring[i].count = new_tx_count;
@@ -1139,10 +1140,10 @@ static int ixgbe_set_ringparam(struct net_device *netdev,
                        memcpy(adapter->tx_ring[i], &temp_ring[i],
                               sizeof(struct ixgbe_ring));
                }
-               for (i = 0; i < adapter->num_xdp_queues; i++) {
-                       ixgbe_free_tx_resources(adapter->xdp_ring[i]);
+               for (j = 0; j < adapter->num_xdp_queues; j++, i++) {
+                       ixgbe_free_tx_resources(adapter->xdp_ring[j]);
 
-                       memcpy(adapter->xdp_ring[i], &temp_ring[i],
+                       memcpy(adapter->xdp_ring[j], &temp_ring[i],
                               sizeof(struct ixgbe_ring));
                }
 
@@ -1916,8 +1917,6 @@ static u16 ixgbe_clean_test_rings(struct ixgbe_ring *rx_ring,
                                  unsigned int size)
 {
        union ixgbe_adv_rx_desc *rx_desc;
-       struct ixgbe_rx_buffer *rx_buffer;
-       struct ixgbe_tx_buffer *tx_buffer;
        u16 rx_ntc, tx_ntc, count = 0;
 
        /* initialize next to clean and descriptor values */
@@ -1925,7 +1924,38 @@ static u16 ixgbe_clean_test_rings(struct ixgbe_ring *rx_ring,
        tx_ntc = tx_ring->next_to_clean;
        rx_desc = IXGBE_RX_DESC(rx_ring, rx_ntc);
 
+       while (tx_ntc != tx_ring->next_to_use) {
+               union ixgbe_adv_tx_desc *tx_desc;
+               struct ixgbe_tx_buffer *tx_buffer;
+
+               tx_desc = IXGBE_TX_DESC(tx_ring, tx_ntc);
+
+               /* if DD is not set transmit has not completed */
+               if (!(tx_desc->wb.status & cpu_to_le32(IXGBE_TXD_STAT_DD)))
+                       return count;
+
+               /* unmap buffer on Tx side */
+               tx_buffer = &tx_ring->tx_buffer_info[tx_ntc];
+
+               /* Free all the Tx ring sk_buffs */
+               dev_kfree_skb_any(tx_buffer->skb);
+
+               /* unmap skb header data */
+               dma_unmap_single(tx_ring->dev,
+                                dma_unmap_addr(tx_buffer, dma),
+                                dma_unmap_len(tx_buffer, len),
+                                DMA_TO_DEVICE);
+               dma_unmap_len_set(tx_buffer, len, 0);
+
+               /* increment Tx next to clean counter */
+               tx_ntc++;
+               if (tx_ntc == tx_ring->count)
+                       tx_ntc = 0;
+       }
+
        while (rx_desc->wb.upper.length) {
+               struct ixgbe_rx_buffer *rx_buffer;
+
                /* check Rx buffer */
                rx_buffer = &rx_ring->rx_buffer_info[rx_ntc];
 
@@ -1938,6 +1968,8 @@ static u16 ixgbe_clean_test_rings(struct ixgbe_ring *rx_ring,
                /* verify contents of skb */
                if (ixgbe_check_lbtest_frame(rx_buffer, size))
                        count++;
+               else
+                       break;
 
                /* sync Rx buffer for device write */
                dma_sync_single_for_device(rx_ring->dev,
@@ -1945,26 +1977,10 @@ static u16 ixgbe_clean_test_rings(struct ixgbe_ring *rx_ring,
                                           ixgbe_rx_bufsz(rx_ring),
                                           DMA_FROM_DEVICE);
 
-               /* unmap buffer on Tx side */
-               tx_buffer = &tx_ring->tx_buffer_info[tx_ntc];
-
-               /* Free all the Tx ring sk_buffs */
-               dev_kfree_skb_any(tx_buffer->skb);
-
-               /* unmap skb header data */
-               dma_unmap_single(tx_ring->dev,
-                                dma_unmap_addr(tx_buffer, dma),
-                                dma_unmap_len(tx_buffer, len),
-                                DMA_TO_DEVICE);
-               dma_unmap_len_set(tx_buffer, len, 0);
-
-               /* increment Rx/Tx next to clean counters */
+               /* increment Rx next to clean counter */
                rx_ntc++;
                if (rx_ntc == rx_ring->count)
                        rx_ntc = 0;
-               tx_ntc++;
-               if (tx_ntc == tx_ring->count)
-                       tx_ntc = 0;
 
                /* fetch next descriptor */
                rx_desc = IXGBE_RX_DESC(rx_ring, rx_ntc);
index f1bfae0..8e2a957 100644 (file)
@@ -806,6 +806,7 @@ static void ixgbe_add_ring(struct ixgbe_ring *ring,
        ring->next = head->ring;
        head->ring = ring;
        head->count++;
+       head->next_update = jiffies + 1;
 }
 
 /**
@@ -879,8 +880,11 @@ static int ixgbe_alloc_q_vector(struct ixgbe_adapter *adapter,
        /* initialize work limits */
        q_vector->tx.work_limit = adapter->tx_work_limit;
 
-       /* initialize pointer to rings */
-       ring = q_vector->ring;
+       /* Initialize setting for adaptive ITR */
+       q_vector->tx.itr = IXGBE_ITR_ADAPTIVE_MAX_USECS |
+                          IXGBE_ITR_ADAPTIVE_LATENCY;
+       q_vector->rx.itr = IXGBE_ITR_ADAPTIVE_MAX_USECS |
+                          IXGBE_ITR_ADAPTIVE_LATENCY;
 
        /* intialize ITR */
        if (txr_count && !rxr_count) {
@@ -897,6 +901,9 @@ static int ixgbe_alloc_q_vector(struct ixgbe_adapter *adapter,
                        q_vector->itr = adapter->rx_itr_setting;
        }
 
+       /* initialize pointer to rings */
+       ring = q_vector->ring;
+
        while (txr_count) {
                /* assign generic ring traits */
                ring->dev = &adapter->pdev->dev;
index 3942c62..7683c14 100644 (file)
@@ -1620,6 +1620,7 @@ static bool ixgbe_alloc_mapped_page(struct ixgbe_ring *rx_ring,
        bi->page = page;
        bi->page_offset = ixgbe_rx_offset(rx_ring);
        bi->pagecnt_bias = 1;
+       rx_ring->rx_stats.alloc_rx_page++;
 
        return true;
 }
@@ -2539,50 +2540,174 @@ enum latency_range {
 static void ixgbe_update_itr(struct ixgbe_q_vector *q_vector,
                             struct ixgbe_ring_container *ring_container)
 {
-       int bytes = ring_container->total_bytes;
-       int packets = ring_container->total_packets;
-       u32 timepassed_us;
-       u64 bytes_perint;
-       u8 itr_setting = ring_container->itr;
+       unsigned int itr = IXGBE_ITR_ADAPTIVE_MIN_USECS |
+                          IXGBE_ITR_ADAPTIVE_LATENCY;
+       unsigned int avg_wire_size, packets, bytes;
+       unsigned long next_update = jiffies;
 
-       if (packets == 0)
+       /* If we don't have any rings just leave ourselves set for maximum
+        * possible latency so we take ourselves out of the equation.
+        */
+       if (!ring_container->ring)
                return;
 
-       /* simple throttlerate management
-        *   0-10MB/s   lowest (100000 ints/s)
-        *  10-20MB/s   low    (20000 ints/s)
-        *  20-1249MB/s bulk   (12000 ints/s)
+       /* If we didn't update within up to 1 - 2 jiffies we can assume
+        * that either packets are coming in so slow there hasn't been
+        * any work, or that there is so much work that NAPI is dealing
+        * with interrupt moderation and we don't need to do anything.
         */
-       /* what was last interrupt timeslice? */
-       timepassed_us = q_vector->itr >> 2;
-       if (timepassed_us == 0)
-               return;
+       if (time_after(next_update, ring_container->next_update))
+               goto clear_counts;
 
-       bytes_perint = bytes / timepassed_us; /* bytes/usec */
+       packets = ring_container->total_packets;
 
-       switch (itr_setting) {
-       case lowest_latency:
-               if (bytes_perint > 10)
-                       itr_setting = low_latency;
-               break;
-       case low_latency:
-               if (bytes_perint > 20)
-                       itr_setting = bulk_latency;
-               else if (bytes_perint <= 10)
-                       itr_setting = lowest_latency;
+       /* We have no packets to actually measure against. This means
+        * either one of the other queues on this vector is active or
+        * we are a Tx queue doing TSO with too high of an interrupt rate.
+        *
+        * When this occurs just tick up our delay by the minimum value
+        * and hope that this extra delay will prevent us from being called
+        * without any work on our queue.
+        */
+       if (!packets) {
+               itr = (q_vector->itr >> 2) + IXGBE_ITR_ADAPTIVE_MIN_INC;
+               if (itr > IXGBE_ITR_ADAPTIVE_MAX_USECS)
+                       itr = IXGBE_ITR_ADAPTIVE_MAX_USECS;
+               itr += ring_container->itr & IXGBE_ITR_ADAPTIVE_LATENCY;
+               goto clear_counts;
+       }
+
+       bytes = ring_container->total_bytes;
+
+       /* If packets are less than 4 or bytes are less than 9000 assume
+        * insufficient data to use bulk rate limiting approach. We are
+        * likely latency driven.
+        */
+       if (packets < 4 && bytes < 9000) {
+               itr = IXGBE_ITR_ADAPTIVE_LATENCY;
+               goto adjust_by_size;
+       }
+
+       /* Between 4 and 48 we can assume that our current interrupt delay
+        * is only slightly too low. As such we should increase it by a small
+        * fixed amount.
+        */
+       if (packets < 48) {
+               itr = (q_vector->itr >> 2) + IXGBE_ITR_ADAPTIVE_MIN_INC;
+               if (itr > IXGBE_ITR_ADAPTIVE_MAX_USECS)
+                       itr = IXGBE_ITR_ADAPTIVE_MAX_USECS;
+               goto clear_counts;
+       }
+
+       /* Between 48 and 96 is our "goldilocks" zone where we are working
+        * out "just right". Just report that our current ITR is good for us.
+        */
+       if (packets < 96) {
+               itr = q_vector->itr >> 2;
+               goto clear_counts;
+       }
+
+       /* If packet count is 96 or greater we are likely looking at a slight
+        * overrun of the delay we want. Try halving our delay to see if that
+        * will cut the number of packets in half per interrupt.
+        */
+       if (packets < 256) {
+               itr = q_vector->itr >> 3;
+               if (itr < IXGBE_ITR_ADAPTIVE_MIN_USECS)
+                       itr = IXGBE_ITR_ADAPTIVE_MIN_USECS;
+               goto clear_counts;
+       }
+
+       /* The paths below assume we are dealing with a bulk ITR since number
+        * of packets is 256 or greater. We are just going to have to compute
+        * a value and try to bring the count under control, though for smaller
+        * packet sizes there isn't much we can do as NAPI polling will likely
+        * be kicking in sooner rather than later.
+        */
+       itr = IXGBE_ITR_ADAPTIVE_BULK;
+
+adjust_by_size:
+       /* If packet counts are 256 or greater we can assume we have a gross
+        * overestimation of what the rate should be. Instead of trying to fine
+        * tune it just use the formula below to try and dial in an exact value
+        * give the current packet size of the frame.
+        */
+       avg_wire_size = bytes / packets;
+
+       /* The following is a crude approximation of:
+        *  wmem_default / (size + overhead) = desired_pkts_per_int
+        *  rate / bits_per_byte / (size + ethernet overhead) = pkt_rate
+        *  (desired_pkt_rate / pkt_rate) * usecs_per_sec = ITR value
+        *
+        * Assuming wmem_default is 212992 and overhead is 640 bytes per
+        * packet, (256 skb, 64 headroom, 320 shared info), we can reduce the
+        * formula down to
+        *
+        *  (170 * (size + 24)) / (size + 640) = ITR
+        *
+        * We first do some math on the packet size and then finally bitshift
+        * by 8 after rounding up. We also have to account for PCIe link speed
+        * difference as ITR scales based on this.
+        */
+       if (avg_wire_size <= 60) {
+               /* Start at 50k ints/sec */
+               avg_wire_size = 5120;
+       } else if (avg_wire_size <= 316) {
+               /* 50K ints/sec to 16K ints/sec */
+               avg_wire_size *= 40;
+               avg_wire_size += 2720;
+       } else if (avg_wire_size <= 1084) {
+               /* 16K ints/sec to 9.2K ints/sec */
+               avg_wire_size *= 15;
+               avg_wire_size += 11452;
+       } else if (avg_wire_size <= 1980) {
+               /* 9.2K ints/sec to 8K ints/sec */
+               avg_wire_size *= 5;
+               avg_wire_size += 22420;
+       } else {
+               /* plateau at a limit of 8K ints/sec */
+               avg_wire_size = 32256;
+       }
+
+       /* If we are in low latency mode half our delay which doubles the rate
+        * to somewhere between 100K to 16K ints/sec
+        */
+       if (itr & IXGBE_ITR_ADAPTIVE_LATENCY)
+               avg_wire_size >>= 1;
+
+       /* Resultant value is 256 times larger than it needs to be. This
+        * gives us room to adjust the value as needed to either increase
+        * or decrease the value based on link speeds of 10G, 2.5G, 1G, etc.
+        *
+        * Use addition as we have already recorded the new latency flag
+        * for the ITR value.
+        */
+       switch (q_vector->adapter->link_speed) {
+       case IXGBE_LINK_SPEED_10GB_FULL:
+       case IXGBE_LINK_SPEED_100_FULL:
+       default:
+               itr += DIV_ROUND_UP(avg_wire_size,
+                                   IXGBE_ITR_ADAPTIVE_MIN_INC * 256) *
+                      IXGBE_ITR_ADAPTIVE_MIN_INC;
                break;
-       case bulk_latency:
-               if (bytes_perint <= 20)
-                       itr_setting = low_latency;
+       case IXGBE_LINK_SPEED_2_5GB_FULL:
+       case IXGBE_LINK_SPEED_1GB_FULL:
+       case IXGBE_LINK_SPEED_10_FULL:
+               itr += DIV_ROUND_UP(avg_wire_size,
+                                   IXGBE_ITR_ADAPTIVE_MIN_INC * 64) *
+                      IXGBE_ITR_ADAPTIVE_MIN_INC;
                break;
        }
 
-       /* clear work counters since we have the values we need */
+clear_counts:
+       /* write back value */
+       ring_container->itr = itr;
+
+       /* next update should occur within next jiffy */
+       ring_container->next_update = next_update + 1;
+
        ring_container->total_bytes = 0;
        ring_container->total_packets = 0;
-
-       /* write updated itr to ring container */
-       ring_container->itr = itr_setting;
 }
 
 /**
@@ -2624,34 +2749,19 @@ void ixgbe_write_eitr(struct ixgbe_q_vector *q_vector)
 
 static void ixgbe_set_itr(struct ixgbe_q_vector *q_vector)
 {
-       u32 new_itr = q_vector->itr;
-       u8 current_itr;
+       u32 new_itr;
 
        ixgbe_update_itr(q_vector, &q_vector->tx);
        ixgbe_update_itr(q_vector, &q_vector->rx);
 
-       current_itr = max(q_vector->rx.itr, q_vector->tx.itr);
+       /* use the smallest value of new ITR delay calculations */
+       new_itr = min(q_vector->rx.itr, q_vector->tx.itr);
 
-       switch (current_itr) {
-       /* counts and packets in update_itr are dependent on these numbers */
-       case lowest_latency:
-               new_itr = IXGBE_100K_ITR;
-               break;
-       case low_latency:
-               new_itr = IXGBE_20K_ITR;
-               break;
-       case bulk_latency:
-               new_itr = IXGBE_12K_ITR;
-               break;
-       default:
-               break;
-       }
+       /* Clear latency flag if set, shift into correct position */
+       new_itr &= ~IXGBE_ITR_ADAPTIVE_LATENCY;
+       new_itr <<= 2;
 
        if (new_itr != q_vector->itr) {
-               /* do an exponential smoothing */
-               new_itr = (10 * new_itr * q_vector->itr) /
-                         ((9 * new_itr) + q_vector->itr);
-
                /* save the algorithm value here */
                q_vector->itr = new_itr;
 
@@ -4904,7 +5014,7 @@ static void ixgbe_clear_udp_tunnel_port(struct ixgbe_adapter *adapter, u32 mask)
                                IXGBE_FLAG_GENEVE_OFFLOAD_CAPABLE)))
                return;
 
-       vxlanctrl = IXGBE_READ_REG(hw, IXGBE_VXLANCTRL) && ~mask;
+       vxlanctrl = IXGBE_READ_REG(hw, IXGBE_VXLANCTRL) & ~mask;
        IXGBE_WRITE_REG(hw, IXGBE_VXLANCTRL, vxlanctrl);
 
        if (mask & IXGBE_VXLANCTRL_VXLAN_UDPPORT_MASK)
@@ -6794,6 +6904,7 @@ void ixgbe_update_stats(struct ixgbe_adapter *adapter)
        u32 i, missed_rx = 0, mpc, bprc, lxon, lxoff, xon_off_tot;
        u64 non_eop_descs = 0, restart_queue = 0, tx_busy = 0;
        u64 alloc_rx_page_failed = 0, alloc_rx_buff_failed = 0;
+       u64 alloc_rx_page = 0;
        u64 bytes = 0, packets = 0, hw_csum_rx_error = 0;
 
        if (test_bit(__IXGBE_DOWN, &adapter->state) ||
@@ -6814,6 +6925,7 @@ void ixgbe_update_stats(struct ixgbe_adapter *adapter)
        for (i = 0; i < adapter->num_rx_queues; i++) {
                struct ixgbe_ring *rx_ring = adapter->rx_ring[i];
                non_eop_descs += rx_ring->rx_stats.non_eop_descs;
+               alloc_rx_page += rx_ring->rx_stats.alloc_rx_page;
                alloc_rx_page_failed += rx_ring->rx_stats.alloc_rx_page_failed;
                alloc_rx_buff_failed += rx_ring->rx_stats.alloc_rx_buff_failed;
                hw_csum_rx_error += rx_ring->rx_stats.csum_err;
@@ -6821,6 +6933,7 @@ void ixgbe_update_stats(struct ixgbe_adapter *adapter)
                packets += rx_ring->stats.packets;
        }
        adapter->non_eop_descs = non_eop_descs;
+       adapter->alloc_rx_page = alloc_rx_page;
        adapter->alloc_rx_page_failed = alloc_rx_page_failed;
        adapter->alloc_rx_buff_failed = alloc_rx_buff_failed;
        adapter->hw_csum_rx_error = hw_csum_rx_error;
@@ -8552,6 +8665,10 @@ static int ixgbe_ioctl(struct net_device *netdev, struct ifreq *req, int cmd)
                return ixgbe_ptp_set_ts_config(adapter, req);
        case SIOCGHWTSTAMP:
                return ixgbe_ptp_get_ts_config(adapter, req);
+       case SIOCGMIIPHY:
+               if (!adapter->hw.phy.ops.read_reg)
+                       return -EOPNOTSUPP;
+               /* fall through */
        default:
                return mdio_mii_ioctl(&adapter->hw.phy.mdio, if_mii(req), cmd);
        }
@@ -9758,6 +9875,17 @@ static void ixgbe_fwd_del(struct net_device *pdev, void *priv)
        limit = find_last_bit(&adapter->fwd_bitmask, 32);
        adapter->ring_feature[RING_F_VMDQ].limit = limit + 1;
        ixgbe_fwd_ring_down(fwd_adapter->netdev, fwd_adapter);
+
+       /* go back to full RSS if we're done with our VMQs */
+       if (adapter->ring_feature[RING_F_VMDQ].limit == 1) {
+               int rss = min_t(int, ixgbe_max_rss_indices(adapter),
+                               num_online_cpus());
+
+               adapter->flags &= ~IXGBE_FLAG_VMDQ_ENABLED;
+               adapter->flags &= ~IXGBE_FLAG_SRIOV_ENABLED;
+               adapter->ring_feature[RING_F_RSS].limit = rss;
+       }
+
        ixgbe_setup_tc(pdev, netdev_get_num_tc(pdev));
        netdev_dbg(pdev, "pool %i:%i queues %i:%i VSI bitmask %lx\n",
                   fwd_adapter->pool, adapter->num_rx_pools,
@@ -10737,6 +10865,9 @@ skip_bad_vf_detection:
        if (!test_bit(__IXGBE_SERVICE_INITED, &adapter->state))
                return PCI_ERS_RESULT_DISCONNECT;
 
+       if (!netif_device_present(netdev))
+               return PCI_ERS_RESULT_DISCONNECT;
+
        rtnl_lock();
        netif_device_detach(netdev);
 
index 6ea0d6a..b8c5fd2 100644 (file)
@@ -619,12 +619,6 @@ s32 ixgbe_acquire_swfw_sync_X540(struct ixgbe_hw *hw, u32 mask)
                usleep_range(5000, 10000);
        }
 
-       /* Failed to get SW only semaphore */
-       if (swmask == IXGBE_GSSR_SW_MNG_SM) {
-               hw_dbg(hw, "Failed to get SW only semaphore\n");
-               return IXGBE_ERR_SWFW_SYNC;
-       }
-
        /* If the resource is not released by the FW/HW the SW can assume that
         * the FW/HW malfunctions. In that case the SW should set the SW bit(s)
         * of the requested resource(s) while ignoring the corresponding FW/HW
@@ -647,7 +641,8 @@ s32 ixgbe_acquire_swfw_sync_X540(struct ixgbe_hw *hw, u32 mask)
         */
        if (swfw_sync & swmask) {
                u32 rmask = IXGBE_GSSR_EEP_SM | IXGBE_GSSR_PHY0_SM |
-                           IXGBE_GSSR_PHY1_SM | IXGBE_GSSR_MAC_CSR_SM;
+                           IXGBE_GSSR_PHY1_SM | IXGBE_GSSR_MAC_CSR_SM |
+                           IXGBE_GSSR_SW_MNG_SM;
 
                if (swi2c_mask)
                        rmask |= IXGBE_GSSR_I2C_MASK;
@@ -763,6 +758,8 @@ static void ixgbe_release_swfw_sync_semaphore(struct ixgbe_hw *hw)
  **/
 void ixgbe_init_swfw_sync_X540(struct ixgbe_hw *hw)
 {
+       u32 rmask;
+
        /* First try to grab the semaphore but we don't need to bother
         * looking to see whether we got the lock or not since we do
         * the same thing regardless of whether we got the lock or not.
@@ -771,6 +768,14 @@ void ixgbe_init_swfw_sync_X540(struct ixgbe_hw *hw)
         */
        ixgbe_get_swfw_sync_semaphore(hw);
        ixgbe_release_swfw_sync_semaphore(hw);
+
+       /* Acquire and release all software resources. */
+       rmask = IXGBE_GSSR_EEP_SM | IXGBE_GSSR_PHY0_SM |
+               IXGBE_GSSR_PHY1_SM | IXGBE_GSSR_MAC_CSR_SM |
+               IXGBE_GSSR_SW_MNG_SM | IXGBE_GSSR_I2C_MASK;
+
+       ixgbe_acquire_swfw_sync_X540(hw, rmask);
+       ixgbe_release_swfw_sync_X540(hw, rmask);
 }
 
 /**
index 19fbb2f..cb7da5f 100644 (file)
@@ -900,6 +900,8 @@ static s32 ixgbe_read_ee_hostif_buffer_X550(struct ixgbe_hw *hw,
                /* convert offset from words to bytes */
                buffer.address = cpu_to_be32((offset + current_word) * 2);
                buffer.length = cpu_to_be16(words_to_read * 2);
+               buffer.pad2 = 0;
+               buffer.pad3 = 0;
 
                status = ixgbe_hic_unlocked(hw, (u32 *)&buffer, sizeof(buffer),
                                            IXGBE_HI_COMMAND_TIMEOUT);
@@ -3192,6 +3194,9 @@ static s32 ixgbe_init_phy_ops_X550em(struct ixgbe_hw *hw)
 
        /* Identify the PHY or SFP module */
        ret_val = phy->ops.identify(hw);
+       if (ret_val == IXGBE_ERR_SFP_NOT_SUPPORTED ||
+           ret_val == IXGBE_ERR_PHY_ADDR_INVALID)
+               return ret_val;
 
        /* Setup function pointers based on detected hardware */
        ixgbe_init_mac_link_ops_X550em(hw);
@@ -3394,9 +3399,10 @@ static s32 ixgbe_reset_hw_X550em(struct ixgbe_hw *hw)
        ixgbe_clear_tx_pending(hw);
 
        /* PHY ops must be identified and initialized prior to reset */
-
-       /* Identify PHY and related function pointers */
        status = hw->phy.ops.init(hw);
+       if (status == IXGBE_ERR_SFP_NOT_SUPPORTED ||
+           status == IXGBE_ERR_PHY_ADDR_INVALID)
+               return status;
 
        /* start the external PHY */
        if (hw->phy.type == ixgbe_phy_x550em_ext_t) {
@@ -3884,7 +3890,7 @@ static const struct ixgbe_mac_operations mac_ops_X550EM_x_fw = {
        .write_iosf_sb_reg      = ixgbe_write_iosf_sb_reg_x550,
 };
 
-static struct ixgbe_mac_operations mac_ops_x550em_a = {
+static const struct ixgbe_mac_operations mac_ops_x550em_a = {
        X550_COMMON_MAC
        .led_on                 = ixgbe_led_on_t_x550em,
        .led_off                = ixgbe_led_off_t_x550em,
@@ -3905,7 +3911,7 @@ static struct ixgbe_mac_operations mac_ops_x550em_a = {
        .write_iosf_sb_reg      = ixgbe_write_iosf_sb_reg_x550a,
 };
 
-static struct ixgbe_mac_operations mac_ops_x550em_a_fw = {
+static const struct ixgbe_mac_operations mac_ops_x550em_a_fw = {
        X550_COMMON_MAC
        .led_on                 = ixgbe_led_on_generic,
        .led_off                = ixgbe_led_off_generic,
index 3d4e4a5..bf1f041 100644 (file)
@@ -1742,13 +1742,18 @@ static int mlx4_en_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd)
        return err;
 }
 
+static int mlx4_en_get_max_num_rx_rings(struct net_device *dev)
+{
+       return min_t(int, num_online_cpus(), MAX_RX_RINGS);
+}
+
 static void mlx4_en_get_channels(struct net_device *dev,
                                 struct ethtool_channels *channel)
 {
        struct mlx4_en_priv *priv = netdev_priv(dev);
 
-       channel->max_rx = MAX_RX_RINGS;
-       channel->max_tx = MLX4_EN_MAX_TX_RING_P_UP;
+       channel->max_rx = mlx4_en_get_max_num_rx_rings(dev);
+       channel->max_tx = priv->mdev->profile.max_num_tx_rings_p_up;
 
        channel->rx_count = priv->rx_ring_num;
        channel->tx_count = priv->tx_ring_num[TX] /
@@ -1777,7 +1782,7 @@ static int mlx4_en_set_channels(struct net_device *dev,
        mutex_lock(&mdev->state_lock);
        xdp_count = priv->tx_ring_num[TX_XDP] ? channel->rx_count : 0;
        if (channel->tx_count * priv->prof->num_up + xdp_count >
-           MAX_TX_RINGS) {
+           priv->mdev->profile.max_num_tx_rings_p_up * priv->prof->num_up) {
                err = -EINVAL;
                en_err(priv,
                       "Total number of TX and XDP rings (%d) exceeds the maximum supported (%d)\n",
index 686e18d..2c29654 100644 (file)
@@ -153,7 +153,7 @@ static void mlx4_en_get_profile(struct mlx4_en_dev *mdev)
        int i;
 
        params->udp_rss = udp_rss;
-       params->num_tx_rings_p_up = mlx4_low_memory_profile() ?
+       params->max_num_tx_rings_p_up = mlx4_low_memory_profile() ?
                MLX4_EN_MIN_TX_RING_P_UP :
                min_t(int, num_online_cpus(), MLX4_EN_MAX_TX_RING_P_UP);
 
@@ -170,8 +170,8 @@ static void mlx4_en_get_profile(struct mlx4_en_dev *mdev)
                params->prof[i].tx_ring_size = MLX4_EN_DEF_TX_RING_SIZE;
                params->prof[i].rx_ring_size = MLX4_EN_DEF_RX_RING_SIZE;
                params->prof[i].num_up = MLX4_EN_NUM_UP_LOW;
-               params->prof[i].num_tx_rings_p_up = params->num_tx_rings_p_up;
-               params->prof[i].tx_ring_num[TX] = params->num_tx_rings_p_up *
+               params->prof[i].num_tx_rings_p_up = params->max_num_tx_rings_p_up;
+               params->prof[i].tx_ring_num[TX] = params->max_num_tx_rings_p_up *
                        params->prof[i].num_up;
                params->prof[i].rss_rings = 0;
                params->prof[i].inline_thold = inline_thold;
index 9c218f1..e4c7a80 100644 (file)
@@ -3305,7 +3305,7 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port,
        priv->pflags = MLX4_EN_PRIV_FLAGS_BLUEFLAME;
        priv->ctrl_flags = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE |
                        MLX4_WQE_CTRL_SOLICITED);
-       priv->num_tx_rings_p_up = mdev->profile.num_tx_rings_p_up;
+       priv->num_tx_rings_p_up = mdev->profile.max_num_tx_rings_p_up;
        priv->tx_work_limit = MLX4_EN_DEFAULT_TX_WORK;
        netdev_rss_key_fill(priv->rss_key, sizeof(priv->rss_key));
 
index 5a47f96..6883ac7 100644 (file)
@@ -53,7 +53,7 @@ void mlx4_en_fill_qp_context(struct mlx4_en_priv *priv, int size, int stride,
        if (is_tx) {
                context->sq_size_stride = ilog2(size) << 3 | (ilog2(stride) - 4);
                if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_PORT_REMAP)
-                       context->params2 |= MLX4_QP_BIT_FPP;
+                       context->params2 |= cpu_to_be32(MLX4_QP_BIT_FPP);
 
        } else {
                context->sq_size_stride = ilog2(TXBB_SIZE) - 4;
index 8f9cb8a..a786695 100644 (file)
@@ -254,8 +254,7 @@ void mlx4_en_set_num_rx_rings(struct mlx4_en_dev *mdev)
                                         DEF_RX_RINGS));
 
                num_rx_rings = mlx4_low_memory_profile() ? MIN_RX_RINGS :
-                       min_t(int, num_of_eqs,
-                             netif_get_num_default_rss_queues());
+                       min_t(int, num_of_eqs, num_online_cpus());
                mdev->profile.prof[i].rx_ring_num =
                        rounddown_pow_of_two(num_rx_rings);
        }
index 8a32a8f..2cc82dc 100644 (file)
@@ -718,7 +718,7 @@ void mlx4_en_xmit_doorbell(struct mlx4_en_tx_ring *ring)
 #else
        iowrite32be(
 #endif
-                 ring->doorbell_qpn,
+                 (__force u32)ring->doorbell_qpn,
                  ring->bf.uar->map + MLX4_SEND_DOORBELL);
 }
 
index 16c0994..634f603 100644 (file)
@@ -57,12 +57,12 @@ MODULE_PARM_DESC(enable_qos, "Enable Enhanced QoS support (default: off)");
 #define MLX4_GET(dest, source, offset)                               \
        do {                                                          \
                void *__p = (char *) (source) + (offset);             \
-               u64 val;                                              \
-               switch (sizeof(dest)) {                       \
+               __be64 val;                                           \
+               switch (sizeof(dest)) {                               \
                case 1: (dest) = *(u8 *) __p;       break;            \
                case 2: (dest) = be16_to_cpup(__p); break;            \
                case 4: (dest) = be32_to_cpup(__p); break;            \
-               case 8: val = get_unaligned((u64 *)__p);              \
+               case 8: val = get_unaligned((__be64 *)__p);           \
                        (dest) = be64_to_cpu(val);  break;            \
                default: __buggy_use_of_MLX4_GET();                   \
                }                                                     \
index fdb3ad0..245e9ea 100644 (file)
@@ -399,7 +399,7 @@ struct mlx4_en_profile {
        u32 active_ports;
        u32 small_pkt_int;
        u8 no_reset;
-       u8 num_tx_rings_p_up;
+       u8 max_num_tx_rings_p_up;
        struct mlx4_en_port_profile prof[MLX4_MAX_PORTS + 1];
 };
 
index 728a2fb..2033209 100644 (file)
@@ -925,7 +925,7 @@ int mlx4_qp_to_ready(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
                context->flags &= cpu_to_be32(~(0xf << 28));
                context->flags |= cpu_to_be32(states[i + 1] << 28);
                if (states[i + 1] != MLX4_QP_STATE_RTR)
-                       context->params2 &= ~MLX4_QP_BIT_FPP;
+                       context->params2 &= ~cpu_to_be32(MLX4_QP_BIT_FPP);
                err = mlx4_qp_modify(dev, mtt, states[i], states[i + 1],
                                     context, 0, 0, qp);
                if (err) {
index fabb533..04304dd 100644 (file)
@@ -3185,7 +3185,7 @@ static int verify_qp_parameters(struct mlx4_dev *dev,
        optpar  = be32_to_cpu(*(__be32 *) inbox->buf);
 
        if (slave != mlx4_master_func_num(dev)) {
-               qp_ctx->params2 &= ~MLX4_QP_BIT_FPP;
+               qp_ctx->params2 &= ~cpu_to_be32(MLX4_QP_BIT_FPP);
                /* setting QP rate-limit is disallowed for VFs */
                if (qp_ctx->rate_limit_params)
                        return -EPERM;
index 5a7bea6..7a136ae 100644 (file)
@@ -145,10 +145,10 @@ static struct init_tree_node {
        }
 };
 
-enum fs_i_mutex_lock_class {
-       FS_MUTEX_GRANDPARENT,
-       FS_MUTEX_PARENT,
-       FS_MUTEX_CHILD
+enum fs_i_lock_class {
+       FS_LOCK_GRANDPARENT,
+       FS_LOCK_PARENT,
+       FS_LOCK_CHILD
 };
 
 static const struct rhashtable_params rhash_fte = {
@@ -168,10 +168,16 @@ static const struct rhashtable_params rhash_fg = {
 
 };
 
-static void del_rule(struct fs_node *node);
-static void del_flow_table(struct fs_node *node);
-static void del_flow_group(struct fs_node *node);
-static void del_fte(struct fs_node *node);
+static void del_hw_flow_table(struct fs_node *node);
+static void del_hw_flow_group(struct fs_node *node);
+static void del_hw_fte(struct fs_node *node);
+static void del_sw_flow_table(struct fs_node *node);
+static void del_sw_flow_group(struct fs_node *node);
+static void del_sw_fte(struct fs_node *node);
+/* Delete rule (destination) is special case that 
+ * requires to lock the FTE for all the deletion process.
+ */
+static void del_sw_hw_rule(struct fs_node *node);
 static bool mlx5_flow_dests_cmp(struct mlx5_flow_destination *d1,
                                struct mlx5_flow_destination *d2);
 static struct mlx5_flow_rule *
@@ -179,14 +185,16 @@ find_flow_rule(struct fs_fte *fte,
               struct mlx5_flow_destination *dest);
 
 static void tree_init_node(struct fs_node *node,
-                          unsigned int refcount,
-                          void (*remove_func)(struct fs_node *))
+                          void (*del_hw_func)(struct fs_node *),
+                          void (*del_sw_func)(struct fs_node *))
 {
-       atomic_set(&node->refcount, refcount);
+       atomic_set(&node->refcount, 1);
        INIT_LIST_HEAD(&node->list);
        INIT_LIST_HEAD(&node->children);
-       mutex_init(&node->lock);
-       node->remove_func = remove_func;
+       init_rwsem(&node->lock);
+       node->del_hw_func = del_hw_func;
+       node->del_sw_func = del_sw_func;
+       node->active = false;
 }
 
 static void tree_add_node(struct fs_node *node, struct fs_node *parent)
@@ -202,50 +210,70 @@ static void tree_add_node(struct fs_node *node, struct fs_node *parent)
                node->root = parent->root;
 }
 
-static void tree_get_node(struct fs_node *node)
+static int tree_get_node(struct fs_node *node)
 {
-       atomic_inc(&node->refcount);
+       return atomic_add_unless(&node->refcount, 1, 0);
 }
 
-static void nested_lock_ref_node(struct fs_node *node,
-                                enum fs_i_mutex_lock_class class)
+static void nested_down_read_ref_node(struct fs_node *node,
+                                     enum fs_i_lock_class class)
 {
        if (node) {
-               mutex_lock_nested(&node->lock, class);
+               down_read_nested(&node->lock, class);
                atomic_inc(&node->refcount);
        }
 }
 
-static void lock_ref_node(struct fs_node *node)
+static void nested_down_write_ref_node(struct fs_node *node,
+                                      enum fs_i_lock_class class)
 {
        if (node) {
-               mutex_lock(&node->lock);
+               down_write_nested(&node->lock, class);
                atomic_inc(&node->refcount);
        }
 }
 
-static void unlock_ref_node(struct fs_node *node)
+static void down_write_ref_node(struct fs_node *node)
 {
        if (node) {
-               atomic_dec(&node->refcount);
-               mutex_unlock(&node->lock);
+               down_write(&node->lock);
+               atomic_inc(&node->refcount);
        }
 }
 
+static void up_read_ref_node(struct fs_node *node)
+{
+       atomic_dec(&node->refcount);
+       up_read(&node->lock);
+}
+
+static void up_write_ref_node(struct fs_node *node)
+{
+       atomic_dec(&node->refcount);
+       up_write(&node->lock);
+}
+
 static void tree_put_node(struct fs_node *node)
 {
        struct fs_node *parent_node = node->parent;
 
-       lock_ref_node(parent_node);
        if (atomic_dec_and_test(&node->refcount)) {
-               if (parent_node)
+               if (node->del_hw_func)
+                       node->del_hw_func(node);
+               if (parent_node) {
+                       /* Only root namespace doesn't have parent and we just
+                        * need to free its node.
+                        */
+                       down_write_ref_node(parent_node);
                        list_del_init(&node->list);
-               if (node->remove_func)
-                       node->remove_func(node);
-               kfree(node);
+                       if (node->del_sw_func)
+                               node->del_sw_func(node);
+                       up_write_ref_node(parent_node);
+               } else {
+                       kfree(node);
+               }
                node = NULL;
        }
-       unlock_ref_node(parent_node);
        if (!node && parent_node)
                tree_put_node(parent_node);
 }
@@ -362,6 +390,15 @@ static struct mlx5_flow_root_namespace *find_root(struct fs_node *node)
        return container_of(ns, struct mlx5_flow_root_namespace, ns);
 }
 
+static inline struct mlx5_flow_steering *get_steering(struct fs_node *node)
+{
+       struct mlx5_flow_root_namespace *root = find_root(node);
+
+       if (root)
+               return root->dev->priv.steering;
+       return NULL;
+}
+
 static inline struct mlx5_core_dev *get_dev(struct fs_node *node)
 {
        struct mlx5_flow_root_namespace *root = find_root(node);
@@ -371,26 +408,36 @@ static inline struct mlx5_core_dev *get_dev(struct fs_node *node)
        return NULL;
 }
 
-static void del_flow_table(struct fs_node *node)
+static void del_hw_flow_table(struct fs_node *node)
 {
        struct mlx5_flow_table *ft;
        struct mlx5_core_dev *dev;
-       struct fs_prio *prio;
        int err;
 
        fs_get_obj(ft, node);
        dev = get_dev(&ft->node);
 
-       err = mlx5_cmd_destroy_flow_table(dev, ft);
-       if (err)
-               mlx5_core_warn(dev, "flow steering can't destroy ft\n");
-       ida_destroy(&ft->fte_allocator);
+       if (node->active) {
+               err = mlx5_cmd_destroy_flow_table(dev, ft);
+               if (err)
+                       mlx5_core_warn(dev, "flow steering can't destroy ft\n");
+       }
+}
+
+static void del_sw_flow_table(struct fs_node *node)
+{
+       struct mlx5_flow_table *ft;
+       struct fs_prio *prio;
+
+       fs_get_obj(ft, node);
+
        rhltable_destroy(&ft->fgs_hash);
        fs_get_obj(prio, ft->node.parent);
        prio->num_ft--;
+       kfree(ft);
 }
 
-static void del_rule(struct fs_node *node)
+static void del_sw_hw_rule(struct fs_node *node)
 {
        struct mlx5_flow_rule *rule;
        struct mlx5_flow_table *ft;
@@ -406,7 +453,6 @@ static void del_rule(struct fs_node *node)
        fs_get_obj(fg, fte->node.parent);
        fs_get_obj(ft, fg->node.parent);
        trace_mlx5_fs_del_rule(rule);
-       list_del(&rule->node.list);
        if (rule->sw_action == MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO) {
                mutex_lock(&rule->dest_attr.ft->lock);
                list_del(&rule->next_ft);
@@ -434,117 +480,203 @@ out:
                                       "%s can't del rule fg id=%d fte_index=%d\n",
                                       __func__, fg->id, fte->index);
        }
+       kfree(rule);
 }
 
-static void destroy_fte(struct fs_fte *fte, struct mlx5_flow_group *fg)
+static void del_hw_fte(struct fs_node *node)
 {
        struct mlx5_flow_table *ft;
-       int ret;
+       struct mlx5_flow_group *fg;
+       struct mlx5_core_dev *dev;
+       struct fs_fte *fte;
+       int err;
 
-       ret = rhashtable_remove_fast(&fg->ftes_hash, &fte->hash, rhash_fte);
-       WARN_ON(ret);
-       fte->status = 0;
+       fs_get_obj(fte, node);
+       fs_get_obj(fg, fte->node.parent);
        fs_get_obj(ft, fg->node.parent);
-       ida_simple_remove(&ft->fte_allocator, fte->index);
+
+       trace_mlx5_fs_del_fte(fte);
+       dev = get_dev(&ft->node);
+       if (node->active) {
+               err = mlx5_cmd_delete_fte(dev, ft,
+                                         fte->index);
+               if (err)
+                       mlx5_core_warn(dev,
+                                      "flow steering can't delete fte in index %d of flow group id %d\n",
+                                      fte->index, fg->id);
+       }
 }
 
-static void del_fte(struct fs_node *node)
+static void del_sw_fte(struct fs_node *node)
 {
-       struct mlx5_flow_table *ft;
+       struct mlx5_flow_steering *steering = get_steering(node);
        struct mlx5_flow_group *fg;
-       struct mlx5_core_dev *dev;
        struct fs_fte *fte;
        int err;
 
        fs_get_obj(fte, node);
        fs_get_obj(fg, fte->node.parent);
-       fs_get_obj(ft, fg->node.parent);
-       trace_mlx5_fs_del_fte(fte);
-
-       dev = get_dev(&ft->node);
-       err = mlx5_cmd_delete_fte(dev, ft,
-                                 fte->index);
-       if (err)
-               mlx5_core_warn(dev,
-                              "flow steering can't delete fte in index %d of flow group id %d\n",
-                              fte->index, fg->id);
 
-       destroy_fte(fte, fg);
+       err = rhashtable_remove_fast(&fg->ftes_hash,
+                                    &fte->hash,
+                                    rhash_fte);
+       WARN_ON(err);
+       ida_simple_remove(&fg->fte_allocator, fte->index - fg->start_index);
+       kmem_cache_free(steering->ftes_cache, fte);
 }
 
-static void del_flow_group(struct fs_node *node)
+static void del_hw_flow_group(struct fs_node *node)
 {
        struct mlx5_flow_group *fg;
        struct mlx5_flow_table *ft;
        struct mlx5_core_dev *dev;
-       int err;
 
        fs_get_obj(fg, node);
        fs_get_obj(ft, fg->node.parent);
        dev = get_dev(&ft->node);
        trace_mlx5_fs_del_fg(fg);
 
-       if (ft->autogroup.active)
-               ft->autogroup.num_groups--;
+       if (fg->node.active && mlx5_cmd_destroy_flow_group(dev, ft, fg->id))
+               mlx5_core_warn(dev, "flow steering can't destroy fg %d of ft %d\n",
+                              fg->id, ft->id);
+}
+
+static void del_sw_flow_group(struct fs_node *node)
+{
+       struct mlx5_flow_steering *steering = get_steering(node);
+       struct mlx5_flow_group *fg;
+       struct mlx5_flow_table *ft;
+       int err;
+
+       fs_get_obj(fg, node);
+       fs_get_obj(ft, fg->node.parent);
 
        rhashtable_destroy(&fg->ftes_hash);
+       ida_destroy(&fg->fte_allocator);
+       if (ft->autogroup.active)
+               ft->autogroup.num_groups--;
        err = rhltable_remove(&ft->fgs_hash,
                              &fg->hash,
                              rhash_fg);
        WARN_ON(err);
-       if (mlx5_cmd_destroy_flow_group(dev, ft, fg->id))
-               mlx5_core_warn(dev, "flow steering can't destroy fg %d of ft %d\n",
-                              fg->id, ft->id);
+       kmem_cache_free(steering->fgs_cache, fg);
+}
+
+static int insert_fte(struct mlx5_flow_group *fg, struct fs_fte *fte)
+{
+       int index;
+       int ret;
+
+       index = ida_simple_get(&fg->fte_allocator, 0, fg->max_ftes, GFP_KERNEL);
+       if (index < 0)
+               return index;
+
+       fte->index = index + fg->start_index;
+       ret = rhashtable_insert_fast(&fg->ftes_hash,
+                                    &fte->hash,
+                                    rhash_fte);
+       if (ret)
+               goto err_ida_remove;
+
+       tree_add_node(&fte->node, &fg->node);
+       list_add_tail(&fte->node.list, &fg->node.children);
+       return 0;
+
+err_ida_remove:
+       ida_simple_remove(&fg->fte_allocator, index);
+       return ret;
 }
 
-static struct fs_fte *alloc_fte(struct mlx5_flow_act *flow_act,
+static struct fs_fte *alloc_fte(struct mlx5_flow_table *ft,
                                u32 *match_value,
-                               unsigned int index)
+                               struct mlx5_flow_act *flow_act)
 {
+       struct mlx5_flow_steering *steering = get_steering(&ft->node);
        struct fs_fte *fte;
 
-       fte = kzalloc(sizeof(*fte), GFP_KERNEL);
+       fte = kmem_cache_zalloc(steering->ftes_cache, GFP_KERNEL);
        if (!fte)
                return ERR_PTR(-ENOMEM);
 
        memcpy(fte->val, match_value, sizeof(fte->val));
        fte->node.type =  FS_TYPE_FLOW_ENTRY;
        fte->flow_tag = flow_act->flow_tag;
-       fte->index = index;
        fte->action = flow_act->action;
        fte->encap_id = flow_act->encap_id;
        fte->modify_id = flow_act->modify_id;
 
+       tree_init_node(&fte->node, del_hw_fte, del_sw_fte);
+
        return fte;
 }
 
-static struct mlx5_flow_group *alloc_flow_group(u32 *create_fg_in)
+static void dealloc_flow_group(struct mlx5_flow_steering *steering,
+                              struct mlx5_flow_group *fg)
+{
+       rhashtable_destroy(&fg->ftes_hash);
+       kmem_cache_free(steering->fgs_cache, fg);
+}
+
+static struct mlx5_flow_group *alloc_flow_group(struct mlx5_flow_steering *steering,
+                                               u8 match_criteria_enable,
+                                               void *match_criteria,
+                                               int start_index,
+                                               int end_index)
 {
        struct mlx5_flow_group *fg;
-       void *match_criteria = MLX5_ADDR_OF(create_flow_group_in,
-                                           create_fg_in, match_criteria);
-       u8 match_criteria_enable = MLX5_GET(create_flow_group_in,
-                                           create_fg_in,
-                                           match_criteria_enable);
        int ret;
 
-       fg = kzalloc(sizeof(*fg), GFP_KERNEL);
+       fg = kmem_cache_zalloc(steering->fgs_cache, GFP_KERNEL);
        if (!fg)
                return ERR_PTR(-ENOMEM);
 
        ret = rhashtable_init(&fg->ftes_hash, &rhash_fte);
        if (ret) {
-               kfree(fg);
+               kmem_cache_free(steering->fgs_cache, fg);
                return ERR_PTR(ret);
-       }
+}
+       ida_init(&fg->fte_allocator);
        fg->mask.match_criteria_enable = match_criteria_enable;
        memcpy(&fg->mask.match_criteria, match_criteria,
               sizeof(fg->mask.match_criteria));
        fg->node.type =  FS_TYPE_FLOW_GROUP;
-       fg->start_index = MLX5_GET(create_flow_group_in, create_fg_in,
-                                  start_flow_index);
-       fg->max_ftes = MLX5_GET(create_flow_group_in, create_fg_in,
-                               end_flow_index) - fg->start_index + 1;
+       fg->start_index = start_index;
+       fg->max_ftes = end_index - start_index + 1;
+
+       return fg;
+}
+
+static struct mlx5_flow_group *alloc_insert_flow_group(struct mlx5_flow_table *ft,
+                                                      u8 match_criteria_enable,
+                                                      void *match_criteria,
+                                                      int start_index,
+                                                      int end_index,
+                                                      struct list_head *prev)
+{
+       struct mlx5_flow_steering *steering = get_steering(&ft->node);
+       struct mlx5_flow_group *fg;
+       int ret;
+
+       fg = alloc_flow_group(steering, match_criteria_enable, match_criteria,
+                             start_index, end_index);
+       if (IS_ERR(fg))
+               return fg;
+
+       /* initialize refcnt, add to parent list */
+       ret = rhltable_insert(&ft->fgs_hash,
+                             &fg->hash,
+                             rhash_fg);
+       if (ret) {
+               dealloc_flow_group(steering, fg);
+               return ERR_PTR(ret);
+       }
+
+       tree_init_node(&fg->node, del_hw_flow_group, del_sw_flow_group);
+       tree_add_node(&fg->node, &ft->node);
+       /* Add node to group list */
+       list_add(&fg->node.list, prev);
+       atomic_inc(&ft->node.version);
+
        return fg;
 }
 
@@ -575,7 +707,6 @@ static struct mlx5_flow_table *alloc_flow_table(int level, u16 vport, int max_ft
        ft->flags = flags;
        INIT_LIST_HEAD(&ft->fwd_rules);
        mutex_init(&ft->lock);
-       ida_init(&ft->fte_allocator);
 
        return ft;
 }
@@ -724,7 +855,7 @@ static int _mlx5_modify_rule_destination(struct mlx5_flow_rule *rule,
        fs_get_obj(fte, rule->node.parent);
        if (!(fte->action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST))
                return -EINVAL;
-       lock_ref_node(&fte->node);
+       down_write_ref_node(&fte->node);
        fs_get_obj(fg, fte->node.parent);
        fs_get_obj(ft, fg->node.parent);
 
@@ -733,7 +864,7 @@ static int _mlx5_modify_rule_destination(struct mlx5_flow_rule *rule,
                                  ft, fg->id,
                                  modify_mask,
                                  fte);
-       unlock_ref_node(&fte->node);
+       up_write_ref_node(&fte->node);
 
        return err;
 }
@@ -870,7 +1001,7 @@ static struct mlx5_flow_table *__mlx5_create_flow_table(struct mlx5_flow_namespa
                goto unlock_root;
        }
 
-       tree_init_node(&ft->node, 1, del_flow_table);
+       tree_init_node(&ft->node, del_hw_flow_table, del_sw_flow_table);
        log_table_sz = ft->max_fte ? ilog2(ft->max_fte) : 0;
        next_ft = find_next_chained_ft(fs_prio);
        err = mlx5_cmd_create_flow_table(root->dev, ft->vport, ft->op_mod, ft->type,
@@ -882,17 +1013,17 @@ static struct mlx5_flow_table *__mlx5_create_flow_table(struct mlx5_flow_namespa
        err = connect_flow_table(root->dev, ft, fs_prio);
        if (err)
                goto destroy_ft;
-       lock_ref_node(&fs_prio->node);
+       ft->node.active = true;
+       down_write_ref_node(&fs_prio->node);
        tree_add_node(&ft->node, &fs_prio->node);
        list_add_flow_table(ft, fs_prio);
        fs_prio->num_ft++;
-       unlock_ref_node(&fs_prio->node);
+       up_write_ref_node(&fs_prio->node);
        mutex_unlock(&root->chain_lock);
        return ft;
 destroy_ft:
        mlx5_cmd_destroy_flow_table(root->dev, ft);
 free_ft:
-       ida_destroy(&ft->fte_allocator);
        kfree(ft);
 unlock_root:
        mutex_unlock(&root->chain_lock);
@@ -960,54 +1091,6 @@ mlx5_create_auto_grouped_flow_table(struct mlx5_flow_namespace *ns,
 }
 EXPORT_SYMBOL(mlx5_create_auto_grouped_flow_table);
 
-/* Flow table should be locked */
-static struct mlx5_flow_group *create_flow_group_common(struct mlx5_flow_table *ft,
-                                                       u32 *fg_in,
-                                                       struct list_head
-                                                       *prev_fg,
-                                                       bool is_auto_fg)
-{
-       struct mlx5_flow_group *fg;
-       struct mlx5_core_dev *dev = get_dev(&ft->node);
-       int err;
-
-       if (!dev)
-               return ERR_PTR(-ENODEV);
-
-       fg = alloc_flow_group(fg_in);
-       if (IS_ERR(fg))
-               return fg;
-
-       err = rhltable_insert(&ft->fgs_hash, &fg->hash, rhash_fg);
-       if (err)
-               goto err_free_fg;
-
-       err = mlx5_cmd_create_flow_group(dev, ft, fg_in, &fg->id);
-       if (err)
-               goto err_remove_fg;
-
-       if (ft->autogroup.active)
-               ft->autogroup.num_groups++;
-       /* Add node to tree */
-       tree_init_node(&fg->node, !is_auto_fg, del_flow_group);
-       tree_add_node(&fg->node, &ft->node);
-       /* Add node to group list */
-       list_add(&fg->node.list, prev_fg);
-
-       trace_mlx5_fs_add_fg(fg);
-       return fg;
-
-err_remove_fg:
-       WARN_ON(rhltable_remove(&ft->fgs_hash,
-                               &fg->hash,
-                               rhash_fg));
-err_free_fg:
-       rhashtable_destroy(&fg->ftes_hash);
-       kfree(fg);
-
-       return ERR_PTR(err);
-}
-
 struct mlx5_flow_group *mlx5_create_flow_group(struct mlx5_flow_table *ft,
                                               u32 *fg_in)
 {
@@ -1016,7 +1099,13 @@ struct mlx5_flow_group *mlx5_create_flow_group(struct mlx5_flow_table *ft,
        u8 match_criteria_enable = MLX5_GET(create_flow_group_in,
                                            fg_in,
                                            match_criteria_enable);
+       int start_index = MLX5_GET(create_flow_group_in, fg_in,
+                                  start_flow_index);
+       int end_index = MLX5_GET(create_flow_group_in, fg_in,
+                                end_flow_index);
+       struct mlx5_core_dev *dev = get_dev(&ft->node);
        struct mlx5_flow_group *fg;
+       int err;
 
        if (!check_valid_mask(match_criteria_enable, match_criteria))
                return ERR_PTR(-EINVAL);
@@ -1024,9 +1113,21 @@ struct mlx5_flow_group *mlx5_create_flow_group(struct mlx5_flow_table *ft,
        if (ft->autogroup.active)
                return ERR_PTR(-EPERM);
 
-       lock_ref_node(&ft->node);
-       fg = create_flow_group_common(ft, fg_in, ft->node.children.prev, false);
-       unlock_ref_node(&ft->node);
+       down_write_ref_node(&ft->node);
+       fg = alloc_insert_flow_group(ft, match_criteria_enable, match_criteria,
+                                    start_index, end_index,
+                                    ft->node.children.prev);
+       up_write_ref_node(&ft->node);
+       if (IS_ERR(fg))
+               return fg;
+
+       err = mlx5_cmd_create_flow_group(dev, ft, fg_in, &fg->id);
+       if (err) {
+               tree_put_node(&fg->node);
+               return ERR_PTR(err);
+       }
+       trace_mlx5_fs_add_fg(fg);
+       fg->node.active = true;
 
        return fg;
 }
@@ -1111,7 +1212,7 @@ create_flow_handle(struct fs_fte *fte,
                /* Add dest to dests list- we need flow tables to be in the
                 * end of the list for forward to next prio rules.
                 */
-               tree_init_node(&rule->node, 1, del_rule);
+               tree_init_node(&rule->node, NULL, del_sw_hw_rule);
                if (dest &&
                    dest[i].type != MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE)
                        list_add(&rule->node.list, &fte->node.children);
@@ -1167,7 +1268,9 @@ add_rule_fte(struct fs_fte *fte,
        if (err)
                goto free_handle;
 
+       fte->node.active = true;
        fte->status |= FS_FTE_STATUS_EXISTING;
+       atomic_inc(&fte->node.version);
 
 out:
        return handle;
@@ -1177,59 +1280,17 @@ free_handle:
        return ERR_PTR(err);
 }
 
-static struct fs_fte *create_fte(struct mlx5_flow_group *fg,
-                                u32 *match_value,
-                                struct mlx5_flow_act *flow_act)
-{
-       struct mlx5_flow_table *ft;
-       struct fs_fte *fte;
-       int index;
-       int ret;
-
-       fs_get_obj(ft, fg->node.parent);
-       index = ida_simple_get(&ft->fte_allocator, fg->start_index,
-                              fg->start_index + fg->max_ftes,
-                              GFP_KERNEL);
-       if (index < 0)
-               return ERR_PTR(index);
-
-       fte = alloc_fte(flow_act, match_value, index);
-       if (IS_ERR(fte)) {
-               ret = PTR_ERR(fte);
-               goto err_alloc;
-       }
-       ret = rhashtable_insert_fast(&fg->ftes_hash, &fte->hash, rhash_fte);
-       if (ret)
-               goto err_hash;
-
-       return fte;
-
-err_hash:
-       kfree(fte);
-err_alloc:
-       ida_simple_remove(&ft->fte_allocator, index);
-       return ERR_PTR(ret);
-}
-
-static struct mlx5_flow_group *create_autogroup(struct mlx5_flow_table *ft,
-                                               u8 match_criteria_enable,
-                                               u32 *match_criteria)
+static struct mlx5_flow_group *alloc_auto_flow_group(struct mlx5_flow_table  *ft,
+                                                    struct mlx5_flow_spec *spec)
 {
-       int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
        struct list_head *prev = &ft->node.children;
-       unsigned int candidate_index = 0;
        struct mlx5_flow_group *fg;
-       void *match_criteria_addr;
+       unsigned int candidate_index = 0;
        unsigned int group_size = 0;
-       u32 *in;
 
        if (!ft->autogroup.active)
                return ERR_PTR(-ENOENT);
 
-       in = kvzalloc(inlen, GFP_KERNEL);
-       if (!in)
-               return ERR_PTR(-ENOMEM);
-
        if (ft->autogroup.num_groups < ft->autogroup.required_groups)
                /* We save place for flow groups in addition to max types */
                group_size = ft->max_fte / (ft->autogroup.required_groups + 1);
@@ -1247,25 +1308,55 @@ static struct mlx5_flow_group *create_autogroup(struct mlx5_flow_table *ft,
                prev = &fg->node.list;
        }
 
-       if (candidate_index + group_size > ft->max_fte) {
-               fg = ERR_PTR(-ENOSPC);
+       if (candidate_index + group_size > ft->max_fte)
+               return ERR_PTR(-ENOSPC);
+
+       fg = alloc_insert_flow_group(ft,
+                                    spec->match_criteria_enable,
+                                    spec->match_criteria,
+                                    candidate_index,
+                                    candidate_index + group_size - 1,
+                                    prev);
+       if (IS_ERR(fg))
                goto out;
-       }
+
+       ft->autogroup.num_groups++;
+
+out:
+       return fg;
+}
+
+static int create_auto_flow_group(struct mlx5_flow_table *ft,
+                                 struct mlx5_flow_group *fg)
+{
+       struct mlx5_core_dev *dev = get_dev(&ft->node);
+       int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
+       void *match_criteria_addr;
+       int err;
+       u32 *in;
+
+       in = kvzalloc(inlen, GFP_KERNEL);
+       if (!in)
+               return -ENOMEM;
 
        MLX5_SET(create_flow_group_in, in, match_criteria_enable,
-                match_criteria_enable);
-       MLX5_SET(create_flow_group_in, in, start_flow_index, candidate_index);
-       MLX5_SET(create_flow_group_in, in, end_flow_index,   candidate_index +
-                group_size - 1);
+                fg->mask.match_criteria_enable);
+       MLX5_SET(create_flow_group_in, in, start_flow_index, fg->start_index);
+       MLX5_SET(create_flow_group_in, in, end_flow_index,   fg->start_index +
+                fg->max_ftes - 1);
        match_criteria_addr = MLX5_ADDR_OF(create_flow_group_in,
                                           in, match_criteria);
-       memcpy(match_criteria_addr, match_criteria,
-              MLX5_ST_SZ_BYTES(fte_match_param));
+       memcpy(match_criteria_addr, fg->mask.match_criteria,
+              sizeof(fg->mask.match_criteria));
+
+       err = mlx5_cmd_create_flow_group(dev, ft, in, &fg->id);
+       if (!err) {
+               fg->node.active = true;
+               trace_mlx5_fs_add_fg(fg);
+       }
 
-       fg = create_flow_group_common(ft, in, prev, true);
-out:
        kvfree(in);
-       return fg;
+       return err;
 }
 
 static bool mlx5_flow_dests_cmp(struct mlx5_flow_destination *d1,
@@ -1340,60 +1431,30 @@ static struct mlx5_flow_handle *add_rule_fg(struct mlx5_flow_group *fg,
                                            struct fs_fte *fte)
 {
        struct mlx5_flow_handle *handle;
-       struct mlx5_flow_table *ft;
+       int old_action;
        int i;
+       int ret;
 
-       if (fte) {
-               int old_action;
-               int ret;
-
-               nested_lock_ref_node(&fte->node, FS_MUTEX_CHILD);
-               ret = check_conflicting_ftes(fte, flow_act);
-               if (ret) {
-                       handle = ERR_PTR(ret);
-                       goto unlock_fte;
-               }
-
-               old_action = fte->action;
-               fte->action |= flow_act->action;
-               handle = add_rule_fte(fte, fg, dest, dest_num,
-                                     old_action != flow_act->action);
-               if (IS_ERR(handle)) {
-                       fte->action = old_action;
-                       goto unlock_fte;
-               } else {
-                       trace_mlx5_fs_set_fte(fte, false);
-                       goto add_rules;
-               }
-       }
-       fs_get_obj(ft, fg->node.parent);
+       ret = check_conflicting_ftes(fte, flow_act);
+       if (ret)
+               return ERR_PTR(ret);
 
-       fte = create_fte(fg, match_value, flow_act);
-       if (IS_ERR(fte))
-               return (void *)fte;
-       tree_init_node(&fte->node, 0, del_fte);
-       nested_lock_ref_node(&fte->node, FS_MUTEX_CHILD);
-       handle = add_rule_fte(fte, fg, dest, dest_num, false);
+       old_action = fte->action;
+       fte->action |= flow_act->action;
+       handle = add_rule_fte(fte, fg, dest, dest_num,
+                             old_action != flow_act->action);
        if (IS_ERR(handle)) {
-               unlock_ref_node(&fte->node);
-               destroy_fte(fte, fg);
-               kfree(fte);
+               fte->action = old_action;
                return handle;
        }
+       trace_mlx5_fs_set_fte(fte, false);
 
-       tree_add_node(&fte->node, &fg->node);
-       /* fte list isn't sorted */
-       list_add_tail(&fte->node.list, &fg->node.children);
-       trace_mlx5_fs_set_fte(fte, true);
-add_rules:
        for (i = 0; i < handle->num_rules; i++) {
                if (atomic_read(&handle->rule[i]->node.refcount) == 1) {
                        tree_add_node(&handle->rule[i]->node, &fte->node);
                        trace_mlx5_fs_add_rule(handle->rule[i]);
                }
        }
-unlock_fte:
-       unlock_ref_node(&fte->node);
        return handle;
 }
 
@@ -1441,93 +1502,197 @@ static bool dest_is_valid(struct mlx5_flow_destination *dest,
        return true;
 }
 
-static struct mlx5_flow_handle *
-try_add_to_existing_fg(struct mlx5_flow_table *ft,
-                      struct mlx5_flow_spec *spec,
-                      struct mlx5_flow_act *flow_act,
-                      struct mlx5_flow_destination *dest,
-                      int dest_num)
-{
+struct match_list {
+       struct list_head        list;
        struct mlx5_flow_group *g;
-       struct mlx5_flow_handle *rule = ERR_PTR(-ENOENT);
+};
+
+struct match_list_head {
+       struct list_head  list;
+       struct match_list first;
+};
+
+static void free_match_list(struct match_list_head *head)
+{
+       if (!list_empty(&head->list)) {
+               struct match_list *iter, *match_tmp;
+
+               list_del(&head->first.list);
+               tree_put_node(&head->first.g->node);
+               list_for_each_entry_safe(iter, match_tmp, &head->list,
+                                        list) {
+                       tree_put_node(&iter->g->node);
+                       list_del(&iter->list);
+                       kfree(iter);
+               }
+       }
+}
+
+static int build_match_list(struct match_list_head *match_head,
+                           struct mlx5_flow_table *ft,
+                           struct mlx5_flow_spec *spec)
+{
        struct rhlist_head *tmp, *list;
-       struct match_list {
-               struct list_head        list;
-               struct mlx5_flow_group *g;
-       } match_list, *iter;
-       LIST_HEAD(match_head);
+       struct mlx5_flow_group *g;
+       int err = 0;
 
        rcu_read_lock();
+       INIT_LIST_HEAD(&match_head->list);
        /* Collect all fgs which has a matching match_criteria */
        list = rhltable_lookup(&ft->fgs_hash, spec, rhash_fg);
+       /* RCU is atomic, we can't execute FW commands here */
        rhl_for_each_entry_rcu(g, tmp, list, hash) {
                struct match_list *curr_match;
 
-               if (likely(list_empty(&match_head))) {
-                       match_list.g = g;
-                       list_add_tail(&match_list.list, &match_head);
+               if (likely(list_empty(&match_head->list))) {
+                       if (!tree_get_node(&g->node))
+                               continue;
+                       match_head->first.g = g;
+                       list_add_tail(&match_head->first.list,
+                                     &match_head->list);
                        continue;
                }
-               curr_match = kmalloc(sizeof(*curr_match), GFP_ATOMIC);
 
+               curr_match = kmalloc(sizeof(*curr_match), GFP_ATOMIC);
                if (!curr_match) {
-                       rcu_read_unlock();
-                       rule = ERR_PTR(-ENOMEM);
-                       goto free_list;
+                       free_match_list(match_head);
+                       err = -ENOMEM;
+                       goto out;
+               }
+               if (!tree_get_node(&g->node)) {
+                       kfree(curr_match);
+                       continue;
                }
                curr_match->g = g;
-               list_add_tail(&curr_match->list, &match_head);
+               list_add_tail(&curr_match->list, &match_head->list);
        }
+out:
        rcu_read_unlock();
+       return err;
+}
+
+static u64 matched_fgs_get_version(struct list_head *match_head)
+{
+       struct match_list *iter;
+       u64 version = 0;
+
+       list_for_each_entry(iter, match_head, list)
+               version += (u64)atomic_read(&iter->g->node.version);
+       return version;
+}
+
+static struct mlx5_flow_handle *
+try_add_to_existing_fg(struct mlx5_flow_table *ft,
+                      struct list_head *match_head,
+                      struct mlx5_flow_spec *spec,
+                      struct mlx5_flow_act *flow_act,
+                      struct mlx5_flow_destination *dest,
+                      int dest_num,
+                      int ft_version)
+{
+       struct mlx5_flow_steering *steering = get_steering(&ft->node);
+       struct mlx5_flow_group *g;
+       struct mlx5_flow_handle *rule;
+       struct match_list *iter;
+       bool take_write = false;
+       struct fs_fte *fte;
+       u64  version;
+       int err;
+
+       fte = alloc_fte(ft, spec->match_value, flow_act);
+       if (IS_ERR(fte))
+               return  ERR_PTR(-ENOMEM);
 
+       list_for_each_entry(iter, match_head, list) {
+               nested_down_read_ref_node(&iter->g->node, FS_LOCK_PARENT);
+               ida_pre_get(&iter->g->fte_allocator, GFP_KERNEL);
+       }
+
+search_again_locked:
+       version = matched_fgs_get_version(match_head);
        /* Try to find a fg that already contains a matching fte */
-       list_for_each_entry(iter, &match_head, list) {
-               struct fs_fte *fte;
+       list_for_each_entry(iter, match_head, list) {
+               struct fs_fte *fte_tmp;
 
                g = iter->g;
-               nested_lock_ref_node(&g->node, FS_MUTEX_PARENT);
-               fte = rhashtable_lookup_fast(&g->ftes_hash, spec->match_value,
-                                            rhash_fte);
-               if (fte) {
-                       rule = add_rule_fg(g, spec->match_value,
-                                          flow_act, dest, dest_num, fte);
-                       unlock_ref_node(&g->node);
-                       goto free_list;
+               fte_tmp = rhashtable_lookup_fast(&g->ftes_hash, spec->match_value,
+                                                rhash_fte);
+               if (!fte_tmp || !tree_get_node(&fte_tmp->node))
+                       continue;
+
+               nested_down_write_ref_node(&fte_tmp->node, FS_LOCK_CHILD);
+               if (!take_write) {
+                       list_for_each_entry(iter, match_head, list)
+                               up_read_ref_node(&iter->g->node);
+               } else {
+                       list_for_each_entry(iter, match_head, list)
+                               up_write_ref_node(&iter->g->node);
                }
-               unlock_ref_node(&g->node);
+
+               rule = add_rule_fg(g, spec->match_value,
+                                  flow_act, dest, dest_num, fte_tmp);
+               up_write_ref_node(&fte_tmp->node);
+               tree_put_node(&fte_tmp->node);
+               kmem_cache_free(steering->ftes_cache, fte);
+               return rule;
        }
 
        /* No group with matching fte found. Try to add a new fte to any
         * matching fg.
         */
-       list_for_each_entry(iter, &match_head, list) {
-               g = iter->g;
 
-               nested_lock_ref_node(&g->node, FS_MUTEX_PARENT);
-               rule = add_rule_fg(g, spec->match_value,
-                                  flow_act, dest, dest_num, NULL);
-               if (!IS_ERR(rule) || PTR_ERR(rule) != -ENOSPC) {
-                       unlock_ref_node(&g->node);
-                       goto free_list;
-               }
-               unlock_ref_node(&g->node);
+       if (!take_write) {
+               list_for_each_entry(iter, match_head, list)
+                       up_read_ref_node(&iter->g->node);
+               list_for_each_entry(iter, match_head, list)
+                       nested_down_write_ref_node(&iter->g->node,
+                                                  FS_LOCK_PARENT);
+               take_write = true;
        }
 
-free_list:
-       if (!list_empty(&match_head)) {
-               struct match_list *match_tmp;
+       /* Check the ft version, for case that new flow group
+        * was added while the fgs weren't locked
+        */
+       if (atomic_read(&ft->node.version) != ft_version) {
+               rule = ERR_PTR(-EAGAIN);
+               goto out;
+       }
 
-               /* The most common case is having one FG. Since we want to
-                * optimize this case, we save the first on the stack.
-                * Therefore, no need to free it.
-                */
-               list_del(&list_first_entry(&match_head, typeof(*iter), list)->list);
-               list_for_each_entry_safe(iter, match_tmp, &match_head, list) {
-                       list_del(&iter->list);
-                       kfree(iter);
+       /* Check the fgs version, for case the new FTE with the
+        * same values was added while the fgs weren't locked
+        */
+       if (version != matched_fgs_get_version(match_head))
+               goto search_again_locked;
+
+       list_for_each_entry(iter, match_head, list) {
+               g = iter->g;
+
+               if (!g->node.active)
+                       continue;
+               err = insert_fte(g, fte);
+               if (err) {
+                       if (err == -ENOSPC)
+                               continue;
+                       list_for_each_entry(iter, match_head, list)
+                               up_write_ref_node(&iter->g->node);
+                       kmem_cache_free(steering->ftes_cache, fte);
+                       return ERR_PTR(err);
                }
-       }
 
+               nested_down_write_ref_node(&fte->node, FS_LOCK_CHILD);
+               list_for_each_entry(iter, match_head, list)
+                       up_write_ref_node(&iter->g->node);
+               rule = add_rule_fg(g, spec->match_value,
+                                  flow_act, dest, dest_num, fte);
+               up_write_ref_node(&fte->node);
+               tree_put_node(&fte->node);
+               return rule;
+       }
+       rule = ERR_PTR(-ENOENT);
+out:
+       list_for_each_entry(iter, match_head, list)
+               up_write_ref_node(&iter->g->node);
+       kmem_cache_free(steering->ftes_cache, fte);
        return rule;
 }
 
@@ -1539,8 +1704,14 @@ _mlx5_add_flow_rules(struct mlx5_flow_table *ft,
                     int dest_num)
 
 {
+       struct mlx5_flow_steering *steering = get_steering(&ft->node);
        struct mlx5_flow_group *g;
        struct mlx5_flow_handle *rule;
+       struct match_list_head match_head;
+       bool take_write = false;
+       struct fs_fte *fte;
+       int version;
+       int err;
        int i;
 
        if (!check_valid_spec(spec))
@@ -1550,33 +1721,73 @@ _mlx5_add_flow_rules(struct mlx5_flow_table *ft,
                if (!dest_is_valid(&dest[i], flow_act->action, ft))
                        return ERR_PTR(-EINVAL);
        }
+       nested_down_read_ref_node(&ft->node, FS_LOCK_GRANDPARENT);
+search_again_locked:
+       version = atomic_read(&ft->node.version);
+
+       /* Collect all fgs which has a matching match_criteria */
+       err = build_match_list(&match_head, ft, spec);
+       if (err)
+               return ERR_PTR(err);
+
+       if (!take_write)
+               up_read_ref_node(&ft->node);
+
+       rule = try_add_to_existing_fg(ft, &match_head.list, spec, flow_act, dest,
+                                     dest_num, version);
+       free_match_list(&match_head);
+       if (!IS_ERR(rule) ||
+           (PTR_ERR(rule) != -ENOENT && PTR_ERR(rule) != -EAGAIN))
+               return rule;
+
+       if (!take_write) {
+               nested_down_write_ref_node(&ft->node, FS_LOCK_GRANDPARENT);
+               take_write = true;
+       }
 
-       nested_lock_ref_node(&ft->node, FS_MUTEX_GRANDPARENT);
-       rule = try_add_to_existing_fg(ft, spec, flow_act, dest, dest_num);
-       if (!IS_ERR(rule))
-               goto unlock;
+       if (PTR_ERR(rule) == -EAGAIN ||
+           version != atomic_read(&ft->node.version))
+               goto search_again_locked;
 
-       g = create_autogroup(ft, spec->match_criteria_enable,
-                            spec->match_criteria);
+       g = alloc_auto_flow_group(ft, spec);
        if (IS_ERR(g)) {
                rule = (void *)g;
-               goto unlock;
+               up_write_ref_node(&ft->node);
+               return rule;
        }
 
-       rule = add_rule_fg(g, spec->match_value, flow_act, dest,
-                          dest_num, NULL);
-       if (IS_ERR(rule)) {
-               /* Remove assumes refcount > 0 and autogroup creates a group
-                * with a refcount = 0.
-                */
-               unlock_ref_node(&ft->node);
-               tree_get_node(&g->node);
-               tree_remove_node(&g->node);
-               return rule;
+       nested_down_write_ref_node(&g->node, FS_LOCK_PARENT);
+       up_write_ref_node(&ft->node);
+
+       err = create_auto_flow_group(ft, g);
+       if (err)
+               goto err_release_fg;
+
+       fte = alloc_fte(ft, spec->match_value, flow_act);
+       if (IS_ERR(fte)) {
+               err = PTR_ERR(fte);
+               goto err_release_fg;
        }
-unlock:
-       unlock_ref_node(&ft->node);
+
+       err = insert_fte(g, fte);
+       if (err) {
+               kmem_cache_free(steering->ftes_cache, fte);
+               goto err_release_fg;
+       }
+
+       nested_down_write_ref_node(&fte->node, FS_LOCK_CHILD);
+       up_write_ref_node(&g->node);
+       rule = add_rule_fg(g, spec->match_value, flow_act, dest,
+                          dest_num, fte);
+       up_write_ref_node(&fte->node);
+       tree_put_node(&fte->node);
+       tree_put_node(&g->node);
        return rule;
+
+err_release_fg:
+       up_write_ref_node(&g->node);
+       tree_put_node(&g->node);
+       return ERR_PTR(err);
 }
 
 static bool fwd_next_prio_supported(struct mlx5_flow_table *ft)
@@ -1817,7 +2028,7 @@ static struct fs_prio *fs_create_prio(struct mlx5_flow_namespace *ns,
                return ERR_PTR(-ENOMEM);
 
        fs_prio->node.type = FS_TYPE_PRIO;
-       tree_init_node(&fs_prio->node, 1, NULL);
+       tree_init_node(&fs_prio->node, NULL, NULL);
        tree_add_node(&fs_prio->node, &ns->node);
        fs_prio->num_levels = num_levels;
        fs_prio->prio = prio;
@@ -1843,7 +2054,7 @@ static struct mlx5_flow_namespace *fs_create_namespace(struct fs_prio *prio)
                return ERR_PTR(-ENOMEM);
 
        fs_init_namespace(ns);
-       tree_init_node(&ns->node, 1, NULL);
+       tree_init_node(&ns->node, NULL, NULL);
        tree_add_node(&ns->node, &prio->node);
        list_add_tail(&ns->node.list, &prio->node.children);
 
@@ -1968,7 +2179,7 @@ static struct mlx5_flow_root_namespace *create_root_ns(struct mlx5_flow_steering
        ns = &root_ns->ns;
        fs_init_namespace(ns);
        mutex_init(&root_ns->chain_lock);
-       tree_init_node(&ns->node, 1, NULL);
+       tree_init_node(&ns->node, NULL, NULL);
        tree_add_node(&ns->node, NULL);
 
        return root_ns;
@@ -2066,8 +2277,10 @@ static void clean_tree(struct fs_node *node)
                struct fs_node *iter;
                struct fs_node *temp;
 
+               tree_get_node(node);
                list_for_each_entry_safe(iter, temp, &node->children, list)
                        clean_tree(iter);
+               tree_put_node(node);
                tree_remove_node(node);
        }
 }
@@ -2091,6 +2304,8 @@ void mlx5_cleanup_fs(struct mlx5_core_dev *dev)
        cleanup_root_ns(steering->sniffer_rx_root_ns);
        cleanup_root_ns(steering->sniffer_tx_root_ns);
        mlx5_cleanup_fc_stats(dev);
+       kmem_cache_destroy(steering->ftes_cache);
+       kmem_cache_destroy(steering->fgs_cache);
        kfree(steering);
 }
 
@@ -2196,6 +2411,16 @@ int mlx5_init_fs(struct mlx5_core_dev *dev)
        steering->dev = dev;
        dev->priv.steering = steering;
 
+       steering->fgs_cache = kmem_cache_create("mlx5_fs_fgs",
+                                               sizeof(struct mlx5_flow_group), 0,
+                                               0, NULL);
+       steering->ftes_cache = kmem_cache_create("mlx5_fs_ftes", sizeof(struct fs_fte), 0,
+                                                0, NULL);
+       if (!steering->ftes_cache || !steering->fgs_cache) {
+               err = -ENOMEM;
+               goto err;
+       }
+
        if ((((MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_ETH) &&
              (MLX5_CAP_GEN(dev, nic_flow_table))) ||
             ((MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_IB) &&
index 48dd789..7a01277 100644 (file)
@@ -66,6 +66,8 @@ enum fs_fte_status {
 
 struct mlx5_flow_steering {
        struct mlx5_core_dev *dev;
+       struct kmem_cache               *fgs_cache;
+       struct kmem_cache               *ftes_cache;
        struct mlx5_flow_root_namespace *root_ns;
        struct mlx5_flow_root_namespace *fdb_root_ns;
        struct mlx5_flow_root_namespace *esw_egress_root_ns;
@@ -81,9 +83,12 @@ struct fs_node {
        struct fs_node          *parent;
        struct fs_node          *root;
        /* lock the node for writing and traversing */
-       struct mutex            lock;
+       struct rw_semaphore     lock;
        atomic_t                refcount;
-       void                    (*remove_func)(struct fs_node *);
+       bool                    active;
+       void                    (*del_hw_func)(struct fs_node *);
+       void                    (*del_sw_func)(struct fs_node *);
+       atomic_t                version;
 };
 
 struct mlx5_flow_rule {
@@ -120,7 +125,6 @@ struct mlx5_flow_table {
        /* FWD rules that point on this flow table */
        struct list_head                fwd_rules;
        u32                             flags;
-       struct ida                      fte_allocator;
        struct rhltable                 fgs_hash;
 };
 
@@ -200,6 +204,7 @@ struct mlx5_flow_group {
        struct mlx5_flow_group_mask     mask;
        u32                             start_index;
        u32                             max_ftes;
+       struct ida                      fte_allocator;
        u32                             id;
        struct rhashtable               ftes_hash;
        struct rhlist_head              hash;
index 5cd4df0..321988a 100644 (file)
@@ -53,6 +53,7 @@
 #include <linux/notifier.h>
 #include <linux/dcbnl.h>
 #include <linux/inetdevice.h>
+#include <linux/netlink.h>
 #include <net/switchdev.h>
 #include <net/pkt_cls.h>
 #include <net/tc_act/tc_mirred.h>
@@ -4298,7 +4299,8 @@ static int mlxsw_sp_netdevice_port_upper_event(struct net_device *lower_dev,
                        if (info->linking)
                                err = mlxsw_sp_port_bridge_join(mlxsw_sp_port,
                                                                lower_dev,
-                                                               upper_dev);
+                                                               upper_dev,
+                                                               extack);
                        else
                                mlxsw_sp_port_bridge_leave(mlxsw_sp_port,
                                                           lower_dev,
@@ -4389,18 +4391,25 @@ static int mlxsw_sp_netdevice_port_vlan_event(struct net_device *vlan_dev,
 {
        struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
        struct netdev_notifier_changeupper_info *info = ptr;
+       struct netlink_ext_ack *extack;
        struct net_device *upper_dev;
        int err = 0;
 
+       extack = netdev_notifier_info_to_extack(&info->info);
+
        switch (event) {
        case NETDEV_PRECHANGEUPPER:
                upper_dev = info->upper_dev;
-               if (!netif_is_bridge_master(upper_dev))
+               if (!netif_is_bridge_master(upper_dev)) {
+                       NL_SET_ERR_MSG(extack, "spectrum: VLAN devices only support bridge and VRF uppers");
                        return -EINVAL;
+               }
                if (!info->linking)
                        break;
-               if (netdev_has_any_upper_dev(upper_dev))
+               if (netdev_has_any_upper_dev(upper_dev)) {
+                       NL_SET_ERR_MSG(extack, "spectrum: Enslaving a port to a device that already has an upper device is not supported");
                        return -EINVAL;
+               }
                break;
        case NETDEV_CHANGEUPPER:
                upper_dev = info->upper_dev;
@@ -4408,7 +4417,8 @@ static int mlxsw_sp_netdevice_port_vlan_event(struct net_device *vlan_dev,
                        if (info->linking)
                                err = mlxsw_sp_port_bridge_join(mlxsw_sp_port,
                                                                vlan_dev,
-                                                               upper_dev);
+                                                               upper_dev,
+                                                               extack);
                        else
                                mlxsw_sp_port_bridge_leave(mlxsw_sp_port,
                                                           vlan_dev,
index ae67e60..8e45183 100644 (file)
@@ -326,7 +326,8 @@ void
 mlxsw_sp_port_vlan_bridge_leave(struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan);
 int mlxsw_sp_port_bridge_join(struct mlxsw_sp_port *mlxsw_sp_port,
                              struct net_device *brport_dev,
-                             struct net_device *br_dev);
+                             struct net_device *br_dev,
+                             struct netlink_ext_ack *extack);
 void mlxsw_sp_port_bridge_leave(struct mlxsw_sp_port *mlxsw_sp_port,
                                struct net_device *brport_dev,
                                struct net_device *br_dev);
index e0f8ea4..6a356f4 100644 (file)
@@ -3640,20 +3640,6 @@ static int mlxsw_sp_fib_lpm_tree_link(struct mlxsw_sp *mlxsw_sp,
 static void mlxsw_sp_fib_lpm_tree_unlink(struct mlxsw_sp *mlxsw_sp,
                                         struct mlxsw_sp_fib *fib)
 {
-       struct mlxsw_sp_prefix_usage req_prefix_usage = {{ 0 } };
-       struct mlxsw_sp_lpm_tree *lpm_tree;
-
-       /* Aggregate prefix lengths across all virtual routers to make
-        * sure we only have used prefix lengths in the LPM tree.
-        */
-       mlxsw_sp_vrs_prefixes(mlxsw_sp, fib->proto, &req_prefix_usage);
-       lpm_tree = mlxsw_sp_lpm_tree_get(mlxsw_sp, &req_prefix_usage,
-                                        fib->proto);
-       if (IS_ERR(lpm_tree))
-               goto err_tree_get;
-       mlxsw_sp_vrs_lpm_tree_replace(mlxsw_sp, fib, lpm_tree);
-
-err_tree_get:
        if (!mlxsw_sp_prefix_usage_none(&fib->prefix_usage))
                return;
        mlxsw_sp_vr_lpm_tree_unbind(mlxsw_sp, fib);
@@ -5957,7 +5943,7 @@ static int mlxsw_sp_rif_vlan_fid_op(struct mlxsw_sp_rif *rif,
        return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ritr), ritr_pl);
 }
 
-static u8 mlxsw_sp_router_port(const struct mlxsw_sp *mlxsw_sp)
+u8 mlxsw_sp_router_port(const struct mlxsw_sp *mlxsw_sp)
 {
        return mlxsw_core_max_ports(mlxsw_sp->core) + 1;
 }
index 3d44918..3f2d840 100644 (file)
@@ -70,6 +70,7 @@ u16 mlxsw_sp_rif_index(const struct mlxsw_sp_rif *rif);
 u16 mlxsw_sp_ipip_lb_rif_index(const struct mlxsw_sp_rif_ipip_lb *rif);
 u16 mlxsw_sp_ipip_lb_ul_vr_id(const struct mlxsw_sp_rif_ipip_lb *rif);
 int mlxsw_sp_rif_dev_ifindex(const struct mlxsw_sp_rif *rif);
+u8 mlxsw_sp_router_port(const struct mlxsw_sp *mlxsw_sp);
 const struct net_device *mlxsw_sp_rif_dev(const struct mlxsw_sp_rif *rif);
 int mlxsw_sp_rif_counter_value_get(struct mlxsw_sp *mlxsw_sp,
                                   struct mlxsw_sp_rif *rif,
index 0f9eac5..7b8548e 100644 (file)
 #include <linux/workqueue.h>
 #include <linux/jiffies.h>
 #include <linux/rtnetlink.h>
+#include <linux/netlink.h>
 #include <net/switchdev.h>
 
+#include "spectrum_router.h"
 #include "spectrum.h"
 #include "core.h"
 #include "reg.h"
@@ -78,7 +80,8 @@ struct mlxsw_sp_bridge_device {
        struct list_head ports_list;
        struct list_head mids_list;
        u8 vlan_enabled:1,
-          multicast_enabled:1;
+          multicast_enabled:1,
+          mrouter:1;
        const struct mlxsw_sp_bridge_ops *ops;
 };
 
@@ -107,7 +110,8 @@ struct mlxsw_sp_bridge_vlan {
 struct mlxsw_sp_bridge_ops {
        int (*port_join)(struct mlxsw_sp_bridge_device *bridge_device,
                         struct mlxsw_sp_bridge_port *bridge_port,
-                        struct mlxsw_sp_port *mlxsw_sp_port);
+                        struct mlxsw_sp_port *mlxsw_sp_port,
+                        struct netlink_ext_ack *extack);
        void (*port_leave)(struct mlxsw_sp_bridge_device *bridge_device,
                           struct mlxsw_sp_bridge_port *bridge_port,
                           struct mlxsw_sp_port *mlxsw_sp_port);
@@ -168,6 +172,7 @@ mlxsw_sp_bridge_device_create(struct mlxsw_sp_bridge *bridge,
        bridge_device->dev = br_dev;
        bridge_device->vlan_enabled = vlan_enabled;
        bridge_device->multicast_enabled = br_multicast_enabled(br_dev);
+       bridge_device->mrouter = br_multicast_router(br_dev);
        INIT_LIST_HEAD(&bridge_device->ports_list);
        if (vlan_enabled) {
                bridge->vlan_enabled_exists = true;
@@ -810,6 +815,60 @@ static int mlxsw_sp_port_mc_disabled_set(struct mlxsw_sp_port *mlxsw_sp_port,
        return 0;
 }
 
+static int mlxsw_sp_smid_router_port_set(struct mlxsw_sp *mlxsw_sp,
+                                        u16 mid_idx, bool add)
+{
+       char *smid_pl;
+       int err;
+
+       smid_pl = kmalloc(MLXSW_REG_SMID_LEN, GFP_KERNEL);
+       if (!smid_pl)
+               return -ENOMEM;
+
+       mlxsw_reg_smid_pack(smid_pl, mid_idx,
+                           mlxsw_sp_router_port(mlxsw_sp), add);
+       err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(smid), smid_pl);
+       kfree(smid_pl);
+       return err;
+}
+
+static void
+mlxsw_sp_bridge_mrouter_update_mdb(struct mlxsw_sp *mlxsw_sp,
+                                  struct mlxsw_sp_bridge_device *bridge_device,
+                                  bool add)
+{
+       struct mlxsw_sp_mid *mid;
+
+       list_for_each_entry(mid, &bridge_device->mids_list, list)
+               mlxsw_sp_smid_router_port_set(mlxsw_sp, mid->mid, add);
+}
+
+static int
+mlxsw_sp_port_attr_br_mrouter_set(struct mlxsw_sp_port *mlxsw_sp_port,
+                                 struct switchdev_trans *trans,
+                                 struct net_device *orig_dev,
+                                 bool is_mrouter)
+{
+       struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+       struct mlxsw_sp_bridge_device *bridge_device;
+
+       if (switchdev_trans_ph_prepare(trans))
+               return 0;
+
+       /* It's possible we failed to enslave the port, yet this
+        * operation is executed due to it being deferred.
+        */
+       bridge_device = mlxsw_sp_bridge_device_find(mlxsw_sp->bridge, orig_dev);
+       if (!bridge_device)
+               return 0;
+
+       if (bridge_device->mrouter != is_mrouter)
+               mlxsw_sp_bridge_mrouter_update_mdb(mlxsw_sp, bridge_device,
+                                                  is_mrouter);
+       bridge_device->mrouter = is_mrouter;
+       return 0;
+}
+
 static int mlxsw_sp_port_attr_set(struct net_device *dev,
                                  const struct switchdev_attr *attr,
                                  struct switchdev_trans *trans)
@@ -847,6 +906,11 @@ static int mlxsw_sp_port_attr_set(struct net_device *dev,
                                                    attr->orig_dev,
                                                    attr->u.mc_disabled);
                break;
+       case SWITCHDEV_ATTR_ID_BRIDGE_MROUTER:
+               err = mlxsw_sp_port_attr_br_mrouter_set(mlxsw_sp_port, trans,
+                                                       attr->orig_dev,
+                                                       attr->u.mrouter);
+               break;
        default:
                err = -EOPNOTSUPP;
                break;
@@ -1241,7 +1305,8 @@ static int mlxsw_sp_port_mdb_op(struct mlxsw_sp *mlxsw_sp, const char *addr,
 }
 
 static int mlxsw_sp_port_smid_full_entry(struct mlxsw_sp *mlxsw_sp, u16 mid_idx,
-                                        long *ports_bitmap)
+                                        long *ports_bitmap,
+                                        bool set_router_port)
 {
        char *smid_pl;
        int err, i;
@@ -1256,9 +1321,15 @@ static int mlxsw_sp_port_smid_full_entry(struct mlxsw_sp *mlxsw_sp, u16 mid_idx,
                        mlxsw_reg_smid_port_mask_set(smid_pl, i, 1);
        }
 
+       mlxsw_reg_smid_port_mask_set(smid_pl,
+                                    mlxsw_sp_router_port(mlxsw_sp), 1);
+
        for_each_set_bit(i, ports_bitmap, mlxsw_core_max_ports(mlxsw_sp->core))
                mlxsw_reg_smid_port_set(smid_pl, i, 1);
 
+       mlxsw_reg_smid_port_set(smid_pl, mlxsw_sp_router_port(mlxsw_sp),
+                               set_router_port);
+
        err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(smid), smid_pl);
        kfree(smid_pl);
        return err;
@@ -1362,7 +1433,8 @@ mlxsw_sp_mc_write_mdb_entry(struct mlxsw_sp *mlxsw_sp,
        mlxsw_sp_mc_get_mrouters_bitmap(flood_bitmap, bridge_device, mlxsw_sp);
 
        mid->mid = mid_idx;
-       err = mlxsw_sp_port_smid_full_entry(mlxsw_sp, mid_idx, flood_bitmap);
+       err = mlxsw_sp_port_smid_full_entry(mlxsw_sp, mid_idx, flood_bitmap,
+                                           bridge_device->mrouter);
        kfree(flood_bitmap);
        if (err)
                return false;
@@ -1735,12 +1807,15 @@ static const struct switchdev_ops mlxsw_sp_port_switchdev_ops = {
 static int
 mlxsw_sp_bridge_8021q_port_join(struct mlxsw_sp_bridge_device *bridge_device,
                                struct mlxsw_sp_bridge_port *bridge_port,
-                               struct mlxsw_sp_port *mlxsw_sp_port)
+                               struct mlxsw_sp_port *mlxsw_sp_port,
+                               struct netlink_ext_ack *extack)
 {
        struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan;
 
-       if (is_vlan_dev(bridge_port->dev))
+       if (is_vlan_dev(bridge_port->dev)) {
+               NL_SET_ERR_MSG(extack, "spectrum: Can not enslave a VLAN device to a VLAN-aware bridge");
                return -EINVAL;
+       }
 
        mlxsw_sp_port_vlan = mlxsw_sp_port_vlan_find_by_vid(mlxsw_sp_port, 1);
        if (WARN_ON(!mlxsw_sp_port_vlan))
@@ -1797,13 +1872,16 @@ mlxsw_sp_port_is_br_member(const struct mlxsw_sp_port *mlxsw_sp_port,
 static int
 mlxsw_sp_bridge_8021d_port_join(struct mlxsw_sp_bridge_device *bridge_device,
                                struct mlxsw_sp_bridge_port *bridge_port,
-                               struct mlxsw_sp_port *mlxsw_sp_port)
+                               struct mlxsw_sp_port *mlxsw_sp_port,
+                               struct netlink_ext_ack *extack)
 {
        struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan;
        u16 vid;
 
-       if (!is_vlan_dev(bridge_port->dev))
+       if (!is_vlan_dev(bridge_port->dev)) {
+               NL_SET_ERR_MSG(extack, "spectrum: Only VLAN devices can be enslaved to a VLAN-unaware bridge");
                return -EINVAL;
+       }
        vid = vlan_dev_vlan_id(bridge_port->dev);
 
        mlxsw_sp_port_vlan = mlxsw_sp_port_vlan_find_by_vid(mlxsw_sp_port, vid);
@@ -1811,7 +1889,7 @@ mlxsw_sp_bridge_8021d_port_join(struct mlxsw_sp_bridge_device *bridge_device,
                return -EINVAL;
 
        if (mlxsw_sp_port_is_br_member(mlxsw_sp_port, bridge_device->dev)) {
-               netdev_err(mlxsw_sp_port->dev, "Can't bridge VLAN uppers of the same port\n");
+               NL_SET_ERR_MSG(extack, "spectrum: Can not bridge VLAN uppers of the same port");
                return -EINVAL;
        }
 
@@ -1854,7 +1932,8 @@ static const struct mlxsw_sp_bridge_ops mlxsw_sp_bridge_8021d_ops = {
 
 int mlxsw_sp_port_bridge_join(struct mlxsw_sp_port *mlxsw_sp_port,
                              struct net_device *brport_dev,
-                             struct net_device *br_dev)
+                             struct net_device *br_dev,
+                             struct netlink_ext_ack *extack)
 {
        struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
        struct mlxsw_sp_bridge_device *bridge_device;
@@ -1867,7 +1946,7 @@ int mlxsw_sp_port_bridge_join(struct mlxsw_sp_port *mlxsw_sp_port,
        bridge_device = bridge_port->bridge_device;
 
        err = bridge_device->ops->port_join(bridge_device, bridge_port,
-                                           mlxsw_sp_port);
+                                           mlxsw_sp_port, extack);
        if (err)
                goto err_port_join;
 
index becaacf..bd3b2bd 100644 (file)
@@ -14,6 +14,7 @@ nfp-objs := \
            nfpcore/nfp_resource.o \
            nfpcore/nfp_rtsym.o \
            nfpcore/nfp_target.o \
+           nfp_asm.o \
            nfp_app.o \
            nfp_app_nic.o \
            nfp_devlink.o \
index 239dfbe..13148f3 100644 (file)
@@ -110,150 +110,7 @@ nfp_prog_offset_to_index(struct nfp_prog *nfp_prog, unsigned int offset)
        return offset - nfp_prog->start_off;
 }
 
-/* --- SW reg --- */
-struct nfp_insn_ur_regs {
-       enum alu_dst_ab dst_ab;
-       u16 dst;
-       u16 areg, breg;
-       bool swap;
-       bool wr_both;
-};
-
-struct nfp_insn_re_regs {
-       enum alu_dst_ab dst_ab;
-       u8 dst;
-       u8 areg, breg;
-       bool swap;
-       bool wr_both;
-       bool i8;
-};
-
-static u16 nfp_swreg_to_unreg(u32 swreg, bool is_dst)
-{
-       u16 val = FIELD_GET(NN_REG_VAL, swreg);
-
-       switch (FIELD_GET(NN_REG_TYPE, swreg)) {
-       case NN_REG_GPR_A:
-       case NN_REG_GPR_B:
-       case NN_REG_GPR_BOTH:
-               return val;
-       case NN_REG_NNR:
-               return UR_REG_NN | val;
-       case NN_REG_XFER:
-               return UR_REG_XFR | val;
-       case NN_REG_IMM:
-               if (val & ~0xff) {
-                       pr_err("immediate too large\n");
-                       return 0;
-               }
-               return UR_REG_IMM_encode(val);
-       case NN_REG_NONE:
-               return is_dst ? UR_REG_NO_DST : REG_NONE;
-       default:
-               pr_err("unrecognized reg encoding %08x\n", swreg);
-               return 0;
-       }
-}
-
-static int
-swreg_to_unrestricted(u32 dst, u32 lreg, u32 rreg, struct nfp_insn_ur_regs *reg)
-{
-       memset(reg, 0, sizeof(*reg));
-
-       /* Decode destination */
-       if (FIELD_GET(NN_REG_TYPE, dst) == NN_REG_IMM)
-               return -EFAULT;
-
-       if (FIELD_GET(NN_REG_TYPE, dst) == NN_REG_GPR_B)
-               reg->dst_ab = ALU_DST_B;
-       if (FIELD_GET(NN_REG_TYPE, dst) == NN_REG_GPR_BOTH)
-               reg->wr_both = true;
-       reg->dst = nfp_swreg_to_unreg(dst, true);
-
-       /* Decode source operands */
-       if (FIELD_GET(NN_REG_TYPE, lreg) == FIELD_GET(NN_REG_TYPE, rreg))
-               return -EFAULT;
-
-       if (FIELD_GET(NN_REG_TYPE, lreg) == NN_REG_GPR_B ||
-           FIELD_GET(NN_REG_TYPE, rreg) == NN_REG_GPR_A) {
-               reg->areg = nfp_swreg_to_unreg(rreg, false);
-               reg->breg = nfp_swreg_to_unreg(lreg, false);
-               reg->swap = true;
-       } else {
-               reg->areg = nfp_swreg_to_unreg(lreg, false);
-               reg->breg = nfp_swreg_to_unreg(rreg, false);
-       }
-
-       return 0;
-}
-
-static u16 nfp_swreg_to_rereg(u32 swreg, bool is_dst, bool has_imm8, bool *i8)
-{
-       u16 val = FIELD_GET(NN_REG_VAL, swreg);
-
-       switch (FIELD_GET(NN_REG_TYPE, swreg)) {
-       case NN_REG_GPR_A:
-       case NN_REG_GPR_B:
-       case NN_REG_GPR_BOTH:
-               return val;
-       case NN_REG_XFER:
-               return RE_REG_XFR | val;
-       case NN_REG_IMM:
-               if (val & ~(0x7f | has_imm8 << 7)) {
-                       pr_err("immediate too large\n");
-                       return 0;
-               }
-               *i8 = val & 0x80;
-               return RE_REG_IMM_encode(val & 0x7f);
-       case NN_REG_NONE:
-               return is_dst ? RE_REG_NO_DST : REG_NONE;
-       default:
-               pr_err("unrecognized reg encoding\n");
-               return 0;
-       }
-}
-
-static int
-swreg_to_restricted(u32 dst, u32 lreg, u32 rreg, struct nfp_insn_re_regs *reg,
-                   bool has_imm8)
-{
-       memset(reg, 0, sizeof(*reg));
-
-       /* Decode destination */
-       if (FIELD_GET(NN_REG_TYPE, dst) == NN_REG_IMM)
-               return -EFAULT;
-
-       if (FIELD_GET(NN_REG_TYPE, dst) == NN_REG_GPR_B)
-               reg->dst_ab = ALU_DST_B;
-       if (FIELD_GET(NN_REG_TYPE, dst) == NN_REG_GPR_BOTH)
-               reg->wr_both = true;
-       reg->dst = nfp_swreg_to_rereg(dst, true, false, NULL);
-
-       /* Decode source operands */
-       if (FIELD_GET(NN_REG_TYPE, lreg) == FIELD_GET(NN_REG_TYPE, rreg))
-               return -EFAULT;
-
-       if (FIELD_GET(NN_REG_TYPE, lreg) == NN_REG_GPR_B ||
-           FIELD_GET(NN_REG_TYPE, rreg) == NN_REG_GPR_A) {
-               reg->areg = nfp_swreg_to_rereg(rreg, false, has_imm8, &reg->i8);
-               reg->breg = nfp_swreg_to_rereg(lreg, false, has_imm8, &reg->i8);
-               reg->swap = true;
-       } else {
-               reg->areg = nfp_swreg_to_rereg(lreg, false, has_imm8, &reg->i8);
-               reg->breg = nfp_swreg_to_rereg(rreg, false, has_imm8, &reg->i8);
-       }
-
-       return 0;
-}
-
 /* --- Emitters --- */
-static const struct cmd_tgt_act cmd_tgt_act[__CMD_TGT_MAP_SIZE] = {
-       [CMD_TGT_WRITE8] =              { 0x00, 0x42 },
-       [CMD_TGT_READ8] =               { 0x01, 0x43 },
-       [CMD_TGT_READ_LE] =             { 0x01, 0x40 },
-       [CMD_TGT_READ_SWAP_LE] =        { 0x03, 0x40 },
-};
-
 static void
 __emit_cmd(struct nfp_prog *nfp_prog, enum cmd_tgt_map op,
           u8 mode, u8 xfer, u8 areg, u8 breg, u8 size, bool sync)
@@ -281,7 +138,7 @@ __emit_cmd(struct nfp_prog *nfp_prog, enum cmd_tgt_map op,
 
 static void
 emit_cmd(struct nfp_prog *nfp_prog, enum cmd_tgt_map op,
-        u8 mode, u8 xfer, u32 lreg, u32 rreg, u8 size, bool sync)
+        u8 mode, u8 xfer, swreg lreg, swreg rreg, u8 size, bool sync)
 {
        struct nfp_insn_re_regs reg;
        int err;
@@ -296,6 +153,11 @@ emit_cmd(struct nfp_prog *nfp_prog, enum cmd_tgt_map op,
                nfp_prog->error = -EFAULT;
                return;
        }
+       if (reg.dst_lmextn || reg.src_lmextn) {
+               pr_err("cmd can't use LMextn\n");
+               nfp_prog->error = -EFAULT;
+               return;
+       }
 
        __emit_cmd(nfp_prog, op, mode, xfer, reg.areg, reg.breg, size, sync);
 }
@@ -341,7 +203,7 @@ emit_br(struct nfp_prog *nfp_prog, enum br_mask mask, u16 addr, u8 defer)
 
 static void
 __emit_br_byte(struct nfp_prog *nfp_prog, u8 areg, u8 breg, bool imm8,
-              u8 byte, bool equal, u16 addr, u8 defer)
+              u8 byte, bool equal, u16 addr, u8 defer, bool src_lmextn)
 {
        u16 addr_lo, addr_hi;
        u64 insn;
@@ -357,32 +219,34 @@ __emit_br_byte(struct nfp_prog *nfp_prog, u8 areg, u8 breg, bool imm8,
                FIELD_PREP(OP_BB_EQ, equal) |
                FIELD_PREP(OP_BB_DEFBR, defer) |
                FIELD_PREP(OP_BB_ADDR_LO, addr_lo) |
-               FIELD_PREP(OP_BB_ADDR_HI, addr_hi);
+               FIELD_PREP(OP_BB_ADDR_HI, addr_hi) |
+               FIELD_PREP(OP_BB_SRC_LMEXTN, src_lmextn);
 
        nfp_prog_push(nfp_prog, insn);
 }
 
 static void
 emit_br_byte_neq(struct nfp_prog *nfp_prog,
-                u32 dst, u8 imm, u8 byte, u16 addr, u8 defer)
+                swreg src, u8 imm, u8 byte, u16 addr, u8 defer)
 {
        struct nfp_insn_re_regs reg;
        int err;
 
-       err = swreg_to_restricted(reg_none(), dst, reg_imm(imm), &reg, true);
+       err = swreg_to_restricted(reg_none(), src, reg_imm(imm), &reg, true);
        if (err) {
                nfp_prog->error = err;
                return;
        }
 
        __emit_br_byte(nfp_prog, reg.areg, reg.breg, reg.i8, byte, false, addr,
-                      defer);
+                      defer, reg.src_lmextn);
 }
 
 static void
 __emit_immed(struct nfp_prog *nfp_prog, u16 areg, u16 breg, u16 imm_hi,
             enum immed_width width, bool invert,
-            enum immed_shift shift, bool wr_both)
+            enum immed_shift shift, bool wr_both,
+            bool dst_lmextn, bool src_lmextn)
 {
        u64 insn;
 
@@ -393,19 +257,21 @@ __emit_immed(struct nfp_prog *nfp_prog, u16 areg, u16 breg, u16 imm_hi,
                FIELD_PREP(OP_IMMED_WIDTH, width) |
                FIELD_PREP(OP_IMMED_INV, invert) |
                FIELD_PREP(OP_IMMED_SHIFT, shift) |
-               FIELD_PREP(OP_IMMED_WR_AB, wr_both);
+               FIELD_PREP(OP_IMMED_WR_AB, wr_both) |
+               FIELD_PREP(OP_IMMED_SRC_LMEXTN, src_lmextn) |
+               FIELD_PREP(OP_IMMED_DST_LMEXTN, dst_lmextn);
 
        nfp_prog_push(nfp_prog, insn);
 }
 
 static void
-emit_immed(struct nfp_prog *nfp_prog, u32 dst, u16 imm,
+emit_immed(struct nfp_prog *nfp_prog, swreg dst, u16 imm,
           enum immed_width width, bool invert, enum immed_shift shift)
 {
        struct nfp_insn_ur_regs reg;
        int err;
 
-       if (FIELD_GET(NN_REG_TYPE, dst) == NN_REG_IMM) {
+       if (swreg_type(dst) == NN_REG_IMM) {
                nfp_prog->error = -EFAULT;
                return;
        }
@@ -417,13 +283,15 @@ emit_immed(struct nfp_prog *nfp_prog, u32 dst, u16 imm,
        }
 
        __emit_immed(nfp_prog, reg.areg, reg.breg, imm >> 8, width,
-                    invert, shift, reg.wr_both);
+                    invert, shift, reg.wr_both,
+                    reg.dst_lmextn, reg.src_lmextn);
 }
 
 static void
 __emit_shf(struct nfp_prog *nfp_prog, u16 dst, enum alu_dst_ab dst_ab,
           enum shf_sc sc, u8 shift,
-          u16 areg, enum shf_op op, u16 breg, bool i8, bool sw, bool wr_both)
+          u16 areg, enum shf_op op, u16 breg, bool i8, bool sw, bool wr_both,
+          bool dst_lmextn, bool src_lmextn)
 {
        u64 insn;
 
@@ -445,14 +313,16 @@ __emit_shf(struct nfp_prog *nfp_prog, u16 dst, enum alu_dst_ab dst_ab,
                FIELD_PREP(OP_SHF_SHIFT, shift) |
                FIELD_PREP(OP_SHF_OP, op) |
                FIELD_PREP(OP_SHF_DST_AB, dst_ab) |
-               FIELD_PREP(OP_SHF_WR_AB, wr_both);
+               FIELD_PREP(OP_SHF_WR_AB, wr_both) |
+               FIELD_PREP(OP_SHF_SRC_LMEXTN, src_lmextn) |
+               FIELD_PREP(OP_SHF_DST_LMEXTN, dst_lmextn);
 
        nfp_prog_push(nfp_prog, insn);
 }
 
 static void
-emit_shf(struct nfp_prog *nfp_prog, u32 dst, u32 lreg, enum shf_op op, u32 rreg,
-        enum shf_sc sc, u8 shift)
+emit_shf(struct nfp_prog *nfp_prog, swreg dst,
+        swreg lreg, enum shf_op op, swreg rreg, enum shf_sc sc, u8 shift)
 {
        struct nfp_insn_re_regs reg;
        int err;
@@ -464,12 +334,14 @@ emit_shf(struct nfp_prog *nfp_prog, u32 dst, u32 lreg, enum shf_op op, u32 rreg,
        }
 
        __emit_shf(nfp_prog, reg.dst, reg.dst_ab, sc, shift,
-                  reg.areg, op, reg.breg, reg.i8, reg.swap, reg.wr_both);
+                  reg.areg, op, reg.breg, reg.i8, reg.swap, reg.wr_both,
+                  reg.dst_lmextn, reg.src_lmextn);
 }
 
 static void
 __emit_alu(struct nfp_prog *nfp_prog, u16 dst, enum alu_dst_ab dst_ab,
-          u16 areg, enum alu_op op, u16 breg, bool swap, bool wr_both)
+          u16 areg, enum alu_op op, u16 breg, bool swap, bool wr_both,
+          bool dst_lmextn, bool src_lmextn)
 {
        u64 insn;
 
@@ -480,13 +352,16 @@ __emit_alu(struct nfp_prog *nfp_prog, u16 dst, enum alu_dst_ab dst_ab,
                FIELD_PREP(OP_ALU_SW, swap) |
                FIELD_PREP(OP_ALU_OP, op) |
                FIELD_PREP(OP_ALU_DST_AB, dst_ab) |
-               FIELD_PREP(OP_ALU_WR_AB, wr_both);
+               FIELD_PREP(OP_ALU_WR_AB, wr_both) |
+               FIELD_PREP(OP_ALU_SRC_LMEXTN, src_lmextn) |
+               FIELD_PREP(OP_ALU_DST_LMEXTN, dst_lmextn);
 
        nfp_prog_push(nfp_prog, insn);
 }
 
 static void
-emit_alu(struct nfp_prog *nfp_prog, u32 dst, u32 lreg, enum alu_op op, u32 rreg)
+emit_alu(struct nfp_prog *nfp_prog, swreg dst,
+        swreg lreg, enum alu_op op, swreg rreg)
 {
        struct nfp_insn_ur_regs reg;
        int err;
@@ -498,13 +373,15 @@ emit_alu(struct nfp_prog *nfp_prog, u32 dst, u32 lreg, enum alu_op op, u32 rreg)
        }
 
        __emit_alu(nfp_prog, reg.dst, reg.dst_ab,
-                  reg.areg, op, reg.breg, reg.swap, reg.wr_both);
+                  reg.areg, op, reg.breg, reg.swap, reg.wr_both,
+                  reg.dst_lmextn, reg.src_lmextn);
 }
 
 static void
 __emit_ld_field(struct nfp_prog *nfp_prog, enum shf_sc sc,
                u8 areg, u8 bmask, u8 breg, u8 shift, bool imm8,
-               bool zero, bool swap, bool wr_both)
+               bool zero, bool swap, bool wr_both,
+               bool dst_lmextn, bool src_lmextn)
 {
        u64 insn;
 
@@ -517,35 +394,44 @@ __emit_ld_field(struct nfp_prog *nfp_prog, enum shf_sc sc,
                FIELD_PREP(OP_LDF_ZF, zero) |
                FIELD_PREP(OP_LDF_BMASK, bmask) |
                FIELD_PREP(OP_LDF_SHF, shift) |
-               FIELD_PREP(OP_LDF_WR_AB, wr_both);
+               FIELD_PREP(OP_LDF_WR_AB, wr_both) |
+               FIELD_PREP(OP_LDF_SRC_LMEXTN, src_lmextn) |
+               FIELD_PREP(OP_LDF_DST_LMEXTN, dst_lmextn);
 
        nfp_prog_push(nfp_prog, insn);
 }
 
 static void
 emit_ld_field_any(struct nfp_prog *nfp_prog, enum shf_sc sc, u8 shift,
-                 u32 dst, u8 bmask, u32 src, bool zero)
+                 swreg dst, u8 bmask, swreg src, bool zero)
 {
        struct nfp_insn_re_regs reg;
        int err;
 
-       err = swreg_to_restricted(reg_none(), dst, src, &reg, true);
+       /* Note: ld_field is special as it uses one of the src regs as dst */
+       err = swreg_to_restricted(dst, dst, src, &reg, true);
        if (err) {
                nfp_prog->error = err;
                return;
        }
 
        __emit_ld_field(nfp_prog, sc, reg.areg, bmask, reg.breg, shift,
-                       reg.i8, zero, reg.swap, reg.wr_both);
+                       reg.i8, zero, reg.swap, reg.wr_both,
+                       reg.dst_lmextn, reg.src_lmextn);
 }
 
 static void
-emit_ld_field(struct nfp_prog *nfp_prog, u32 dst, u8 bmask, u32 src,
+emit_ld_field(struct nfp_prog *nfp_prog, swreg dst, u8 bmask, swreg src,
              enum shf_sc sc, u8 shift)
 {
        emit_ld_field_any(nfp_prog, sc, shift, dst, bmask, src, false);
 }
 
+static void emit_nop(struct nfp_prog *nfp_prog)
+{
+       __emit_immed(nfp_prog, UR_REG_IMM, UR_REG_IMM, 0, 0, 0, 0, 0, 0, 0);
+}
+
 /* --- Wrappers --- */
 static bool pack_immed(u32 imm, u16 *val, enum immed_shift *shift)
 {
@@ -565,7 +451,7 @@ static bool pack_immed(u32 imm, u16 *val, enum immed_shift *shift)
        return true;
 }
 
-static void wrp_immed(struct nfp_prog *nfp_prog, u32 dst, u32 imm)
+static void wrp_immed(struct nfp_prog *nfp_prog, swreg dst, u32 imm)
 {
        enum immed_shift shift;
        u16 val;
@@ -586,7 +472,7 @@ static void wrp_immed(struct nfp_prog *nfp_prog, u32 dst, u32 imm)
  * If the @imm is small enough encode it directly in operand and return
  * otherwise load @imm to a spare register and return its encoding.
  */
-static u32 ur_load_imm_any(struct nfp_prog *nfp_prog, u32 imm, u32 tmp_reg)
+static swreg ur_load_imm_any(struct nfp_prog *nfp_prog, u32 imm, swreg tmp_reg)
 {
        if (FIELD_FIT(UR_REG_IMM_MAX, imm))
                return reg_imm(imm);
@@ -599,7 +485,7 @@ static u32 ur_load_imm_any(struct nfp_prog *nfp_prog, u32 imm, u32 tmp_reg)
  * If the @imm is small enough encode it directly in operand and return
  * otherwise load @imm to a spare register and return its encoding.
  */
-static u32 re_load_imm_any(struct nfp_prog *nfp_prog, u32 imm, u32 tmp_reg)
+static swreg re_load_imm_any(struct nfp_prog *nfp_prog, u32 imm, swreg tmp_reg)
 {
        if (FIELD_FIT(RE_REG_IMM_MAX, imm))
                return reg_imm(imm);
@@ -629,7 +515,7 @@ construct_data_ind_ld(struct nfp_prog *nfp_prog, u16 offset,
 {
        unsigned int i;
        u16 shift, sz;
-       u32 tmp_reg;
+       swreg tmp_reg;
 
        /* We load the value from the address indicated in @offset and then
         * shift out the data we don't need.  Note: this is big endian!
@@ -646,22 +532,22 @@ construct_data_ind_ld(struct nfp_prog *nfp_prog, u16 offset,
                emit_alu(nfp_prog, imm_a(nfp_prog),
                         imm_a(nfp_prog), ALU_OP_ADD, reg_imm(size));
                emit_alu(nfp_prog, reg_none(),
-                        NFP_BPF_ABI_LEN, ALU_OP_SUB, imm_a(nfp_prog));
+                        plen_reg(nfp_prog), ALU_OP_SUB, imm_a(nfp_prog));
                wrp_br_special(nfp_prog, BR_BLO, OP_BR_GO_ABORT);
                /* Load data */
                emit_cmd(nfp_prog, CMD_TGT_READ8, CMD_MODE_32b, 0,
-                        pkt_reg(nfp_prog), imm_b(nfp_prog), sz - 1, true);
+                        pptr_reg(nfp_prog), imm_b(nfp_prog), sz - 1, true);
        } else {
                /* Check packet length */
                tmp_reg = ur_load_imm_any(nfp_prog, offset + size,
                                          imm_a(nfp_prog));
                emit_alu(nfp_prog, reg_none(),
-                        NFP_BPF_ABI_LEN, ALU_OP_SUB, tmp_reg);
+                        plen_reg(nfp_prog), ALU_OP_SUB, tmp_reg);
                wrp_br_special(nfp_prog, BR_BLO, OP_BR_GO_ABORT);
                /* Load data */
                tmp_reg = re_load_imm_any(nfp_prog, offset, imm_b(nfp_prog));
                emit_cmd(nfp_prog, CMD_TGT_READ8, CMD_MODE_32b, 0,
-                        pkt_reg(nfp_prog), tmp_reg, sz - 1, true);
+                        pptr_reg(nfp_prog), tmp_reg, sz - 1, true);
        }
 
        i = 0;
@@ -684,20 +570,10 @@ static int construct_data_ld(struct nfp_prog *nfp_prog, u16 offset, u8 size)
        return construct_data_ind_ld(nfp_prog, offset, 0, false, size);
 }
 
-static int wrp_set_mark(struct nfp_prog *nfp_prog, u8 src)
-{
-       emit_alu(nfp_prog, NFP_BPF_ABI_MARK,
-                reg_none(), ALU_OP_NONE, reg_b(src));
-       emit_alu(nfp_prog, NFP_BPF_ABI_FLAGS,
-                NFP_BPF_ABI_FLAGS, ALU_OP_OR, reg_imm(NFP_BPF_ABI_FLAG_MARK));
-
-       return 0;
-}
-
 static void
 wrp_alu_imm(struct nfp_prog *nfp_prog, u8 dst, enum alu_op alu_op, u32 imm)
 {
-       u32 tmp_reg;
+       swreg tmp_reg;
 
        if (alu_op == ALU_OP_AND) {
                if (!imm)
@@ -815,7 +691,7 @@ wrp_cmp_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
        const struct bpf_insn *insn = &meta->insn;
        u64 imm = insn->imm; /* sign extend */
        u8 reg = insn->dst_reg * 2;
-       u32 tmp_reg;
+       swreg tmp_reg;
 
        if (insn->off < 0) /* TODO */
                return -EOPNOTSUPP;
@@ -967,12 +843,24 @@ static int sub_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 static int shl_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
        const struct bpf_insn *insn = &meta->insn;
-
-       if (insn->imm != 32)
-               return 1; /* TODO */
-
-       wrp_reg_mov(nfp_prog, insn->dst_reg * 2 + 1, insn->dst_reg * 2);
-       wrp_immed(nfp_prog, reg_both(insn->dst_reg * 2), 0);
+       u8 dst = insn->dst_reg * 2;
+
+       if (insn->imm < 32) {
+               emit_shf(nfp_prog, reg_both(dst + 1),
+                        reg_a(dst + 1), SHF_OP_NONE, reg_b(dst),
+                        SHF_SC_R_DSHF, 32 - insn->imm);
+               emit_shf(nfp_prog, reg_both(dst),
+                        reg_none(), SHF_OP_NONE, reg_b(dst),
+                        SHF_SC_L_SHF, insn->imm);
+       } else if (insn->imm == 32) {
+               wrp_reg_mov(nfp_prog, dst + 1, dst);
+               wrp_immed(nfp_prog, reg_both(dst), 0);
+       } else if (insn->imm > 32) {
+               emit_shf(nfp_prog, reg_both(dst + 1),
+                        reg_none(), SHF_OP_NONE, reg_b(dst),
+                        SHF_SC_L_SHF, insn->imm - 32);
+               wrp_immed(nfp_prog, reg_both(dst), 0);
+       }
 
        return 0;
 }
@@ -980,12 +868,24 @@ static int shl_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 static int shr_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
        const struct bpf_insn *insn = &meta->insn;
-
-       if (insn->imm != 32)
-               return 1; /* TODO */
-
-       wrp_reg_mov(nfp_prog, insn->dst_reg * 2, insn->dst_reg * 2 + 1);
-       wrp_immed(nfp_prog, reg_both(insn->dst_reg * 2 + 1), 0);
+       u8 dst = insn->dst_reg * 2;
+
+       if (insn->imm < 32) {
+               emit_shf(nfp_prog, reg_both(dst),
+                        reg_a(dst + 1), SHF_OP_NONE, reg_b(dst),
+                        SHF_SC_R_DSHF, insn->imm);
+               emit_shf(nfp_prog, reg_both(dst + 1),
+                        reg_none(), SHF_OP_NONE, reg_b(dst + 1),
+                        SHF_SC_R_SHF, insn->imm);
+       } else if (insn->imm == 32) {
+               wrp_reg_mov(nfp_prog, dst, dst + 1);
+               wrp_immed(nfp_prog, reg_both(dst + 1), 0);
+       } else if (insn->imm > 32) {
+               emit_shf(nfp_prog, reg_both(dst),
+                        reg_none(), SHF_OP_NONE, reg_b(dst + 1),
+                        SHF_SC_R_SHF, insn->imm - 32);
+               wrp_immed(nfp_prog, reg_both(dst + 1), 0);
+       }
 
        return 0;
 }
@@ -1130,7 +1030,7 @@ static int mem_ldx4_skb(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
        if (meta->insn.off == offsetof(struct sk_buff, len))
                emit_alu(nfp_prog, reg_both(meta->insn.dst_reg * 2),
-                        reg_none(), ALU_OP_NONE, NFP_BPF_ABI_LEN);
+                        reg_none(), ALU_OP_NONE, plen_reg(nfp_prog));
        else
                return -EOPNOTSUPP;
 
@@ -1139,18 +1039,18 @@ static int mem_ldx4_skb(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 
 static int mem_ldx4_xdp(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
-       u32 dst = reg_both(meta->insn.dst_reg * 2);
+       swreg dst = reg_both(meta->insn.dst_reg * 2);
 
        if (meta->insn.off != offsetof(struct xdp_md, data) &&
            meta->insn.off != offsetof(struct xdp_md, data_end))
                return -EOPNOTSUPP;
 
-       emit_alu(nfp_prog, dst, reg_none(), ALU_OP_NONE, NFP_BPF_ABI_PKT);
+       emit_alu(nfp_prog, dst, reg_none(), ALU_OP_NONE, pptr_reg(nfp_prog));
 
        if (meta->insn.off == offsetof(struct xdp_md, data))
                return 0;
 
-       emit_alu(nfp_prog, dst, dst, ALU_OP_ADD, NFP_BPF_ABI_LEN);
+       emit_alu(nfp_prog, dst, dst, ALU_OP_ADD, plen_reg(nfp_prog));
 
        return 0;
 }
@@ -1171,9 +1071,6 @@ static int mem_ldx4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 
 static int mem_stx4_skb(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
-       if (meta->insn.off == offsetof(struct sk_buff, mark))
-               return wrp_set_mark(nfp_prog, meta->insn.src_reg * 2);
-
        return -EOPNOTSUPP;
 }
 
@@ -1202,8 +1099,10 @@ static int jeq_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
        const struct bpf_insn *insn = &meta->insn;
        u64 imm = insn->imm; /* sign extend */
-       u32 or1 = reg_a(insn->dst_reg * 2), or2 = reg_b(insn->dst_reg * 2 + 1);
-       u32 tmp_reg;
+       swreg or1, or2, tmp_reg;
+
+       or1 = reg_a(insn->dst_reg * 2);
+       or2 = reg_b(insn->dst_reg * 2 + 1);
 
        if (insn->off < 0) /* TODO */
                return -EOPNOTSUPP;
@@ -1252,7 +1151,7 @@ static int jset_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
        const struct bpf_insn *insn = &meta->insn;
        u64 imm = insn->imm; /* sign extend */
-       u32 tmp_reg;
+       swreg tmp_reg;
 
        if (insn->off < 0) /* TODO */
                return -EOPNOTSUPP;
@@ -1283,7 +1182,7 @@ static int jne_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
        const struct bpf_insn *insn = &meta->insn;
        u64 imm = insn->imm; /* sign extend */
-       u32 tmp_reg;
+       swreg tmp_reg;
 
        if (insn->off < 0) /* TODO */
                return -EOPNOTSUPP;
@@ -1510,8 +1409,9 @@ static int nfp_fixup_branches(struct nfp_prog *nfp_prog)
 
 static void nfp_intro(struct nfp_prog *nfp_prog)
 {
-       emit_alu(nfp_prog, pkt_reg(nfp_prog),
-                reg_none(), ALU_OP_NONE, NFP_BPF_ABI_PKT);
+       wrp_immed(nfp_prog, plen_reg(nfp_prog), GENMASK(13, 0));
+       emit_alu(nfp_prog, plen_reg(nfp_prog),
+                plen_reg(nfp_prog), ALU_OP_AND, pv_len(nfp_prog));
 }
 
 static void nfp_outro_tc_legacy(struct nfp_prog *nfp_prog)
@@ -1656,7 +1556,7 @@ static void nfp_outro(struct nfp_prog *nfp_prog)
 static int nfp_translate(struct nfp_prog *nfp_prog)
 {
        struct nfp_insn_meta *meta;
-       int err;
+       int i, err;
 
        nfp_intro(nfp_prog);
        if (nfp_prog->error)
@@ -1688,6 +1588,11 @@ static int nfp_translate(struct nfp_prog *nfp_prog)
        if (nfp_prog->error)
                return nfp_prog->error;
 
+       for (i = 0; i < NFP_USTORE_PREFETCH_WINDOW; i++)
+               emit_nop(nfp_prog);
+       if (nfp_prog->error)
+               return nfp_prog->error;
+
        return nfp_fixup_branches(nfp_prog);
 }
 
@@ -1737,38 +1642,6 @@ static void nfp_bpf_opt_reg_init(struct nfp_prog *nfp_prog)
        }
 }
 
-/* Try to rename registers so that program uses only low ones */
-static int nfp_bpf_opt_reg_rename(struct nfp_prog *nfp_prog)
-{
-       bool reg_used[MAX_BPF_REG] = {};
-       u8 tgt_reg[MAX_BPF_REG] = {};
-       struct nfp_insn_meta *meta;
-       unsigned int i, j;
-
-       list_for_each_entry(meta, &nfp_prog->insns, l) {
-               if (meta->skip)
-                       continue;
-
-               reg_used[meta->insn.src_reg] = true;
-               reg_used[meta->insn.dst_reg] = true;
-       }
-
-       for (i = 0, j = 0; i < ARRAY_SIZE(tgt_reg); i++) {
-               if (!reg_used[i])
-                       continue;
-
-               tgt_reg[i] = j++;
-       }
-       nfp_prog->num_regs = j;
-
-       list_for_each_entry(meta, &nfp_prog->insns, l) {
-               meta->insn.src_reg = tgt_reg[meta->insn.src_reg];
-               meta->insn.dst_reg = tgt_reg[meta->insn.dst_reg];
-       }
-
-       return 0;
-}
-
 /* Remove masking after load since our load guarantees this is not needed */
 static void nfp_bpf_opt_ld_mask(struct nfp_prog *nfp_prog)
 {
@@ -1845,20 +1718,33 @@ static void nfp_bpf_opt_ld_shift(struct nfp_prog *nfp_prog)
 
 static int nfp_bpf_optimize(struct nfp_prog *nfp_prog)
 {
-       int ret;
-
        nfp_bpf_opt_reg_init(nfp_prog);
 
-       ret = nfp_bpf_opt_reg_rename(nfp_prog);
-       if (ret)
-               return ret;
-
        nfp_bpf_opt_ld_mask(nfp_prog);
        nfp_bpf_opt_ld_shift(nfp_prog);
 
        return 0;
 }
 
+static int nfp_bpf_ustore_calc(struct nfp_prog *nfp_prog, __le64 *ustore)
+{
+       int i;
+
+       for (i = 0; i < nfp_prog->prog_len; i++) {
+               int err;
+
+               err = nfp_ustore_check_valid_no_ecc(nfp_prog->prog[i]);
+               if (err)
+                       return err;
+
+               nfp_prog->prog[i] = nfp_ustore_calc_ecc_insn(nfp_prog->prog[i]);
+
+               ustore[i] = cpu_to_le64(nfp_prog->prog[i]);
+       }
+
+       return 0;
+}
+
 /**
  * nfp_bpf_jit() - translate BPF code into NFP assembly
  * @filter:    kernel BPF filter struct
@@ -1899,10 +1785,8 @@ nfp_bpf_jit(struct bpf_prog *filter, void *prog_mem,
        if (ret)
                goto out;
 
-       if (nfp_prog->num_regs <= 7)
-               nfp_prog->regs_per_thread = 16;
-       else
-               nfp_prog->regs_per_thread = 32;
+       nfp_prog->num_regs = MAX_BPF_REG;
+       nfp_prog->regs_per_thread = 32;
 
        nfp_prog->prog = prog_mem;
        nfp_prog->__prog_alloc_len = prog_sz;
@@ -1912,10 +1796,13 @@ nfp_bpf_jit(struct bpf_prog *filter, void *prog_mem,
                pr_err("Translation failed with error %d (translated: %u)\n",
                       ret, nfp_prog->n_translated);
                ret = -EINVAL;
+               goto out;
        }
 
+       ret = nfp_bpf_ustore_calc(nfp_prog, (__force __le64 *)prog_mem);
+
        res->n_instr = nfp_prog->prog_len;
-       res->dense_mode = nfp_prog->num_regs <= 7;
+       res->dense_mode = false;
 out:
        nfp_prog_free(nfp_prog);
 
index be2cf10..0747269 100644 (file)
@@ -89,14 +89,6 @@ nfp_bpf_vnic_alloc(struct nfp_app *app, struct nfp_net *nn, unsigned int id)
        struct nfp_net_bpf_priv *priv;
        int ret;
 
-       /* Limit to single port, otherwise it's just a NIC */
-       if (id > 0) {
-               nfp_warn(app->cpp,
-                        "BPF NIC doesn't support more than one port right now\n");
-               nn->port = nfp_port_alloc(app, NFP_PORT_INVALID, nn->dp.netdev);
-               return PTR_ERR_OR_ZERO(nn->port);
-       }
-
        priv = kmalloc(sizeof(*priv), GFP_KERNEL);
        if (!priv)
                return -ENOMEM;
index 4051e94..b7a112a 100644 (file)
@@ -39,6 +39,7 @@
 #include <linux/list.h>
 #include <linux/types.h>
 
+#include "../nfp_asm.h"
 #include "../nfp_net.h"
 
 /* For branch fixup logic use up-most byte of branch instruction as scratch
@@ -53,9 +54,13 @@ enum br_special {
 };
 
 enum static_regs {
-       STATIC_REG_PKT          = 1,
-#define REG_PKT_BANK   ALU_DST_A
-       STATIC_REG_IMM          = 2, /* Bank AB */
+       STATIC_REG_IMM          = 21, /* Bank AB */
+       STATIC_REG_PKT_LEN      = 22, /* Bank B */
+};
+
+enum pkt_vec {
+       PKT_VEC_PKT_LEN         = 0,
+       PKT_VEC_PKT_PTR         = 2,
 };
 
 enum nfp_bpf_action_type {
@@ -65,39 +70,17 @@ enum nfp_bpf_action_type {
        NN_ACT_XDP,
 };
 
-/* Software register representation, hardware encoding in asm.h */
-#define NN_REG_TYPE    GENMASK(31, 24)
-#define NN_REG_VAL     GENMASK(7, 0)
-
-enum nfp_bpf_reg_type {
-       NN_REG_GPR_A =  BIT(0),
-       NN_REG_GPR_B =  BIT(1),
-       NN_REG_NNR =    BIT(2),
-       NN_REG_XFER =   BIT(3),
-       NN_REG_IMM =    BIT(4),
-       NN_REG_NONE =   BIT(5),
-};
-
-#define NN_REG_GPR_BOTH        (NN_REG_GPR_A | NN_REG_GPR_B)
-
-#define reg_both(x)    ((x) | FIELD_PREP(NN_REG_TYPE, NN_REG_GPR_BOTH))
-#define reg_a(x)       ((x) | FIELD_PREP(NN_REG_TYPE, NN_REG_GPR_A))
-#define reg_b(x)       ((x) | FIELD_PREP(NN_REG_TYPE, NN_REG_GPR_B))
-#define reg_nnr(x)     ((x) | FIELD_PREP(NN_REG_TYPE, NN_REG_NNR))
-#define reg_xfer(x)    ((x) | FIELD_PREP(NN_REG_TYPE, NN_REG_XFER))
-#define reg_imm(x)     ((x) | FIELD_PREP(NN_REG_TYPE, NN_REG_IMM))
-#define reg_none()     (FIELD_PREP(NN_REG_TYPE, NN_REG_NONE))
+#define pv_len(np)     reg_lm(1, PKT_VEC_PKT_LEN)
+#define pv_ctm_ptr(np) reg_lm(1, PKT_VEC_PKT_PTR)
 
-#define pkt_reg(np)    reg_a((np)->regs_per_thread - STATIC_REG_PKT)
-#define imm_a(np)      reg_a((np)->regs_per_thread - STATIC_REG_IMM)
-#define imm_b(np)      reg_b((np)->regs_per_thread - STATIC_REG_IMM)
-#define imm_both(np)   reg_both((np)->regs_per_thread - STATIC_REG_IMM)
+#define plen_reg(np)   reg_b(STATIC_REG_PKT_LEN)
+#define pptr_reg(np)   pv_ctm_ptr(np)
+#define imm_a(np)      reg_a(STATIC_REG_IMM)
+#define imm_b(np)      reg_b(STATIC_REG_IMM)
+#define imm_both(np)   reg_both(STATIC_REG_IMM)
 
-#define NFP_BPF_ABI_FLAGS      reg_nnr(0)
+#define NFP_BPF_ABI_FLAGS      reg_imm(0)
 #define   NFP_BPF_ABI_FLAG_MARK        1
-#define NFP_BPF_ABI_MARK       reg_nnr(1)
-#define NFP_BPF_ABI_PKT                reg_nnr(2)
-#define NFP_BPF_ABI_LEN                reg_nnr(3)
 
 struct nfp_prog;
 struct nfp_insn_meta;
index 38f3835..1194c47 100644 (file)
@@ -36,6 +36,7 @@
 #include <net/switchdev.h>
 #include <net/tc_act/tc_gact.h>
 #include <net/tc_act/tc_mirred.h>
+#include <net/tc_act/tc_pedit.h>
 #include <net/tc_act/tc_vlan.h>
 #include <net/tc_act/tc_tunnel_key.h>
 
@@ -223,6 +224,247 @@ nfp_fl_set_vxlan(struct nfp_fl_set_vxlan *set_vxlan,
        return 0;
 }
 
+static void nfp_fl_set_helper32(u32 value, u32 mask, u8 *p_exact, u8 *p_mask)
+{
+       u32 oldvalue = get_unaligned((u32 *)p_exact);
+       u32 oldmask = get_unaligned((u32 *)p_mask);
+
+       value &= mask;
+       value |= oldvalue & ~mask;
+
+       put_unaligned(oldmask | mask, (u32 *)p_mask);
+       put_unaligned(value, (u32 *)p_exact);
+}
+
+static int
+nfp_fl_set_eth(const struct tc_action *action, int idx, u32 off,
+              struct nfp_fl_set_eth *set_eth)
+{
+       u16 tmp_set_eth_op;
+       u32 exact, mask;
+
+       if (off + 4 > ETH_ALEN * 2)
+               return -EOPNOTSUPP;
+
+       mask = ~tcf_pedit_mask(action, idx);
+       exact = tcf_pedit_val(action, idx);
+
+       if (exact & ~mask)
+               return -EOPNOTSUPP;
+
+       nfp_fl_set_helper32(exact, mask, &set_eth->eth_addr_val[off],
+                           &set_eth->eth_addr_mask[off]);
+
+       set_eth->reserved = cpu_to_be16(0);
+       tmp_set_eth_op = FIELD_PREP(NFP_FL_ACT_LEN_LW,
+                                   sizeof(*set_eth) >> NFP_FL_LW_SIZ) |
+                        FIELD_PREP(NFP_FL_ACT_JMP_ID,
+                                   NFP_FL_ACTION_OPCODE_SET_ETHERNET);
+       set_eth->a_op = cpu_to_be16(tmp_set_eth_op);
+
+       return 0;
+}
+
+static int
+nfp_fl_set_ip4(const struct tc_action *action, int idx, u32 off,
+              struct nfp_fl_set_ip4_addrs *set_ip_addr)
+{
+       u16 tmp_set_ipv4_op;
+       __be32 exact, mask;
+
+       /* We are expecting tcf_pedit to return a big endian value */
+       mask = (__force __be32)~tcf_pedit_mask(action, idx);
+       exact = (__force __be32)tcf_pedit_val(action, idx);
+
+       if (exact & ~mask)
+               return -EOPNOTSUPP;
+
+       switch (off) {
+       case offsetof(struct iphdr, daddr):
+               set_ip_addr->ipv4_dst_mask = mask;
+               set_ip_addr->ipv4_dst = exact;
+               break;
+       case offsetof(struct iphdr, saddr):
+               set_ip_addr->ipv4_src_mask = mask;
+               set_ip_addr->ipv4_src = exact;
+               break;
+       default:
+               return -EOPNOTSUPP;
+       }
+
+       set_ip_addr->reserved = cpu_to_be16(0);
+       tmp_set_ipv4_op = FIELD_PREP(NFP_FL_ACT_LEN_LW,
+                                    sizeof(*set_ip_addr) >> NFP_FL_LW_SIZ) |
+                         FIELD_PREP(NFP_FL_ACT_JMP_ID,
+                                    NFP_FL_ACTION_OPCODE_SET_IPV4_ADDRS);
+       set_ip_addr->a_op = cpu_to_be16(tmp_set_ipv4_op);
+
+       return 0;
+}
+
+static void
+nfp_fl_set_ip6_helper(int opcode_tag, int idx, __be32 exact, __be32 mask,
+                     struct nfp_fl_set_ipv6_addr *ip6)
+{
+       u16 tmp_set_op;
+
+       ip6->ipv6[idx % 4].mask = mask;
+       ip6->ipv6[idx % 4].exact = exact;
+
+       ip6->reserved = cpu_to_be16(0);
+       tmp_set_op = FIELD_PREP(NFP_FL_ACT_LEN_LW, sizeof(*ip6) >>
+                               NFP_FL_LW_SIZ) |
+                    FIELD_PREP(NFP_FL_ACT_JMP_ID, opcode_tag);
+       ip6->a_op = cpu_to_be16(tmp_set_op);
+}
+
+static int
+nfp_fl_set_ip6(const struct tc_action *action, int idx, u32 off,
+              struct nfp_fl_set_ipv6_addr *ip_dst,
+              struct nfp_fl_set_ipv6_addr *ip_src)
+{
+       __be32 exact, mask;
+
+       /* We are expecting tcf_pedit to return a big endian value */
+       mask = (__force __be32)~tcf_pedit_mask(action, idx);
+       exact = (__force __be32)tcf_pedit_val(action, idx);
+
+       if (exact & ~mask)
+               return -EOPNOTSUPP;
+
+       if (off < offsetof(struct ipv6hdr, saddr))
+               return -EOPNOTSUPP;
+       else if (off < offsetof(struct ipv6hdr, daddr))
+               nfp_fl_set_ip6_helper(NFP_FL_ACTION_OPCODE_SET_IPV6_SRC, idx,
+                                     exact, mask, ip_src);
+       else if (off < offsetof(struct ipv6hdr, daddr) +
+                      sizeof(struct in6_addr))
+               nfp_fl_set_ip6_helper(NFP_FL_ACTION_OPCODE_SET_IPV6_DST, idx,
+                                     exact, mask, ip_dst);
+       else
+               return -EOPNOTSUPP;
+
+       return 0;
+}
+
+static int
+nfp_fl_set_tport(const struct tc_action *action, int idx, u32 off,
+                struct nfp_fl_set_tport *set_tport, int opcode)
+{
+       u32 exact, mask;
+       u16 tmp_set_op;
+
+       if (off)
+               return -EOPNOTSUPP;
+
+       mask = ~tcf_pedit_mask(action, idx);
+       exact = tcf_pedit_val(action, idx);
+
+       if (exact & ~mask)
+               return -EOPNOTSUPP;
+
+       nfp_fl_set_helper32(exact, mask, set_tport->tp_port_val,
+                           set_tport->tp_port_mask);
+
+       set_tport->reserved = cpu_to_be16(0);
+       tmp_set_op = FIELD_PREP(NFP_FL_ACT_LEN_LW,
+                               sizeof(*set_tport) >> NFP_FL_LW_SIZ);
+       tmp_set_op |= FIELD_PREP(NFP_FL_ACT_JMP_ID, opcode);
+       set_tport->a_op = cpu_to_be16(tmp_set_op);
+
+       return 0;
+}
+
+static int
+nfp_fl_pedit(const struct tc_action *action, char *nfp_action, int *a_len)
+{
+       struct nfp_fl_set_ipv6_addr set_ip6_dst, set_ip6_src;
+       struct nfp_fl_set_ip4_addrs set_ip_addr;
+       struct nfp_fl_set_tport set_tport;
+       struct nfp_fl_set_eth set_eth;
+       enum pedit_header_type htype;
+       int idx, nkeys, err;
+       size_t act_size;
+       u32 offset, cmd;
+
+       memset(&set_ip6_dst, 0, sizeof(set_ip6_dst));
+       memset(&set_ip6_src, 0, sizeof(set_ip6_src));
+       memset(&set_ip_addr, 0, sizeof(set_ip_addr));
+       memset(&set_tport, 0, sizeof(set_tport));
+       memset(&set_eth, 0, sizeof(set_eth));
+       nkeys = tcf_pedit_nkeys(action);
+
+       for (idx = 0; idx < nkeys; idx++) {
+               cmd = tcf_pedit_cmd(action, idx);
+               htype = tcf_pedit_htype(action, idx);
+               offset = tcf_pedit_offset(action, idx);
+
+               if (cmd != TCA_PEDIT_KEY_EX_CMD_SET)
+                       return -EOPNOTSUPP;
+
+               switch (htype) {
+               case TCA_PEDIT_KEY_EX_HDR_TYPE_ETH:
+                       err = nfp_fl_set_eth(action, idx, offset, &set_eth);
+                       break;
+               case TCA_PEDIT_KEY_EX_HDR_TYPE_IP4:
+                       err = nfp_fl_set_ip4(action, idx, offset, &set_ip_addr);
+                       break;
+               case TCA_PEDIT_KEY_EX_HDR_TYPE_IP6:
+                       err = nfp_fl_set_ip6(action, idx, offset, &set_ip6_dst,
+                                            &set_ip6_src);
+                       break;
+               case TCA_PEDIT_KEY_EX_HDR_TYPE_TCP:
+                       err = nfp_fl_set_tport(action, idx, offset, &set_tport,
+                                              NFP_FL_ACTION_OPCODE_SET_TCP);
+                       break;
+               case TCA_PEDIT_KEY_EX_HDR_TYPE_UDP:
+                       err = nfp_fl_set_tport(action, idx, offset, &set_tport,
+                                              NFP_FL_ACTION_OPCODE_SET_UDP);
+                       break;
+               default:
+                       return -EOPNOTSUPP;
+               }
+               if (err)
+                       return err;
+       }
+
+       if (set_eth.a_op) {
+               act_size = sizeof(set_eth);
+               memcpy(nfp_action, &set_eth, act_size);
+               *a_len += act_size;
+       } else if (set_ip_addr.a_op) {
+               act_size = sizeof(set_ip_addr);
+               memcpy(nfp_action, &set_ip_addr, act_size);
+               *a_len += act_size;
+       } else if (set_ip6_dst.a_op && set_ip6_src.a_op) {
+               /* TC compiles set src and dst IPv6 address as a single action,
+                * the hardware requires this to be 2 separate actions.
+                */
+               act_size = sizeof(set_ip6_src);
+               memcpy(nfp_action, &set_ip6_src, act_size);
+               *a_len += act_size;
+
+               act_size = sizeof(set_ip6_dst);
+               memcpy(&nfp_action[sizeof(set_ip6_src)], &set_ip6_dst,
+                      act_size);
+               *a_len += act_size;
+       } else if (set_ip6_dst.a_op) {
+               act_size = sizeof(set_ip6_dst);
+               memcpy(nfp_action, &set_ip6_dst, act_size);
+               *a_len += act_size;
+       } else if (set_ip6_src.a_op) {
+               act_size = sizeof(set_ip6_src);
+               memcpy(nfp_action, &set_ip6_src, act_size);
+               *a_len += act_size;
+       } else if (set_tport.a_op) {
+               act_size = sizeof(set_tport);
+               memcpy(nfp_action, &set_tport, act_size);
+               *a_len += act_size;
+       }
+
+       return 0;
+}
+
 static int
 nfp_flower_loop_action(const struct tc_action *a,
                       struct nfp_fl_payload *nfp_fl, int *a_len,
@@ -301,6 +543,9 @@ nfp_flower_loop_action(const struct tc_action *a,
        } else if (is_tcf_tunnel_release(a)) {
                /* Tunnel decap is handled by default so accept action. */
                return 0;
+       } else if (is_tcf_pedit(a)) {
+               if (nfp_fl_pedit(a, &nfp_fl->action_data[*a_len], a_len))
+                       return -EOPNOTSUPP;
        } else {
                /* Currently we do not handle any other actions. */
                return -EOPNOTSUPP;
index 504ddaa..f7b7242 100644 (file)
 #define NFP_FLOWER_MASK_VLAN_CFI       BIT(12)
 #define NFP_FLOWER_MASK_VLAN_VID       GENMASK(11, 0)
 
+#define NFP_FLOWER_MASK_MPLS_LB                GENMASK(31, 12)
+#define NFP_FLOWER_MASK_MPLS_TC                GENMASK(11, 9)
+#define NFP_FLOWER_MASK_MPLS_BOS       BIT(8)
+#define NFP_FLOWER_MASK_MPLS_Q         BIT(0)
+
 #define NFP_FL_SC_ACT_DROP             0x80000000
 #define NFP_FL_SC_ACT_USER             0x7D000000
 #define NFP_FL_SC_ACT_POPV             0x6A000000
 #define NFP_FL_ACTION_OPCODE_PUSH_VLAN         1
 #define NFP_FL_ACTION_OPCODE_POP_VLAN          2
 #define NFP_FL_ACTION_OPCODE_SET_IPV4_TUNNEL   6
+#define NFP_FL_ACTION_OPCODE_SET_ETHERNET      7
+#define NFP_FL_ACTION_OPCODE_SET_IPV4_ADDRS    9
+#define NFP_FL_ACTION_OPCODE_SET_IPV6_SRC      11
+#define NFP_FL_ACTION_OPCODE_SET_IPV6_DST      12
+#define NFP_FL_ACTION_OPCODE_SET_UDP           14
+#define NFP_FL_ACTION_OPCODE_SET_TCP           15
 #define NFP_FL_ACTION_OPCODE_PRE_TUNNEL                17
 #define NFP_FL_ACTION_OPCODE_NUM               32
 
@@ -102,6 +113,38 @@ enum nfp_flower_tun_type {
        NFP_FL_TUNNEL_VXLAN =   2,
 };
 
+struct nfp_fl_set_eth {
+       __be16 a_op;
+       __be16 reserved;
+       u8 eth_addr_mask[ETH_ALEN * 2];
+       u8 eth_addr_val[ETH_ALEN * 2];
+};
+
+struct nfp_fl_set_ip4_addrs {
+       __be16 a_op;
+       __be16 reserved;
+       __be32 ipv4_src_mask;
+       __be32 ipv4_src;
+       __be32 ipv4_dst_mask;
+       __be32 ipv4_dst;
+};
+
+struct nfp_fl_set_ipv6_addr {
+       __be16 a_op;
+       __be16 reserved;
+       struct {
+               __be32 mask;
+               __be32 exact;
+       } ipv6[4];
+};
+
+struct nfp_fl_set_tport {
+       __be16 a_op;
+       __be16 reserved;
+       u8 tp_port_mask[4];
+       u8 tp_port_val[4];
+};
+
 struct nfp_fl_output {
        __be16 a_op;
        __be16 flags;
index 865a815..60614d4 100644 (file)
@@ -111,8 +111,21 @@ nfp_flower_compile_mac(struct nfp_flower_mac_mpls *frame,
                ether_addr_copy(frame->mac_src, &addr->src[0]);
        }
 
-       if (mask_version)
-               frame->mpls_lse = cpu_to_be32(~0);
+       if (dissector_uses_key(flow->dissector, FLOW_DISSECTOR_KEY_MPLS)) {
+               struct flow_dissector_key_mpls *mpls;
+               u32 t_mpls;
+
+               mpls = skb_flow_dissector_target(flow->dissector,
+                                                FLOW_DISSECTOR_KEY_MPLS,
+                                                target);
+
+               t_mpls = FIELD_PREP(NFP_FLOWER_MASK_MPLS_LB, mpls->mpls_label) |
+                        FIELD_PREP(NFP_FLOWER_MASK_MPLS_TC, mpls->mpls_tc) |
+                        FIELD_PREP(NFP_FLOWER_MASK_MPLS_BOS, mpls->mpls_bos) |
+                        NFP_FLOWER_MASK_MPLS_Q;
+
+               frame->mpls_lse = cpu_to_be32(t_mpls);
+       }
 }
 
 static void
@@ -143,7 +156,6 @@ nfp_flower_compile_ipv4(struct nfp_flower_ipv4 *frame,
        struct flow_dissector_key_ipv4_addrs *addr;
        struct flow_dissector_key_basic *basic;
 
-       /* Wildcard TOS/TTL for now. */
        memset(frame, 0, sizeof(struct nfp_flower_ipv4));
 
        if (dissector_uses_key(flow->dissector,
@@ -161,6 +173,16 @@ nfp_flower_compile_ipv4(struct nfp_flower_ipv4 *frame,
                                                  target);
                frame->proto = basic->ip_proto;
        }
+
+       if (dissector_uses_key(flow->dissector, FLOW_DISSECTOR_KEY_IP)) {
+               struct flow_dissector_key_ip *flow_ip;
+
+               flow_ip = skb_flow_dissector_target(flow->dissector,
+                                                   FLOW_DISSECTOR_KEY_IP,
+                                                   target);
+               frame->tos = flow_ip->tos;
+               frame->ttl = flow_ip->ttl;
+       }
 }
 
 static void
@@ -172,7 +194,6 @@ nfp_flower_compile_ipv6(struct nfp_flower_ipv6 *frame,
        struct flow_dissector_key_ipv6_addrs *addr;
        struct flow_dissector_key_basic *basic;
 
-       /* Wildcard LABEL/TOS/TTL for now. */
        memset(frame, 0, sizeof(struct nfp_flower_ipv6));
 
        if (dissector_uses_key(flow->dissector,
@@ -190,6 +211,16 @@ nfp_flower_compile_ipv6(struct nfp_flower_ipv6 *frame,
                                                  target);
                frame->proto = basic->ip_proto;
        }
+
+       if (dissector_uses_key(flow->dissector, FLOW_DISSECTOR_KEY_IP)) {
+               struct flow_dissector_key_ip *flow_ip;
+
+               flow_ip = skb_flow_dissector_target(flow->dissector,
+                                                   FLOW_DISSECTOR_KEY_IP,
+                                                   target);
+               frame->tos = flow_ip->tos;
+               frame->ttl = flow_ip->ttl;
+       }
 }
 
 static void
index 3d9537e..6f239c2 100644 (file)
@@ -57,6 +57,7 @@
         BIT(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS) | \
         BIT(FLOW_DISSECTOR_KEY_ENC_CONTROL) | \
         BIT(FLOW_DISSECTOR_KEY_ENC_PORTS) | \
+        BIT(FLOW_DISSECTOR_KEY_MPLS) | \
         BIT(FLOW_DISSECTOR_KEY_IP))
 
 #define NFP_FLOWER_WHITELIST_TUN_DISSECTOR \
@@ -134,7 +135,6 @@ nfp_flower_calculate_key_layers(struct nfp_fl_key_ls *ret_key_ls,
 {
        struct flow_dissector_key_basic *mask_basic = NULL;
        struct flow_dissector_key_basic *key_basic = NULL;
-       struct flow_dissector_key_ip *mask_ip = NULL;
        u32 key_layer_two;
        u8 key_layer;
        int key_size;
@@ -206,28 +206,15 @@ nfp_flower_calculate_key_layers(struct nfp_fl_key_ls *ret_key_ls,
                                                      flow->key);
        }
 
-       if (dissector_uses_key(flow->dissector, FLOW_DISSECTOR_KEY_IP))
-               mask_ip = skb_flow_dissector_target(flow->dissector,
-                                                   FLOW_DISSECTOR_KEY_IP,
-                                                   flow->mask);
-
        if (mask_basic && mask_basic->n_proto) {
                /* Ethernet type is present in the key. */
                switch (key_basic->n_proto) {
                case cpu_to_be16(ETH_P_IP):
-                       if (mask_ip && mask_ip->tos)
-                               return -EOPNOTSUPP;
-                       if (mask_ip && mask_ip->ttl)
-                               return -EOPNOTSUPP;
                        key_layer |= NFP_FLOWER_LAYER_IPV4;
                        key_size += sizeof(struct nfp_flower_ipv4);
                        break;
 
                case cpu_to_be16(ETH_P_IPV6):
-                       if (mask_ip && mask_ip->tos)
-                               return -EOPNOTSUPP;
-                       if (mask_ip && mask_ip->ttl)
-                               return -EOPNOTSUPP;
                        key_layer |= NFP_FLOWER_LAYER_IPV6;
                        key_size += sizeof(struct nfp_flower_ipv6);
                        break;
@@ -238,11 +225,6 @@ nfp_flower_calculate_key_layers(struct nfp_fl_key_ls *ret_key_ls,
                case cpu_to_be16(ETH_P_ARP):
                        return -EOPNOTSUPP;
 
-               /* Currently we do not offload MPLS. */
-               case cpu_to_be16(ETH_P_MPLS_UC):
-               case cpu_to_be16(ETH_P_MPLS_MC):
-                       return -EOPNOTSUPP;
-
                /* Will be included in layer 2. */
                case cpu_to_be16(ETH_P_8021Q):
                        break;
index af640b5..857bb33 100644 (file)
@@ -36,6 +36,8 @@
 
 #include <net/devlink.h>
 
+#include <trace/events/devlink.h>
+
 #include "nfp_net_repr.h"
 
 struct bpf_prog;
@@ -271,11 +273,17 @@ static inline int nfp_app_xdp_offload(struct nfp_app *app, struct nfp_net *nn,
 
 static inline bool nfp_app_ctrl_tx(struct nfp_app *app, struct sk_buff *skb)
 {
+       trace_devlink_hwmsg(priv_to_devlink(app->pf), false, 0,
+                           skb->data, skb->len);
+
        return nfp_ctrl_tx(app->ctrl, skb);
 }
 
 static inline void nfp_app_ctrl_rx(struct nfp_app *app, struct sk_buff *skb)
 {
+       trace_devlink_hwmsg(priv_to_devlink(app->pf), true, 0,
+                           skb->data, skb->len);
+
        app->type->ctrl_msg_rx(app, skb);
 }
 
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_asm.c b/drivers/net/ethernet/netronome/nfp/nfp_asm.c
new file mode 100644 (file)
index 0000000..de76e74
--- /dev/null
@@ -0,0 +1,254 @@
+/*
+ * Copyright (C) 2016-2017 Netronome Systems, Inc.
+ *
+ * This software is dual licensed under the GNU General License Version 2,
+ * June 1991 as shown in the file COPYING in the top-level directory of this
+ * source tree or the BSD 2-Clause License provided below.  You have the
+ * option to license this software under the complete terms of either license.
+ *
+ * The BSD 2-Clause License:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      1. Redistributions of source code must retain the above
+ *         copyright notice, this list of conditions and the following
+ *         disclaimer.
+ *
+ *      2. Redistributions in binary form must reproduce the above
+ *         copyright notice, this list of conditions and the following
+ *         disclaimer in the documentation and/or other materials
+ *         provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/bitops.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+#include "nfp_asm.h"
+
+const struct cmd_tgt_act cmd_tgt_act[__CMD_TGT_MAP_SIZE] = {
+       [CMD_TGT_WRITE8] =              { 0x00, 0x42 },
+       [CMD_TGT_READ8] =               { 0x01, 0x43 },
+       [CMD_TGT_READ_LE] =             { 0x01, 0x40 },
+       [CMD_TGT_READ_SWAP_LE] =        { 0x03, 0x40 },
+};
+
+static u16 nfp_swreg_to_unreg(swreg reg, bool is_dst)
+{
+       bool lm_id, lm_dec = false;
+       u16 val = swreg_value(reg);
+
+       switch (swreg_type(reg)) {
+       case NN_REG_GPR_A:
+       case NN_REG_GPR_B:
+       case NN_REG_GPR_BOTH:
+               return val;
+       case NN_REG_NNR:
+               return UR_REG_NN | val;
+       case NN_REG_XFER:
+               return UR_REG_XFR | val;
+       case NN_REG_LMEM:
+               lm_id = swreg_lm_idx(reg);
+
+               switch (swreg_lm_mode(reg)) {
+               case NN_LM_MOD_NONE:
+                       if (val & ~UR_REG_LM_IDX_MAX) {
+                               pr_err("LM offset too large\n");
+                               return 0;
+                       }
+                       return UR_REG_LM | FIELD_PREP(UR_REG_LM_IDX, lm_id) |
+                               val;
+               case NN_LM_MOD_DEC:
+                       lm_dec = true;
+                       /* fall through */
+               case NN_LM_MOD_INC:
+                       if (val) {
+                               pr_err("LM offset in inc/dev mode\n");
+                               return 0;
+                       }
+                       return UR_REG_LM | UR_REG_LM_POST_MOD |
+                               FIELD_PREP(UR_REG_LM_IDX, lm_id) |
+                               FIELD_PREP(UR_REG_LM_POST_MOD_DEC, lm_dec);
+               default:
+                       pr_err("bad LM mode for unrestricted operands %d\n",
+                              swreg_lm_mode(reg));
+                       return 0;
+               }
+       case NN_REG_IMM:
+               if (val & ~0xff) {
+                       pr_err("immediate too large\n");
+                       return 0;
+               }
+               return UR_REG_IMM_encode(val);
+       case NN_REG_NONE:
+               return is_dst ? UR_REG_NO_DST : REG_NONE;
+       }
+
+       pr_err("unrecognized reg encoding %08x\n", reg);
+       return 0;
+}
+
+int swreg_to_unrestricted(swreg dst, swreg lreg, swreg rreg,
+                         struct nfp_insn_ur_regs *reg)
+{
+       memset(reg, 0, sizeof(*reg));
+
+       /* Decode destination */
+       if (swreg_type(dst) == NN_REG_IMM)
+               return -EFAULT;
+
+       if (swreg_type(dst) == NN_REG_GPR_B)
+               reg->dst_ab = ALU_DST_B;
+       if (swreg_type(dst) == NN_REG_GPR_BOTH)
+               reg->wr_both = true;
+       reg->dst = nfp_swreg_to_unreg(dst, true);
+
+       /* Decode source operands */
+       if (swreg_type(lreg) == swreg_type(rreg))
+               return -EFAULT;
+
+       if (swreg_type(lreg) == NN_REG_GPR_B ||
+           swreg_type(rreg) == NN_REG_GPR_A) {
+               reg->areg = nfp_swreg_to_unreg(rreg, false);
+               reg->breg = nfp_swreg_to_unreg(lreg, false);
+               reg->swap = true;
+       } else {
+               reg->areg = nfp_swreg_to_unreg(lreg, false);
+               reg->breg = nfp_swreg_to_unreg(rreg, false);
+       }
+
+       reg->dst_lmextn = swreg_lmextn(dst);
+       reg->src_lmextn = swreg_lmextn(lreg) | swreg_lmextn(rreg);
+
+       return 0;
+}
+
+static u16 nfp_swreg_to_rereg(swreg reg, bool is_dst, bool has_imm8, bool *i8)
+{
+       u16 val = swreg_value(reg);
+       bool lm_id;
+
+       switch (swreg_type(reg)) {
+       case NN_REG_GPR_A:
+       case NN_REG_GPR_B:
+       case NN_REG_GPR_BOTH:
+               return val;
+       case NN_REG_XFER:
+               return RE_REG_XFR | val;
+       case NN_REG_LMEM:
+               lm_id = swreg_lm_idx(reg);
+
+               if (swreg_lm_mode(reg) != NN_LM_MOD_NONE) {
+                       pr_err("bad LM mode for restricted operands %d\n",
+                              swreg_lm_mode(reg));
+                       return 0;
+               }
+
+               if (val & ~RE_REG_LM_IDX_MAX) {
+                       pr_err("LM offset too large\n");
+                       return 0;
+               }
+
+               return RE_REG_LM | FIELD_PREP(RE_REG_LM_IDX, lm_id) | val;
+       case NN_REG_IMM:
+               if (val & ~(0x7f | has_imm8 << 7)) {
+                       pr_err("immediate too large\n");
+                       return 0;
+               }
+               *i8 = val & 0x80;
+               return RE_REG_IMM_encode(val & 0x7f);
+       case NN_REG_NONE:
+               return is_dst ? RE_REG_NO_DST : REG_NONE;
+       case NN_REG_NNR:
+               pr_err("NNRs used with restricted encoding\n");
+               return 0;
+       }
+
+       pr_err("unrecognized reg encoding\n");
+       return 0;
+}
+
+int swreg_to_restricted(swreg dst, swreg lreg, swreg rreg,
+                       struct nfp_insn_re_regs *reg, bool has_imm8)
+{
+       memset(reg, 0, sizeof(*reg));
+
+       /* Decode destination */
+       if (swreg_type(dst) == NN_REG_IMM)
+               return -EFAULT;
+
+       if (swreg_type(dst) == NN_REG_GPR_B)
+               reg->dst_ab = ALU_DST_B;
+       if (swreg_type(dst) == NN_REG_GPR_BOTH)
+               reg->wr_both = true;
+       reg->dst = nfp_swreg_to_rereg(dst, true, false, NULL);
+
+       /* Decode source operands */
+       if (swreg_type(lreg) == swreg_type(rreg))
+               return -EFAULT;
+
+       if (swreg_type(lreg) == NN_REG_GPR_B ||
+           swreg_type(rreg) == NN_REG_GPR_A) {
+               reg->areg = nfp_swreg_to_rereg(rreg, false, has_imm8, &reg->i8);
+               reg->breg = nfp_swreg_to_rereg(lreg, false, has_imm8, &reg->i8);
+               reg->swap = true;
+       } else {
+               reg->areg = nfp_swreg_to_rereg(lreg, false, has_imm8, &reg->i8);
+               reg->breg = nfp_swreg_to_rereg(rreg, false, has_imm8, &reg->i8);
+       }
+
+       reg->dst_lmextn = swreg_lmextn(dst);
+       reg->src_lmextn = swreg_lmextn(lreg) | swreg_lmextn(rreg);
+
+       return 0;
+}
+
+#define NFP_USTORE_ECC_POLY_WORDS              7
+#define NFP_USTORE_OP_BITS                     45
+
+static const u64 nfp_ustore_ecc_polynomials[NFP_USTORE_ECC_POLY_WORDS] = {
+       0x0ff800007fffULL,
+       0x11f801ff801fULL,
+       0x1e387e0781e1ULL,
+       0x17cb8e388e22ULL,
+       0x1af5b2c93244ULL,
+       0x1f56d5525488ULL,
+       0x0daf69a46910ULL,
+};
+
+static bool parity(u64 value)
+{
+       return hweight64(value) & 1;
+}
+
+int nfp_ustore_check_valid_no_ecc(u64 insn)
+{
+       if (insn & ~GENMASK_ULL(NFP_USTORE_OP_BITS, 0))
+               return -EINVAL;
+
+       return 0;
+}
+
+u64 nfp_ustore_calc_ecc_insn(u64 insn)
+{
+       u8 ecc = 0;
+       int i;
+
+       for (i = 0; i < NFP_USTORE_ECC_POLY_WORDS; i++)
+               ecc |= parity(nfp_ustore_ecc_polynomials[i] & insn) << i;
+
+       return insn | (u64)ecc << NFP_USTORE_OP_BITS;
+}
index d2b5357..c4c18dd 100644 (file)
@@ -34,6 +34,7 @@
 #ifndef __NFP_ASM_H__
 #define __NFP_ASM_H__ 1
 
+#include <linux/bitfield.h>
 #include <linux/types.h>
 
 #define REG_NONE       0
 #define RE_REG_IMM_encode(x)                                   \
        (RE_REG_IMM | ((x) & 0x1f) | (((x) & 0x60) << 1))
 #define RE_REG_IMM_MAX  0x07fULL
+#define RE_REG_LM      0x050
+#define RE_REG_LM_IDX  0x008
+#define RE_REG_LM_IDX_MAX      0x7
 #define RE_REG_XFR     0x080
 
 #define UR_REG_XFR     0x180
+#define UR_REG_LM      0x200
+#define UR_REG_LM_IDX  0x020
+#define UR_REG_LM_POST_MOD     0x010
+#define UR_REG_LM_POST_MOD_DEC 0x001
+#define UR_REG_LM_IDX_MAX      0xf
 #define UR_REG_NN      0x280
 #define UR_REG_NO_DST  0x300
 #define UR_REG_IMM     UR_REG_NO_DST
 #define UR_REG_IMM_encode(x) (UR_REG_IMM | (x))
 #define UR_REG_IMM_MAX  0x0ffULL
 
-#define OP_BR_BASE     0x0d800000020ULL
-#define OP_BR_BASE_MASK        0x0f8000c3ce0ULL
-#define OP_BR_MASK     0x0000000001fULL
-#define OP_BR_EV_PIP   0x00000000300ULL
-#define OP_BR_CSS      0x0000003c000ULL
-#define OP_BR_DEFBR    0x00000300000ULL
-#define OP_BR_ADDR_LO  0x007ffc00000ULL
-#define OP_BR_ADDR_HI  0x10000000000ULL
+#define OP_BR_BASE             0x0d800000020ULL
+#define OP_BR_BASE_MASK                0x0f8000c3ce0ULL
+#define OP_BR_MASK             0x0000000001fULL
+#define OP_BR_EV_PIP           0x00000000300ULL
+#define OP_BR_CSS              0x0000003c000ULL
+#define OP_BR_DEFBR            0x00000300000ULL
+#define OP_BR_ADDR_LO          0x007ffc00000ULL
+#define OP_BR_ADDR_HI          0x10000000000ULL
 
 #define nfp_is_br(_insn)                               \
        (((_insn) & OP_BR_BASE_MASK) == OP_BR_BASE)
@@ -82,30 +91,33 @@ enum br_ctx_signal_state {
        BR_CSS_NONE = 2,
 };
 
-#define OP_BBYTE_BASE  0x0c800000000ULL
-#define OP_BB_A_SRC    0x000000000ffULL
-#define OP_BB_BYTE     0x00000000300ULL
-#define OP_BB_B_SRC    0x0000003fc00ULL
-#define OP_BB_I8       0x00000040000ULL
-#define OP_BB_EQ       0x00000080000ULL
-#define OP_BB_DEFBR    0x00000300000ULL
-#define OP_BB_ADDR_LO  0x007ffc00000ULL
-#define OP_BB_ADDR_HI  0x10000000000ULL
-
-#define OP_BALU_BASE   0x0e800000000ULL
-#define OP_BA_A_SRC    0x000000003ffULL
-#define OP_BA_B_SRC    0x000000ffc00ULL
-#define OP_BA_DEFBR    0x00000300000ULL
-#define OP_BA_ADDR_HI  0x0007fc00000ULL
-
-#define OP_IMMED_A_SRC 0x000000003ffULL
-#define OP_IMMED_B_SRC 0x000000ffc00ULL
-#define OP_IMMED_IMM   0x0000ff00000ULL
-#define OP_IMMED_WIDTH 0x00060000000ULL
-#define OP_IMMED_INV   0x00080000000ULL
-#define OP_IMMED_SHIFT 0x00600000000ULL
-#define OP_IMMED_BASE  0x0f000000000ULL
-#define OP_IMMED_WR_AB 0x20000000000ULL
+#define OP_BBYTE_BASE          0x0c800000000ULL
+#define OP_BB_A_SRC            0x000000000ffULL
+#define OP_BB_BYTE             0x00000000300ULL
+#define OP_BB_B_SRC            0x0000003fc00ULL
+#define OP_BB_I8               0x00000040000ULL
+#define OP_BB_EQ               0x00000080000ULL
+#define OP_BB_DEFBR            0x00000300000ULL
+#define OP_BB_ADDR_LO          0x007ffc00000ULL
+#define OP_BB_ADDR_HI          0x10000000000ULL
+#define OP_BB_SRC_LMEXTN       0x40000000000ULL
+
+#define OP_BALU_BASE           0x0e800000000ULL
+#define OP_BA_A_SRC            0x000000003ffULL
+#define OP_BA_B_SRC            0x000000ffc00ULL
+#define OP_BA_DEFBR            0x00000300000ULL
+#define OP_BA_ADDR_HI          0x0007fc00000ULL
+
+#define OP_IMMED_A_SRC         0x000000003ffULL
+#define OP_IMMED_B_SRC         0x000000ffc00ULL
+#define OP_IMMED_IMM           0x0000ff00000ULL
+#define OP_IMMED_WIDTH         0x00060000000ULL
+#define OP_IMMED_INV           0x00080000000ULL
+#define OP_IMMED_SHIFT         0x00600000000ULL
+#define OP_IMMED_BASE          0x0f000000000ULL
+#define OP_IMMED_WR_AB         0x20000000000ULL
+#define OP_IMMED_SRC_LMEXTN    0x40000000000ULL
+#define OP_IMMED_DST_LMEXTN    0x80000000000ULL
 
 enum immed_width {
        IMMED_WIDTH_ALL = 0,
@@ -119,17 +131,19 @@ enum immed_shift {
        IMMED_SHIFT_2B = 2,
 };
 
-#define OP_SHF_BASE    0x08000000000ULL
-#define OP_SHF_A_SRC   0x000000000ffULL
-#define OP_SHF_SC      0x00000000300ULL
-#define OP_SHF_B_SRC   0x0000003fc00ULL
-#define OP_SHF_I8      0x00000040000ULL
-#define OP_SHF_SW      0x00000080000ULL
-#define OP_SHF_DST     0x0000ff00000ULL
-#define OP_SHF_SHIFT   0x001f0000000ULL
-#define OP_SHF_OP      0x00e00000000ULL
-#define OP_SHF_DST_AB  0x01000000000ULL
-#define OP_SHF_WR_AB   0x20000000000ULL
+#define OP_SHF_BASE            0x08000000000ULL
+#define OP_SHF_A_SRC           0x000000000ffULL
+#define OP_SHF_SC              0x00000000300ULL
+#define OP_SHF_B_SRC           0x0000003fc00ULL
+#define OP_SHF_I8              0x00000040000ULL
+#define OP_SHF_SW              0x00000080000ULL
+#define OP_SHF_DST             0x0000ff00000ULL
+#define OP_SHF_SHIFT           0x001f0000000ULL
+#define OP_SHF_OP              0x00e00000000ULL
+#define OP_SHF_DST_AB          0x01000000000ULL
+#define OP_SHF_WR_AB           0x20000000000ULL
+#define OP_SHF_SRC_LMEXTN      0x40000000000ULL
+#define OP_SHF_DST_LMEXTN      0x80000000000ULL
 
 enum shf_op {
        SHF_OP_NONE = 0,
@@ -144,14 +158,16 @@ enum shf_sc {
        SHF_SC_R_DSHF = 3,
 };
 
-#define OP_ALU_A_SRC   0x000000003ffULL
-#define OP_ALU_B_SRC   0x000000ffc00ULL
-#define OP_ALU_DST     0x0003ff00000ULL
-#define OP_ALU_SW      0x00040000000ULL
-#define OP_ALU_OP      0x00f80000000ULL
-#define OP_ALU_DST_AB  0x01000000000ULL
-#define OP_ALU_BASE    0x0a000000000ULL
-#define OP_ALU_WR_AB   0x20000000000ULL
+#define OP_ALU_A_SRC           0x000000003ffULL
+#define OP_ALU_B_SRC           0x000000ffc00ULL
+#define OP_ALU_DST             0x0003ff00000ULL
+#define OP_ALU_SW              0x00040000000ULL
+#define OP_ALU_OP              0x00f80000000ULL
+#define OP_ALU_DST_AB          0x01000000000ULL
+#define OP_ALU_BASE            0x0a000000000ULL
+#define OP_ALU_WR_AB           0x20000000000ULL
+#define OP_ALU_SRC_LMEXTN      0x40000000000ULL
+#define OP_ALU_DST_LMEXTN      0x80000000000ULL
 
 enum alu_op {
        ALU_OP_NONE     = 0x00,
@@ -170,26 +186,28 @@ enum alu_dst_ab {
        ALU_DST_B = 1,
 };
 
-#define OP_LDF_BASE    0x0c000000000ULL
-#define OP_LDF_A_SRC   0x000000000ffULL
-#define OP_LDF_SC      0x00000000300ULL
-#define OP_LDF_B_SRC   0x0000003fc00ULL
-#define OP_LDF_I8      0x00000040000ULL
-#define OP_LDF_SW      0x00000080000ULL
-#define OP_LDF_ZF      0x00000100000ULL
-#define OP_LDF_BMASK   0x0000f000000ULL
-#define OP_LDF_SHF     0x001f0000000ULL
-#define OP_LDF_WR_AB   0x20000000000ULL
-
-#define OP_CMD_A_SRC    0x000000000ffULL
-#define OP_CMD_CTX      0x00000000300ULL
-#define OP_CMD_B_SRC    0x0000003fc00ULL
-#define OP_CMD_TOKEN    0x000000c0000ULL
-#define OP_CMD_XFER     0x00001f00000ULL
-#define OP_CMD_CNT      0x0000e000000ULL
-#define OP_CMD_SIG      0x000f0000000ULL
-#define OP_CMD_TGT_CMD  0x07f00000000ULL
-#define OP_CMD_MODE    0x1c0000000000ULL
+#define OP_LDF_BASE            0x0c000000000ULL
+#define OP_LDF_A_SRC           0x000000000ffULL
+#define OP_LDF_SC              0x00000000300ULL
+#define OP_LDF_B_SRC           0x0000003fc00ULL
+#define OP_LDF_I8              0x00000040000ULL
+#define OP_LDF_SW              0x00000080000ULL
+#define OP_LDF_ZF              0x00000100000ULL
+#define OP_LDF_BMASK           0x0000f000000ULL
+#define OP_LDF_SHF             0x001f0000000ULL
+#define OP_LDF_WR_AB           0x20000000000ULL
+#define OP_LDF_SRC_LMEXTN      0x40000000000ULL
+#define OP_LDF_DST_LMEXTN      0x80000000000ULL
+
+#define OP_CMD_A_SRC           0x000000000ffULL
+#define OP_CMD_CTX             0x00000000300ULL
+#define OP_CMD_B_SRC           0x0000003fc00ULL
+#define OP_CMD_TOKEN           0x000000c0000ULL
+#define OP_CMD_XFER            0x00001f00000ULL
+#define OP_CMD_CNT             0x0000e000000ULL
+#define OP_CMD_SIG             0x000f0000000ULL
+#define OP_CMD_TGT_CMD         0x07f00000000ULL
+#define OP_CMD_MODE           0x1c0000000000ULL
 
 struct cmd_tgt_act {
        u8 token;
@@ -204,6 +222,8 @@ enum cmd_tgt_map {
        __CMD_TGT_MAP_SIZE,
 };
 
+extern const struct cmd_tgt_act cmd_tgt_act[__CMD_TGT_MAP_SIZE];
+
 enum cmd_mode {
        CMD_MODE_40b_AB = 0,
        CMD_MODE_40b_BA = 1,
@@ -215,11 +235,13 @@ enum cmd_ctx_swap {
        CMD_CTX_NO_SWAP = 3,
 };
 
-#define OP_LCSR_BASE   0x0fc00000000ULL
-#define OP_LCSR_A_SRC  0x000000003ffULL
-#define OP_LCSR_B_SRC  0x000000ffc00ULL
-#define OP_LCSR_WRITE  0x00000200000ULL
-#define OP_LCSR_ADDR   0x001ffc00000ULL
+#define OP_LCSR_BASE           0x0fc00000000ULL
+#define OP_LCSR_A_SRC          0x000000003ffULL
+#define OP_LCSR_B_SRC          0x000000ffc00ULL
+#define OP_LCSR_WRITE          0x00000200000ULL
+#define OP_LCSR_ADDR           0x001ffc00000ULL
+#define OP_LCSR_SRC_LMEXTN     0x40000000000ULL
+#define OP_LCSR_DST_LMEXTN     0x80000000000ULL
 
 enum lcsr_wr_src {
        LCSR_WR_AREG,
@@ -227,7 +249,122 @@ enum lcsr_wr_src {
        LCSR_WR_IMM,
 };
 
-#define OP_CARB_BASE   0x0e000000000ULL
-#define OP_CARB_OR     0x00000010000ULL
+#define OP_CARB_BASE           0x0e000000000ULL
+#define OP_CARB_OR             0x00000010000ULL
+
+/* Software register representation, independent of operand type */
+#define NN_REG_TYPE    GENMASK(31, 24)
+#define NN_REG_LM_IDX  GENMASK(23, 22)
+#define NN_REG_LM_IDX_HI       BIT(23)
+#define NN_REG_LM_IDX_LO       BIT(22)
+#define NN_REG_LM_MOD  GENMASK(21, 20)
+#define NN_REG_VAL     GENMASK(7, 0)
+
+enum nfp_bpf_reg_type {
+       NN_REG_GPR_A =  BIT(0),
+       NN_REG_GPR_B =  BIT(1),
+       NN_REG_GPR_BOTH = NN_REG_GPR_A | NN_REG_GPR_B,
+       NN_REG_NNR =    BIT(2),
+       NN_REG_XFER =   BIT(3),
+       NN_REG_IMM =    BIT(4),
+       NN_REG_NONE =   BIT(5),
+       NN_REG_LMEM =   BIT(6),
+};
+
+enum nfp_bpf_lm_mode {
+       NN_LM_MOD_NONE = 0,
+       NN_LM_MOD_INC,
+       NN_LM_MOD_DEC,
+};
+
+#define reg_both(x)    __enc_swreg((x), NN_REG_GPR_BOTH)
+#define reg_a(x)       __enc_swreg((x), NN_REG_GPR_A)
+#define reg_b(x)       __enc_swreg((x), NN_REG_GPR_B)
+#define reg_nnr(x)     __enc_swreg((x), NN_REG_NNR)
+#define reg_xfer(x)    __enc_swreg((x), NN_REG_XFER)
+#define reg_imm(x)     __enc_swreg((x), NN_REG_IMM)
+#define reg_none()     __enc_swreg(0, NN_REG_NONE)
+#define reg_lm(x, off) __enc_swreg_lm((x), NN_LM_MOD_NONE, (off))
+#define reg_lm_inc(x)  __enc_swreg_lm((x), NN_LM_MOD_INC, 0)
+#define reg_lm_dec(x)  __enc_swreg_lm((x), NN_LM_MOD_DEC, 0)
+#define __reg_lm(x, mod, off)  __enc_swreg_lm((x), (mod), (off))
+
+typedef __u32 __bitwise swreg;
+
+static inline swreg __enc_swreg(u16 id, u8 type)
+{
+       return (__force swreg)(id | FIELD_PREP(NN_REG_TYPE, type));
+}
+
+static inline swreg __enc_swreg_lm(u8 id, enum nfp_bpf_lm_mode mode, u8 off)
+{
+       WARN_ON(id > 3 || (off && mode != NN_LM_MOD_NONE));
+
+       return (__force swreg)(FIELD_PREP(NN_REG_TYPE, NN_REG_LMEM) |
+                              FIELD_PREP(NN_REG_LM_IDX, id) |
+                              FIELD_PREP(NN_REG_LM_MOD, mode) |
+                              off);
+}
+
+static inline u32 swreg_raw(swreg reg)
+{
+       return (__force u32)reg;
+}
+
+static inline enum nfp_bpf_reg_type swreg_type(swreg reg)
+{
+       return FIELD_GET(NN_REG_TYPE, swreg_raw(reg));
+}
+
+static inline u16 swreg_value(swreg reg)
+{
+       return FIELD_GET(NN_REG_VAL, swreg_raw(reg));
+}
+
+static inline bool swreg_lm_idx(swreg reg)
+{
+       return FIELD_GET(NN_REG_LM_IDX_LO, swreg_raw(reg));
+}
+
+static inline bool swreg_lmextn(swreg reg)
+{
+       return FIELD_GET(NN_REG_LM_IDX_HI, swreg_raw(reg));
+}
+
+static inline enum nfp_bpf_lm_mode swreg_lm_mode(swreg reg)
+{
+       return FIELD_GET(NN_REG_LM_MOD, swreg_raw(reg));
+}
+
+struct nfp_insn_ur_regs {
+       enum alu_dst_ab dst_ab;
+       u16 dst;
+       u16 areg, breg;
+       bool swap;
+       bool wr_both;
+       bool dst_lmextn;
+       bool src_lmextn;
+};
+
+struct nfp_insn_re_regs {
+       enum alu_dst_ab dst_ab;
+       u8 dst;
+       u8 areg, breg;
+       bool swap;
+       bool wr_both;
+       bool i8;
+       bool dst_lmextn;
+       bool src_lmextn;
+};
+
+int swreg_to_unrestricted(swreg dst, swreg lreg, swreg rreg,
+                         struct nfp_insn_ur_regs *reg);
+int swreg_to_restricted(swreg dst, swreg lreg, swreg rreg,
+                       struct nfp_insn_re_regs *reg, bool has_imm8);
+
+#define NFP_USTORE_PREFETCH_WINDOW     8
+
+int nfp_ustore_check_valid_no_ecc(u64 insn);
+u64 nfp_ustore_calc_ecc_insn(u64 insn);
 
 #endif
index b0a452b..782d452 100644 (file)
  * @NFP_NET_CFG_BPF_ADDR:      DMA address of the buffer with JITed BPF code
  */
 #define NFP_NET_CFG_BPF_ABI            0x0080
-#define   NFP_NET_BPF_ABI              1
+#define   NFP_NET_BPF_ABI              2
 #define NFP_NET_CFG_BPF_CAP            0x0081
 #define   NFP_NET_BPF_CAP_RELO         (1 << 0) /* seamless reload */
 #define NFP_NET_CFG_BPF_MAX_LEN                0x0082
index 8f6ccc0..6e15d3c 100644 (file)
@@ -2308,7 +2308,7 @@ static int qed_dcbnl_ieee_setapp(struct qed_dev *cdev, struct dcb_app *app)
 
        DP_VERBOSE(hwfn, QED_MSG_DCB, "selector = %d protocol = %d pri = %d\n",
                   app->selector, app->protocol, app->priority);
-       if (app->priority < 0 || app->priority >= QED_MAX_PFC_PRIORITIES) {
+       if (app->priority >= QED_MAX_PFC_PRIORITIES) {
                DP_INFO(hwfn, "Invalid priority %d\n", app->priority);
                return -EINVAL;
        }
index 8fc9c81..b2b1f87 100644 (file)
@@ -1415,7 +1415,12 @@ int qed_iwarp_alloc(struct qed_hwfn *p_hwfn)
 
 void qed_iwarp_resc_free(struct qed_hwfn *p_hwfn)
 {
+       struct qed_iwarp_info *iwarp_info = &p_hwfn->p_rdma_info->iwarp;
+
        qed_rdma_bmap_free(p_hwfn, &p_hwfn->p_rdma_info->tcp_cid_map, 1);
+       kfree(iwarp_info->mpa_bufs);
+       kfree(iwarp_info->partial_fpdus);
+       kfree(iwarp_info->mpa_intermediate_buf);
 }
 
 int qed_iwarp_accept(void *rdma_cxt, struct qed_iwarp_accept_in *iparams)
@@ -1713,6 +1718,569 @@ qed_iwarp_parse_rx_pkt(struct qed_hwfn *p_hwfn,
        return 0;
 }
 
+static struct qed_iwarp_fpdu *qed_iwarp_get_curr_fpdu(struct qed_hwfn *p_hwfn,
+                                                     u16 cid)
+{
+       struct qed_iwarp_info *iwarp_info = &p_hwfn->p_rdma_info->iwarp;
+       struct qed_iwarp_fpdu *partial_fpdu;
+       u32 idx;
+
+       idx = cid - qed_cxt_get_proto_cid_start(p_hwfn, PROTOCOLID_IWARP);
+       if (idx >= iwarp_info->max_num_partial_fpdus) {
+               DP_ERR(p_hwfn, "Invalid cid %x max_num_partial_fpdus=%x\n", cid,
+                      iwarp_info->max_num_partial_fpdus);
+               return NULL;
+       }
+
+       partial_fpdu = &iwarp_info->partial_fpdus[idx];
+
+       return partial_fpdu;
+}
+
+enum qed_iwarp_mpa_pkt_type {
+       QED_IWARP_MPA_PKT_PACKED,
+       QED_IWARP_MPA_PKT_PARTIAL,
+       QED_IWARP_MPA_PKT_UNALIGNED
+};
+
+#define QED_IWARP_INVALID_FPDU_LENGTH 0xffff
+#define QED_IWARP_MPA_FPDU_LENGTH_SIZE (2)
+#define QED_IWARP_MPA_CRC32_DIGEST_SIZE (4)
+
+/* Pad to multiple of 4 */
+#define QED_IWARP_PDU_DATA_LEN_WITH_PAD(data_len) ALIGN(data_len, 4)
+#define QED_IWARP_FPDU_LEN_WITH_PAD(_mpa_len)                             \
+       (QED_IWARP_PDU_DATA_LEN_WITH_PAD((_mpa_len) +                      \
+                                        QED_IWARP_MPA_FPDU_LENGTH_SIZE) + \
+                                        QED_IWARP_MPA_CRC32_DIGEST_SIZE)
+
+/* fpdu can be fragmented over maximum 3 bds: header, partial mpa, unaligned */
+#define QED_IWARP_MAX_BDS_PER_FPDU 3
+
+char *pkt_type_str[] = {
+       "QED_IWARP_MPA_PKT_PACKED",
+       "QED_IWARP_MPA_PKT_PARTIAL",
+       "QED_IWARP_MPA_PKT_UNALIGNED"
+};
+
+static int
+qed_iwarp_recycle_pkt(struct qed_hwfn *p_hwfn,
+                     struct qed_iwarp_fpdu *fpdu,
+                     struct qed_iwarp_ll2_buff *buf);
+
+static enum qed_iwarp_mpa_pkt_type
+qed_iwarp_mpa_classify(struct qed_hwfn *p_hwfn,
+                      struct qed_iwarp_fpdu *fpdu,
+                      u16 tcp_payload_len, u8 *mpa_data)
+{
+       enum qed_iwarp_mpa_pkt_type pkt_type;
+       u16 mpa_len;
+
+       if (fpdu->incomplete_bytes) {
+               pkt_type = QED_IWARP_MPA_PKT_UNALIGNED;
+               goto out;
+       }
+
+       /* special case of one byte remaining...
+        * lower byte will be read next packet
+        */
+       if (tcp_payload_len == 1) {
+               fpdu->fpdu_length = *mpa_data << BITS_PER_BYTE;
+               pkt_type = QED_IWARP_MPA_PKT_PARTIAL;
+               goto out;
+       }
+
+       mpa_len = ntohs(*((u16 *)(mpa_data)));
+       fpdu->fpdu_length = QED_IWARP_FPDU_LEN_WITH_PAD(mpa_len);
+
+       if (fpdu->fpdu_length <= tcp_payload_len)
+               pkt_type = QED_IWARP_MPA_PKT_PACKED;
+       else
+               pkt_type = QED_IWARP_MPA_PKT_PARTIAL;
+
+out:
+       DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+                  "MPA_ALIGN: %s: fpdu_length=0x%x tcp_payload_len:0x%x\n",
+                  pkt_type_str[pkt_type], fpdu->fpdu_length, tcp_payload_len);
+
+       return pkt_type;
+}
+
+static void
+qed_iwarp_init_fpdu(struct qed_iwarp_ll2_buff *buf,
+                   struct qed_iwarp_fpdu *fpdu,
+                   struct unaligned_opaque_data *pkt_data,
+                   u16 tcp_payload_size, u8 placement_offset)
+{
+       fpdu->mpa_buf = buf;
+       fpdu->pkt_hdr = buf->data_phys_addr + placement_offset;
+       fpdu->pkt_hdr_size = pkt_data->tcp_payload_offset;
+       fpdu->mpa_frag = buf->data_phys_addr + pkt_data->first_mpa_offset;
+       fpdu->mpa_frag_virt = (u8 *)(buf->data) + pkt_data->first_mpa_offset;
+
+       if (tcp_payload_size == 1)
+               fpdu->incomplete_bytes = QED_IWARP_INVALID_FPDU_LENGTH;
+       else if (tcp_payload_size < fpdu->fpdu_length)
+               fpdu->incomplete_bytes = fpdu->fpdu_length - tcp_payload_size;
+       else
+               fpdu->incomplete_bytes = 0;     /* complete fpdu */
+
+       fpdu->mpa_frag_len = fpdu->fpdu_length - fpdu->incomplete_bytes;
+}
+
+static int
+qed_iwarp_cp_pkt(struct qed_hwfn *p_hwfn,
+                struct qed_iwarp_fpdu *fpdu,
+                struct unaligned_opaque_data *pkt_data,
+                struct qed_iwarp_ll2_buff *buf, u16 tcp_payload_size)
+{
+       u8 *tmp_buf = p_hwfn->p_rdma_info->iwarp.mpa_intermediate_buf;
+       int rc;
+
+       /* need to copy the data from the partial packet stored in fpdu
+        * to the new buf, for this we also need to move the data currently
+        * placed on the buf. The assumption is that the buffer is big enough
+        * since fpdu_length <= mss, we use an intermediate buffer since
+        * we may need to copy the new data to an overlapping location
+        */
+       if ((fpdu->mpa_frag_len + tcp_payload_size) > (u16)buf->buff_size) {
+               DP_ERR(p_hwfn,
+                      "MPA ALIGN: Unexpected: buffer is not large enough for split fpdu buff_size = %d mpa_frag_len = %d, tcp_payload_size = %d, incomplete_bytes = %d\n",
+                      buf->buff_size, fpdu->mpa_frag_len,
+                      tcp_payload_size, fpdu->incomplete_bytes);
+               return -EINVAL;
+       }
+
+       DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+                  "MPA ALIGN Copying fpdu: [%p, %d] [%p, %d]\n",
+                  fpdu->mpa_frag_virt, fpdu->mpa_frag_len,
+                  (u8 *)(buf->data) + pkt_data->first_mpa_offset,
+                  tcp_payload_size);
+
+       memcpy(tmp_buf, fpdu->mpa_frag_virt, fpdu->mpa_frag_len);
+       memcpy(tmp_buf + fpdu->mpa_frag_len,
+              (u8 *)(buf->data) + pkt_data->first_mpa_offset,
+              tcp_payload_size);
+
+       rc = qed_iwarp_recycle_pkt(p_hwfn, fpdu, fpdu->mpa_buf);
+       if (rc)
+               return rc;
+
+       /* If we managed to post the buffer copy the data to the new buffer
+        * o/w this will occur in the next round...
+        */
+       memcpy((u8 *)(buf->data), tmp_buf,
+              fpdu->mpa_frag_len + tcp_payload_size);
+
+       fpdu->mpa_buf = buf;
+       /* fpdu->pkt_hdr remains as is */
+       /* fpdu->mpa_frag is overridden with new buf */
+       fpdu->mpa_frag = buf->data_phys_addr;
+       fpdu->mpa_frag_virt = buf->data;
+       fpdu->mpa_frag_len += tcp_payload_size;
+
+       fpdu->incomplete_bytes -= tcp_payload_size;
+
+       DP_VERBOSE(p_hwfn,
+                  QED_MSG_RDMA,
+                  "MPA ALIGN: split fpdu buff_size = %d mpa_frag_len = %d, tcp_payload_size = %d, incomplete_bytes = %d\n",
+                  buf->buff_size, fpdu->mpa_frag_len, tcp_payload_size,
+                  fpdu->incomplete_bytes);
+
+       return 0;
+}
+
+static void
+qed_iwarp_update_fpdu_length(struct qed_hwfn *p_hwfn,
+                            struct qed_iwarp_fpdu *fpdu, u8 *mpa_data)
+{
+       u16 mpa_len;
+
+       /* Update incomplete packets if needed */
+       if (fpdu->incomplete_bytes == QED_IWARP_INVALID_FPDU_LENGTH) {
+               /* Missing lower byte is now available */
+               mpa_len = fpdu->fpdu_length | *mpa_data;
+               fpdu->fpdu_length = QED_IWARP_FPDU_LEN_WITH_PAD(mpa_len);
+               fpdu->mpa_frag_len = fpdu->fpdu_length;
+               /* one byte of hdr */
+               fpdu->incomplete_bytes = fpdu->fpdu_length - 1;
+               DP_VERBOSE(p_hwfn,
+                          QED_MSG_RDMA,
+                          "MPA_ALIGN: Partial header mpa_len=%x fpdu_length=%x incomplete_bytes=%x\n",
+                          mpa_len, fpdu->fpdu_length, fpdu->incomplete_bytes);
+       }
+}
+
+#define QED_IWARP_IS_RIGHT_EDGE(_curr_pkt) \
+       (GET_FIELD((_curr_pkt)->flags,     \
+                  UNALIGNED_OPAQUE_DATA_PKT_REACHED_WIN_RIGHT_EDGE))
+
+/* This function is used to recycle a buffer using the ll2 drop option. It
+ * uses the mechanism to ensure that all buffers posted to tx before this one
+ * were completed. The buffer sent here will be sent as a cookie in the tx
+ * completion function and can then be reposted to rx chain when done. The flow
+ * that requires this is the flow where a FPDU splits over more than 3 tcp
+ * segments. In this case the driver needs to re-post a rx buffer instead of
+ * the one received, but driver can't simply repost a buffer it copied from
+ * as there is a case where the buffer was originally a packed FPDU, and is
+ * partially posted to FW. Driver needs to ensure FW is done with it.
+ */
+static int
+qed_iwarp_recycle_pkt(struct qed_hwfn *p_hwfn,
+                     struct qed_iwarp_fpdu *fpdu,
+                     struct qed_iwarp_ll2_buff *buf)
+{
+       struct qed_ll2_tx_pkt_info tx_pkt;
+       u8 ll2_handle;
+       int rc;
+
+       memset(&tx_pkt, 0, sizeof(tx_pkt));
+       tx_pkt.num_of_bds = 1;
+       tx_pkt.tx_dest = QED_LL2_TX_DEST_DROP;
+       tx_pkt.l4_hdr_offset_w = fpdu->pkt_hdr_size >> 2;
+       tx_pkt.first_frag = fpdu->pkt_hdr;
+       tx_pkt.first_frag_len = fpdu->pkt_hdr_size;
+       buf->piggy_buf = NULL;
+       tx_pkt.cookie = buf;
+
+       ll2_handle = p_hwfn->p_rdma_info->iwarp.ll2_mpa_handle;
+
+       rc = qed_ll2_prepare_tx_packet(p_hwfn, ll2_handle, &tx_pkt, true);
+       if (rc)
+               DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+                          "Can't drop packet rc=%d\n", rc);
+
+       DP_VERBOSE(p_hwfn,
+                  QED_MSG_RDMA,
+                  "MPA_ALIGN: send drop tx packet [%lx, 0x%x], buf=%p, rc=%d\n",
+                  (unsigned long int)tx_pkt.first_frag,
+                  tx_pkt.first_frag_len, buf, rc);
+
+       return rc;
+}
+
+static int
+qed_iwarp_win_right_edge(struct qed_hwfn *p_hwfn, struct qed_iwarp_fpdu *fpdu)
+{
+       struct qed_ll2_tx_pkt_info tx_pkt;
+       u8 ll2_handle;
+       int rc;
+
+       memset(&tx_pkt, 0, sizeof(tx_pkt));
+       tx_pkt.num_of_bds = 1;
+       tx_pkt.tx_dest = QED_LL2_TX_DEST_LB;
+       tx_pkt.l4_hdr_offset_w = fpdu->pkt_hdr_size >> 2;
+
+       tx_pkt.first_frag = fpdu->pkt_hdr;
+       tx_pkt.first_frag_len = fpdu->pkt_hdr_size;
+       tx_pkt.enable_ip_cksum = true;
+       tx_pkt.enable_l4_cksum = true;
+       tx_pkt.calc_ip_len = true;
+       /* vlan overload with enum iwarp_ll2_tx_queues */
+       tx_pkt.vlan = IWARP_LL2_ALIGNED_RIGHT_TRIMMED_TX_QUEUE;
+
+       ll2_handle = p_hwfn->p_rdma_info->iwarp.ll2_mpa_handle;
+
+       rc = qed_ll2_prepare_tx_packet(p_hwfn, ll2_handle, &tx_pkt, true);
+       if (rc)
+               DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+                          "Can't send right edge rc=%d\n", rc);
+       DP_VERBOSE(p_hwfn,
+                  QED_MSG_RDMA,
+                  "MPA_ALIGN: Sent right edge FPDU num_bds=%d [%lx, 0x%x], rc=%d\n",
+                  tx_pkt.num_of_bds,
+                  (unsigned long int)tx_pkt.first_frag,
+                  tx_pkt.first_frag_len, rc);
+
+       return rc;
+}
+
+static int
+qed_iwarp_send_fpdu(struct qed_hwfn *p_hwfn,
+                   struct qed_iwarp_fpdu *fpdu,
+                   struct unaligned_opaque_data *curr_pkt,
+                   struct qed_iwarp_ll2_buff *buf,
+                   u16 tcp_payload_size, enum qed_iwarp_mpa_pkt_type pkt_type)
+{
+       struct qed_ll2_tx_pkt_info tx_pkt;
+       u8 ll2_handle;
+       int rc;
+
+       memset(&tx_pkt, 0, sizeof(tx_pkt));
+
+       /* An unaligned packet means it's split over two tcp segments. So the
+        * complete packet requires 3 bds, one for the header, one for the
+        * part of the fpdu of the first tcp segment, and the last fragment
+        * will point to the remainder of the fpdu. A packed pdu, requires only
+        * two bds, one for the header and one for the data.
+        */
+       tx_pkt.num_of_bds = (pkt_type == QED_IWARP_MPA_PKT_UNALIGNED) ? 3 : 2;
+       tx_pkt.tx_dest = QED_LL2_TX_DEST_LB;
+       tx_pkt.l4_hdr_offset_w = fpdu->pkt_hdr_size >> 2; /* offset in words */
+
+       /* Send the mpa_buf only with the last fpdu (in case of packed) */
+       if (pkt_type == QED_IWARP_MPA_PKT_UNALIGNED ||
+           tcp_payload_size <= fpdu->fpdu_length)
+               tx_pkt.cookie = fpdu->mpa_buf;
+
+       tx_pkt.first_frag = fpdu->pkt_hdr;
+       tx_pkt.first_frag_len = fpdu->pkt_hdr_size;
+       tx_pkt.enable_ip_cksum = true;
+       tx_pkt.enable_l4_cksum = true;
+       tx_pkt.calc_ip_len = true;
+       /* vlan overload with enum iwarp_ll2_tx_queues */
+       tx_pkt.vlan = IWARP_LL2_ALIGNED_TX_QUEUE;
+
+       /* special case of unaligned packet and not packed, need to send
+        * both buffers as cookie to release.
+        */
+       if (tcp_payload_size == fpdu->incomplete_bytes)
+               fpdu->mpa_buf->piggy_buf = buf;
+
+       ll2_handle = p_hwfn->p_rdma_info->iwarp.ll2_mpa_handle;
+
+       /* Set first fragment to header */
+       rc = qed_ll2_prepare_tx_packet(p_hwfn, ll2_handle, &tx_pkt, true);
+       if (rc)
+               goto out;
+
+       /* Set second fragment to first part of packet */
+       rc = qed_ll2_set_fragment_of_tx_packet(p_hwfn, ll2_handle,
+                                              fpdu->mpa_frag,
+                                              fpdu->mpa_frag_len);
+       if (rc)
+               goto out;
+
+       if (!fpdu->incomplete_bytes)
+               goto out;
+
+       /* Set third fragment to second part of the packet */
+       rc = qed_ll2_set_fragment_of_tx_packet(p_hwfn,
+                                              ll2_handle,
+                                              buf->data_phys_addr +
+                                              curr_pkt->first_mpa_offset,
+                                              fpdu->incomplete_bytes);
+out:
+       DP_VERBOSE(p_hwfn,
+                  QED_MSG_RDMA,
+                  "MPA_ALIGN: Sent FPDU num_bds=%d first_frag_len=%x, mpa_frag_len=0x%x, incomplete_bytes:0x%x rc=%d\n",
+                  tx_pkt.num_of_bds,
+                  tx_pkt.first_frag_len,
+                  fpdu->mpa_frag_len,
+                  fpdu->incomplete_bytes, rc);
+
+       return rc;
+}
+
+static void
+qed_iwarp_mpa_get_data(struct qed_hwfn *p_hwfn,
+                      struct unaligned_opaque_data *curr_pkt,
+                      u32 opaque_data0, u32 opaque_data1)
+{
+       u64 opaque_data;
+
+       opaque_data = HILO_64(opaque_data1, opaque_data0);
+       *curr_pkt = *((struct unaligned_opaque_data *)&opaque_data);
+
+       curr_pkt->first_mpa_offset = curr_pkt->tcp_payload_offset +
+                                    le16_to_cpu(curr_pkt->first_mpa_offset);
+       curr_pkt->cid = le32_to_cpu(curr_pkt->cid);
+}
+
+/* This function is called when an unaligned or incomplete MPA packet arrives
+ * driver needs to align the packet, perhaps using previous data and send
+ * it down to FW once it is aligned.
+ */
+static int
+qed_iwarp_process_mpa_pkt(struct qed_hwfn *p_hwfn,
+                         struct qed_iwarp_ll2_mpa_buf *mpa_buf)
+{
+       struct unaligned_opaque_data *curr_pkt = &mpa_buf->data;
+       struct qed_iwarp_ll2_buff *buf = mpa_buf->ll2_buf;
+       enum qed_iwarp_mpa_pkt_type pkt_type;
+       struct qed_iwarp_fpdu *fpdu;
+       int rc = -EINVAL;
+       u8 *mpa_data;
+
+       fpdu = qed_iwarp_get_curr_fpdu(p_hwfn, curr_pkt->cid & 0xffff);
+       if (!fpdu) { /* something corrupt with cid, post rx back */
+               DP_ERR(p_hwfn, "Invalid cid, drop and post back to rx cid=%x\n",
+                      curr_pkt->cid);
+               goto err;
+       }
+
+       do {
+               mpa_data = ((u8 *)(buf->data) + curr_pkt->first_mpa_offset);
+
+               pkt_type = qed_iwarp_mpa_classify(p_hwfn, fpdu,
+                                                 mpa_buf->tcp_payload_len,
+                                                 mpa_data);
+
+               switch (pkt_type) {
+               case QED_IWARP_MPA_PKT_PARTIAL:
+                       qed_iwarp_init_fpdu(buf, fpdu,
+                                           curr_pkt,
+                                           mpa_buf->tcp_payload_len,
+                                           mpa_buf->placement_offset);
+
+                       if (!QED_IWARP_IS_RIGHT_EDGE(curr_pkt)) {
+                               mpa_buf->tcp_payload_len = 0;
+                               break;
+                       }
+
+                       rc = qed_iwarp_win_right_edge(p_hwfn, fpdu);
+
+                       if (rc) {
+                               DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+                                          "Can't send FPDU:reset rc=%d\n", rc);
+                               memset(fpdu, 0, sizeof(*fpdu));
+                               break;
+                       }
+
+                       mpa_buf->tcp_payload_len = 0;
+                       break;
+               case QED_IWARP_MPA_PKT_PACKED:
+                       qed_iwarp_init_fpdu(buf, fpdu,
+                                           curr_pkt,
+                                           mpa_buf->tcp_payload_len,
+                                           mpa_buf->placement_offset);
+
+                       rc = qed_iwarp_send_fpdu(p_hwfn, fpdu, curr_pkt, buf,
+                                                mpa_buf->tcp_payload_len,
+                                                pkt_type);
+                       if (rc) {
+                               DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+                                          "Can't send FPDU:reset rc=%d\n", rc);
+                               memset(fpdu, 0, sizeof(*fpdu));
+                               break;
+                       }
+
+                       mpa_buf->tcp_payload_len -= fpdu->fpdu_length;
+                       curr_pkt->first_mpa_offset += fpdu->fpdu_length;
+                       break;
+               case QED_IWARP_MPA_PKT_UNALIGNED:
+                       qed_iwarp_update_fpdu_length(p_hwfn, fpdu, mpa_data);
+                       if (mpa_buf->tcp_payload_len < fpdu->incomplete_bytes) {
+                               /* special handling of fpdu split over more
+                                * than 2 segments
+                                */
+                               if (QED_IWARP_IS_RIGHT_EDGE(curr_pkt)) {
+                                       rc = qed_iwarp_win_right_edge(p_hwfn,
+                                                                     fpdu);
+                                       /* packet will be re-processed later */
+                                       if (rc)
+                                               return rc;
+                               }
+
+                               rc = qed_iwarp_cp_pkt(p_hwfn, fpdu, curr_pkt,
+                                                     buf,
+                                                     mpa_buf->tcp_payload_len);
+                               if (rc) /* packet will be re-processed later */
+                                       return rc;
+
+                               mpa_buf->tcp_payload_len = 0;
+                               break;
+                       }
+
+                       rc = qed_iwarp_send_fpdu(p_hwfn, fpdu, curr_pkt, buf,
+                                                mpa_buf->tcp_payload_len,
+                                                pkt_type);
+                       if (rc) {
+                               DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+                                          "Can't send FPDU:delay rc=%d\n", rc);
+                               /* don't reset fpdu -> we need it for next
+                                * classify
+                                */
+                               break;
+                       }
+
+                       mpa_buf->tcp_payload_len -= fpdu->incomplete_bytes;
+                       curr_pkt->first_mpa_offset += fpdu->incomplete_bytes;
+                       /* The framed PDU was sent - no more incomplete bytes */
+                       fpdu->incomplete_bytes = 0;
+                       break;
+               }
+       } while (mpa_buf->tcp_payload_len && !rc);
+
+       return rc;
+
+err:
+       qed_iwarp_ll2_post_rx(p_hwfn,
+                             buf,
+                             p_hwfn->p_rdma_info->iwarp.ll2_mpa_handle);
+       return rc;
+}
+
+static void qed_iwarp_process_pending_pkts(struct qed_hwfn *p_hwfn)
+{
+       struct qed_iwarp_info *iwarp_info = &p_hwfn->p_rdma_info->iwarp;
+       struct qed_iwarp_ll2_mpa_buf *mpa_buf = NULL;
+       int rc;
+
+       while (!list_empty(&iwarp_info->mpa_buf_pending_list)) {
+               mpa_buf = list_first_entry(&iwarp_info->mpa_buf_pending_list,
+                                          struct qed_iwarp_ll2_mpa_buf,
+                                          list_entry);
+
+               rc = qed_iwarp_process_mpa_pkt(p_hwfn, mpa_buf);
+
+               /* busy means break and continue processing later, don't
+                * remove the buf from the pending list.
+                */
+               if (rc == -EBUSY)
+                       break;
+
+               list_del(&mpa_buf->list_entry);
+               list_add_tail(&mpa_buf->list_entry, &iwarp_info->mpa_buf_list);
+
+               if (rc) {       /* different error, don't continue */
+                       DP_NOTICE(p_hwfn, "process pkts failed rc=%d\n", rc);
+                       break;
+               }
+       }
+}
+
+static void
+qed_iwarp_ll2_comp_mpa_pkt(void *cxt, struct qed_ll2_comp_rx_data *data)
+{
+       struct qed_iwarp_ll2_mpa_buf *mpa_buf;
+       struct qed_iwarp_info *iwarp_info;
+       struct qed_hwfn *p_hwfn = cxt;
+
+       iwarp_info = &p_hwfn->p_rdma_info->iwarp;
+       mpa_buf = list_first_entry(&iwarp_info->mpa_buf_list,
+                                  struct qed_iwarp_ll2_mpa_buf, list_entry);
+       if (!mpa_buf) {
+               DP_ERR(p_hwfn, "No free mpa buf\n");
+               goto err;
+       }
+
+       list_del(&mpa_buf->list_entry);
+       qed_iwarp_mpa_get_data(p_hwfn, &mpa_buf->data,
+                              data->opaque_data_0, data->opaque_data_1);
+
+       DP_VERBOSE(p_hwfn,
+                  QED_MSG_RDMA,
+                  "LL2 MPA CompRx payload_len:0x%x\tfirst_mpa_offset:0x%x\ttcp_payload_offset:0x%x\tflags:0x%x\tcid:0x%x\n",
+                  data->length.packet_length, mpa_buf->data.first_mpa_offset,
+                  mpa_buf->data.tcp_payload_offset, mpa_buf->data.flags,
+                  mpa_buf->data.cid);
+
+       mpa_buf->ll2_buf = data->cookie;
+       mpa_buf->tcp_payload_len = data->length.packet_length -
+                                  mpa_buf->data.first_mpa_offset;
+       mpa_buf->data.first_mpa_offset += data->u.placement_offset;
+       mpa_buf->placement_offset = data->u.placement_offset;
+
+       list_add_tail(&mpa_buf->list_entry, &iwarp_info->mpa_buf_pending_list);
+
+       qed_iwarp_process_pending_pkts(p_hwfn);
+       return;
+err:
+       qed_iwarp_ll2_post_rx(p_hwfn, data->cookie,
+                             iwarp_info->ll2_mpa_handle);
+}
+
 static void
 qed_iwarp_ll2_comp_syn_pkt(void *cxt, struct qed_ll2_comp_rx_data *data)
 {
@@ -1855,10 +2423,25 @@ static void qed_iwarp_ll2_comp_tx_pkt(void *cxt, u8 connection_handle,
                                      bool b_last_fragment, bool b_last_packet)
 {
        struct qed_iwarp_ll2_buff *buffer = cookie;
+       struct qed_iwarp_ll2_buff *piggy;
        struct qed_hwfn *p_hwfn = cxt;
 
+       if (!buffer)            /* can happen in packed mpa unaligned... */
+               return;
+
        /* this was originally an rx packet, post it back */
+       piggy = buffer->piggy_buf;
+       if (piggy) {
+               buffer->piggy_buf = NULL;
+               qed_iwarp_ll2_post_rx(p_hwfn, piggy, connection_handle);
+       }
+
        qed_iwarp_ll2_post_rx(p_hwfn, buffer, connection_handle);
+
+       if (connection_handle == p_hwfn->p_rdma_info->iwarp.ll2_mpa_handle)
+               qed_iwarp_process_pending_pkts(p_hwfn);
+
+       return;
 }
 
 static void qed_iwarp_ll2_rel_tx_pkt(void *cxt, u8 connection_handle,
@@ -1871,12 +2454,44 @@ static void qed_iwarp_ll2_rel_tx_pkt(void *cxt, u8 connection_handle,
        if (!buffer)
                return;
 
+       if (buffer->piggy_buf) {
+               dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+                                 buffer->piggy_buf->buff_size,
+                                 buffer->piggy_buf->data,
+                                 buffer->piggy_buf->data_phys_addr);
+
+               kfree(buffer->piggy_buf);
+       }
+
        dma_free_coherent(&p_hwfn->cdev->pdev->dev, buffer->buff_size,
                          buffer->data, buffer->data_phys_addr);
 
        kfree(buffer);
 }
 
+/* The only slowpath for iwarp ll2 is unalign flush. When this completion
+ * is received, need to reset the FPDU.
+ */
+void
+qed_iwarp_ll2_slowpath(void *cxt,
+                      u8 connection_handle,
+                      u32 opaque_data_0, u32 opaque_data_1)
+{
+       struct unaligned_opaque_data unalign_data;
+       struct qed_hwfn *p_hwfn = cxt;
+       struct qed_iwarp_fpdu *fpdu;
+
+       qed_iwarp_mpa_get_data(p_hwfn, &unalign_data,
+                              opaque_data_0, opaque_data_1);
+
+       DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "(0x%x) Flush fpdu\n",
+                  unalign_data.cid);
+
+       fpdu = qed_iwarp_get_curr_fpdu(p_hwfn, (u16)unalign_data.cid);
+       if (fpdu)
+               memset(fpdu, 0, sizeof(*fpdu));
+}
+
 static int qed_iwarp_ll2_stop(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
 {
        struct qed_iwarp_info *iwarp_info = &p_hwfn->p_rdma_info->iwarp;
@@ -1902,6 +2517,16 @@ static int qed_iwarp_ll2_stop(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
                iwarp_info->ll2_ooo_handle = QED_IWARP_HANDLE_INVAL;
        }
 
+       if (iwarp_info->ll2_mpa_handle != QED_IWARP_HANDLE_INVAL) {
+               rc = qed_ll2_terminate_connection(p_hwfn,
+                                                 iwarp_info->ll2_mpa_handle);
+               if (rc)
+                       DP_INFO(p_hwfn, "Failed to terminate mpa connection\n");
+
+               qed_ll2_release_connection(p_hwfn, iwarp_info->ll2_mpa_handle);
+               iwarp_info->ll2_mpa_handle = QED_IWARP_HANDLE_INVAL;
+       }
+
        qed_llh_remove_mac_filter(p_hwfn,
                                  p_ptt, p_hwfn->p_rdma_info->iwarp.mac_addr);
        return rc;
@@ -1953,12 +2578,15 @@ qed_iwarp_ll2_start(struct qed_hwfn *p_hwfn,
        struct qed_iwarp_info *iwarp_info;
        struct qed_ll2_acquire_data data;
        struct qed_ll2_cbs cbs;
+       u32 mpa_buff_size;
        u16 n_ooo_bufs;
        int rc = 0;
+       int i;
 
        iwarp_info = &p_hwfn->p_rdma_info->iwarp;
        iwarp_info->ll2_syn_handle = QED_IWARP_HANDLE_INVAL;
        iwarp_info->ll2_ooo_handle = QED_IWARP_HANDLE_INVAL;
+       iwarp_info->ll2_mpa_handle = QED_IWARP_HANDLE_INVAL;
 
        iwarp_info->max_mtu = params->max_mtu;
 
@@ -2029,6 +2657,68 @@ qed_iwarp_ll2_start(struct qed_hwfn *p_hwfn,
        if (rc)
                goto err;
 
+       /* Start Unaligned MPA connection */
+       cbs.rx_comp_cb = qed_iwarp_ll2_comp_mpa_pkt;
+       cbs.slowpath_cb = qed_iwarp_ll2_slowpath;
+
+       memset(&data, 0, sizeof(data));
+       data.input.conn_type = QED_LL2_TYPE_IWARP;
+       data.input.mtu = params->max_mtu;
+       /* FW requires that once a packet arrives OOO, it must have at
+        * least 2 rx buffers available on the unaligned connection
+        * for handling the case that it is a partial fpdu.
+        */
+       data.input.rx_num_desc = n_ooo_bufs * 2;
+       data.input.tx_num_desc = data.input.rx_num_desc;
+       data.input.tx_max_bds_per_packet = QED_IWARP_MAX_BDS_PER_FPDU;
+       data.p_connection_handle = &iwarp_info->ll2_mpa_handle;
+       data.input.secondary_queue = true;
+       data.cbs = &cbs;
+
+       rc = qed_ll2_acquire_connection(p_hwfn, &data);
+       if (rc)
+               goto err;
+
+       rc = qed_ll2_establish_connection(p_hwfn, iwarp_info->ll2_mpa_handle);
+       if (rc)
+               goto err;
+
+       mpa_buff_size = QED_IWARP_MAX_BUF_SIZE(params->max_mtu);
+       rc = qed_iwarp_ll2_alloc_buffers(p_hwfn,
+                                        data.input.rx_num_desc,
+                                        mpa_buff_size,
+                                        iwarp_info->ll2_mpa_handle);
+       if (rc)
+               goto err;
+
+       iwarp_info->partial_fpdus = kcalloc((u16)p_hwfn->p_rdma_info->num_qps,
+                                           sizeof(*iwarp_info->partial_fpdus),
+                                           GFP_KERNEL);
+       if (!iwarp_info->partial_fpdus)
+               goto err;
+
+       iwarp_info->max_num_partial_fpdus = (u16)p_hwfn->p_rdma_info->num_qps;
+
+       iwarp_info->mpa_intermediate_buf = kzalloc(mpa_buff_size, GFP_KERNEL);
+       if (!iwarp_info->mpa_intermediate_buf)
+               goto err;
+
+       /* The mpa_bufs array serves for pending RX packets received on the
+        * mpa ll2 that don't have place on the tx ring and require later
+        * processing. We can't fail on allocation of such a struct therefore
+        * we allocate enough to take care of all rx packets
+        */
+       iwarp_info->mpa_bufs = kcalloc(data.input.rx_num_desc,
+                                      sizeof(*iwarp_info->mpa_bufs),
+                                      GFP_KERNEL);
+       if (!iwarp_info->mpa_bufs)
+               goto err;
+
+       INIT_LIST_HEAD(&iwarp_info->mpa_buf_pending_list);
+       INIT_LIST_HEAD(&iwarp_info->mpa_buf_list);
+       for (i = 0; i < data.input.rx_num_desc; i++)
+               list_add_tail(&iwarp_info->mpa_bufs[i].list_entry,
+                             &iwarp_info->mpa_buf_list);
        return rc;
 err:
        qed_iwarp_ll2_stop(p_hwfn, p_ptt);
index 9e2bfde..c1ecd74 100644 (file)
@@ -55,15 +55,43 @@ enum qed_iwarp_qp_state qed_roce2iwarp_state(enum qed_roce_qp_state state);
 #define QED_IWARP_HANDLE_INVAL         (0xff)
 
 struct qed_iwarp_ll2_buff {
+       struct qed_iwarp_ll2_buff *piggy_buf;
        void *data;
        dma_addr_t data_phys_addr;
        u32 buff_size;
 };
 
+struct qed_iwarp_ll2_mpa_buf {
+       struct list_head list_entry;
+       struct qed_iwarp_ll2_buff *ll2_buf;
+       struct unaligned_opaque_data data;
+       u16 tcp_payload_len;
+       u8 placement_offset;
+};
+
+/* In some cases a fpdu will arrive with only one byte of the header, in this
+ * case the fpdu_length will be partial (contain only higher byte and
+ * incomplete bytes will contain the invalid value
+ */
+#define QED_IWARP_INVALID_INCOMPLETE_BYTES 0xffff
+
+struct qed_iwarp_fpdu {
+       struct qed_iwarp_ll2_buff *mpa_buf;
+       void *mpa_frag_virt;
+       dma_addr_t mpa_frag;
+       dma_addr_t pkt_hdr;
+       u16 mpa_frag_len;
+       u16 fpdu_length;
+       u16 incomplete_bytes;
+       u8 pkt_hdr_size;
+};
+
 struct qed_iwarp_info {
        struct list_head listen_list;   /* qed_iwarp_listener */
        struct list_head ep_list;       /* qed_iwarp_ep */
        struct list_head ep_free_list;  /* pre-allocated ep's */
+       struct list_head mpa_buf_list;  /* list of mpa_bufs */
+       struct list_head mpa_buf_pending_list;
        spinlock_t iw_lock;     /* for iwarp resources */
        spinlock_t qp_lock;     /* for teardown races */
        u32 rcv_wnd_scale;
@@ -73,9 +101,14 @@ struct qed_iwarp_info {
        u8 tcp_flags;
        u8 ll2_syn_handle;
        u8 ll2_ooo_handle;
+       u8 ll2_mpa_handle;
        u8 peer2peer;
        enum mpa_negotiation_mode mpa_rev;
        enum mpa_rtr_type rtr_type;
+       struct qed_iwarp_fpdu *partial_fpdus;
+       struct qed_iwarp_ll2_mpa_buf *mpa_bufs;
+       u8 *mpa_intermediate_buf;
+       u16 max_num_partial_fpdus;
 };
 
 enum qed_iwarp_ep_state {
index 250afa5..047f556 100644 (file)
@@ -423,6 +423,41 @@ static void qed_ll2_rxq_parse_reg(struct qed_hwfn *p_hwfn,
 }
 
 static int
+qed_ll2_handle_slowpath(struct qed_hwfn *p_hwfn,
+                       struct qed_ll2_info *p_ll2_conn,
+                       union core_rx_cqe_union *p_cqe,
+                       unsigned long *p_lock_flags)
+{
+       struct qed_ll2_rx_queue *p_rx = &p_ll2_conn->rx_queue;
+       struct core_rx_slow_path_cqe *sp_cqe;
+
+       sp_cqe = &p_cqe->rx_cqe_sp;
+       if (sp_cqe->ramrod_cmd_id != CORE_RAMROD_RX_QUEUE_FLUSH) {
+               DP_NOTICE(p_hwfn,
+                         "LL2 - unexpected Rx CQE slowpath ramrod_cmd_id:%d\n",
+                         sp_cqe->ramrod_cmd_id);
+               return -EINVAL;
+       }
+
+       if (!p_ll2_conn->cbs.slowpath_cb) {
+               DP_NOTICE(p_hwfn,
+                         "LL2 - received RX_QUEUE_FLUSH but no callback was provided\n");
+               return -EINVAL;
+       }
+
+       spin_unlock_irqrestore(&p_rx->lock, *p_lock_flags);
+
+       p_ll2_conn->cbs.slowpath_cb(p_ll2_conn->cbs.cookie,
+                                   p_ll2_conn->my_id,
+                                   le32_to_cpu(sp_cqe->opaque_data.data[0]),
+                                   le32_to_cpu(sp_cqe->opaque_data.data[1]));
+
+       spin_lock_irqsave(&p_rx->lock, *p_lock_flags);
+
+       return 0;
+}
+
+static int
 qed_ll2_rxq_handle_completion(struct qed_hwfn *p_hwfn,
                              struct qed_ll2_info *p_ll2_conn,
                              union core_rx_cqe_union *p_cqe,
@@ -495,8 +530,8 @@ static int qed_ll2_rxq_completion(struct qed_hwfn *p_hwfn, void *cookie)
 
                switch (cqe->rx_cqe_sp.type) {
                case CORE_RX_CQE_TYPE_SLOW_PATH:
-                       DP_NOTICE(p_hwfn, "LL2 - unexpected Rx CQE slowpath\n");
-                       rc = -EINVAL;
+                       rc = qed_ll2_handle_slowpath(p_hwfn, p_ll2_conn,
+                                                    cqe, &flags);
                        break;
                case CORE_RX_CQE_TYPE_GSI_OFFLOAD:
                case CORE_RX_CQE_TYPE_REGULAR:
@@ -894,7 +929,7 @@ static int qed_sp_ll2_rx_queue_start(struct qed_hwfn *p_hwfn,
        p_ramrod->drop_ttl0_flg = p_ll2_conn->input.rx_drop_ttl0_flg;
        p_ramrod->inner_vlan_removal_en = p_ll2_conn->input.rx_vlan_removal_en;
        p_ramrod->queue_id = p_ll2_conn->queue_id;
-       p_ramrod->main_func_queue = (conn_type == QED_LL2_TYPE_OOO) ? 0 : 1;
+       p_ramrod->main_func_queue = p_ll2_conn->main_func_queue ? 1 : 0;
 
        if ((IS_MF_DEFAULT(p_hwfn) || IS_MF_SI(p_hwfn)) &&
            p_ramrod->main_func_queue && (conn_type != QED_LL2_TYPE_ROCE) &&
@@ -1105,6 +1140,7 @@ static int qed_ll2_acquire_connection_tx(struct qed_hwfn *p_hwfn,
                                         struct qed_ll2_info *p_ll2_info)
 {
        struct qed_ll2_tx_packet *p_descq;
+       u32 desc_size;
        u32 capacity;
        int rc = 0;
 
@@ -1122,13 +1158,17 @@ static int qed_ll2_acquire_connection_tx(struct qed_hwfn *p_hwfn,
                goto out;
 
        capacity = qed_chain_get_capacity(&p_ll2_info->tx_queue.txq_chain);
-       p_descq = kcalloc(capacity, sizeof(struct qed_ll2_tx_packet),
-                         GFP_KERNEL);
+       /* First element is part of the packet, rest are flexibly added */
+       desc_size = (sizeof(*p_descq) +
+                    (p_ll2_info->input.tx_max_bds_per_packet - 1) *
+                    sizeof(p_descq->bds_set));
+
+       p_descq = kcalloc(capacity, desc_size, GFP_KERNEL);
        if (!p_descq) {
                rc = -ENOMEM;
                goto out;
        }
-       p_ll2_info->tx_queue.descq_array = p_descq;
+       p_ll2_info->tx_queue.descq_mem = p_descq;
 
        DP_VERBOSE(p_hwfn, QED_MSG_LL2,
                   "Allocated LL2 Txq [Type %08x] with 0x%08x buffers\n",
@@ -1209,6 +1249,7 @@ qed_ll2_set_cbs(struct qed_ll2_info *p_ll2_info, const struct qed_ll2_cbs *cbs)
        p_ll2_info->cbs.rx_release_cb = cbs->rx_release_cb;
        p_ll2_info->cbs.tx_comp_cb = cbs->tx_comp_cb;
        p_ll2_info->cbs.tx_release_cb = cbs->tx_release_cb;
+       p_ll2_info->cbs.slowpath_cb = cbs->slowpath_cb;
        p_ll2_info->cbs.cookie = cbs->cookie;
 
        return 0;
@@ -1260,6 +1301,11 @@ int qed_ll2_acquire_connection(void *cxt, struct qed_ll2_acquire_data *data)
 
        p_ll2_info->tx_dest = (data->input.tx_dest == QED_LL2_TX_DEST_NW) ?
                              CORE_TX_DEST_NW : CORE_TX_DEST_LB;
+       if (data->input.conn_type == QED_LL2_TYPE_OOO ||
+           data->input.secondary_queue)
+               p_ll2_info->main_func_queue = false;
+       else
+               p_ll2_info->main_func_queue = true;
 
        /* Correct maximum number of Tx BDs */
        p_tx_max = &p_ll2_info->input.tx_max_bds_per_packet;
@@ -1359,11 +1405,13 @@ int qed_ll2_establish_connection(void *cxt, u8 connection_handle)
 {
        struct qed_hwfn *p_hwfn = cxt;
        struct qed_ll2_info *p_ll2_conn;
+       struct qed_ll2_tx_packet *p_pkt;
        struct qed_ll2_rx_queue *p_rx;
        struct qed_ll2_tx_queue *p_tx;
        struct qed_ptt *p_ptt;
        int rc = -EINVAL;
        u32 i, capacity;
+       u32 desc_size;
        u8 qid;
 
        p_ptt = qed_ptt_acquire(p_hwfn);
@@ -1397,9 +1445,15 @@ int qed_ll2_establish_connection(void *cxt, u8 connection_handle)
        INIT_LIST_HEAD(&p_tx->sending_descq);
        spin_lock_init(&p_tx->lock);
        capacity = qed_chain_get_capacity(&p_tx->txq_chain);
-       for (i = 0; i < capacity; i++)
-               list_add_tail(&p_tx->descq_array[i].list_entry,
-                             &p_tx->free_descq);
+       /* First element is part of the packet, rest are flexibly added */
+       desc_size = (sizeof(*p_pkt) +
+                    (p_ll2_conn->input.tx_max_bds_per_packet - 1) *
+                    sizeof(p_pkt->bds_set));
+
+       for (i = 0; i < capacity; i++) {
+               p_pkt = p_tx->descq_mem + desc_size * i;
+               list_add_tail(&p_pkt->list_entry, &p_tx->free_descq);
+       }
        p_tx->cur_completing_bd_idx = 0;
        p_tx->bds_idx = 0;
        p_tx->b_completing_packet = false;
@@ -1579,11 +1633,28 @@ qed_ll2_prepare_tx_packet_set_bd(struct qed_hwfn *p_hwfn,
        roce_flavor = (pkt->qed_roce_flavor == QED_LL2_ROCE) ? CORE_ROCE
                                                             : CORE_RROCE;
 
-       tx_dest = (pkt->tx_dest == QED_LL2_TX_DEST_NW) ? CORE_TX_DEST_NW
-                                                      : CORE_TX_DEST_LB;
+       switch (pkt->tx_dest) {
+       case QED_LL2_TX_DEST_NW:
+               tx_dest = CORE_TX_DEST_NW;
+               break;
+       case QED_LL2_TX_DEST_LB:
+               tx_dest = CORE_TX_DEST_LB;
+               break;
+       case QED_LL2_TX_DEST_DROP:
+               tx_dest = CORE_TX_DEST_DROP;
+               break;
+       default:
+               tx_dest = CORE_TX_DEST_LB;
+               break;
+       }
 
        start_bd = (struct core_tx_bd *)qed_chain_produce(p_tx_chain);
-       start_bd->nw_vlan_or_lb_echo = cpu_to_le16(pkt->vlan);
+       if (QED_IS_IWARP_PERSONALITY(p_hwfn) &&
+           p_ll2->input.conn_type == QED_LL2_TYPE_OOO)
+               start_bd->nw_vlan_or_lb_echo =
+                   cpu_to_le16(IWARP_LL2_IN_ORDER_TX_QUEUE);
+       else
+               start_bd->nw_vlan_or_lb_echo = cpu_to_le16(pkt->vlan);
        SET_FIELD(start_bd->bitfield1, CORE_TX_BD_L4_HDR_OFFSET_W,
                  cpu_to_le16(pkt->l4_hdr_offset_w));
        SET_FIELD(start_bd->bitfield1, CORE_TX_BD_TX_DST, tx_dest);
@@ -1591,6 +1662,9 @@ qed_ll2_prepare_tx_packet_set_bd(struct qed_hwfn *p_hwfn,
        SET_FIELD(bd_data, CORE_TX_BD_DATA_START_BD, 0x1);
        SET_FIELD(bd_data, CORE_TX_BD_DATA_NBDS, pkt->num_of_bds);
        SET_FIELD(bd_data, CORE_TX_BD_DATA_ROCE_FLAV, roce_flavor);
+       SET_FIELD(bd_data, CORE_TX_BD_DATA_IP_CSUM, !!(pkt->enable_ip_cksum));
+       SET_FIELD(bd_data, CORE_TX_BD_DATA_L4_CSUM, !!(pkt->enable_l4_cksum));
+       SET_FIELD(bd_data, CORE_TX_BD_DATA_IP_LEN, !!(pkt->calc_ip_len));
        start_bd->bd_data.as_bitfield = cpu_to_le16(bd_data);
        DMA_REGPAIR_LE(start_bd->addr, pkt->first_frag);
        start_bd->nbytes = cpu_to_le16(pkt->first_frag_len);
@@ -1698,7 +1772,7 @@ int qed_ll2_prepare_tx_packet(void *cxt,
        p_tx = &p_ll2_conn->tx_queue;
        p_tx_chain = &p_tx->txq_chain;
 
-       if (pkt->num_of_bds > CORE_LL2_TX_MAX_BDS_PER_PACKET)
+       if (pkt->num_of_bds > p_ll2_conn->input.tx_max_bds_per_packet)
                return -EIO;
 
        spin_lock_irqsave(&p_tx->lock, flags);
@@ -1858,7 +1932,7 @@ void qed_ll2_release_connection(void *cxt, u8 connection_handle)
                qed_int_unregister_cb(p_hwfn, p_ll2_conn->tx_queue.tx_sb_index);
        }
 
-       kfree(p_ll2_conn->tx_queue.descq_array);
+       kfree(p_ll2_conn->tx_queue.descq_mem);
        qed_chain_free(p_hwfn->cdev, &p_ll2_conn->tx_queue.txq_chain);
 
        kfree(p_ll2_conn->rx_queue.descq_array);
index a822528..f658170 100644 (file)
@@ -63,17 +63,14 @@ struct qed_ll2_rx_packet {
 struct qed_ll2_tx_packet {
        struct list_head list_entry;
        u16 bd_used;
-       u16 vlan;
-       u16 l4_hdr_offset_w;
-       u8 bd_flags;
        bool notify_fw;
        void *cookie;
-
+       /* Flexible Array of bds_set determined by max_bds_per_packet */
        struct {
                struct core_tx_bd *txq_bd;
                dma_addr_t tx_frag;
                u16 frag_len;
-       } bds_set[ETH_TX_MAX_BDS_PER_NON_LSO_PACKET];
+       } bds_set[1];
 };
 
 struct qed_ll2_rx_queue {
@@ -101,7 +98,7 @@ struct qed_ll2_tx_queue {
        struct list_head active_descq;
        struct list_head free_descq;
        struct list_head sending_descq;
-       struct qed_ll2_tx_packet *descq_array;
+       void *descq_mem; /* memory for variable sized qed_ll2_tx_packet*/
        struct qed_ll2_tx_packet *cur_send_packet;
        struct qed_ll2_tx_packet cur_completing_packet;
        u16 cur_completing_bd_idx;
@@ -124,6 +121,7 @@ struct qed_ll2_info {
        bool b_active;
        enum core_tx_dest tx_dest;
        u8 tx_stats_en;
+       bool main_func_queue;
        struct qed_ll2_rx_queue rx_queue;
        struct qed_ll2_tx_queue tx_queue;
        struct qed_ll2_cbs cbs;
index 866444b..2c6d7c6 100644 (file)
 #define NSS_COMMON_CLK_SRC_CTRL_RGMII(x)       1
 #define NSS_COMMON_CLK_SRC_CTRL_SGMII(x)       ((x >= 2) ? 1 : 0)
 
-#define NSS_COMMON_MACSEC_CTL                  0x28
-#define NSS_COMMON_MACSEC_CTL_EXT_BYPASS_EN(x) (1 << x)
-
 #define NSS_COMMON_GMAC_CTL(x)                 (0x30 + (x * 4))
 #define NSS_COMMON_GMAC_CTL_CSYS_REQ           BIT(19)
 #define NSS_COMMON_GMAC_CTL_PHY_IFACE_SEL      BIT(16)
 #define NSS_COMMON_GMAC_CTL_IFG_LIMIT_OFFSET   8
 #define NSS_COMMON_GMAC_CTL_IFG_OFFSET         0
-#define NSS_COMMON_GMAC_CTL_IFG_MASK           0x3f
 
 #define NSS_COMMON_CLK_DIV_RGMII_1000          1
 #define NSS_COMMON_CLK_DIV_RGMII_100           9
@@ -68,9 +64,6 @@
 #define NSS_COMMON_CLK_DIV_SGMII_100           4
 #define NSS_COMMON_CLK_DIV_SGMII_10            49
 
-#define QSGMII_PCS_MODE_CTL                    0x68
-#define QSGMII_PCS_MODE_CTL_AUTONEG_EN(x)      BIT((x * 8) + 7)
-
 #define QSGMII_PCS_CAL_LCKDT_CTL               0x120
 #define QSGMII_PCS_CAL_LCKDT_CTL_RST           BIT(19)
 
 #define QSGMII_PHY_TX_DRIVER_EN                        BIT(3)
 #define QSGMII_PHY_QSGMII_EN                   BIT(7)
 #define QSGMII_PHY_PHASE_LOOP_GAIN_OFFSET      12
-#define QSGMII_PHY_PHASE_LOOP_GAIN_MASK                0x7
 #define QSGMII_PHY_RX_DC_BIAS_OFFSET           18
-#define QSGMII_PHY_RX_DC_BIAS_MASK             0x3
 #define QSGMII_PHY_RX_INPUT_EQU_OFFSET         20
-#define QSGMII_PHY_RX_INPUT_EQU_MASK           0x3
 #define QSGMII_PHY_CDR_PI_SLEW_OFFSET          22
-#define QSGMII_PHY_CDR_PI_SLEW_MASK            0x3
 #define QSGMII_PHY_TX_DRV_AMP_OFFSET           28
-#define QSGMII_PHY_TX_DRV_AMP_MASK             0xf
 
 struct ipq806x_gmac {
        struct platform_device *pdev;
@@ -217,7 +205,7 @@ static int ipq806x_gmac_of_parse(struct ipq806x_gmac *gmac)
         * code and keep it consistent with the Linux convention, we'll number
         * them from 0 to 3 here.
         */
-       if (gmac->id < 0 || gmac->id > 3) {
+       if (gmac->id > 3) {
                dev_err(dev, "invalid gmac id\n");
                return -EINVAL;
        }
index 6f550e1..a81335e 100644 (file)
@@ -704,6 +704,14 @@ struct netvsc_reconfig {
        u32 event;
 };
 
+/* L4 hash bits for different protocols */
+#define HV_TCP4_L4HASH 1
+#define HV_TCP6_L4HASH 2
+#define HV_UDP4_L4HASH 4
+#define HV_UDP6_L4HASH 8
+#define HV_DEFAULT_L4HASH (HV_TCP4_L4HASH | HV_TCP6_L4HASH | HV_UDP4_L4HASH | \
+                          HV_UDP6_L4HASH)
+
 /* The context of the netvsc device  */
 struct net_device_context {
        /* point back to our device context */
@@ -726,10 +734,9 @@ struct net_device_context {
        u32 tx_send_table[VRSS_SEND_TAB_SIZE];
 
        /* Ethtool settings */
-       bool udp4_l4_hash;
-       bool udp6_l4_hash;
        u8 duplex;
        u32 speed;
+       u32 l4_hash; /* L4 hash settings */
        struct netvsc_ethtool_stats eth_stats;
 
        /* State to manage the associated VF interface. */
index dfb9864..44746de 100644 (file)
@@ -203,7 +203,7 @@ static inline u32 netvsc_get_hash(
        const struct net_device_context *ndc)
 {
        struct flow_keys flow;
-       u32 hash;
+       u32 hash, pkt_proto = 0;
        static u32 hashrnd __read_mostly;
 
        net_get_random_once(&hashrnd, sizeof(hashrnd));
@@ -211,11 +211,25 @@ static inline u32 netvsc_get_hash(
        if (!skb_flow_dissect_flow_keys(skb, &flow, 0))
                return 0;
 
-       if (flow.basic.ip_proto == IPPROTO_TCP ||
-           (flow.basic.ip_proto == IPPROTO_UDP &&
-            ((flow.basic.n_proto == htons(ETH_P_IP) && ndc->udp4_l4_hash) ||
-             (flow.basic.n_proto == htons(ETH_P_IPV6) &&
-              ndc->udp6_l4_hash)))) {
+       switch (flow.basic.ip_proto) {
+       case IPPROTO_TCP:
+               if (flow.basic.n_proto == htons(ETH_P_IP))
+                       pkt_proto = HV_TCP4_L4HASH;
+               else if (flow.basic.n_proto == htons(ETH_P_IPV6))
+                       pkt_proto = HV_TCP6_L4HASH;
+
+               break;
+
+       case IPPROTO_UDP:
+               if (flow.basic.n_proto == htons(ETH_P_IP))
+                       pkt_proto = HV_UDP4_L4HASH;
+               else if (flow.basic.n_proto == htons(ETH_P_IPV6))
+                       pkt_proto = HV_UDP6_L4HASH;
+
+               break;
+       }
+
+       if (pkt_proto & ndc->l4_hash) {
                return skb_get_hash(skb);
        } else {
                if (flow.basic.n_proto == htons(ETH_P_IP))
@@ -898,8 +912,7 @@ static void netvsc_init_settings(struct net_device *dev)
 {
        struct net_device_context *ndc = netdev_priv(dev);
 
-       ndc->udp4_l4_hash = true;
-       ndc->udp6_l4_hash = true;
+       ndc->l4_hash = HV_DEFAULT_L4HASH;
 
        ndc->speed = SPEED_UNKNOWN;
        ndc->duplex = DUPLEX_FULL;
@@ -1245,23 +1258,32 @@ static int
 netvsc_get_rss_hash_opts(struct net_device_context *ndc,
                         struct ethtool_rxnfc *info)
 {
+       const u32 l4_flag = RXH_L4_B_0_1 | RXH_L4_B_2_3;
+
        info->data = RXH_IP_SRC | RXH_IP_DST;
 
        switch (info->flow_type) {
        case TCP_V4_FLOW:
+               if (ndc->l4_hash & HV_TCP4_L4HASH)
+                       info->data |= l4_flag;
+
+               break;
+
        case TCP_V6_FLOW:
-               info->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3;
+               if (ndc->l4_hash & HV_TCP6_L4HASH)
+                       info->data |= l4_flag;
+
                break;
 
        case UDP_V4_FLOW:
-               if (ndc->udp4_l4_hash)
-                       info->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3;
+               if (ndc->l4_hash & HV_UDP4_L4HASH)
+                       info->data |= l4_flag;
 
                break;
 
        case UDP_V6_FLOW:
-               if (ndc->udp6_l4_hash)
-                       info->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3;
+               if (ndc->l4_hash & HV_UDP6_L4HASH)
+                       info->data |= l4_flag;
 
                break;
 
@@ -1302,23 +1324,51 @@ static int netvsc_set_rss_hash_opts(struct net_device_context *ndc,
 {
        if (info->data == (RXH_IP_SRC | RXH_IP_DST |
                           RXH_L4_B_0_1 | RXH_L4_B_2_3)) {
-               if (info->flow_type == UDP_V4_FLOW)
-                       ndc->udp4_l4_hash = true;
-               else if (info->flow_type == UDP_V6_FLOW)
-                       ndc->udp6_l4_hash = true;
-               else
+               switch (info->flow_type) {
+               case TCP_V4_FLOW:
+                       ndc->l4_hash |= HV_TCP4_L4HASH;
+                       break;
+
+               case TCP_V6_FLOW:
+                       ndc->l4_hash |= HV_TCP6_L4HASH;
+                       break;
+
+               case UDP_V4_FLOW:
+                       ndc->l4_hash |= HV_UDP4_L4HASH;
+                       break;
+
+               case UDP_V6_FLOW:
+                       ndc->l4_hash |= HV_UDP6_L4HASH;
+                       break;
+
+               default:
                        return -EOPNOTSUPP;
+               }
 
                return 0;
        }
 
        if (info->data == (RXH_IP_SRC | RXH_IP_DST)) {
-               if (info->flow_type == UDP_V4_FLOW)
-                       ndc->udp4_l4_hash = false;
-               else if (info->flow_type == UDP_V6_FLOW)
-                       ndc->udp6_l4_hash = false;
-               else
+               switch (info->flow_type) {
+               case TCP_V4_FLOW:
+                       ndc->l4_hash &= ~HV_TCP4_L4HASH;
+                       break;
+
+               case TCP_V6_FLOW:
+                       ndc->l4_hash &= ~HV_TCP6_L4HASH;
+                       break;
+
+               case UDP_V4_FLOW:
+                       ndc->l4_hash &= ~HV_UDP4_L4HASH;
+                       break;
+
+               case UDP_V6_FLOW:
+                       ndc->l4_hash &= ~HV_UDP6_L4HASH;
+                       break;
+
+               default:
                        return -EOPNOTSUPP;
+               }
 
                return 0;
        }
index cd931cf..e2cf8ff 100644 (file)
@@ -366,6 +366,11 @@ config REALTEK_PHY
        ---help---
          Supports the Realtek 821x PHY.
 
+config RENESAS_PHY
+       tristate "Driver for Renesas PHYs"
+       ---help---
+         Supports the Renesas PHYs uPD60620 and uPD60620A.
+
 config ROCKCHIP_PHY
         tristate "Driver for Rockchip Ethernet PHYs"
         ---help---
index 416df92..1404ad3 100644 (file)
@@ -72,6 +72,7 @@ obj-$(CONFIG_MICROSEMI_PHY)   += mscc.o
 obj-$(CONFIG_NATIONAL_PHY)     += national.o
 obj-$(CONFIG_QSEMI_PHY)                += qsemi.o
 obj-$(CONFIG_REALTEK_PHY)      += realtek.o
+obj-$(CONFIG_RENESAS_PHY)      += uPD60620.o
 obj-$(CONFIG_ROCKCHIP_PHY)     += rockchip.o
 obj-$(CONFIG_SMSC_PHY)         += smsc.o
 obj-$(CONFIG_STE10XP)          += ste10Xp.o
diff --git a/drivers/net/phy/uPD60620.c b/drivers/net/phy/uPD60620.c
new file mode 100644 (file)
index 0000000..96b3347
--- /dev/null
@@ -0,0 +1,109 @@
+/*
+ * Driver for the Renesas PHY uPD60620.
+ *
+ * Copyright (C) 2015 Softing Industrial Automation GmbH
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/phy.h>
+
+#define UPD60620_PHY_ID    0xb8242824
+
+/* Extended Registers and values */
+/* PHY Special Control/Status    */
+#define PHY_PHYSCR         0x1F      /* PHY.31 */
+#define PHY_PHYSCR_10MB    0x0004    /* PHY speed = 10mb */
+#define PHY_PHYSCR_100MB   0x0008    /* PHY speed = 100mb */
+#define PHY_PHYSCR_DUPLEX  0x0010    /* PHY Duplex */
+
+/* PHY Special Modes */
+#define PHY_SPM            0x12      /* PHY.18 */
+
+/* Init PHY */
+
+static int upd60620_config_init(struct phy_device *phydev)
+{
+       /* Enable support for passive HUBs (could be a strap option) */
+       /* PHYMODE: All speeds, HD in parallel detect */
+       return phy_write(phydev, PHY_SPM, 0x0180 | phydev->mdio.addr);
+}
+
+/* Get PHY status from common registers */
+
+static int upd60620_read_status(struct phy_device *phydev)
+{
+       int phy_state;
+
+       /* Read negotiated state */
+       phy_state = phy_read(phydev, MII_BMSR);
+       if (phy_state < 0)
+               return phy_state;
+
+       phydev->link = 0;
+       phydev->lp_advertising = 0;
+       phydev->pause = 0;
+       phydev->asym_pause = 0;
+
+       if (phy_state & (BMSR_ANEGCOMPLETE | BMSR_LSTATUS)) {
+               phy_state = phy_read(phydev, PHY_PHYSCR);
+               if (phy_state < 0)
+                       return phy_state;
+
+               if (phy_state & (PHY_PHYSCR_10MB | PHY_PHYSCR_100MB)) {
+                       phydev->link = 1;
+                       phydev->speed = SPEED_10;
+                       phydev->duplex = DUPLEX_HALF;
+
+                       if (phy_state & PHY_PHYSCR_100MB)
+                               phydev->speed = SPEED_100;
+                       if (phy_state & PHY_PHYSCR_DUPLEX)
+                               phydev->duplex = DUPLEX_FULL;
+
+                       phy_state = phy_read(phydev, MII_LPA);
+                       if (phy_state < 0)
+                               return phy_state;
+
+                       phydev->lp_advertising
+                               = mii_lpa_to_ethtool_lpa_t(phy_state);
+
+                       if (phydev->duplex == DUPLEX_FULL) {
+                               if (phy_state & LPA_PAUSE_CAP)
+                                       phydev->pause = 1;
+                               if (phy_state & LPA_PAUSE_ASYM)
+                                       phydev->asym_pause = 1;
+                       }
+               }
+       }
+       return 0;
+}
+
+MODULE_DESCRIPTION("Renesas uPD60620 PHY driver");
+MODULE_AUTHOR("Bernd Edlinger <bernd.edlinger@hotmail.de>");
+MODULE_LICENSE("GPL");
+
+static struct phy_driver upd60620_driver[1] = { {
+       .phy_id         = UPD60620_PHY_ID,
+       .phy_id_mask    = 0xfffffffe,
+       .name           = "Renesas uPD60620",
+       .features       = PHY_BASIC_FEATURES,
+       .flags          = 0,
+       .config_init    = upd60620_config_init,
+       .config_aneg    = genphy_config_aneg,
+       .read_status    = upd60620_read_status,
+} };
+
+module_phy_driver(upd60620_driver);
+
+static struct mdio_device_id __maybe_unused upd60620_tbl[] = {
+       { UPD60620_PHY_ID, 0xfffffffe },
+       { }
+};
+
+MODULE_DEVICE_TABLE(mdio, upd60620_tbl);
index c3f77e3..e365866 100644 (file)
@@ -1339,7 +1339,17 @@ ppp_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats64)
 
 static int ppp_dev_init(struct net_device *dev)
 {
+       struct ppp *ppp;
+
        netdev_lockdep_set_classes(dev);
+
+       ppp = netdev_priv(dev);
+       /* Let the netdevice take a reference on the ppp file. This ensures
+        * that ppp_destroy_interface() won't run before the device gets
+        * unregistered.
+        */
+       atomic_inc(&ppp->file.refcnt);
+
        return 0;
 }
 
@@ -1362,6 +1372,15 @@ static void ppp_dev_uninit(struct net_device *dev)
        wake_up_interruptible(&ppp->file.rwait);
 }
 
+static void ppp_dev_priv_destructor(struct net_device *dev)
+{
+       struct ppp *ppp;
+
+       ppp = netdev_priv(dev);
+       if (atomic_dec_and_test(&ppp->file.refcnt))
+               ppp_destroy_interface(ppp);
+}
+
 static const struct net_device_ops ppp_netdev_ops = {
        .ndo_init        = ppp_dev_init,
        .ndo_uninit      = ppp_dev_uninit,
@@ -1387,6 +1406,7 @@ static void ppp_setup(struct net_device *dev)
        dev->tx_queue_len = 3;
        dev->type = ARPHRD_PPP;
        dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST;
+       dev->priv_destructor = ppp_dev_priv_destructor;
        netif_keep_dst(dev);
 }
 
index 29c7e2e..52ea80b 100644 (file)
@@ -560,6 +560,7 @@ static const struct driver_info wwan_info = {
 #define NVIDIA_VENDOR_ID       0x0955
 #define HP_VENDOR_ID           0x03f0
 #define MICROSOFT_VENDOR_ID    0x045e
+#define UBLOX_VENDOR_ID                0x1546
 
 static const struct usb_device_id      products[] = {
 /* BLACKLIST !!
@@ -869,6 +870,18 @@ static const struct usb_device_id  products[] = {
                                      USB_CDC_PROTO_NONE),
        .driver_info = (unsigned long)&zte_cdc_info,
 }, {
+       /* U-blox TOBY-L2 */
+       USB_DEVICE_AND_INTERFACE_INFO(UBLOX_VENDOR_ID, 0x1143, USB_CLASS_COMM,
+                                     USB_CDC_SUBCLASS_ETHERNET,
+                                     USB_CDC_PROTO_NONE),
+       .driver_info = (unsigned long)&wwan_info,
+}, {
+       /* U-blox SARA-U2 */
+       USB_DEVICE_AND_INTERFACE_INFO(UBLOX_VENDOR_ID, 0x1104, USB_CLASS_COMM,
+                                     USB_CDC_SUBCLASS_ETHERNET,
+                                     USB_CDC_PROTO_NONE),
+       .driver_info = (unsigned long)&wwan_info,
+}, {
        USB_INTERFACE_INFO(USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET,
                        USB_CDC_PROTO_NONE),
        .driver_info = (unsigned long) &cdc_info,
index bb2aad0..5a14cc7 100644 (file)
@@ -2136,7 +2136,7 @@ static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj,
        struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
 
        if (a == &dev_attr_uuid.attr) {
-               if (uuid_is_null(&ns->uuid) ||
+               if (uuid_is_null(&ns->uuid) &&
                    !memchr_inv(ns->nguid, 0, sizeof(ns->nguid)))
                        return 0;
        }
index cb73bc8..3f5a04c 100644 (file)
@@ -94,7 +94,7 @@ struct nvme_dev {
        struct mutex shutdown_lock;
        bool subsystem;
        void __iomem *cmb;
-       dma_addr_t cmb_dma_addr;
+       pci_bus_addr_t cmb_bus_addr;
        u64 cmb_size;
        u32 cmbsz;
        u32 cmbloc;
@@ -1226,7 +1226,7 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
        if (qid && dev->cmb && use_cmb_sqes && NVME_CMB_SQS(dev->cmbsz)) {
                unsigned offset = (qid - 1) * roundup(SQ_SIZE(depth),
                                                      dev->ctrl.page_size);
-               nvmeq->sq_dma_addr = dev->cmb_dma_addr + offset;
+               nvmeq->sq_dma_addr = dev->cmb_bus_addr + offset;
                nvmeq->sq_cmds_io = dev->cmb + offset;
        } else {
                nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth),
@@ -1527,7 +1527,7 @@ static void __iomem *nvme_map_cmb(struct nvme_dev *dev)
        resource_size_t bar_size;
        struct pci_dev *pdev = to_pci_dev(dev->dev);
        void __iomem *cmb;
-       dma_addr_t dma_addr;
+       int bar;
 
        dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ);
        if (!(NVME_CMB_SZ(dev->cmbsz)))
@@ -1540,7 +1540,8 @@ static void __iomem *nvme_map_cmb(struct nvme_dev *dev)
        szu = (u64)1 << (12 + 4 * NVME_CMB_SZU(dev->cmbsz));
        size = szu * NVME_CMB_SZ(dev->cmbsz);
        offset = szu * NVME_CMB_OFST(dev->cmbloc);
-       bar_size = pci_resource_len(pdev, NVME_CMB_BIR(dev->cmbloc));
+       bar = NVME_CMB_BIR(dev->cmbloc);
+       bar_size = pci_resource_len(pdev, bar);
 
        if (offset > bar_size)
                return NULL;
@@ -1553,12 +1554,11 @@ static void __iomem *nvme_map_cmb(struct nvme_dev *dev)
        if (size > bar_size - offset)
                size = bar_size - offset;
 
-       dma_addr = pci_resource_start(pdev, NVME_CMB_BIR(dev->cmbloc)) + offset;
-       cmb = ioremap_wc(dma_addr, size);
+       cmb = ioremap_wc(pci_resource_start(pdev, bar) + offset, size);
        if (!cmb)
                return NULL;
 
-       dev->cmb_dma_addr = dma_addr;
+       dev->cmb_bus_addr = pci_bus_address(pdev, bar) + offset;
        dev->cmb_size = size;
        return cmb;
 }
index 785fb42..2799a6b 100644 (file)
@@ -3767,7 +3767,7 @@ static int ibmvscsis_write_pending(struct se_cmd *se_cmd)
         */
        if ((vscsi->flags & (CLIENT_FAILED | RESPONSE_Q_DOWN))) {
                pr_err("write_pending failed since: %d\n", vscsi->flags);
-               return 0;
+               return -EIO;
        }
 
        rc = srp_transfer_data(cmd, &vio_iu(iue)->srp.cmd, ibmvscsis_rdma,
index bd4605a..c62e8d1 100644 (file)
@@ -2851,9 +2851,6 @@ EXPORT_SYMBOL_GPL(iscsi_session_setup);
 /**
  * iscsi_session_teardown - destroy session, host, and cls_session
  * @cls_session: iscsi session
- *
- * The driver must have called iscsi_remove_session before
- * calling this.
  */
 void iscsi_session_teardown(struct iscsi_cls_session *cls_session)
 {
@@ -2863,6 +2860,8 @@ void iscsi_session_teardown(struct iscsi_cls_session *cls_session)
 
        iscsi_pool_free(&session->cmdpool);
 
+       iscsi_remove_session(cls_session);
+
        kfree(session->password);
        kfree(session->password_in);
        kfree(session->username);
@@ -2877,7 +2876,8 @@ void iscsi_session_teardown(struct iscsi_cls_session *cls_session)
        kfree(session->portal_type);
        kfree(session->discovery_parent_type);
 
-       iscsi_destroy_session(cls_session);
+       iscsi_free_session(cls_session);
+
        iscsi_host_dec_session_cnt(shost);
        module_put(owner);
 }
index e7818af..15590a0 100644 (file)
@@ -956,6 +956,9 @@ static int scsi_add_lun(struct scsi_device *sdev, unsigned char *inq_result,
        if (*bflags & BLIST_NO_DIF)
                sdev->no_dif = 1;
 
+       if (*bflags & BLIST_UNMAP_LIMIT_WS)
+               sdev->unmap_limit_for_ws = 1;
+
        sdev->eh_timeout = SCSI_DEFAULT_EH_TIMEOUT;
 
        if (*bflags & BLIST_TRY_VPD_PAGES)
index 0190aef..7404d26 100644 (file)
@@ -2211,22 +2211,6 @@ void iscsi_free_session(struct iscsi_cls_session *session)
 EXPORT_SYMBOL_GPL(iscsi_free_session);
 
 /**
- * iscsi_destroy_session - destroy iscsi session
- * @session: iscsi_session
- *
- * Can be called by a LLD or iscsi_transport. There must not be
- * any running connections.
- */
-int iscsi_destroy_session(struct iscsi_cls_session *session)
-{
-       iscsi_remove_session(session);
-       ISCSI_DBG_TRANS_SESSION(session, "Completing session destruction\n");
-       iscsi_free_session(session);
-       return 0;
-}
-EXPORT_SYMBOL_GPL(iscsi_destroy_session);
-
-/**
  * iscsi_create_conn - create iscsi class connection
  * @session: iscsi cls session
  * @dd_size: private driver data size
index fb9f8b5..d175c5c 100644 (file)
@@ -715,13 +715,21 @@ static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode)
                break;
 
        case SD_LBP_WS16:
-               max_blocks = min_not_zero(sdkp->max_ws_blocks,
-                                         (u32)SD_MAX_WS16_BLOCKS);
+               if (sdkp->device->unmap_limit_for_ws)
+                       max_blocks = sdkp->max_unmap_blocks;
+               else
+                       max_blocks = sdkp->max_ws_blocks;
+
+               max_blocks = min_not_zero(max_blocks, (u32)SD_MAX_WS16_BLOCKS);
                break;
 
        case SD_LBP_WS10:
-               max_blocks = min_not_zero(sdkp->max_ws_blocks,
-                                         (u32)SD_MAX_WS10_BLOCKS);
+               if (sdkp->device->unmap_limit_for_ws)
+                       max_blocks = sdkp->max_unmap_blocks;
+               else
+                       max_blocks = sdkp->max_ws_blocks;
+
+               max_blocks = min_not_zero(max_blocks, (u32)SD_MAX_WS10_BLOCKS);
                break;
 
        case SD_LBP_ZERO:
@@ -3099,8 +3107,6 @@ static int sd_revalidate_disk(struct gendisk *disk)
                sd_read_security(sdkp, buffer);
        }
 
-       sdkp->first_scan = 0;
-
        /*
         * We now have all cache related info, determine how we deal
         * with flush requests.
@@ -3115,7 +3121,7 @@ static int sd_revalidate_disk(struct gendisk *disk)
        q->limits.max_dev_sectors = logical_to_sectors(sdp, dev_max);
 
        /*
-        * Use the device's preferred I/O size for reads and writes
+        * Determine the device's preferred I/O size for reads and writes
         * unless the reported value is unreasonably small, large, or
         * garbage.
         */
@@ -3129,8 +3135,19 @@ static int sd_revalidate_disk(struct gendisk *disk)
                rw_max = min_not_zero(logical_to_sectors(sdp, dev_max),
                                      (sector_t)BLK_DEF_MAX_SECTORS);
 
-       /* Combine with controller limits */
-       q->limits.max_sectors = min(rw_max, queue_max_hw_sectors(q));
+       /* Do not exceed controller limit */
+       rw_max = min(rw_max, queue_max_hw_sectors(q));
+
+       /*
+        * Only update max_sectors if previously unset or if the current value
+        * exceeds the capabilities of the hardware.
+        */
+       if (sdkp->first_scan ||
+           q->limits.max_sectors > q->limits.max_dev_sectors ||
+           q->limits.max_sectors > q->limits.max_hw_sectors)
+               q->limits.max_sectors = rw_max;
+
+       sdkp->first_scan = 0;
 
        set_capacity(disk, logical_to_sectors(sdp, sdkp->capacity));
        sd_config_write_same(sdkp);
index 0e79eeb..419a7a9 100644 (file)
@@ -1144,5 +1144,5 @@ static void __exit nhi_unload(void)
        tb_domain_exit();
 }
 
-module_init(nhi_init);
+fs_initcall(nhi_init);
 module_exit(nhi_unload);
index f2d06f6..1380275 100644 (file)
@@ -1487,6 +1487,9 @@ int tb_register_property_dir(const char *key, struct tb_property_dir *dir)
 {
        int ret;
 
+       if (WARN_ON(!xdomain_property_dir))
+               return -EAGAIN;
+
        if (!key || strlen(key) > 8)
                return -EINVAL;
 
index 58585ec..68677d9 100644 (file)
@@ -436,8 +436,8 @@ static bool vhost_exceeds_maxpend(struct vhost_net *net)
        struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
        struct vhost_virtqueue *vq = &nvq->vq;
 
-       return (nvq->upend_idx + vq->num - VHOST_MAX_PEND) % UIO_MAXIOV
-               == nvq->done_idx;
+       return (nvq->upend_idx + UIO_MAXIOV - nvq->done_idx) % UIO_MAXIOV >
+              min_t(unsigned int, VHOST_MAX_PEND, vq->num >> 2);
 }
 
 /* Expects to be always run from workqueue - which acts as
@@ -480,11 +480,6 @@ static void handle_tx(struct vhost_net *net)
                if (zcopy)
                        vhost_zerocopy_signal_used(net, vq);
 
-               /* If more outstanding DMAs, queue the work.
-                * Handle upend_idx wrap around
-                */
-               if (unlikely(vhost_exceeds_maxpend(net)))
-                       break;
 
                head = vhost_net_tx_get_vq_desc(net, vq, vq->iov,
                                                ARRAY_SIZE(vq->iov),
@@ -519,8 +514,7 @@ static void handle_tx(struct vhost_net *net)
                len = msg_data_left(&msg);
 
                zcopy_used = zcopy && len >= VHOST_GOODCOPY_LEN
-                                  && (nvq->upend_idx + 1) % UIO_MAXIOV !=
-                                     nvq->done_idx
+                                  && !vhost_exceeds_maxpend(net)
                                   && vhost_net_tx_select_zcopy(net);
 
                /* use msg_control to pass vhost zerocopy ubuf info to skb */
index 899ddae..8fc6903 100644 (file)
@@ -722,7 +722,7 @@ struct btrfs_delayed_root;
  * Indicate that a whole-filesystem exclusive operation is running
  * (device replace, resize, device add/delete, balance)
  */
-#define BTRFS_FS_EXCL_OP                       14
+#define BTRFS_FS_EXCL_OP                       16
 
 struct btrfs_fs_info {
        u8 fsid[BTRFS_FSID_SIZE];
index 12ab19a..970190c 100644 (file)
@@ -2801,7 +2801,7 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
                }
        }
 
-       bio = btrfs_bio_alloc(bdev, sector << 9);
+       bio = btrfs_bio_alloc(bdev, (u64)sector << 9);
        bio_add_page(bio, page, page_size, offset);
        bio->bi_end_io = end_io_func;
        bio->bi_private = tree;
index 84edfc6..f23c820 100644 (file)
@@ -734,12 +734,13 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
                        inode = req->r_inode;
                        ihold(inode);
                } else {
-                       /* req->r_dentry is non-null for LSSNAP request.
-                        * fall-thru */
-                       WARN_ON_ONCE(!req->r_dentry);
+                       /* req->r_dentry is non-null for LSSNAP request */
+                       rcu_read_lock();
+                       inode = get_nonsnap_parent(req->r_dentry);
+                       rcu_read_unlock();
+                       dout("__choose_mds using snapdir's parent %p\n", inode);
                }
-       }
-       if (!inode && req->r_dentry) {
+       } else if (req->r_dentry) {
                /* ignore race with rename; old or new d_parent is okay */
                struct dentry *parent;
                struct inode *dir;
index 1ffc8b4..7fc0b85 100644 (file)
@@ -374,12 +374,10 @@ static int build_snap_context(struct ceph_snap_realm *realm,
             realm->ino, realm, snapc, snapc->seq,
             (unsigned int) snapc->num_snaps);
 
-       if (realm->cached_context) {
-               ceph_put_snap_context(realm->cached_context);
-               /* queue realm for cap_snap creation */
-               list_add_tail(&realm->dirty_item, dirty_realms);
-       }
+       ceph_put_snap_context(realm->cached_context);
        realm->cached_context = snapc;
+       /* queue realm for cap_snap creation */
+       list_add_tail(&realm->dirty_item, dirty_realms);
        return 0;
 
 fail:
index 54059b1..3b601f1 100644 (file)
@@ -468,7 +468,9 @@ static inline int may_write_real(struct file *file)
 
        /* File refers to upper, writable layer? */
        upperdentry = d_real(dentry, NULL, 0, D_REAL_UPPER);
-       if (upperdentry && file_inode(file) == d_inode(upperdentry))
+       if (upperdentry &&
+           (file_inode(file) == d_inode(upperdentry) ||
+            file_inode(file) == d_inode(dentry)))
                return 0;
 
        /* Lower layer: can't write to real file, sorry... */
index efebe6c..22880ef 100644 (file)
@@ -218,7 +218,6 @@ static void nfs_cb_idr_remove_locked(struct nfs_client *clp)
 static void pnfs_init_server(struct nfs_server *server)
 {
        rpc_init_wait_queue(&server->roc_rpcwaitq, "pNFS ROC");
-       rpc_init_wait_queue(&server->uoc_rpcwaitq, "NFS UOC");
 }
 
 #else
@@ -888,6 +887,7 @@ struct nfs_server *nfs_alloc_server(void)
        ida_init(&server->openowner_id);
        ida_init(&server->lockowner_id);
        pnfs_init_server(server);
+       rpc_init_wait_queue(&server->uoc_rpcwaitq, "NFS UOC");
 
        return server;
 }
index 44c638b..508126e 100644 (file)
@@ -745,7 +745,8 @@ filelayout_free_lseg(struct pnfs_layout_segment *lseg)
        struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
 
        dprintk("--> %s\n", __func__);
-       nfs4_fl_put_deviceid(fl->dsaddr);
+       if (fl->dsaddr != NULL)
+               nfs4_fl_put_deviceid(fl->dsaddr);
        /* This assumes a single RW lseg */
        if (lseg->pls_range.iomode == IOMODE_RW) {
                struct nfs4_filelayout *flo;
index dd5d27d..30426c1 100644 (file)
@@ -274,7 +274,7 @@ static struct key *nfs_idmap_request_key(const char *name, size_t namelen,
        ssize_t ret;
 
        ret = nfs_idmap_get_desc(name, namelen, type, strlen(type), &desc);
-       if (ret <= 0)
+       if (ret < 0)
                return ERR_PTR(ret);
 
        rkey = request_key(&key_type_id_resolver, desc, "");
index 6c61e2b..f90090e 100644 (file)
@@ -8399,8 +8399,7 @@ nfs4_layoutget_handle_exception(struct rpc_task *task,
                lo = NFS_I(inode)->layout;
                /* If the open stateid was bad, then recover it. */
                if (!lo || test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) ||
-                   nfs4_stateid_match_other(&lgp->args.stateid,
-                                       &lgp->args.ctx->state->stateid)) {
+                   !nfs4_stateid_match_other(&lgp->args.stateid, &lo->plh_stateid)) {
                        spin_unlock(&inode->i_lock);
                        exception->state = lgp->args.ctx->state;
                        exception->stateid = &lgp->args.stateid;
index 37c8af0..14ed979 100644 (file)
@@ -1842,8 +1842,8 @@ static void encode_create_session(struct xdr_stream *xdr,
         * Assumes OPEN is the biggest non-idempotent compound.
         * 2 is the verifier.
         */
-       max_resp_sz_cached = (NFS4_dec_open_sz + RPC_REPHDRSIZE +
-                             RPC_MAX_AUTH_SIZE + 2) * XDR_UNIT;
+       max_resp_sz_cached = (NFS4_dec_open_sz + RPC_REPHDRSIZE + 2)
+                               * XDR_UNIT + RPC_MAX_AUTH_SIZE;
 
        encode_op_hdr(xdr, OP_CREATE_SESSION, decode_create_session_maxsz, hdr);
        p = reserve_space(xdr, 16 + 2*28 + 20 + clnt->cl_nodelen + 12);
index aad97b3..c441f93 100644 (file)
@@ -561,10 +561,8 @@ static int ovl_do_copy_up(struct ovl_copy_up_ctx *c)
                c->tmpfile = true;
                err = ovl_copy_up_locked(c);
        } else {
-               err = -EIO;
-               if (lock_rename(c->workdir, c->destdir) != NULL) {
-                       pr_err("overlayfs: failed to lock workdir+upperdir\n");
-               } else {
+               err = ovl_lock_rename_workdir(c->workdir, c->destdir);
+               if (!err) {
                        err = ovl_copy_up_locked(c);
                        unlock_rename(c->workdir, c->destdir);
                }
index 3309b19..cc961a3 100644 (file)
@@ -216,26 +216,6 @@ out_unlock:
        return err;
 }
 
-static int ovl_lock_rename_workdir(struct dentry *workdir,
-                                  struct dentry *upperdir)
-{
-       /* Workdir should not be the same as upperdir */
-       if (workdir == upperdir)
-               goto err;
-
-       /* Workdir should not be subdir of upperdir and vice versa */
-       if (lock_rename(workdir, upperdir) != NULL)
-               goto err_unlock;
-
-       return 0;
-
-err_unlock:
-       unlock_rename(workdir, upperdir);
-err:
-       pr_err("overlayfs: failed to lock workdir+upperdir\n");
-       return -EIO;
-}
-
 static struct dentry *ovl_clear_empty(struct dentry *dentry,
                                      struct list_head *list)
 {
index c3addd1..654bea1 100644 (file)
@@ -506,6 +506,7 @@ static struct dentry *ovl_lookup_index(struct dentry *dentry,
 
        index = lookup_one_len_unlocked(name.name, ofs->indexdir, name.len);
        if (IS_ERR(index)) {
+               err = PTR_ERR(index);
                pr_warn_ratelimited("overlayfs: failed inode index lookup (ino=%lu, key=%*s, err=%i);\n"
                                    "overlayfs: mount with '-o index=off' to disable inodes index.\n",
                                    d_inode(origin)->i_ino, name.len, name.name,
index d4e8c1a..c706a6f 100644 (file)
@@ -235,6 +235,7 @@ bool ovl_inuse_trylock(struct dentry *dentry);
 void ovl_inuse_unlock(struct dentry *dentry);
 int ovl_nlink_start(struct dentry *dentry, bool *locked);
 void ovl_nlink_end(struct dentry *dentry, bool locked);
+int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir);
 
 static inline bool ovl_is_impuredir(struct dentry *dentry)
 {
index 878a750..25d9b5a 100644 (file)
@@ -37,6 +37,9 @@ struct ovl_fs {
        bool noxattr;
        /* sb common to all layers */
        struct super_block *same_sb;
+       /* Did we take the inuse lock? */
+       bool upperdir_locked;
+       bool workdir_locked;
 };
 
 /* private information held for every overlayfs dentry */
index 62e9b22..0f85ee9 100644 (file)
@@ -988,6 +988,7 @@ int ovl_indexdir_cleanup(struct dentry *dentry, struct vfsmount *mnt,
                         struct path *lowerstack, unsigned int numlower)
 {
        int err;
+       struct dentry *index = NULL;
        struct inode *dir = dentry->d_inode;
        struct path path = { .mnt = mnt, .dentry = dentry };
        LIST_HEAD(list);
@@ -1007,8 +1008,6 @@ int ovl_indexdir_cleanup(struct dentry *dentry, struct vfsmount *mnt,
 
        inode_lock_nested(dir, I_MUTEX_PARENT);
        list_for_each_entry(p, &list, l_node) {
-               struct dentry *index;
-
                if (p->name[0] == '.') {
                        if (p->len == 1)
                                continue;
@@ -1018,6 +1017,7 @@ int ovl_indexdir_cleanup(struct dentry *dentry, struct vfsmount *mnt,
                index = lookup_one_len(p->name, dentry, p->len);
                if (IS_ERR(index)) {
                        err = PTR_ERR(index);
+                       index = NULL;
                        break;
                }
                err = ovl_verify_index(index, lowerstack, numlower);
@@ -1029,7 +1029,9 @@ int ovl_indexdir_cleanup(struct dentry *dentry, struct vfsmount *mnt,
                                break;
                }
                dput(index);
+               index = NULL;
        }
+       dput(index);
        inode_unlock(dir);
 out:
        ovl_cache_free(&list);
index fd5ea4f..092d150 100644 (file)
@@ -211,9 +211,10 @@ static void ovl_put_super(struct super_block *sb)
 
        dput(ufs->indexdir);
        dput(ufs->workdir);
-       ovl_inuse_unlock(ufs->workbasedir);
+       if (ufs->workdir_locked)
+               ovl_inuse_unlock(ufs->workbasedir);
        dput(ufs->workbasedir);
-       if (ufs->upper_mnt)
+       if (ufs->upper_mnt && ufs->upperdir_locked)
                ovl_inuse_unlock(ufs->upper_mnt->mnt_root);
        mntput(ufs->upper_mnt);
        for (i = 0; i < ufs->numlower; i++)
@@ -881,9 +882,13 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
                        goto out_put_upperpath;
 
                err = -EBUSY;
-               if (!ovl_inuse_trylock(upperpath.dentry)) {
-                       pr_err("overlayfs: upperdir is in-use by another mount\n");
+               if (ovl_inuse_trylock(upperpath.dentry)) {
+                       ufs->upperdir_locked = true;
+               } else if (ufs->config.index) {
+                       pr_err("overlayfs: upperdir is in-use by another mount, mount with '-o index=off' to override exclusive upperdir protection.\n");
                        goto out_put_upperpath;
+               } else {
+                       pr_warn("overlayfs: upperdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n");
                }
 
                err = ovl_mount_dir(ufs->config.workdir, &workpath);
@@ -901,9 +906,13 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
                }
 
                err = -EBUSY;
-               if (!ovl_inuse_trylock(workpath.dentry)) {
-                       pr_err("overlayfs: workdir is in-use by another mount\n");
+               if (ovl_inuse_trylock(workpath.dentry)) {
+                       ufs->workdir_locked = true;
+               } else if (ufs->config.index) {
+                       pr_err("overlayfs: workdir is in-use by another mount, mount with '-o index=off' to override exclusive workdir protection.\n");
                        goto out_put_workpath;
+               } else {
+                       pr_warn("overlayfs: workdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n");
                }
 
                ufs->workbasedir = workpath.dentry;
@@ -1156,11 +1165,13 @@ out_put_lowerpath:
 out_free_lowertmp:
        kfree(lowertmp);
 out_unlock_workdentry:
-       ovl_inuse_unlock(workpath.dentry);
+       if (ufs->workdir_locked)
+               ovl_inuse_unlock(workpath.dentry);
 out_put_workpath:
        path_put(&workpath);
 out_unlock_upperdentry:
-       ovl_inuse_unlock(upperpath.dentry);
+       if (ufs->upperdir_locked)
+               ovl_inuse_unlock(upperpath.dentry);
 out_put_upperpath:
        path_put(&upperpath);
 out_free_config:
index 1177945..b9b239f 100644 (file)
@@ -430,7 +430,7 @@ void ovl_inuse_unlock(struct dentry *dentry)
        }
 }
 
-/* Called must hold OVL_I(inode)->oi_lock */
+/* Caller must hold OVL_I(inode)->lock */
 static void ovl_cleanup_index(struct dentry *dentry)
 {
        struct inode *dir = ovl_indexdir(dentry->d_sb)->d_inode;
@@ -469,6 +469,9 @@ static void ovl_cleanup_index(struct dentry *dentry)
        err = PTR_ERR(index);
        if (!IS_ERR(index))
                err = ovl_cleanup(dir, index);
+       else
+               index = NULL;
+
        inode_unlock(dir);
        if (err)
                goto fail;
@@ -557,3 +560,22 @@ void ovl_nlink_end(struct dentry *dentry, bool locked)
                mutex_unlock(&OVL_I(d_inode(dentry))->lock);
        }
 }
+
+int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir)
+{
+       /* Workdir should not be the same as upperdir */
+       if (workdir == upperdir)
+               goto err;
+
+       /* Workdir should not be subdir of upperdir and vice versa */
+       if (lock_rename(workdir, upperdir) != NULL)
+               goto err_unlock;
+
+       return 0;
+
+err_unlock:
+       unlock_rename(workdir, upperdir);
+err:
+       pr_err("overlayfs: failed to lock workdir+upperdir\n");
+       return -EIO;
+}
index bc6c6e1..e9db7fc 100644 (file)
@@ -2122,11 +2122,31 @@ xfs_swap_extents(
                ip->i_d.di_flags2 |= tip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK;
                tip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
                tip->i_d.di_flags2 |= f & XFS_DIFLAG2_REFLINK;
+       }
+
+       /* Swap the cow forks. */
+       if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+               xfs_extnum_t    extnum;
+
+               ASSERT(ip->i_cformat == XFS_DINODE_FMT_EXTENTS);
+               ASSERT(tip->i_cformat == XFS_DINODE_FMT_EXTENTS);
+
+               extnum = ip->i_cnextents;
+               ip->i_cnextents = tip->i_cnextents;
+               tip->i_cnextents = extnum;
+
                cowfp = ip->i_cowfp;
                ip->i_cowfp = tip->i_cowfp;
                tip->i_cowfp = cowfp;
-               xfs_inode_set_cowblocks_tag(ip);
-               xfs_inode_set_cowblocks_tag(tip);
+
+               if (ip->i_cowfp && ip->i_cnextents)
+                       xfs_inode_set_cowblocks_tag(ip);
+               else
+                       xfs_inode_clear_cowblocks_tag(ip);
+               if (tip->i_cowfp && tip->i_cnextents)
+                       xfs_inode_set_cowblocks_tag(tip);
+               else
+                       xfs_inode_clear_cowblocks_tag(tip);
        }
 
        xfs_trans_log_inode(tp, ip,  src_log_flags);
index 3246815..37e603b 100644 (file)
@@ -736,7 +736,13 @@ xfs_reflink_end_cow(
        /* If there is a hole at end_fsb - 1 go to the previous extent */
        if (!xfs_iext_lookup_extent(ip, ifp, end_fsb - 1, &idx, &got) ||
            got.br_startoff > end_fsb) {
-               ASSERT(idx > 0);
+               /*
+                * In case of racing, overlapping AIO writes no COW extents
+                * might be left by the time I/O completes for the loser of
+                * the race.  In that case we are done.
+                */
+               if (idx <= 0)
+                       goto out_cancel;
                xfs_iext_get_extent(ifp, --idx, &got);
        }
 
@@ -809,6 +815,7 @@ next_extent:
 
 out_defer:
        xfs_defer_cancel(&dfops);
+out_cancel:
        xfs_trans_cancel(tp);
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
 out:
index a67daea..4373125 100644 (file)
@@ -56,7 +56,7 @@ struct bpf_map {
        struct work_struct work;
        atomic_t usercnt;
        struct bpf_map *inner_map_meta;
-       u8 name[BPF_OBJ_NAME_LEN];
+       char name[BPF_OBJ_NAME_LEN];
 };
 
 /* function argument constraints */
@@ -189,7 +189,7 @@ struct bpf_prog_aux {
        struct bpf_prog *prog;
        struct user_struct *user;
        u64 load_time; /* ns since boottime */
-       u8 name[BPF_OBJ_NAME_LEN];
+       char name[BPF_OBJ_NAME_LEN];
        union {
                struct work_struct work;
                struct rcu_head rcu;
@@ -407,6 +407,11 @@ static inline void __bpf_prog_uncharge(struct user_struct *user, u32 pages)
 {
 }
 
+static inline int bpf_obj_get_user(const char __user *pathname)
+{
+       return -EOPNOTSUPP;
+}
+
 static inline struct net_device  *__dev_map_lookup_elem(struct bpf_map *map,
                                                       u32 key)
 {
index b8d200f..f00ef75 100644 (file)
@@ -115,6 +115,21 @@ struct bpf_insn_aux_data {
 
 #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
 
+#define BPF_VERIFIER_TMP_LOG_SIZE      1024
+
+struct bpf_verifer_log {
+       u32 level;
+       char kbuf[BPF_VERIFIER_TMP_LOG_SIZE];
+       char __user *ubuf;
+       u32 len_used;
+       u32 len_total;
+};
+
+static inline bool bpf_verifier_log_full(const struct bpf_verifer_log *log)
+{
+       return log->len_used >= log->len_total - 1;
+}
+
 struct bpf_verifier_env;
 struct bpf_ext_analyzer_ops {
        int (*insn_hook)(struct bpf_verifier_env *env,
@@ -139,6 +154,8 @@ struct bpf_verifier_env {
        bool allow_ptr_leaks;
        bool seen_direct_write;
        struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */
+
+       struct bpf_verifer_log log;
 };
 
 int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
index 3cd18ac..02639eb 100644 (file)
@@ -49,6 +49,7 @@ struct br_ip_list {
 #define BR_MULTICAST_TO_UNICAST        BIT(12)
 #define BR_VLAN_TUNNEL         BIT(13)
 #define BR_BCAST_FLOOD         BIT(14)
+#define BR_NEIGH_SUPPRESS      BIT(15)
 
 #define BR_DEFAULT_AGEING_TIME (300 * HZ)
 
@@ -63,6 +64,7 @@ int br_multicast_list_adjacent(struct net_device *dev,
 bool br_multicast_has_querier_anywhere(struct net_device *dev, int proto);
 bool br_multicast_has_querier_adjacent(struct net_device *dev, int proto);
 bool br_multicast_enabled(const struct net_device *dev);
+bool br_multicast_router(const struct net_device *dev);
 #else
 static inline int br_multicast_list_adjacent(struct net_device *dev,
                                             struct list_head *br_ip_list)
@@ -83,6 +85,10 @@ static inline bool br_multicast_enabled(const struct net_device *dev)
 {
        return false;
 }
+static inline bool br_multicast_router(const struct net_device *dev)
+{
+       return false;
+}
 #endif
 
 #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_BRIDGE_VLAN_FILTERING)
index bbcdb0a..a118ee4 100644 (file)
@@ -10,5 +10,5 @@
 
 #include <uapi/linux/if_phonet.h>
 
-extern struct header_ops phonet_header_ops;
+extern const struct header_ops phonet_header_ops;
 #endif
index f3f2d07..9a43763 100644 (file)
@@ -316,7 +316,7 @@ struct mmc_host {
 #define MMC_CAP_UHS_SDR50      (1 << 18)       /* Host supports UHS SDR50 mode */
 #define MMC_CAP_UHS_SDR104     (1 << 19)       /* Host supports UHS SDR104 mode */
 #define MMC_CAP_UHS_DDR50      (1 << 20)       /* Host supports UHS DDR50 mode */
-#define MMC_CAP_NO_BOUNCE_BUFF (1 << 21)       /* Disable bounce buffers on host */
+/* (1 << 21) is free for reuse */
 #define MMC_CAP_DRIVER_TYPE_A  (1 << 23)       /* Host supports Driver Type A */
 #define MMC_CAP_DRIVER_TYPE_C  (1 << 24)       /* Host supports Driver Type C */
 #define MMC_CAP_DRIVER_TYPE_D  (1 << 25)       /* Host supports Driver Type D */
index 2c2a551..528b24c 100644 (file)
@@ -108,9 +108,10 @@ struct ebt_table {
 
 #define EBT_ALIGN(s) (((s) + (__alignof__(struct _xt_align)-1)) & \
                     ~(__alignof__(struct _xt_align)-1))
-extern struct ebt_table *ebt_register_table(struct net *net,
-                                           const struct ebt_table *table,
-                                           const struct nf_hook_ops *);
+extern int ebt_register_table(struct net *net,
+                             const struct ebt_table *table,
+                             const struct nf_hook_ops *ops,
+                             struct ebt_table **res);
 extern void ebt_unregister_table(struct net *net, struct ebt_table *table,
                                 const struct nf_hook_ops *);
 extern unsigned int ebt_do_table(struct sk_buff *skb,
index a36abe2..27e249e 100644 (file)
 
 #ifdef CONFIG_LOCKUP_DETECTOR
 void lockup_detector_init(void);
+void lockup_detector_soft_poweroff(void);
+void lockup_detector_cleanup(void);
+bool is_hardlockup(void);
+
+extern int watchdog_user_enabled;
+extern int nmi_watchdog_user_enabled;
+extern int soft_watchdog_user_enabled;
+extern int watchdog_thresh;
+extern unsigned long watchdog_enabled;
+
+extern struct cpumask watchdog_cpumask;
+extern unsigned long *watchdog_cpumask_bits;
+#ifdef CONFIG_SMP
+extern int sysctl_softlockup_all_cpu_backtrace;
+extern int sysctl_hardlockup_all_cpu_backtrace;
 #else
-static inline void lockup_detector_init(void)
-{
-}
-#endif
+#define sysctl_softlockup_all_cpu_backtrace 0
+#define sysctl_hardlockup_all_cpu_backtrace 0
+#endif /* !CONFIG_SMP */
+
+#else /* CONFIG_LOCKUP_DETECTOR */
+static inline void lockup_detector_init(void) { }
+static inline void lockup_detector_soft_poweroff(void) { }
+static inline void lockup_detector_cleanup(void) { }
+#endif /* !CONFIG_LOCKUP_DETECTOR */
 
 #ifdef CONFIG_SOFTLOCKUP_DETECTOR
 extern void touch_softlockup_watchdog_sched(void);
@@ -24,29 +44,17 @@ extern void touch_softlockup_watchdog(void);
 extern void touch_softlockup_watchdog_sync(void);
 extern void touch_all_softlockup_watchdogs(void);
 extern unsigned int  softlockup_panic;
-extern int soft_watchdog_enabled;
-extern atomic_t watchdog_park_in_progress;
 #else
-static inline void touch_softlockup_watchdog_sched(void)
-{
-}
-static inline void touch_softlockup_watchdog(void)
-{
-}
-static inline void touch_softlockup_watchdog_sync(void)
-{
-}
-static inline void touch_all_softlockup_watchdogs(void)
-{
-}
+static inline void touch_softlockup_watchdog_sched(void) { }
+static inline void touch_softlockup_watchdog(void) { }
+static inline void touch_softlockup_watchdog_sync(void) { }
+static inline void touch_all_softlockup_watchdogs(void) { }
 #endif
 
 #ifdef CONFIG_DETECT_HUNG_TASK
 void reset_hung_task_detector(void);
 #else
-static inline void reset_hung_task_detector(void)
-{
-}
+static inline void reset_hung_task_detector(void) { }
 #endif
 
 /*
@@ -54,12 +62,12 @@ static inline void reset_hung_task_detector(void)
  * 'watchdog_enabled' variable. Each lockup detector has its dedicated bit -
  * bit 0 for the hard lockup detector and bit 1 for the soft lockup detector.
  *
- * 'watchdog_user_enabled', 'nmi_watchdog_enabled' and 'soft_watchdog_enabled'
- * are variables that are only used as an 'interface' between the parameters
- * in /proc/sys/kernel and the internal state bits in 'watchdog_enabled'. The
- * 'watchdog_thresh' variable is handled differently because its value is not
- * boolean, and the lockup detectors are 'suspended' while 'watchdog_thresh'
- * is equal zero.
+ * 'watchdog_user_enabled', 'nmi_watchdog_user_enabled' and
+ * 'soft_watchdog_user_enabled' are variables that are only used as an
+ * 'interface' between the parameters in /proc/sys/kernel and the internal
+ * state bits in 'watchdog_enabled'. The 'watchdog_thresh' variable is
+ * handled differently because its value is not boolean, and the lockup
+ * detectors are 'suspended' while 'watchdog_thresh' is equal zero.
  */
 #define NMI_WATCHDOG_ENABLED_BIT   0
 #define SOFT_WATCHDOG_ENABLED_BIT  1
@@ -73,17 +81,41 @@ extern unsigned int hardlockup_panic;
 static inline void hardlockup_detector_disable(void) {}
 #endif
 
+#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR)
+# define NMI_WATCHDOG_SYSCTL_PERM      0644
+#else
+# define NMI_WATCHDOG_SYSCTL_PERM      0444
+#endif
+
 #if defined(CONFIG_HARDLOCKUP_DETECTOR_PERF)
 extern void arch_touch_nmi_watchdog(void);
+extern void hardlockup_detector_perf_stop(void);
+extern void hardlockup_detector_perf_restart(void);
+extern void hardlockup_detector_perf_disable(void);
+extern void hardlockup_detector_perf_enable(void);
+extern void hardlockup_detector_perf_cleanup(void);
+extern int hardlockup_detector_perf_init(void);
 #else
-#if !defined(CONFIG_HAVE_NMI_WATCHDOG)
+static inline void hardlockup_detector_perf_stop(void) { }
+static inline void hardlockup_detector_perf_restart(void) { }
+static inline void hardlockup_detector_perf_disable(void) { }
+static inline void hardlockup_detector_perf_enable(void) { }
+static inline void hardlockup_detector_perf_cleanup(void) { }
+# if !defined(CONFIG_HAVE_NMI_WATCHDOG)
+static inline int hardlockup_detector_perf_init(void) { return -ENODEV; }
 static inline void arch_touch_nmi_watchdog(void) {}
+# else
+static inline int hardlockup_detector_perf_init(void) { return 0; }
+# endif
 #endif
-#endif
+
+void watchdog_nmi_stop(void);
+void watchdog_nmi_start(void);
+int watchdog_nmi_probe(void);
 
 /**
  * touch_nmi_watchdog - restart NMI watchdog timeout.
- * 
+ *
  * If the architecture supports the NMI watchdog, touch_nmi_watchdog()
  * may be used to reset the timeout - for code which intentionally
  * disables interrupts for a long time. This call is stateless.
@@ -153,22 +185,6 @@ static inline bool trigger_single_cpu_backtrace(int cpu)
 u64 hw_nmi_get_sample_period(int watchdog_thresh);
 #endif
 
-#ifdef CONFIG_LOCKUP_DETECTOR
-extern int nmi_watchdog_enabled;
-extern int watchdog_user_enabled;
-extern int watchdog_thresh;
-extern unsigned long watchdog_enabled;
-extern struct cpumask watchdog_cpumask;
-extern unsigned long *watchdog_cpumask_bits;
-extern int __read_mostly watchdog_suspended;
-#ifdef CONFIG_SMP
-extern int sysctl_softlockup_all_cpu_backtrace;
-extern int sysctl_hardlockup_all_cpu_backtrace;
-#else
-#define sysctl_softlockup_all_cpu_backtrace 0
-#define sysctl_hardlockup_all_cpu_backtrace 0
-#endif
-
 #if defined(CONFIG_HARDLOCKUP_CHECK_TIMESTAMP) && \
     defined(CONFIG_HARDLOCKUP_DETECTOR)
 void watchdog_update_hrtimer_threshold(u64 period);
@@ -176,7 +192,6 @@ void watchdog_update_hrtimer_threshold(u64 period);
 static inline void watchdog_update_hrtimer_threshold(u64 period) { }
 #endif
 
-extern bool is_hardlockup(void);
 struct ctl_table;
 extern int proc_watchdog(struct ctl_table *, int ,
                         void __user *, size_t *, loff_t *);
@@ -188,18 +203,6 @@ extern int proc_watchdog_thresh(struct ctl_table *, int ,
                                void __user *, size_t *, loff_t *);
 extern int proc_watchdog_cpumask(struct ctl_table *, int,
                                 void __user *, size_t *, loff_t *);
-extern int lockup_detector_suspend(void);
-extern void lockup_detector_resume(void);
-#else
-static inline int lockup_detector_suspend(void)
-{
-       return 0;
-}
-
-static inline void lockup_detector_resume(void)
-{
-}
-#endif
 
 #ifdef CONFIG_HAVE_ACPI_APEI_NMI
 #include <asm/nmi.h>
index 9c98aaa..7247249 100644 (file)
@@ -5,7 +5,7 @@
 #include <linux/jump_label.h>
 
 bool __do_once_start(bool *done, unsigned long *flags);
-void __do_once_done(bool *done, struct static_key *once_key,
+void __do_once_done(bool *done, struct static_key_true *once_key,
                    unsigned long *flags);
 
 /* Call a function exactly once. The idea of DO_ONCE() is to perform
@@ -38,8 +38,8 @@ void __do_once_done(bool *done, struct static_key *once_key,
        ({                                                                   \
                bool ___ret = false;                                         \
                static bool ___done = false;                                 \
-               static struct static_key ___once_key = STATIC_KEY_INIT_TRUE; \
-               if (static_key_true(&___once_key)) {                         \
+               static DEFINE_STATIC_KEY_TRUE(___once_key);                  \
+               if (static_branch_unlikely(&___once_key)) {                  \
                        unsigned long ___flags;                              \
                        ___ret = __do_once_start(&___done, &___flags);       \
                        if (unlikely(___ret)) {                              \
index 8e22f24..79b18a2 100644 (file)
@@ -806,6 +806,7 @@ struct perf_output_handle {
 struct bpf_perf_event_data_kern {
        struct pt_regs *regs;
        struct perf_sample_data *data;
+       struct perf_event *event;
 };
 
 #ifdef CONFIG_CGROUP_PERF
@@ -884,7 +885,8 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr,
                                void *context);
 extern void perf_pmu_migrate_context(struct pmu *pmu,
                                int src_cpu, int dst_cpu);
-int perf_event_read_local(struct perf_event *event, u64 *value);
+int perf_event_read_local(struct perf_event *event, u64 *value,
+                         u64 *enabled, u64 *running);
 extern u64 perf_event_read_value(struct perf_event *event,
                                 u64 *enabled, u64 *running);
 
@@ -1286,7 +1288,8 @@ static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *
 {
        return ERR_PTR(-EINVAL);
 }
-static inline int perf_event_read_local(struct perf_event *event, u64 *value)
+static inline int perf_event_read_local(struct perf_event *event, u64 *value,
+                                       u64 *enabled, u64 *running)
 {
        return -EINVAL;
 }
index 89fa0bb..e755954 100644 (file)
@@ -64,6 +64,7 @@ enum qed_ll2_roce_flavor_type {
 enum qed_ll2_tx_dest {
        QED_LL2_TX_DEST_NW, /* Light L2 TX Destination to the Network */
        QED_LL2_TX_DEST_LB, /* Light L2 TX Destination to the Loopback */
+       QED_LL2_TX_DEST_DROP, /* Light L2 Drop the TX packet */
        QED_LL2_TX_DEST_MAX
 };
 
@@ -150,11 +151,16 @@ void (*qed_ll2_release_tx_packet_cb)(void *cxt,
                                     dma_addr_t first_frag_addr,
                                     bool b_last_fragment, bool b_last_packet);
 
+typedef
+void (*qed_ll2_slowpath_cb)(void *cxt, u8 connection_handle,
+                           u32 opaque_data_0, u32 opaque_data_1);
+
 struct qed_ll2_cbs {
        qed_ll2_complete_rx_packet_cb rx_comp_cb;
        qed_ll2_release_rx_packet_cb rx_release_cb;
        qed_ll2_complete_tx_packet_cb tx_comp_cb;
        qed_ll2_release_tx_packet_cb tx_release_cb;
+       qed_ll2_slowpath_cb slowpath_cb;
        void *cookie;
 };
 
@@ -171,6 +177,7 @@ struct qed_ll2_acquire_data_inputs {
        enum qed_ll2_tx_dest tx_dest;
        enum qed_ll2_error_handle ai_err_packet_too_big;
        enum qed_ll2_error_handle ai_err_no_buf;
+       bool secondary_queue;
        u8 gsi_enable;
 };
 
index 01a9859..03634ec 100644 (file)
@@ -3158,6 +3158,12 @@ static inline int __skb_grow_rcsum(struct sk_buff *skb, unsigned int len)
        return __skb_grow(skb, len);
 }
 
+#define rb_to_skb(rb) rb_entry_safe(rb, struct sk_buff, rbnode)
+#define skb_rb_first(root) rb_to_skb(rb_first(root))
+#define skb_rb_last(root)  rb_to_skb(rb_last(root))
+#define skb_rb_next(skb)   rb_to_skb(rb_next(&(skb)->rbnode))
+#define skb_rb_prev(skb)   rb_to_skb(rb_prev(&(skb)->rbnode))
+
 #define skb_queue_walk(queue, skb) \
                for (skb = (queue)->next;                                       \
                     skb != (struct sk_buff *)(queue);                          \
@@ -3172,6 +3178,18 @@ static inline int __skb_grow_rcsum(struct sk_buff *skb, unsigned int len)
                for (; skb != (struct sk_buff *)(queue);                        \
                     skb = skb->next)
 
+#define skb_rbtree_walk(skb, root)                                             \
+               for (skb = skb_rb_first(root); skb != NULL;                     \
+                    skb = skb_rb_next(skb))
+
+#define skb_rbtree_walk_from(skb)                                              \
+               for (; skb != NULL;                                             \
+                    skb = skb_rb_next(skb))
+
+#define skb_rbtree_walk_from_safe(skb, tmp)                                    \
+               for (; tmp = skb ? skb_rb_next(skb) : NULL, (skb != NULL);      \
+                    skb = tmp)
+
 #define skb_queue_walk_from_safe(queue, skb, tmp)                              \
                for (tmp = skb->next;                                           \
                     skb != (struct sk_buff *)(queue);                          \
index 12910cf..c149aa7 100644 (file)
@@ -55,7 +55,7 @@ smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
 }
 
 void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread);
-int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
-                                        const struct cpumask *);
+void smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
+                                         const struct cpumask *);
 
 #endif
index 06a6765..204c19e 100644 (file)
@@ -101,7 +101,7 @@ struct dst_entry {
        union {
                struct dst_entry        *next;
                struct rtable __rcu     *rt_next;
-               struct rt6_info         *rt6_next;
+               struct rt6_info __rcu   *rt6_next;
                struct dn_route __rcu   *dn_next;
        };
 };
index 9fba2eb..87a0bb8 100644 (file)
@@ -87,6 +87,7 @@ static inline int skb_metadata_dst_cmp(const struct sk_buff *skb_a,
 void metadata_dst_free(struct metadata_dst *);
 struct metadata_dst *metadata_dst_alloc(u8 optslen, enum metadata_type type,
                                        gfp_t flags);
+void metadata_dst_free_percpu(struct metadata_dst __percpu *md_dst);
 struct metadata_dst __percpu *
 metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags);
 
index d060d71..10c9138 100644 (file)
 #define FIB6_TABLE_HASHSZ 1
 #endif
 
+#define RT6_DEBUG 2
+
+#if RT6_DEBUG >= 3
+#define RT6_TRACE(x...) pr_debug(x)
+#else
+#define RT6_TRACE(x...) do { ; } while (0)
+#endif
+
 struct rt6_info;
 
 struct fib6_config {
@@ -60,25 +68,30 @@ struct fib6_config {
 };
 
 struct fib6_node {
-       struct fib6_node        *parent;
-       struct fib6_node        *left;
-       struct fib6_node        *right;
+       struct fib6_node __rcu  *parent;
+       struct fib6_node __rcu  *left;
+       struct fib6_node __rcu  *right;
 #ifdef CONFIG_IPV6_SUBTREES
-       struct fib6_node        *subtree;
+       struct fib6_node __rcu  *subtree;
 #endif
-       struct rt6_info         *leaf;
+       struct rt6_info __rcu   *leaf;
 
        __u16                   fn_bit;         /* bit key */
        __u16                   fn_flags;
        int                     fn_sernum;
-       struct rt6_info         *rr_ptr;
+       struct rt6_info __rcu   *rr_ptr;
        struct rcu_head         rcu;
 };
 
+struct fib6_gc_args {
+       int                     timeout;
+       int                     more;
+};
+
 #ifndef CONFIG_IPV6_SUBTREES
 #define FIB6_SUBTREE(fn)       NULL
 #else
-#define FIB6_SUBTREE(fn)       ((fn)->subtree)
+#define FIB6_SUBTREE(fn)       (rcu_dereference_protected((fn)->subtree, 1))
 #endif
 
 struct mx6_config {
@@ -98,6 +111,22 @@ struct rt6key {
 
 struct fib6_table;
 
+struct rt6_exception_bucket {
+       struct hlist_head       chain;
+       int                     depth;
+};
+
+struct rt6_exception {
+       struct hlist_node       hlist;
+       struct rt6_info         *rt6i;
+       unsigned long           stamp;
+       struct rcu_head         rcu;
+};
+
+#define FIB6_EXCEPTION_BUCKET_SIZE_SHIFT 10
+#define FIB6_EXCEPTION_BUCKET_SIZE (1 << FIB6_EXCEPTION_BUCKET_SIZE_SHIFT)
+#define FIB6_MAX_DEPTH 5
+
 struct rt6_info {
        struct dst_entry                dst;
 
@@ -134,14 +163,25 @@ struct rt6_info {
 
        struct inet6_dev                *rt6i_idev;
        struct rt6_info * __percpu      *rt6i_pcpu;
+       struct rt6_exception_bucket __rcu *rt6i_exception_bucket;
 
        u32                             rt6i_metric;
        u32                             rt6i_pmtu;
        /* more non-fragment space at head required */
        unsigned short                  rt6i_nfheader_len;
        u8                              rt6i_protocol;
+       u8                              exception_bucket_flushed:1,
+                                       unused:7;
 };
 
+#define for_each_fib6_node_rt_rcu(fn)                                  \
+       for (rt = rcu_dereference((fn)->leaf); rt;                      \
+            rt = rcu_dereference(rt->dst.rt6_next))
+
+#define for_each_fib6_walker_rt(w)                                     \
+       for (rt = (w)->leaf; rt;                                        \
+            rt = rcu_dereference_protected(rt->dst.rt6_next, 1))
+
 static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst)
 {
        return ((struct rt6_info *)dst)->rt6i_idev;
@@ -188,6 +228,8 @@ static inline bool rt6_get_cookie_safe(const struct rt6_info *rt,
 
        if (fn) {
                *cookie = fn->fn_sernum;
+               /* pairs with smp_wmb() in fib6_update_sernum_upto_root() */
+               smp_rmb();
                status = true;
        }
 
@@ -248,7 +290,6 @@ struct fib6_walker {
        struct fib6_node *root, *node;
        struct rt6_info *leaf;
        enum fib6_walk_state state;
-       bool prune;
        unsigned int skip;
        unsigned int count;
        int (*func)(struct fib6_walker *);
@@ -256,12 +297,15 @@ struct fib6_walker {
 };
 
 struct rt6_statistics {
-       __u32           fib_nodes;
-       __u32           fib_route_nodes;
-       __u32           fib_rt_alloc;           /* permanent routes     */
-       __u32           fib_rt_entries;         /* rt entries in table  */
-       __u32           fib_rt_cache;           /* cache routes         */
-       __u32           fib_discarded_routes;
+       __u32           fib_nodes;              /* all fib6 nodes */
+       __u32           fib_route_nodes;        /* intermediate nodes */
+       __u32           fib_rt_entries;         /* rt entries in fib table */
+       __u32           fib_rt_cache;           /* cached rt entries in exception table */
+       __u32           fib_discarded_routes;   /* total number of routes delete */
+
+       /* The following stats are not protected by any lock */
+       atomic_t        fib_rt_alloc;           /* total number of routes alloced */
+       atomic_t        fib_rt_uncache;         /* rt entries in uncached list */
 };
 
 #define RTN_TL_ROOT    0x0001
@@ -277,7 +321,7 @@ struct rt6_statistics {
 struct fib6_table {
        struct hlist_node       tb6_hlist;
        u32                     tb6_id;
-       rwlock_t                tb6_lock;
+       spinlock_t              tb6_lock;
        struct fib6_node        tb6_root;
        struct inet_peer_base   tb6_peers;
        unsigned int            flags;
@@ -325,7 +369,8 @@ struct fib6_node *fib6_lookup(struct fib6_node *root,
 
 struct fib6_node *fib6_locate(struct fib6_node *root,
                              const struct in6_addr *daddr, int dst_len,
-                             const struct in6_addr *saddr, int src_len);
+                             const struct in6_addr *saddr, int src_len,
+                             bool exact_match);
 
 void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *arg),
                    void *arg);
@@ -358,6 +403,8 @@ void __net_exit fib6_notifier_exit(struct net *net);
 unsigned int fib6_tables_seq_read(struct net *net);
 int fib6_tables_dump(struct net *net, struct notifier_block *nb);
 
+void fib6_update_sernum(struct rt6_info *rt);
+
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
 int fib6_rules_init(void);
 void fib6_rules_cleanup(void);
index ee96f40..a0087fb 100644 (file)
@@ -95,6 +95,11 @@ int ip6_route_add(struct fib6_config *cfg, struct netlink_ext_ack *extack);
 int ip6_ins_rt(struct rt6_info *);
 int ip6_del_rt(struct rt6_info *);
 
+void rt6_flush_exceptions(struct rt6_info *rt);
+int rt6_remove_exception_rt(struct rt6_info *rt);
+void rt6_age_exceptions(struct rt6_info *rt, struct fib6_gc_args *gc_args,
+                       unsigned long now);
+
 static inline int ip6_route_get_saddr(struct net *net, struct rt6_info *rt,
                                      const struct in6_addr *daddr,
                                      unsigned int prefs,
index 6eac5cf..3cda3b5 100644 (file)
@@ -300,8 +300,8 @@ static inline void fl6_sock_release(struct ip6_flowlabel *fl)
 
 void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info);
 
-int icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6,
-                              struct icmp6hdr *thdr, int len);
+void icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6,
+                               struct icmp6hdr *thdr, int len);
 
 int ip6_ra_control(struct sock *sk, int sel);
 
index 039cc29..51e1a2a 100644 (file)
@@ -108,8 +108,10 @@ struct phonet_protocol {
        int                     sock_type;
 };
 
-int phonet_proto_register(unsigned int protocol, struct phonet_protocol *pp);
-void phonet_proto_unregister(unsigned int protocol, struct phonet_protocol *pp);
+int phonet_proto_register(unsigned int protocol,
+               const struct phonet_protocol *pp);
+void phonet_proto_unregister(unsigned int protocol,
+               const struct phonet_protocol *pp);
 
 int phonet_sysctl_init(void);
 void phonet_sysctl_exit(void);
index a6b9a8d..4827094 100644 (file)
@@ -60,7 +60,7 @@
 #include <linux/sched.h>
 #include <linux/wait.h>
 #include <linux/cgroup-defs.h>
-
+#include <linux/rbtree.h>
 #include <linux/filter.h>
 #include <linux/rculist_nulls.h>
 #include <linux/poll.h>
@@ -397,7 +397,10 @@ struct sock {
        int                     sk_wmem_queued;
        refcount_t              sk_wmem_alloc;
        unsigned long           sk_tsq_flags;
-       struct sk_buff          *sk_send_head;
+       union {
+               struct sk_buff  *sk_send_head;
+               struct rb_root  tcp_rtx_queue;
+       };
        struct sk_buff_head     sk_write_queue;
        __s32                   sk_peek_off;
        int                     sk_write_pending;
index d767b79..d756fbe 100644 (file)
@@ -51,6 +51,7 @@ enum switchdev_attr_id {
        SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME,
        SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING,
        SWITCHDEV_ATTR_ID_BRIDGE_MC_DISABLED,
+       SWITCHDEV_ATTR_ID_BRIDGE_MROUTER,
 };
 
 struct switchdev_attr {
index 3b16f35..5a95e58 100644 (file)
@@ -551,7 +551,13 @@ void tcp_xmit_retransmit_queue(struct sock *);
 void tcp_simple_retransmit(struct sock *);
 void tcp_enter_recovery(struct sock *sk, bool ece_ack);
 int tcp_trim_head(struct sock *, struct sk_buff *, u32);
-int tcp_fragment(struct sock *, struct sk_buff *, u32, unsigned int, gfp_t);
+enum tcp_queue {
+       TCP_FRAG_IN_WRITE_QUEUE,
+       TCP_FRAG_IN_RTX_QUEUE,
+};
+int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
+                struct sk_buff *skb, u32 len,
+                unsigned int mss_now, gfp_t gfp);
 
 void tcp_send_probe0(struct sock *);
 void tcp_send_partial(struct sock *);
@@ -1606,19 +1612,11 @@ static inline void tcp_skb_tsorted_anchor_cleanup(struct sk_buff *skb)
        skb->_skb_refdst = _save;               \
 }
 
-/* write queue abstraction */
-static inline void tcp_write_queue_purge(struct sock *sk)
-{
-       struct sk_buff *skb;
+void tcp_write_queue_purge(struct sock *sk);
 
-       tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
-       while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
-               tcp_skb_tsorted_anchor_cleanup(skb);
-               sk_wmem_free_skb(sk, skb);
-       }
-       INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
-       sk_mem_reclaim(sk);
-       tcp_clear_all_retrans_hints(tcp_sk(sk));
+static inline struct sk_buff *tcp_rtx_queue_head(const struct sock *sk)
+{
+       return skb_rb_first(&sk->tcp_rtx_queue);
 }
 
 static inline struct sk_buff *tcp_write_queue_head(const struct sock *sk)
@@ -1643,18 +1641,12 @@ static inline struct sk_buff *tcp_write_queue_prev(const struct sock *sk,
        return skb_queue_prev(&sk->sk_write_queue, skb);
 }
 
-#define tcp_for_write_queue(skb, sk)                                   \
-       skb_queue_walk(&(sk)->sk_write_queue, skb)
-
-#define tcp_for_write_queue_from(skb, sk)                              \
-       skb_queue_walk_from(&(sk)->sk_write_queue, skb)
-
 #define tcp_for_write_queue_from_safe(skb, tmp, sk)                    \
        skb_queue_walk_from_safe(&(sk)->sk_write_queue, skb, tmp)
 
 static inline struct sk_buff *tcp_send_head(const struct sock *sk)
 {
-       return sk->sk_send_head;
+       return skb_peek(&sk->sk_write_queue);
 }
 
 static inline bool tcp_skb_is_last(const struct sock *sk,
@@ -1663,29 +1655,30 @@ static inline bool tcp_skb_is_last(const struct sock *sk,
        return skb_queue_is_last(&sk->sk_write_queue, skb);
 }
 
-static inline void tcp_advance_send_head(struct sock *sk, const struct sk_buff *skb)
+static inline bool tcp_write_queue_empty(const struct sock *sk)
 {
-       if (tcp_skb_is_last(sk, skb))
-               sk->sk_send_head = NULL;
-       else
-               sk->sk_send_head = tcp_write_queue_next(sk, skb);
+       return skb_queue_empty(&sk->sk_write_queue);
+}
+
+static inline bool tcp_rtx_queue_empty(const struct sock *sk)
+{
+       return RB_EMPTY_ROOT(&sk->tcp_rtx_queue);
+}
+
+static inline bool tcp_rtx_and_write_queues_empty(const struct sock *sk)
+{
+       return tcp_rtx_queue_empty(sk) && tcp_write_queue_empty(sk);
 }
 
 static inline void tcp_check_send_head(struct sock *sk, struct sk_buff *skb_unlinked)
 {
-       if (sk->sk_send_head == skb_unlinked) {
-               sk->sk_send_head = NULL;
+       if (tcp_write_queue_empty(sk))
                tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
-       }
+
        if (tcp_sk(sk)->highest_sack == skb_unlinked)
                tcp_sk(sk)->highest_sack = NULL;
 }
 
-static inline void tcp_init_send_head(struct sock *sk)
-{
-       sk->sk_send_head = NULL;
-}
-
 static inline void __tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb)
 {
        __skb_queue_tail(&sk->sk_write_queue, skb);
@@ -1696,8 +1689,7 @@ static inline void tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb
        __tcp_add_write_queue_tail(sk, skb);
 
        /* Queue it, remembering where we must start sending. */
-       if (sk->sk_send_head == NULL) {
-               sk->sk_send_head = skb;
+       if (sk->sk_write_queue.next == skb) {
                tcp_chrono_start(sk, TCP_CHRONO_BUSY);
 
                if (tcp_sk(sk)->highest_sack == NULL)
@@ -1710,35 +1702,32 @@ static inline void __tcp_add_write_queue_head(struct sock *sk, struct sk_buff *s
        __skb_queue_head(&sk->sk_write_queue, skb);
 }
 
-/* Insert buff after skb on the write queue of sk.  */
-static inline void tcp_insert_write_queue_after(struct sk_buff *skb,
-                                               struct sk_buff *buff,
-                                               struct sock *sk)
-{
-       __skb_queue_after(&sk->sk_write_queue, skb, buff);
-}
-
 /* Insert new before skb on the write queue of sk.  */
 static inline void tcp_insert_write_queue_before(struct sk_buff *new,
                                                  struct sk_buff *skb,
                                                  struct sock *sk)
 {
        __skb_queue_before(&sk->sk_write_queue, skb, new);
-
-       if (sk->sk_send_head == skb)
-               sk->sk_send_head = new;
 }
 
 static inline void tcp_unlink_write_queue(struct sk_buff *skb, struct sock *sk)
 {
-       list_del(&skb->tcp_tsorted_anchor);
-       tcp_skb_tsorted_anchor_cleanup(skb);
        __skb_unlink(skb, &sk->sk_write_queue);
 }
 
-static inline bool tcp_write_queue_empty(struct sock *sk)
+void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb);
+
+static inline void tcp_rtx_queue_unlink(struct sk_buff *skb, struct sock *sk)
 {
-       return skb_queue_empty(&sk->sk_write_queue);
+       tcp_skb_tsorted_anchor_cleanup(skb);
+       rb_erase(&skb->rbnode, &sk->tcp_rtx_queue);
+}
+
+static inline void tcp_rtx_queue_unlink_and_free(struct sk_buff *skb, struct sock *sk)
+{
+       list_del(&skb->tcp_tsorted_anchor);
+       tcp_rtx_queue_unlink(skb, sk);
+       sk_wmem_free_skb(sk, skb);
 }
 
 static inline void tcp_push_pending_frames(struct sock *sk)
@@ -1767,8 +1756,9 @@ static inline u32 tcp_highest_sack_seq(struct tcp_sock *tp)
 
 static inline void tcp_advance_highest_sack(struct sock *sk, struct sk_buff *skb)
 {
-       tcp_sk(sk)->highest_sack = tcp_skb_is_last(sk, skb) ? NULL :
-                                               tcp_write_queue_next(sk, skb);
+       struct sk_buff *next = skb_rb_next(skb);
+
+       tcp_sk(sk)->highest_sack = next ?: tcp_send_head(sk);
 }
 
 static inline struct sk_buff *tcp_highest_sack(struct sock *sk)
@@ -1778,7 +1768,9 @@ static inline struct sk_buff *tcp_highest_sack(struct sock *sk)
 
 static inline void tcp_highest_sack_reset(struct sock *sk)
 {
-       tcp_sk(sk)->highest_sack = tcp_write_queue_head(sk);
+       struct sk_buff *skb = tcp_rtx_queue_head(sk);
+
+       tcp_sk(sk)->highest_sack = skb ?: tcp_send_head(sk);
 }
 
 /* Called when old skb is about to be deleted (to be combined with new skb) */
@@ -1948,7 +1940,7 @@ extern void tcp_rack_reo_timeout(struct sock *sk);
 /* At how many usecs into the future should the RTO fire? */
 static inline s64 tcp_rto_delta_us(const struct sock *sk)
 {
-       const struct sk_buff *skb = tcp_write_queue_head(sk);
+       const struct sk_buff *skb = tcp_rtx_queue_head(sk);
        u32 rto = inet_csk(sk)->icsk_rto;
        u64 rto_time_stamp_us = skb->skb_mstamp + jiffies_to_usecs(rto);
 
index 82e93ee..67c5a9f 100644 (file)
@@ -192,6 +192,7 @@ struct scsi_device {
        unsigned no_dif:1;      /* T10 PI (DIF) should be disabled */
        unsigned broken_fua:1;          /* Don't set FUA bit */
        unsigned lun_in_cdb:1;          /* Store LUN bits in CDB[1] */
+       unsigned unmap_limit_for_ws:1;  /* Use the UNMAP limit for WRITE SAME */
 
        atomic_t disk_events_disable_depth; /* disable depth for disk events */
 
index 9592570..36b0301 100644 (file)
@@ -29,5 +29,6 @@
 #define BLIST_TRY_VPD_PAGES    0x10000000 /* Attempt to read VPD pages */
 #define BLIST_NO_RSOC          0x20000000 /* don't try to issue RSOC */
 #define BLIST_MAX_1024         0x40000000 /* maximum 1024 sector cdb length */
+#define BLIST_UNMAP_LIMIT_WS   0x80000000 /* Use UNMAP limit for WRITE SAME */
 
 #endif
index 6183d20..b266d2a 100644 (file)
@@ -434,7 +434,6 @@ extern struct iscsi_cls_session *iscsi_create_session(struct Scsi_Host *shost,
                                                unsigned int target_id);
 extern void iscsi_remove_session(struct iscsi_cls_session *session);
 extern void iscsi_free_session(struct iscsi_cls_session *session);
-extern int iscsi_destroy_session(struct iscsi_cls_session *session);
 extern struct iscsi_cls_conn *iscsi_create_conn(struct iscsi_cls_session *sess,
                                                int dd_size, uint32_t cid);
 extern int iscsi_destroy_conn(struct iscsi_cls_conn *conn);
index 6082faf..6db9e1d 100644 (file)
@@ -230,7 +230,7 @@ union bpf_attr {
                __u32   numa_node;      /* numa node (effective only if
                                         * BPF_F_NUMA_NODE is set).
                                         */
-               __u8    map_name[BPF_OBJ_NAME_LEN];
+               char    map_name[BPF_OBJ_NAME_LEN];
        };
 
        struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
@@ -253,7 +253,7 @@ union bpf_attr {
                __aligned_u64   log_buf;        /* user supplied buffer */
                __u32           kern_version;   /* checked when prog_type=kprobe */
                __u32           prog_flags;
-               __u8            prog_name[BPF_OBJ_NAME_LEN];
+               char            prog_name[BPF_OBJ_NAME_LEN];
        };
 
        struct { /* anonymous struct used by BPF_OBJ_* commands */
@@ -641,6 +641,21 @@ union bpf_attr {
  *     @xdp_md: pointer to xdp_md
  *     @delta: An positive/negative integer to be added to xdp_md.data_meta
  *     Return: 0 on success or negative on error
+ *
+ * int bpf_perf_event_read_value(map, flags, buf, buf_size)
+ *     read perf event counter value and perf event enabled/running time
+ *     @map: pointer to perf_event_array map
+ *     @flags: index of event in the map or bitmask flags
+ *     @buf: buf to fill
+ *     @buf_size: size of the buf
+ *     Return: 0 on success or negative error code
+ *
+ * int bpf_perf_prog_read_value(ctx, buf, buf_size)
+ *     read perf prog attached perf event counter and enabled/running time
+ *     @ctx: pointer to ctx
+ *     @buf: buf to fill
+ *     @buf_size: size of the buf
+ *     Return : 0 on success or negative error code
  */
 #define __BPF_FUNC_MAPPER(FN)          \
        FN(unspec),                     \
@@ -697,7 +712,9 @@ union bpf_attr {
        FN(redirect_map),               \
        FN(sk_redirect_map),            \
        FN(sock_map_update),            \
-       FN(xdp_adjust_meta),
+       FN(xdp_adjust_meta),            \
+       FN(perf_event_read_value),      \
+       FN(perf_prog_read_value),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -741,7 +758,9 @@ enum bpf_func_id {
 #define BPF_F_ZERO_CSUM_TX             (1ULL << 1)
 #define BPF_F_DONT_FRAGMENT            (1ULL << 2)
 
-/* BPF_FUNC_perf_event_output and BPF_FUNC_perf_event_read flags. */
+/* BPF_FUNC_perf_event_output, BPF_FUNC_perf_event_read and
+ * BPF_FUNC_perf_event_read_value flags.
+ */
 #define BPF_F_INDEX_MASK               0xffffffffULL
 #define BPF_F_CURRENT_CPU              BPF_F_INDEX_MASK
 /* BPF_FUNC_perf_event_output for sk_buff input context. */
@@ -869,7 +888,7 @@ struct bpf_prog_info {
        __u32 created_by_uid;
        __u32 nr_map_ids;
        __aligned_u64 map_ids;
-       __u8  name[BPF_OBJ_NAME_LEN];
+       char name[BPF_OBJ_NAME_LEN];
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
@@ -879,7 +898,7 @@ struct bpf_map_info {
        __u32 value_size;
        __u32 max_entries;
        __u32 map_flags;
-       __u8  name[BPF_OBJ_NAME_LEN];
+       char  name[BPF_OBJ_NAME_LEN];
 } __attribute__((aligned(8)));
 
 /* User bpf_sock_ops struct to access socket values and specify request ops
@@ -934,4 +953,10 @@ enum {
 #define TCP_BPF_IW             1001    /* Set TCP initial congestion window */
 #define TCP_BPF_SNDCWND_CLAMP  1002    /* Set sndcwnd_clamp */
 
+struct bpf_perf_event_value {
+       __u64 counter;
+       __u64 enabled;
+       __u64 running;
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
index cd580fc..b037e0a 100644 (file)
@@ -327,6 +327,7 @@ enum {
        IFLA_BRPORT_VLAN_TUNNEL,
        IFLA_BRPORT_BCAST_FLOOD,
        IFLA_BRPORT_GROUP_FWD_MASK,
+       IFLA_BRPORT_NEIGH_SUPPRESS,
        __IFLA_BRPORT_MAX
 };
 #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1)
index 2e52088..a2f48c0 100644 (file)
@@ -84,6 +84,7 @@ enum tunnel_encap_types {
        TUNNEL_ENCAP_NONE,
        TUNNEL_ENCAP_FOU,
        TUNNEL_ENCAP_GUE,
+       TUNNEL_ENCAP_MPLS,
 };
 
 #define TUNNEL_ENCAP_FLAG_CSUM         (1<<0)
index b97725a..da161b5 100644 (file)
@@ -23,6 +23,7 @@ enum xt_bpf_modes {
        XT_BPF_MODE_FD_PINNED,
        XT_BPF_MODE_FD_ELF,
 };
+#define XT_BPF_MODE_PATH_PINNED XT_BPF_MODE_FD_PINNED
 
 struct xt_bpf_info_v1 {
        __u16 mode;
index 156ee4c..0cd6f88 100644 (file)
@@ -359,6 +359,7 @@ enum ovs_tunnel_key_attr {
        OVS_TUNNEL_KEY_ATTR_IPV6_SRC,           /* struct in6_addr src IPv6 address. */
        OVS_TUNNEL_KEY_ATTR_IPV6_DST,           /* struct in6_addr dst IPv6 address. */
        OVS_TUNNEL_KEY_ATTR_PAD,
+       OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS,        /* be32 ERSPAN index. */
        __OVS_TUNNEL_KEY_ATTR_MAX
 };
 
@@ -806,6 +807,7 @@ struct ovs_action_push_eth {
  * packet.
  * @OVS_ACTION_ATTR_POP_ETH: Pop the outermost Ethernet header off the
  * packet.
+ * @OVS_ACTION_ATTR_CT_CLEAR: Clear conntrack state from the packet.
  *
  * Only a single header can be set with a single %OVS_ACTION_ATTR_SET.  Not all
  * fields within a header are modifiable, e.g. the IPv4 protocol and fragment
@@ -835,6 +837,7 @@ enum ovs_action_attr {
        OVS_ACTION_ATTR_TRUNC,        /* u32 struct ovs_action_trunc. */
        OVS_ACTION_ATTR_PUSH_ETH,     /* struct ovs_action_push_eth. */
        OVS_ACTION_ATTR_POP_ETH,      /* No argument. */
+       OVS_ACTION_ATTR_CT_CLEAR,     /* No argument. */
 
        __OVS_ACTION_ATTR_MAX,        /* Nothing past this will be accepted
                                       * from userspace. */
index 897daa0..53fb09f 100644 (file)
@@ -2,6 +2,7 @@ obj-y := core.o
 
 obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
+obj-$(CONFIG_BPF_SYSCALL) += disasm.o
 ifeq ($(CONFIG_NET),y)
 obj-$(CONFIG_BPF_SYSCALL) += devmap.o
 ifeq ($(CONFIG_STREAM_PARSER),y)
index 98c0f00..68d8666 100644 (file)
@@ -492,7 +492,7 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map,
 
        ee = ERR_PTR(-EOPNOTSUPP);
        event = perf_file->private_data;
-       if (perf_event_read_local(event, &value) == -EOPNOTSUPP)
+       if (perf_event_read_local(event, &value, NULL, NULL) == -EOPNOTSUPP)
                goto err_out;
 
        ee = bpf_event_entry_gen(perf_file, map_file);
index c6be15a..248961a 100644 (file)
@@ -309,12 +309,25 @@ bpf_get_prog_addr_region(const struct bpf_prog *prog,
 
 static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
 {
+       const char *end = sym + KSYM_NAME_LEN;
+
        BUILD_BUG_ON(sizeof("bpf_prog_") +
-                    sizeof(prog->tag) * 2 + 1 > KSYM_NAME_LEN);
+                    sizeof(prog->tag) * 2 +
+                    /* name has been null terminated.
+                     * We should need +1 for the '_' preceding
+                     * the name.  However, the null character
+                     * is double counted between the name and the
+                     * sizeof("bpf_prog_") above, so we omit
+                     * the +1 here.
+                     */
+                    sizeof(prog->aux->name) > KSYM_NAME_LEN);
 
        sym += snprintf(sym, KSYM_NAME_LEN, "bpf_prog_");
        sym  = bin2hex(sym, prog->tag, sizeof(prog->tag));
-       *sym = 0;
+       if (prog->aux->name[0])
+               snprintf(sym, (size_t)(end - sym), "_%s", prog->aux->name);
+       else
+               *sym = 0;
 }
 
 static __always_inline unsigned long
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
new file mode 100644 (file)
index 0000000..e682850
--- /dev/null
@@ -0,0 +1,214 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/bpf.h>
+
+#include "disasm.h"
+
+#define __BPF_FUNC_STR_FN(x) [BPF_FUNC_ ## x] = __stringify(bpf_ ## x)
+static const char * const func_id_str[] = {
+       __BPF_FUNC_MAPPER(__BPF_FUNC_STR_FN)
+};
+#undef __BPF_FUNC_STR_FN
+
+const char *func_id_name(int id)
+{
+       BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID);
+
+       if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id])
+               return func_id_str[id];
+       else
+               return "unknown";
+}
+
+const char *const bpf_class_string[8] = {
+       [BPF_LD]    = "ld",
+       [BPF_LDX]   = "ldx",
+       [BPF_ST]    = "st",
+       [BPF_STX]   = "stx",
+       [BPF_ALU]   = "alu",
+       [BPF_JMP]   = "jmp",
+       [BPF_RET]   = "BUG",
+       [BPF_ALU64] = "alu64",
+};
+
+const char *const bpf_alu_string[16] = {
+       [BPF_ADD >> 4]  = "+=",
+       [BPF_SUB >> 4]  = "-=",
+       [BPF_MUL >> 4]  = "*=",
+       [BPF_DIV >> 4]  = "/=",
+       [BPF_OR  >> 4]  = "|=",
+       [BPF_AND >> 4]  = "&=",
+       [BPF_LSH >> 4]  = "<<=",
+       [BPF_RSH >> 4]  = ">>=",
+       [BPF_NEG >> 4]  = "neg",
+       [BPF_MOD >> 4]  = "%=",
+       [BPF_XOR >> 4]  = "^=",
+       [BPF_MOV >> 4]  = "=",
+       [BPF_ARSH >> 4] = "s>>=",
+       [BPF_END >> 4]  = "endian",
+};
+
+static const char *const bpf_ldst_string[] = {
+       [BPF_W >> 3]  = "u32",
+       [BPF_H >> 3]  = "u16",
+       [BPF_B >> 3]  = "u8",
+       [BPF_DW >> 3] = "u64",
+};
+
+static const char *const bpf_jmp_string[16] = {
+       [BPF_JA >> 4]   = "jmp",
+       [BPF_JEQ >> 4]  = "==",
+       [BPF_JGT >> 4]  = ">",
+       [BPF_JLT >> 4]  = "<",
+       [BPF_JGE >> 4]  = ">=",
+       [BPF_JLE >> 4]  = "<=",
+       [BPF_JSET >> 4] = "&",
+       [BPF_JNE >> 4]  = "!=",
+       [BPF_JSGT >> 4] = "s>",
+       [BPF_JSLT >> 4] = "s<",
+       [BPF_JSGE >> 4] = "s>=",
+       [BPF_JSLE >> 4] = "s<=",
+       [BPF_CALL >> 4] = "call",
+       [BPF_EXIT >> 4] = "exit",
+};
+
+static void print_bpf_end_insn(bpf_insn_print_cb verbose,
+                              struct bpf_verifier_env *env,
+                              const struct bpf_insn *insn)
+{
+       verbose(env, "(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg,
+               BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le",
+               insn->imm, insn->dst_reg);
+}
+
+void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env,
+                   const struct bpf_insn *insn, bool allow_ptr_leaks)
+{
+       u8 class = BPF_CLASS(insn->code);
+
+       if (class == BPF_ALU || class == BPF_ALU64) {
+               if (BPF_OP(insn->code) == BPF_END) {
+                       if (class == BPF_ALU64)
+                               verbose(env, "BUG_alu64_%02x\n", insn->code);
+                       else
+                               print_bpf_end_insn(verbose, env, insn);
+               } else if (BPF_OP(insn->code) == BPF_NEG) {
+                       verbose(env, "(%02x) r%d = %s-r%d\n",
+                               insn->code, insn->dst_reg,
+                               class == BPF_ALU ? "(u32) " : "",
+                               insn->dst_reg);
+               } else if (BPF_SRC(insn->code) == BPF_X) {
+                       verbose(env, "(%02x) %sr%d %s %sr%d\n",
+                               insn->code, class == BPF_ALU ? "(u32) " : "",
+                               insn->dst_reg,
+                               bpf_alu_string[BPF_OP(insn->code) >> 4],
+                               class == BPF_ALU ? "(u32) " : "",
+                               insn->src_reg);
+               } else {
+                       verbose(env, "(%02x) %sr%d %s %s%d\n",
+                               insn->code, class == BPF_ALU ? "(u32) " : "",
+                               insn->dst_reg,
+                               bpf_alu_string[BPF_OP(insn->code) >> 4],
+                               class == BPF_ALU ? "(u32) " : "",
+                               insn->imm);
+               }
+       } else if (class == BPF_STX) {
+               if (BPF_MODE(insn->code) == BPF_MEM)
+                       verbose(env, "(%02x) *(%s *)(r%d %+d) = r%d\n",
+                               insn->code,
+                               bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+                               insn->dst_reg,
+                               insn->off, insn->src_reg);
+               else if (BPF_MODE(insn->code) == BPF_XADD)
+                       verbose(env, "(%02x) lock *(%s *)(r%d %+d) += r%d\n",
+                               insn->code,
+                               bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+                               insn->dst_reg, insn->off,
+                               insn->src_reg);
+               else
+                       verbose(env, "BUG_%02x\n", insn->code);
+       } else if (class == BPF_ST) {
+               if (BPF_MODE(insn->code) != BPF_MEM) {
+                       verbose(env, "BUG_st_%02x\n", insn->code);
+                       return;
+               }
+               verbose(env, "(%02x) *(%s *)(r%d %+d) = %d\n",
+                       insn->code,
+                       bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+                       insn->dst_reg,
+                       insn->off, insn->imm);
+       } else if (class == BPF_LDX) {
+               if (BPF_MODE(insn->code) != BPF_MEM) {
+                       verbose(env, "BUG_ldx_%02x\n", insn->code);
+                       return;
+               }
+               verbose(env, "(%02x) r%d = *(%s *)(r%d %+d)\n",
+                       insn->code, insn->dst_reg,
+                       bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+                       insn->src_reg, insn->off);
+       } else if (class == BPF_LD) {
+               if (BPF_MODE(insn->code) == BPF_ABS) {
+                       verbose(env, "(%02x) r0 = *(%s *)skb[%d]\n",
+                               insn->code,
+                               bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+                               insn->imm);
+               } else if (BPF_MODE(insn->code) == BPF_IND) {
+                       verbose(env, "(%02x) r0 = *(%s *)skb[r%d + %d]\n",
+                               insn->code,
+                               bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+                               insn->src_reg, insn->imm);
+               } else if (BPF_MODE(insn->code) == BPF_IMM &&
+                          BPF_SIZE(insn->code) == BPF_DW) {
+                       /* At this point, we already made sure that the second
+                        * part of the ldimm64 insn is accessible.
+                        */
+                       u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
+                       bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD;
+
+                       if (map_ptr && !allow_ptr_leaks)
+                               imm = 0;
+
+                       verbose(env, "(%02x) r%d = 0x%llx\n", insn->code,
+                               insn->dst_reg, (unsigned long long)imm);
+               } else {
+                       verbose(env, "BUG_ld_%02x\n", insn->code);
+                       return;
+               }
+       } else if (class == BPF_JMP) {
+               u8 opcode = BPF_OP(insn->code);
+
+               if (opcode == BPF_CALL) {
+                       verbose(env, "(%02x) call %s#%d\n", insn->code,
+                               func_id_name(insn->imm), insn->imm);
+               } else if (insn->code == (BPF_JMP | BPF_JA)) {
+                       verbose(env, "(%02x) goto pc%+d\n",
+                               insn->code, insn->off);
+               } else if (insn->code == (BPF_JMP | BPF_EXIT)) {
+                       verbose(env, "(%02x) exit\n", insn->code);
+               } else if (BPF_SRC(insn->code) == BPF_X) {
+                       verbose(env, "(%02x) if r%d %s r%d goto pc%+d\n",
+                               insn->code, insn->dst_reg,
+                               bpf_jmp_string[BPF_OP(insn->code) >> 4],
+                               insn->src_reg, insn->off);
+               } else {
+                       verbose(env, "(%02x) if r%d %s 0x%x goto pc%+d\n",
+                               insn->code, insn->dst_reg,
+                               bpf_jmp_string[BPF_OP(insn->code) >> 4],
+                               insn->imm, insn->off);
+               }
+       } else {
+               verbose(env, "(%02x) %s\n",
+                       insn->code, bpf_class_string[class]);
+       }
+}
diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h
new file mode 100644 (file)
index 0000000..8de977e
--- /dev/null
@@ -0,0 +1,32 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef __BPF_DISASM_H__
+#define __BPF_DISASM_H__
+
+#include <linux/bpf.h>
+#include <linux/kernel.h>
+#include <linux/stringify.h>
+
+extern const char *const bpf_alu_string[16];
+extern const char *const bpf_class_string[8];
+
+const char *func_id_name(int id);
+
+struct bpf_verifier_env;
+typedef void (*bpf_insn_print_cb)(struct bpf_verifier_env *env,
+                                 const char *, ...);
+void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env,
+                   const struct bpf_insn *insn, bool allow_ptr_leaks);
+
+#endif
index e833ed9..be1dde9 100644 (file)
@@ -363,6 +363,7 @@ out:
        putname(pname);
        return ret;
 }
+EXPORT_SYMBOL_GPL(bpf_obj_get_user);
 
 static void bpf_evict_inode(struct inode *inode)
 {
index 0048cb2..d124e70 100644 (file)
@@ -322,6 +322,8 @@ static int bpf_obj_name_cpy(char *dst, const char *src)
 {
        const char *end = src + BPF_OBJ_NAME_LEN;
 
+       memset(dst, 0, BPF_OBJ_NAME_LEN);
+
        /* Copy all isalnum() and '_' char */
        while (src < end && *src) {
                if (!isalnum(*src) && *src != '_')
@@ -333,9 +335,6 @@ static int bpf_obj_name_cpy(char *dst, const char *src)
        if (src == end)
                return -EINVAL;
 
-       /* '\0' terminates dst */
-       *dst = 0;
-
        return 0;
 }
 
index 52b0223..2cdbcc4 100644 (file)
@@ -21,6 +21,8 @@
 #include <linux/vmalloc.h>
 #include <linux/stringify.h>
 
+#include "disasm.h"
+
 /* bpf_check() is a static code analyzer that walks eBPF program
  * instruction by instruction and updates register/stack state.
  * All paths of conditional branches are analyzed until 'bpf_exit' insn.
@@ -153,28 +155,36 @@ struct bpf_call_arg_meta {
        int access_size;
 };
 
-/* verbose verifier prints what it's seeing
- * bpf_check() is called under lock, so no race to access these global vars
- */
-static u32 log_level, log_size, log_len;
-static char *log_buf;
-
 static DEFINE_MUTEX(bpf_verifier_lock);
 
 /* log_level controls verbosity level of eBPF verifier.
  * verbose() is used to dump the verification trace to the log, so the user
  * can figure out what's wrong with the program
  */
-static __printf(1, 2) void verbose(const char *fmt, ...)
+static __printf(2, 3) void verbose(struct bpf_verifier_env *env,
+                                  const char *fmt, ...)
 {
+       struct bpf_verifer_log *log = &env->log;
+       unsigned int n;
        va_list args;
 
-       if (log_level == 0 || log_len >= log_size - 1)
+       if (!log->level || !log->ubuf || bpf_verifier_log_full(log))
                return;
 
        va_start(args, fmt);
-       log_len += vscnprintf(log_buf + log_len, log_size - log_len, fmt, args);
+       n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args);
        va_end(args);
+
+       WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1,
+                 "verifier log line truncated - local buffer too short\n");
+
+       n = min(log->len_total - log->len_used - 1, n);
+       log->kbuf[n] = '\0';
+
+       if (!copy_to_user(log->ubuf + log->len_used, log->kbuf, n + 1))
+               log->len_used += n;
+       else
+               log->ubuf = NULL;
 }
 
 static bool type_is_pkt_pointer(enum bpf_reg_type type)
@@ -197,23 +207,8 @@ static const char * const reg_type_str[] = {
        [PTR_TO_PACKET_END]     = "pkt_end",
 };
 
-#define __BPF_FUNC_STR_FN(x) [BPF_FUNC_ ## x] = __stringify(bpf_ ## x)
-static const char * const func_id_str[] = {
-       __BPF_FUNC_MAPPER(__BPF_FUNC_STR_FN)
-};
-#undef __BPF_FUNC_STR_FN
-
-static const char *func_id_name(int id)
-{
-       BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID);
-
-       if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id])
-               return func_id_str[id];
-       else
-               return "unknown";
-}
-
-static void print_verifier_state(struct bpf_verifier_state *state)
+static void print_verifier_state(struct bpf_verifier_env *env,
+                                struct bpf_verifier_state *state)
 {
        struct bpf_reg_state *reg;
        enum bpf_reg_type t;
@@ -224,21 +219,21 @@ static void print_verifier_state(struct bpf_verifier_state *state)
                t = reg->type;
                if (t == NOT_INIT)
                        continue;
-               verbose(" R%d=%s", i, reg_type_str[t]);
+               verbose(env, " R%d=%s", i, reg_type_str[t]);
                if ((t == SCALAR_VALUE || t == PTR_TO_STACK) &&
                    tnum_is_const(reg->var_off)) {
                        /* reg->off should be 0 for SCALAR_VALUE */
-                       verbose("%lld", reg->var_off.value + reg->off);
+                       verbose(env, "%lld", reg->var_off.value + reg->off);
                } else {
-                       verbose("(id=%d", reg->id);
+                       verbose(env, "(id=%d", reg->id);
                        if (t != SCALAR_VALUE)
-                               verbose(",off=%d", reg->off);
+                               verbose(env, ",off=%d", reg->off);
                        if (type_is_pkt_pointer(t))
-                               verbose(",r=%d", reg->range);
+                               verbose(env, ",r=%d", reg->range);
                        else if (t == CONST_PTR_TO_MAP ||
                                 t == PTR_TO_MAP_VALUE ||
                                 t == PTR_TO_MAP_VALUE_OR_NULL)
-                               verbose(",ks=%d,vs=%d",
+                               verbose(env, ",ks=%d,vs=%d",
                                        reg->map_ptr->key_size,
                                        reg->map_ptr->value_size);
                        if (tnum_is_const(reg->var_off)) {
@@ -246,218 +241,38 @@ static void print_verifier_state(struct bpf_verifier_state *state)
                                 * could be a pointer whose offset is too big
                                 * for reg->off
                                 */
-                               verbose(",imm=%llx", reg->var_off.value);
+                               verbose(env, ",imm=%llx", reg->var_off.value);
                        } else {
                                if (reg->smin_value != reg->umin_value &&
                                    reg->smin_value != S64_MIN)
-                                       verbose(",smin_value=%lld",
+                                       verbose(env, ",smin_value=%lld",
                                                (long long)reg->smin_value);
                                if (reg->smax_value != reg->umax_value &&
                                    reg->smax_value != S64_MAX)
-                                       verbose(",smax_value=%lld",
+                                       verbose(env, ",smax_value=%lld",
                                                (long long)reg->smax_value);
                                if (reg->umin_value != 0)
-                                       verbose(",umin_value=%llu",
+                                       verbose(env, ",umin_value=%llu",
                                                (unsigned long long)reg->umin_value);
                                if (reg->umax_value != U64_MAX)
-                                       verbose(",umax_value=%llu",
+                                       verbose(env, ",umax_value=%llu",
                                                (unsigned long long)reg->umax_value);
                                if (!tnum_is_unknown(reg->var_off)) {
                                        char tn_buf[48];
 
                                        tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-                                       verbose(",var_off=%s", tn_buf);
+                                       verbose(env, ",var_off=%s", tn_buf);
                                }
                        }
-                       verbose(")");
+                       verbose(env, ")");
                }
        }
        for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
                if (state->stack_slot_type[i] == STACK_SPILL)
-                       verbose(" fp%d=%s", -MAX_BPF_STACK + i,
+                       verbose(env, " fp%d=%s", -MAX_BPF_STACK + i,
                                reg_type_str[state->spilled_regs[i / BPF_REG_SIZE].type]);
        }
-       verbose("\n");
-}
-
-static const char *const bpf_class_string[] = {
-       [BPF_LD]    = "ld",
-       [BPF_LDX]   = "ldx",
-       [BPF_ST]    = "st",
-       [BPF_STX]   = "stx",
-       [BPF_ALU]   = "alu",
-       [BPF_JMP]   = "jmp",
-       [BPF_RET]   = "BUG",
-       [BPF_ALU64] = "alu64",
-};
-
-static const char *const bpf_alu_string[16] = {
-       [BPF_ADD >> 4]  = "+=",
-       [BPF_SUB >> 4]  = "-=",
-       [BPF_MUL >> 4]  = "*=",
-       [BPF_DIV >> 4]  = "/=",
-       [BPF_OR  >> 4]  = "|=",
-       [BPF_AND >> 4]  = "&=",
-       [BPF_LSH >> 4]  = "<<=",
-       [BPF_RSH >> 4]  = ">>=",
-       [BPF_NEG >> 4]  = "neg",
-       [BPF_MOD >> 4]  = "%=",
-       [BPF_XOR >> 4]  = "^=",
-       [BPF_MOV >> 4]  = "=",
-       [BPF_ARSH >> 4] = "s>>=",
-       [BPF_END >> 4]  = "endian",
-};
-
-static const char *const bpf_ldst_string[] = {
-       [BPF_W >> 3]  = "u32",
-       [BPF_H >> 3]  = "u16",
-       [BPF_B >> 3]  = "u8",
-       [BPF_DW >> 3] = "u64",
-};
-
-static const char *const bpf_jmp_string[16] = {
-       [BPF_JA >> 4]   = "jmp",
-       [BPF_JEQ >> 4]  = "==",
-       [BPF_JGT >> 4]  = ">",
-       [BPF_JLT >> 4]  = "<",
-       [BPF_JGE >> 4]  = ">=",
-       [BPF_JLE >> 4]  = "<=",
-       [BPF_JSET >> 4] = "&",
-       [BPF_JNE >> 4]  = "!=",
-       [BPF_JSGT >> 4] = "s>",
-       [BPF_JSLT >> 4] = "s<",
-       [BPF_JSGE >> 4] = "s>=",
-       [BPF_JSLE >> 4] = "s<=",
-       [BPF_CALL >> 4] = "call",
-       [BPF_EXIT >> 4] = "exit",
-};
-
-static void print_bpf_end_insn(const struct bpf_verifier_env *env,
-                              const struct bpf_insn *insn)
-{
-       verbose("(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg,
-               BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le",
-               insn->imm, insn->dst_reg);
-}
-
-static void print_bpf_insn(const struct bpf_verifier_env *env,
-                          const struct bpf_insn *insn)
-{
-       u8 class = BPF_CLASS(insn->code);
-
-       if (class == BPF_ALU || class == BPF_ALU64) {
-               if (BPF_OP(insn->code) == BPF_END) {
-                       if (class == BPF_ALU64)
-                               verbose("BUG_alu64_%02x\n", insn->code);
-                       else
-                               print_bpf_end_insn(env, insn);
-               } else if (BPF_OP(insn->code) == BPF_NEG) {
-                       verbose("(%02x) r%d = %s-r%d\n",
-                               insn->code, insn->dst_reg,
-                               class == BPF_ALU ? "(u32) " : "",
-                               insn->dst_reg);
-               } else if (BPF_SRC(insn->code) == BPF_X) {
-                       verbose("(%02x) %sr%d %s %sr%d\n",
-                               insn->code, class == BPF_ALU ? "(u32) " : "",
-                               insn->dst_reg,
-                               bpf_alu_string[BPF_OP(insn->code) >> 4],
-                               class == BPF_ALU ? "(u32) " : "",
-                               insn->src_reg);
-               } else {
-                       verbose("(%02x) %sr%d %s %s%d\n",
-                               insn->code, class == BPF_ALU ? "(u32) " : "",
-                               insn->dst_reg,
-                               bpf_alu_string[BPF_OP(insn->code) >> 4],
-                               class == BPF_ALU ? "(u32) " : "",
-                               insn->imm);
-               }
-       } else if (class == BPF_STX) {
-               if (BPF_MODE(insn->code) == BPF_MEM)
-                       verbose("(%02x) *(%s *)(r%d %+d) = r%d\n",
-                               insn->code,
-                               bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
-                               insn->dst_reg,
-                               insn->off, insn->src_reg);
-               else if (BPF_MODE(insn->code) == BPF_XADD)
-                       verbose("(%02x) lock *(%s *)(r%d %+d) += r%d\n",
-                               insn->code,
-                               bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
-                               insn->dst_reg, insn->off,
-                               insn->src_reg);
-               else
-                       verbose("BUG_%02x\n", insn->code);
-       } else if (class == BPF_ST) {
-               if (BPF_MODE(insn->code) != BPF_MEM) {
-                       verbose("BUG_st_%02x\n", insn->code);
-                       return;
-               }
-               verbose("(%02x) *(%s *)(r%d %+d) = %d\n",
-                       insn->code,
-                       bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
-                       insn->dst_reg,
-                       insn->off, insn->imm);
-       } else if (class == BPF_LDX) {
-               if (BPF_MODE(insn->code) != BPF_MEM) {
-                       verbose("BUG_ldx_%02x\n", insn->code);
-                       return;
-               }
-               verbose("(%02x) r%d = *(%s *)(r%d %+d)\n",
-                       insn->code, insn->dst_reg,
-                       bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
-                       insn->src_reg, insn->off);
-       } else if (class == BPF_LD) {
-               if (BPF_MODE(insn->code) == BPF_ABS) {
-                       verbose("(%02x) r0 = *(%s *)skb[%d]\n",
-                               insn->code,
-                               bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
-                               insn->imm);
-               } else if (BPF_MODE(insn->code) == BPF_IND) {
-                       verbose("(%02x) r0 = *(%s *)skb[r%d + %d]\n",
-                               insn->code,
-                               bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
-                               insn->src_reg, insn->imm);
-               } else if (BPF_MODE(insn->code) == BPF_IMM &&
-                          BPF_SIZE(insn->code) == BPF_DW) {
-                       /* At this point, we already made sure that the second
-                        * part of the ldimm64 insn is accessible.
-                        */
-                       u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
-                       bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD;
-
-                       if (map_ptr && !env->allow_ptr_leaks)
-                               imm = 0;
-
-                       verbose("(%02x) r%d = 0x%llx\n", insn->code,
-                               insn->dst_reg, (unsigned long long)imm);
-               } else {
-                       verbose("BUG_ld_%02x\n", insn->code);
-                       return;
-               }
-       } else if (class == BPF_JMP) {
-               u8 opcode = BPF_OP(insn->code);
-
-               if (opcode == BPF_CALL) {
-                       verbose("(%02x) call %s#%d\n", insn->code,
-                               func_id_name(insn->imm), insn->imm);
-               } else if (insn->code == (BPF_JMP | BPF_JA)) {
-                       verbose("(%02x) goto pc%+d\n",
-                               insn->code, insn->off);
-               } else if (insn->code == (BPF_JMP | BPF_EXIT)) {
-                       verbose("(%02x) exit\n", insn->code);
-               } else if (BPF_SRC(insn->code) == BPF_X) {
-                       verbose("(%02x) if r%d %s r%d goto pc%+d\n",
-                               insn->code, insn->dst_reg,
-                               bpf_jmp_string[BPF_OP(insn->code) >> 4],
-                               insn->src_reg, insn->off);
-               } else {
-                       verbose("(%02x) if r%d %s 0x%x goto pc%+d\n",
-                               insn->code, insn->dst_reg,
-                               bpf_jmp_string[BPF_OP(insn->code) >> 4],
-                               insn->imm, insn->off);
-               }
-       } else {
-               verbose("(%02x) %s\n", insn->code, bpf_class_string[class]);
-       }
+       verbose(env, "\n");
 }
 
 static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx)
@@ -495,7 +310,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
        env->head = elem;
        env->stack_size++;
        if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) {
-               verbose("BPF program is too complex\n");
+               verbose(env, "BPF program is too complex\n");
                goto err;
        }
        return &elem->st;
@@ -533,10 +348,11 @@ static void __mark_reg_known_zero(struct bpf_reg_state *reg)
        __mark_reg_known(reg, 0);
 }
 
-static void mark_reg_known_zero(struct bpf_reg_state *regs, u32 regno)
+static void mark_reg_known_zero(struct bpf_verifier_env *env,
+                               struct bpf_reg_state *regs, u32 regno)
 {
        if (WARN_ON(regno >= MAX_BPF_REG)) {
-               verbose("mark_reg_known_zero(regs, %u)\n", regno);
+               verbose(env, "mark_reg_known_zero(regs, %u)\n", regno);
                /* Something bad happened, let's kill all regs */
                for (regno = 0; regno < MAX_BPF_REG; regno++)
                        __mark_reg_not_init(regs + regno);
@@ -646,10 +462,11 @@ static void __mark_reg_unknown(struct bpf_reg_state *reg)
        __mark_reg_unbounded(reg);
 }
 
-static void mark_reg_unknown(struct bpf_reg_state *regs, u32 regno)
+static void mark_reg_unknown(struct bpf_verifier_env *env,
+                            struct bpf_reg_state *regs, u32 regno)
 {
        if (WARN_ON(regno >= MAX_BPF_REG)) {
-               verbose("mark_reg_unknown(regs, %u)\n", regno);
+               verbose(env, "mark_reg_unknown(regs, %u)\n", regno);
                /* Something bad happened, let's kill all regs */
                for (regno = 0; regno < MAX_BPF_REG; regno++)
                        __mark_reg_not_init(regs + regno);
@@ -664,10 +481,11 @@ static void __mark_reg_not_init(struct bpf_reg_state *reg)
        reg->type = NOT_INIT;
 }
 
-static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno)
+static void mark_reg_not_init(struct bpf_verifier_env *env,
+                             struct bpf_reg_state *regs, u32 regno)
 {
        if (WARN_ON(regno >= MAX_BPF_REG)) {
-               verbose("mark_reg_not_init(regs, %u)\n", regno);
+               verbose(env, "mark_reg_not_init(regs, %u)\n", regno);
                /* Something bad happened, let's kill all regs */
                for (regno = 0; regno < MAX_BPF_REG; regno++)
                        __mark_reg_not_init(regs + regno);
@@ -676,22 +494,23 @@ static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno)
        __mark_reg_not_init(regs + regno);
 }
 
-static void init_reg_state(struct bpf_reg_state *regs)
+static void init_reg_state(struct bpf_verifier_env *env,
+                          struct bpf_reg_state *regs)
 {
        int i;
 
        for (i = 0; i < MAX_BPF_REG; i++) {
-               mark_reg_not_init(regs, i);
+               mark_reg_not_init(env, regs, i);
                regs[i].live = REG_LIVE_NONE;
        }
 
        /* frame pointer */
        regs[BPF_REG_FP].type = PTR_TO_STACK;
-       mark_reg_known_zero(regs, BPF_REG_FP);
+       mark_reg_known_zero(env, regs, BPF_REG_FP);
 
        /* 1st arg to a function */
        regs[BPF_REG_1].type = PTR_TO_CTX;
-       mark_reg_known_zero(regs, BPF_REG_1);
+       mark_reg_known_zero(env, regs, BPF_REG_1);
 }
 
 enum reg_arg_type {
@@ -704,6 +523,10 @@ static void mark_reg_read(const struct bpf_verifier_state *state, u32 regno)
 {
        struct bpf_verifier_state *parent = state->parent;
 
+       if (regno == BPF_REG_FP)
+               /* We don't need to worry about FP liveness because it's read-only */
+               return;
+
        while (parent) {
                /* if read wasn't screened by an earlier write ... */
                if (state->regs[regno].live & REG_LIVE_WRITTEN)
@@ -721,26 +544,26 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
        struct bpf_reg_state *regs = env->cur_state.regs;
 
        if (regno >= MAX_BPF_REG) {
-               verbose("R%d is invalid\n", regno);
+               verbose(env, "R%d is invalid\n", regno);
                return -EINVAL;
        }
 
        if (t == SRC_OP) {
                /* check whether register used as source operand can be read */
                if (regs[regno].type == NOT_INIT) {
-                       verbose("R%d !read_ok\n", regno);
+                       verbose(env, "R%d !read_ok\n", regno);
                        return -EACCES;
                }
                mark_reg_read(&env->cur_state, regno);
        } else {
                /* check whether register used as dest operand can be written to */
                if (regno == BPF_REG_FP) {
-                       verbose("frame pointer is read only\n");
+                       verbose(env, "frame pointer is read only\n");
                        return -EACCES;
                }
                regs[regno].live |= REG_LIVE_WRITTEN;
                if (t == DST_OP)
-                       mark_reg_unknown(regs, regno);
+                       mark_reg_unknown(env, regs, regno);
        }
        return 0;
 }
@@ -765,7 +588,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
 /* check_stack_read/write functions track spill/fill of registers,
  * stack boundary and alignment are checked in check_mem_access()
  */
-static int check_stack_write(struct bpf_verifier_state *state, int off,
+static int check_stack_write(struct bpf_verifier_env *env,
+                            struct bpf_verifier_state *state, int off,
                             int size, int value_regno)
 {
        int i, spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE;
@@ -778,7 +602,7 @@ static int check_stack_write(struct bpf_verifier_state *state, int off,
 
                /* register containing pointer is being spilled into stack */
                if (size != BPF_REG_SIZE) {
-                       verbose("invalid size of register spill\n");
+                       verbose(env, "invalid size of register spill\n");
                        return -EACCES;
                }
 
@@ -813,7 +637,8 @@ static void mark_stack_slot_read(const struct bpf_verifier_state *state, int slo
        }
 }
 
-static int check_stack_read(struct bpf_verifier_state *state, int off, int size,
+static int check_stack_read(struct bpf_verifier_env *env,
+                           struct bpf_verifier_state *state, int off, int size,
                            int value_regno)
 {
        u8 *slot_type;
@@ -823,12 +648,12 @@ static int check_stack_read(struct bpf_verifier_state *state, int off, int size,
 
        if (slot_type[0] == STACK_SPILL) {
                if (size != BPF_REG_SIZE) {
-                       verbose("invalid size of register spill\n");
+                       verbose(env, "invalid size of register spill\n");
                        return -EACCES;
                }
                for (i = 1; i < BPF_REG_SIZE; i++) {
                        if (slot_type[i] != STACK_SPILL) {
-                               verbose("corrupted spill memory\n");
+                               verbose(env, "corrupted spill memory\n");
                                return -EACCES;
                        }
                }
@@ -844,14 +669,14 @@ static int check_stack_read(struct bpf_verifier_state *state, int off, int size,
        } else {
                for (i = 0; i < size; i++) {
                        if (slot_type[i] != STACK_MISC) {
-                               verbose("invalid read from stack off %d+%d size %d\n",
+                               verbose(env, "invalid read from stack off %d+%d size %d\n",
                                        off, i, size);
                                return -EACCES;
                        }
                }
                if (value_regno >= 0)
                        /* have read misc data from the stack */
-                       mark_reg_unknown(state->regs, value_regno);
+                       mark_reg_unknown(env, state->regs, value_regno);
                return 0;
        }
 }
@@ -863,7 +688,7 @@ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
        struct bpf_map *map = env->cur_state.regs[regno].map_ptr;
 
        if (off < 0 || size <= 0 || off + size > map->value_size) {
-               verbose("invalid access to map value, value_size=%d off=%d size=%d\n",
+               verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n",
                        map->value_size, off, size);
                return -EACCES;
        }
@@ -882,8 +707,8 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
         * need to try adding each of min_value and max_value to off
         * to make sure our theoretical access will be safe.
         */
-       if (log_level)
-               print_verifier_state(state);
+       if (env->log.level)
+               print_verifier_state(env, state);
        /* The minimum value is only important with signed
         * comparisons where we can't assume the floor of a
         * value is 0.  If we are using signed variables for our
@@ -891,13 +716,14 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
         * will have a set floor within our range.
         */
        if (reg->smin_value < 0) {
-               verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
+               verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
                        regno);
                return -EACCES;
        }
        err = __check_map_access(env, regno, reg->smin_value + off, size);
        if (err) {
-               verbose("R%d min value is outside of the array range\n", regno);
+               verbose(env, "R%d min value is outside of the array range\n",
+                       regno);
                return err;
        }
 
@@ -906,13 +732,14 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
         * If reg->umax_value + off could overflow, treat that as unbounded too.
         */
        if (reg->umax_value >= BPF_MAX_VAR_OFF) {
-               verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n",
+               verbose(env, "R%d unbounded memory access, make sure to bounds check any array access into a map\n",
                        regno);
                return -EACCES;
        }
        err = __check_map_access(env, regno, reg->umax_value + off, size);
        if (err)
-               verbose("R%d max value is outside of the array range\n", regno);
+               verbose(env, "R%d max value is outside of the array range\n",
+                       regno);
        return err;
 }
 
@@ -951,7 +778,7 @@ static int __check_packet_access(struct bpf_verifier_env *env, u32 regno,
        struct bpf_reg_state *reg = &regs[regno];
 
        if (off < 0 || size <= 0 || (u64)off + size > reg->range) {
-               verbose("invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
+               verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
                        off, size, regno, reg->id, reg->off, reg->range);
                return -EACCES;
        }
@@ -974,13 +801,13 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
         * detail to prove they're safe.
         */
        if (reg->smin_value < 0) {
-               verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
+               verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
                        regno);
                return -EACCES;
        }
        err = __check_packet_access(env, regno, off, size);
        if (err) {
-               verbose("R%d offset is outside of the packet\n", regno);
+               verbose(env, "R%d offset is outside of the packet\n", regno);
                return err;
        }
        return err;
@@ -1016,7 +843,7 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
                return 0;
        }
 
-       verbose("invalid bpf_context access off=%d size=%d\n", off, size);
+       verbose(env, "invalid bpf_context access off=%d size=%d\n", off, size);
        return -EACCES;
 }
 
@@ -1034,7 +861,8 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
        return __is_pointer_value(env->allow_ptr_leaks, &env->cur_state.regs[regno]);
 }
 
-static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
+static int check_pkt_ptr_alignment(struct bpf_verifier_env *env,
+                                  const struct bpf_reg_state *reg,
                                   int off, int size, bool strict)
 {
        struct tnum reg_off;
@@ -1059,7 +887,8 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
                char tn_buf[48];
 
                tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-               verbose("misaligned packet access off %d+%s+%d+%d size %d\n",
+               verbose(env,
+                       "misaligned packet access off %d+%s+%d+%d size %d\n",
                        ip_align, tn_buf, reg->off, off, size);
                return -EACCES;
        }
@@ -1067,7 +896,8 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
        return 0;
 }
 
-static int check_generic_ptr_alignment(const struct bpf_reg_state *reg,
+static int check_generic_ptr_alignment(struct bpf_verifier_env *env,
+                                      const struct bpf_reg_state *reg,
                                       const char *pointer_desc,
                                       int off, int size, bool strict)
 {
@@ -1082,7 +912,7 @@ static int check_generic_ptr_alignment(const struct bpf_reg_state *reg,
                char tn_buf[48];
 
                tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-               verbose("misaligned %saccess off %s+%d+%d size %d\n",
+               verbose(env, "misaligned %saccess off %s+%d+%d size %d\n",
                        pointer_desc, tn_buf, reg->off, off, size);
                return -EACCES;
        }
@@ -1103,7 +933,7 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
                /* Special case, because of NET_IP_ALIGN. Given metadata sits
                 * right in front, treat it the very same way.
                 */
-               return check_pkt_ptr_alignment(reg, off, size, strict);
+               return check_pkt_ptr_alignment(env, reg, off, size, strict);
        case PTR_TO_MAP_VALUE:
                pointer_desc = "value ";
                break;
@@ -1116,7 +946,8 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
        default:
                break;
        }
-       return check_generic_ptr_alignment(reg, pointer_desc, off, size, strict);
+       return check_generic_ptr_alignment(env, reg, pointer_desc, off, size,
+                                          strict);
 }
 
 /* check whether memory at (regno + off) is accessible for t = (read | write)
@@ -1148,20 +979,20 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
        if (reg->type == PTR_TO_MAP_VALUE) {
                if (t == BPF_WRITE && value_regno >= 0 &&
                    is_pointer_value(env, value_regno)) {
-                       verbose("R%d leaks addr into map\n", value_regno);
+                       verbose(env, "R%d leaks addr into map\n", value_regno);
                        return -EACCES;
                }
 
                err = check_map_access(env, regno, off, size);
                if (!err && t == BPF_READ && value_regno >= 0)
-                       mark_reg_unknown(state->regs, value_regno);
+                       mark_reg_unknown(env, state->regs, value_regno);
 
        } else if (reg->type == PTR_TO_CTX) {
                enum bpf_reg_type reg_type = SCALAR_VALUE;
 
                if (t == BPF_WRITE && value_regno >= 0 &&
                    is_pointer_value(env, value_regno)) {
-                       verbose("R%d leaks addr into ctx\n", value_regno);
+                       verbose(env, "R%d leaks addr into ctx\n", value_regno);
                        return -EACCES;
                }
                /* ctx accesses must be at a fixed offset, so that we can
@@ -1171,7 +1002,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
                        char tn_buf[48];
 
                        tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-                       verbose("variable ctx access var_off=%s off=%d size=%d",
+                       verbose(env,
+                               "variable ctx access var_off=%s off=%d size=%d",
                                tn_buf, off, size);
                        return -EACCES;
                }
@@ -1183,9 +1015,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
                         * case, we know the offset is zero.
                         */
                        if (reg_type == SCALAR_VALUE)
-                               mark_reg_unknown(state->regs, value_regno);
+                               mark_reg_unknown(env, state->regs, value_regno);
                        else
-                               mark_reg_known_zero(state->regs, value_regno);
+                               mark_reg_known_zero(env, state->regs,
+                                                   value_regno);
                        state->regs[value_regno].id = 0;
                        state->regs[value_regno].off = 0;
                        state->regs[value_regno].range = 0;
@@ -1201,13 +1034,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
                        char tn_buf[48];
 
                        tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-                       verbose("variable stack access var_off=%s off=%d size=%d",
+                       verbose(env, "variable stack access var_off=%s off=%d size=%d",
                                tn_buf, off, size);
                        return -EACCES;
                }
                off += reg->var_off.value;
                if (off >= 0 || off < -MAX_BPF_STACK) {
-                       verbose("invalid stack off=%d size=%d\n", off, size);
+                       verbose(env, "invalid stack off=%d size=%d\n", off,
+                               size);
                        return -EACCES;
                }
 
@@ -1218,29 +1052,32 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
                        if (!env->allow_ptr_leaks &&
                            state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL &&
                            size != BPF_REG_SIZE) {
-                               verbose("attempt to corrupt spilled pointer on stack\n");
+                               verbose(env, "attempt to corrupt spilled pointer on stack\n");
                                return -EACCES;
                        }
-                       err = check_stack_write(state, off, size, value_regno);
+                       err = check_stack_write(env, state, off, size,
+                                               value_regno);
                } else {
-                       err = check_stack_read(state, off, size, value_regno);
+                       err = check_stack_read(env, state, off, size,
+                                              value_regno);
                }
        } else if (reg_is_pkt_pointer(reg)) {
                if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
-                       verbose("cannot write into packet\n");
+                       verbose(env, "cannot write into packet\n");
                        return -EACCES;
                }
                if (t == BPF_WRITE && value_regno >= 0 &&
                    is_pointer_value(env, value_regno)) {
-                       verbose("R%d leaks addr into packet\n", value_regno);
+                       verbose(env, "R%d leaks addr into packet\n",
+                               value_regno);
                        return -EACCES;
                }
                err = check_packet_access(env, regno, off, size);
                if (!err && t == BPF_READ && value_regno >= 0)
-                       mark_reg_unknown(state->regs, value_regno);
+                       mark_reg_unknown(env, state->regs, value_regno);
        } else {
-               verbose("R%d invalid mem access '%s'\n",
-                       regno, reg_type_str[reg->type]);
+               verbose(env, "R%d invalid mem access '%s'\n", regno,
+                       reg_type_str[reg->type]);
                return -EACCES;
        }
 
@@ -1260,7 +1097,7 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
 
        if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) ||
            insn->imm != 0) {
-               verbose("BPF_XADD uses reserved fields\n");
+               verbose(env, "BPF_XADD uses reserved fields\n");
                return -EINVAL;
        }
 
@@ -1275,7 +1112,7 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
                return err;
 
        if (is_pointer_value(env, insn->src_reg)) {
-               verbose("R%d leaks addr into mem\n", insn->src_reg);
+               verbose(env, "R%d leaks addr into mem\n", insn->src_reg);
                return -EACCES;
        }
 
@@ -1316,7 +1153,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
                    register_is_null(regs[regno]))
                        return 0;
 
-               verbose("R%d type=%s expected=%s\n", regno,
+               verbose(env, "R%d type=%s expected=%s\n", regno,
                        reg_type_str[regs[regno].type],
                        reg_type_str[PTR_TO_STACK]);
                return -EACCES;
@@ -1327,13 +1164,13 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
                char tn_buf[48];
 
                tnum_strn(tn_buf, sizeof(tn_buf), regs[regno].var_off);
-               verbose("invalid variable stack read R%d var_off=%s\n",
+               verbose(env, "invalid variable stack read R%d var_off=%s\n",
                        regno, tn_buf);
        }
        off = regs[regno].off + regs[regno].var_off.value;
        if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
            access_size <= 0) {
-               verbose("invalid stack type R%d off=%d access_size=%d\n",
+               verbose(env, "invalid stack type R%d off=%d access_size=%d\n",
                        regno, off, access_size);
                return -EACCES;
        }
@@ -1349,7 +1186,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
 
        for (i = 0; i < access_size; i++) {
                if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) {
-                       verbose("invalid indirect read from stack off %d+%d size %d\n",
+                       verbose(env, "invalid indirect read from stack off %d+%d size %d\n",
                                off, i, access_size);
                        return -EACCES;
                }
@@ -1392,7 +1229,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 
        if (arg_type == ARG_ANYTHING) {
                if (is_pointer_value(env, regno)) {
-                       verbose("R%d leaks addr into helper function\n", regno);
+                       verbose(env, "R%d leaks addr into helper function\n",
+                               regno);
                        return -EACCES;
                }
                return 0;
@@ -1400,7 +1238,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 
        if (type_is_pkt_pointer(type) &&
            !may_access_direct_pkt_data(env, meta, BPF_READ)) {
-               verbose("helper access to the packet is not allowed\n");
+               verbose(env, "helper access to the packet is not allowed\n");
                return -EACCES;
        }
 
@@ -1438,7 +1276,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
                        goto err_type;
                meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM;
        } else {
-               verbose("unsupported arg_type %d\n", arg_type);
+               verbose(env, "unsupported arg_type %d\n", arg_type);
                return -EFAULT;
        }
 
@@ -1456,7 +1294,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
                         * we have to check map_key here. Otherwise it means
                         * that kernel subsystem misconfigured verifier
                         */
-                       verbose("invalid map_ptr to access map->key\n");
+                       verbose(env, "invalid map_ptr to access map->key\n");
                        return -EACCES;
                }
                if (type_is_pkt_pointer(type))
@@ -1472,7 +1310,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
                 */
                if (!meta->map_ptr) {
                        /* kernel subsystem misconfigured verifier */
-                       verbose("invalid map_ptr to access map->value\n");
+                       verbose(env, "invalid map_ptr to access map->value\n");
                        return -EACCES;
                }
                if (type_is_pkt_pointer(type))
@@ -1492,7 +1330,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
                 */
                if (regno == 0) {
                        /* kernel subsystem misconfigured verifier */
-                       verbose("ARG_CONST_SIZE cannot be first argument\n");
+                       verbose(env,
+                               "ARG_CONST_SIZE cannot be first argument\n");
                        return -EACCES;
                }
 
@@ -1509,7 +1348,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
                        meta = NULL;
 
                if (reg->smin_value < 0) {
-                       verbose("R%d min value is negative, either use unsigned or 'var &= const'\n",
+                       verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n",
                                regno);
                        return -EACCES;
                }
@@ -1523,7 +1362,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
                }
 
                if (reg->umax_value >= BPF_MAX_VAR_SIZ) {
-                       verbose("R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
+                       verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
                                regno);
                        return -EACCES;
                }
@@ -1534,12 +1373,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 
        return err;
 err_type:
-       verbose("R%d type=%s expected=%s\n", regno,
+       verbose(env, "R%d type=%s expected=%s\n", regno,
                reg_type_str[type], reg_type_str[expected_type]);
        return -EACCES;
 }
 
-static int check_map_func_compatibility(struct bpf_map *map, int func_id)
+static int check_map_func_compatibility(struct bpf_verifier_env *env,
+                                       struct bpf_map *map, int func_id)
 {
        if (!map)
                return 0;
@@ -1552,7 +1392,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
                break;
        case BPF_MAP_TYPE_PERF_EVENT_ARRAY:
                if (func_id != BPF_FUNC_perf_event_read &&
-                   func_id != BPF_FUNC_perf_event_output)
+                   func_id != BPF_FUNC_perf_event_output &&
+                   func_id != BPF_FUNC_perf_event_read_value)
                        goto error;
                break;
        case BPF_MAP_TYPE_STACK_TRACE:
@@ -1595,6 +1436,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
                break;
        case BPF_FUNC_perf_event_read:
        case BPF_FUNC_perf_event_output:
+       case BPF_FUNC_perf_event_read_value:
                if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY)
                        goto error;
                break;
@@ -1625,7 +1467,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 
        return 0;
 error:
-       verbose("cannot pass map_type %d into func %s#%d\n",
+       verbose(env, "cannot pass map_type %d into func %s#%d\n",
                map->map_type, func_id_name(func_id), func_id);
        return -EINVAL;
 }
@@ -1659,7 +1501,7 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
 
        for (i = 0; i < MAX_BPF_REG; i++)
                if (reg_is_pkt_pointer_any(&regs[i]))
-                       mark_reg_unknown(regs, i);
+                       mark_reg_unknown(env, regs, i);
 
        for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
                if (state->stack_slot_type[i] != STACK_SPILL)
@@ -1681,7 +1523,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 
        /* find function prototype */
        if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) {
-               verbose("invalid func %s#%d\n", func_id_name(func_id), func_id);
+               verbose(env, "invalid func %s#%d\n", func_id_name(func_id),
+                       func_id);
                return -EINVAL;
        }
 
@@ -1689,13 +1532,14 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
                fn = env->prog->aux->ops->get_func_proto(func_id);
 
        if (!fn) {
-               verbose("unknown func %s#%d\n", func_id_name(func_id), func_id);
+               verbose(env, "unknown func %s#%d\n", func_id_name(func_id),
+                       func_id);
                return -EINVAL;
        }
 
        /* eBPF programs must be GPL compatible to use GPL-ed functions */
        if (!env->prog->gpl_compatible && fn->gpl_only) {
-               verbose("cannot call GPL only function from proprietary program\n");
+               verbose(env, "cannot call GPL only function from proprietary program\n");
                return -EINVAL;
        }
 
@@ -1709,7 +1553,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
         */
        err = check_raw_mode(fn);
        if (err) {
-               verbose("kernel subsystem misconfigured func %s#%d\n",
+               verbose(env, "kernel subsystem misconfigured func %s#%d\n",
                        func_id_name(func_id), func_id);
                return err;
        }
@@ -1742,14 +1586,14 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 
        /* reset caller saved regs */
        for (i = 0; i < CALLER_SAVED_REGS; i++) {
-               mark_reg_not_init(regs, caller_saved[i]);
+               mark_reg_not_init(env, regs, caller_saved[i]);
                check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
        }
 
        /* update return register (already marked as written above) */
        if (fn->ret_type == RET_INTEGER) {
                /* sets type to SCALAR_VALUE */
-               mark_reg_unknown(regs, BPF_REG_0);
+               mark_reg_unknown(env, regs, BPF_REG_0);
        } else if (fn->ret_type == RET_VOID) {
                regs[BPF_REG_0].type = NOT_INIT;
        } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) {
@@ -1757,14 +1601,15 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 
                regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
                /* There is no offset yet applied, variable or fixed */
-               mark_reg_known_zero(regs, BPF_REG_0);
+               mark_reg_known_zero(env, regs, BPF_REG_0);
                regs[BPF_REG_0].off = 0;
                /* remember map_ptr, so that check_map_access()
                 * can check 'value_size' boundary of memory access
                 * to map element returned from bpf_map_lookup_elem()
                 */
                if (meta.map_ptr == NULL) {
-                       verbose("kernel subsystem misconfigured verifier\n");
+                       verbose(env,
+                               "kernel subsystem misconfigured verifier\n");
                        return -EINVAL;
                }
                regs[BPF_REG_0].map_ptr = meta.map_ptr;
@@ -1775,12 +1620,12 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
                else if (insn_aux->map_ptr != meta.map_ptr)
                        insn_aux->map_ptr = BPF_MAP_PTR_POISON;
        } else {
-               verbose("unknown return type %d of func %s#%d\n",
+               verbose(env, "unknown return type %d of func %s#%d\n",
                        fn->ret_type, func_id_name(func_id), func_id);
                return -EINVAL;
        }
 
-       err = check_map_func_compatibility(meta.map_ptr, func_id);
+       err = check_map_func_compatibility(env, meta.map_ptr, func_id);
        if (err)
                return err;
 
@@ -1839,39 +1684,42 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
        dst_reg = &regs[dst];
 
        if (WARN_ON_ONCE(known && (smin_val != smax_val))) {
-               print_verifier_state(&env->cur_state);
-               verbose("verifier internal error: known but bad sbounds\n");
+               print_verifier_state(env, &env->cur_state);
+               verbose(env,
+                       "verifier internal error: known but bad sbounds\n");
                return -EINVAL;
        }
        if (WARN_ON_ONCE(known && (umin_val != umax_val))) {
-               print_verifier_state(&env->cur_state);
-               verbose("verifier internal error: known but bad ubounds\n");
+               print_verifier_state(env, &env->cur_state);
+               verbose(env,
+                       "verifier internal error: known but bad ubounds\n");
                return -EINVAL;
        }
 
        if (BPF_CLASS(insn->code) != BPF_ALU64) {
                /* 32-bit ALU ops on pointers produce (meaningless) scalars */
                if (!env->allow_ptr_leaks)
-                       verbose("R%d 32-bit pointer arithmetic prohibited\n",
+                       verbose(env,
+                               "R%d 32-bit pointer arithmetic prohibited\n",
                                dst);
                return -EACCES;
        }
 
        if (ptr_reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
                if (!env->allow_ptr_leaks)
-                       verbose("R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n",
+                       verbose(env, "R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n",
                                dst);
                return -EACCES;
        }
        if (ptr_reg->type == CONST_PTR_TO_MAP) {
                if (!env->allow_ptr_leaks)
-                       verbose("R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n",
+                       verbose(env, "R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n",
                                dst);
                return -EACCES;
        }
        if (ptr_reg->type == PTR_TO_PACKET_END) {
                if (!env->allow_ptr_leaks)
-                       verbose("R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n",
+                       verbose(env, "R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n",
                                dst);
                return -EACCES;
        }
@@ -1936,7 +1784,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
                if (dst_reg == off_reg) {
                        /* scalar -= pointer.  Creates an unknown scalar */
                        if (!env->allow_ptr_leaks)
-                               verbose("R%d tried to subtract pointer from scalar\n",
+                               verbose(env, "R%d tried to subtract pointer from scalar\n",
                                        dst);
                        return -EACCES;
                }
@@ -1946,7 +1794,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
                 */
                if (ptr_reg->type == PTR_TO_STACK) {
                        if (!env->allow_ptr_leaks)
-                               verbose("R%d subtraction from stack pointer prohibited\n",
+                               verbose(env, "R%d subtraction from stack pointer prohibited\n",
                                        dst);
                        return -EACCES;
                }
@@ -2001,13 +1849,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
                 * ptr &= ~3 which would reduce min_value by 3.)
                 */
                if (!env->allow_ptr_leaks)
-                       verbose("R%d bitwise operator %s on pointer prohibited\n",
+                       verbose(env, "R%d bitwise operator %s on pointer prohibited\n",
                                dst, bpf_alu_string[opcode >> 4]);
                return -EACCES;
        default:
                /* other operators (e.g. MUL,LSH) produce non-pointer results */
                if (!env->allow_ptr_leaks)
-                       verbose("R%d pointer arithmetic with %s operator prohibited\n",
+                       verbose(env, "R%d pointer arithmetic with %s operator prohibited\n",
                                dst, bpf_alu_string[opcode >> 4]);
                return -EACCES;
        }
@@ -2173,7 +2021,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
                        /* Shifts greater than 63 are undefined.  This includes
                         * shifts by a negative number.
                         */
-                       mark_reg_unknown(regs, insn->dst_reg);
+                       mark_reg_unknown(env, regs, insn->dst_reg);
                        break;
                }
                /* We lose all sign bit information (except what we can pick
@@ -2201,7 +2049,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
                        /* Shifts greater than 63 are undefined.  This includes
                         * shifts by a negative number.
                         */
-                       mark_reg_unknown(regs, insn->dst_reg);
+                       mark_reg_unknown(env, regs, insn->dst_reg);
                        break;
                }
                /* BPF_RSH is an unsigned shift, so make the appropriate casts */
@@ -2229,7 +2077,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
                __update_reg_bounds(dst_reg);
                break;
        default:
-               mark_reg_unknown(regs, insn->dst_reg);
+               mark_reg_unknown(env, regs, insn->dst_reg);
                break;
        }
 
@@ -2261,12 +2109,12 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
                                 * an arbitrary scalar.
                                 */
                                if (!env->allow_ptr_leaks) {
-                                       verbose("R%d pointer %s pointer prohibited\n",
+                                       verbose(env, "R%d pointer %s pointer prohibited\n",
                                                insn->dst_reg,
                                                bpf_alu_string[opcode >> 4]);
                                        return -EACCES;
                                }
-                               mark_reg_unknown(regs, insn->dst_reg);
+                               mark_reg_unknown(env, regs, insn->dst_reg);
                                return 0;
                        } else {
                                /* scalar += pointer
@@ -2318,13 +2166,13 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 
        /* Got here implies adding two SCALAR_VALUEs */
        if (WARN_ON_ONCE(ptr_reg)) {
-               print_verifier_state(&env->cur_state);
-               verbose("verifier internal error: unexpected ptr_reg\n");
+               print_verifier_state(env, &env->cur_state);
+               verbose(env, "verifier internal error: unexpected ptr_reg\n");
                return -EINVAL;
        }
        if (WARN_ON(!src_reg)) {
-               print_verifier_state(&env->cur_state);
-               verbose("verifier internal error: no src_reg\n");
+               print_verifier_state(env, &env->cur_state);
+               verbose(env, "verifier internal error: no src_reg\n");
                return -EINVAL;
        }
        return adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg);
@@ -2342,14 +2190,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
                        if (BPF_SRC(insn->code) != 0 ||
                            insn->src_reg != BPF_REG_0 ||
                            insn->off != 0 || insn->imm != 0) {
-                               verbose("BPF_NEG uses reserved fields\n");
+                               verbose(env, "BPF_NEG uses reserved fields\n");
                                return -EINVAL;
                        }
                } else {
                        if (insn->src_reg != BPF_REG_0 || insn->off != 0 ||
                            (insn->imm != 16 && insn->imm != 32 && insn->imm != 64) ||
                            BPF_CLASS(insn->code) == BPF_ALU64) {
-                               verbose("BPF_END uses reserved fields\n");
+                               verbose(env, "BPF_END uses reserved fields\n");
                                return -EINVAL;
                        }
                }
@@ -2360,7 +2208,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
                        return err;
 
                if (is_pointer_value(env, insn->dst_reg)) {
-                       verbose("R%d pointer arithmetic prohibited\n",
+                       verbose(env, "R%d pointer arithmetic prohibited\n",
                                insn->dst_reg);
                        return -EACCES;
                }
@@ -2374,7 +2222,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 
                if (BPF_SRC(insn->code) == BPF_X) {
                        if (insn->imm != 0 || insn->off != 0) {
-                               verbose("BPF_MOV uses reserved fields\n");
+                               verbose(env, "BPF_MOV uses reserved fields\n");
                                return -EINVAL;
                        }
 
@@ -2384,7 +2232,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
                                return err;
                } else {
                        if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
-                               verbose("BPF_MOV uses reserved fields\n");
+                               verbose(env, "BPF_MOV uses reserved fields\n");
                                return -EINVAL;
                        }
                }
@@ -2400,14 +2248,16 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
                                 * copy register state to dest reg
                                 */
                                regs[insn->dst_reg] = regs[insn->src_reg];
+                               regs[insn->dst_reg].live |= REG_LIVE_WRITTEN;
                        } else {
                                /* R1 = (u32) R2 */
                                if (is_pointer_value(env, insn->src_reg)) {
-                                       verbose("R%d partial copy of pointer\n",
+                                       verbose(env,
+                                               "R%d partial copy of pointer\n",
                                                insn->src_reg);
                                        return -EACCES;
                                }
-                               mark_reg_unknown(regs, insn->dst_reg);
+                               mark_reg_unknown(env, regs, insn->dst_reg);
                                /* high 32 bits are known zero. */
                                regs[insn->dst_reg].var_off = tnum_cast(
                                                regs[insn->dst_reg].var_off, 4);
@@ -2422,14 +2272,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
                }
 
        } else if (opcode > BPF_END) {
-               verbose("invalid BPF_ALU opcode %x\n", opcode);
+               verbose(env, "invalid BPF_ALU opcode %x\n", opcode);
                return -EINVAL;
 
        } else {        /* all other ALU ops: and, sub, xor, add, ... */
 
                if (BPF_SRC(insn->code) == BPF_X) {
                        if (insn->imm != 0 || insn->off != 0) {
-                               verbose("BPF_ALU uses reserved fields\n");
+                               verbose(env, "BPF_ALU uses reserved fields\n");
                                return -EINVAL;
                        }
                        /* check src1 operand */
@@ -2438,7 +2288,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
                                return err;
                } else {
                        if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
-                               verbose("BPF_ALU uses reserved fields\n");
+                               verbose(env, "BPF_ALU uses reserved fields\n");
                                return -EINVAL;
                        }
                }
@@ -2450,7 +2300,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 
                if ((opcode == BPF_MOD || opcode == BPF_DIV) &&
                    BPF_SRC(insn->code) == BPF_K && insn->imm == 0) {
-                       verbose("div by zero\n");
+                       verbose(env, "div by zero\n");
                        return -EINVAL;
                }
 
@@ -2459,7 +2309,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
                        int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32;
 
                        if (insn->imm < 0 || insn->imm >= size) {
-                               verbose("invalid shift %d\n", insn->imm);
+                               verbose(env, "invalid shift %d\n", insn->imm);
                                return -EINVAL;
                        }
                }
@@ -2812,13 +2662,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
        int err;
 
        if (opcode > BPF_JSLE) {
-               verbose("invalid BPF_JMP opcode %x\n", opcode);
+               verbose(env, "invalid BPF_JMP opcode %x\n", opcode);
                return -EINVAL;
        }
 
        if (BPF_SRC(insn->code) == BPF_X) {
                if (insn->imm != 0) {
-                       verbose("BPF_JMP uses reserved fields\n");
+                       verbose(env, "BPF_JMP uses reserved fields\n");
                        return -EINVAL;
                }
 
@@ -2828,13 +2678,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
                        return err;
 
                if (is_pointer_value(env, insn->src_reg)) {
-                       verbose("R%d pointer comparison prohibited\n",
+                       verbose(env, "R%d pointer comparison prohibited\n",
                                insn->src_reg);
                        return -EACCES;
                }
        } else {
                if (insn->src_reg != BPF_REG_0) {
-                       verbose("BPF_JMP uses reserved fields\n");
+                       verbose(env, "BPF_JMP uses reserved fields\n");
                        return -EINVAL;
                }
        }
@@ -2946,11 +2796,12 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
                find_good_pkt_pointers(this_branch, &regs[insn->src_reg],
                                       PTR_TO_PACKET_META);
        } else if (is_pointer_value(env, insn->dst_reg)) {
-               verbose("R%d pointer comparison prohibited\n", insn->dst_reg);
+               verbose(env, "R%d pointer comparison prohibited\n",
+                       insn->dst_reg);
                return -EACCES;
        }
-       if (log_level)
-               print_verifier_state(this_branch);
+       if (env->log.level)
+               print_verifier_state(env, this_branch);
        return 0;
 }
 
@@ -2969,11 +2820,11 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
        int err;
 
        if (BPF_SIZE(insn->code) != BPF_DW) {
-               verbose("invalid BPF_LD_IMM insn\n");
+               verbose(env, "invalid BPF_LD_IMM insn\n");
                return -EINVAL;
        }
        if (insn->off != 0) {
-               verbose("BPF_LD_IMM64 uses reserved fields\n");
+               verbose(env, "BPF_LD_IMM64 uses reserved fields\n");
                return -EINVAL;
        }
 
@@ -3031,14 +2882,14 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
        int i, err;
 
        if (!may_access_skb(env->prog->type)) {
-               verbose("BPF_LD_[ABS|IND] instructions not allowed for this program type\n");
+               verbose(env, "BPF_LD_[ABS|IND] instructions not allowed for this program type\n");
                return -EINVAL;
        }
 
        if (insn->dst_reg != BPF_REG_0 || insn->off != 0 ||
            BPF_SIZE(insn->code) == BPF_DW ||
            (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) {
-               verbose("BPF_LD_[ABS|IND] uses reserved fields\n");
+               verbose(env, "BPF_LD_[ABS|IND] uses reserved fields\n");
                return -EINVAL;
        }
 
@@ -3048,7 +2899,8 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
                return err;
 
        if (regs[BPF_REG_6].type != PTR_TO_CTX) {
-               verbose("at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
+               verbose(env,
+                       "at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
                return -EINVAL;
        }
 
@@ -3061,7 +2913,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 
        /* reset caller saved regs to unreadable */
        for (i = 0; i < CALLER_SAVED_REGS; i++) {
-               mark_reg_not_init(regs, caller_saved[i]);
+               mark_reg_not_init(env, regs, caller_saved[i]);
                check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
        }
 
@@ -3069,7 +2921,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
         * the value fetched from the packet.
         * Already marked as written above.
         */
-       mark_reg_unknown(regs, BPF_REG_0);
+       mark_reg_unknown(env, regs, BPF_REG_0);
        return 0;
 }
 
@@ -3089,22 +2941,22 @@ static int check_return_code(struct bpf_verifier_env *env)
 
        reg = &env->cur_state.regs[BPF_REG_0];
        if (reg->type != SCALAR_VALUE) {
-               verbose("At program exit the register R0 is not a known value (%s)\n",
+               verbose(env, "At program exit the register R0 is not a known value (%s)\n",
                        reg_type_str[reg->type]);
                return -EINVAL;
        }
 
        if (!tnum_in(range, reg->var_off)) {
-               verbose("At program exit the register R0 ");
+               verbose(env, "At program exit the register R0 ");
                if (!tnum_is_unknown(reg->var_off)) {
                        char tn_buf[48];
 
                        tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-                       verbose("has value %s", tn_buf);
+                       verbose(env, "has value %s", tn_buf);
                } else {
-                       verbose("has unknown scalar value");
+                       verbose(env, "has unknown scalar value");
                }
-               verbose(" should have been 0 or 1\n");
+               verbose(env, " should have been 0 or 1\n");
                return -EINVAL;
        }
        return 0;
@@ -3170,7 +3022,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
                return 0;
 
        if (w < 0 || w >= env->prog->len) {
-               verbose("jump out of range from insn %d to %d\n", t, w);
+               verbose(env, "jump out of range from insn %d to %d\n", t, w);
                return -EINVAL;
        }
 
@@ -3187,13 +3039,13 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
                insn_stack[cur_stack++] = w;
                return 1;
        } else if ((insn_state[w] & 0xF0) == DISCOVERED) {
-               verbose("back-edge from insn %d to %d\n", t, w);
+               verbose(env, "back-edge from insn %d to %d\n", t, w);
                return -EINVAL;
        } else if (insn_state[w] == EXPLORED) {
                /* forward- or cross-edge */
                insn_state[t] = DISCOVERED | e;
        } else {
-               verbose("insn state internal bug\n");
+               verbose(env, "insn state internal bug\n");
                return -EFAULT;
        }
        return 0;
@@ -3287,7 +3139,7 @@ peek_stack:
 mark_explored:
        insn_state[t] = EXPLORED;
        if (cur_stack-- <= 0) {
-               verbose("pop stack internal bug\n");
+               verbose(env, "pop stack internal bug\n");
                ret = -EFAULT;
                goto err_free;
        }
@@ -3296,7 +3148,7 @@ mark_explored:
 check_state:
        for (i = 0; i < insn_cnt; i++) {
                if (insn_state[i] != EXPLORED) {
-                       verbose("unreachable insn %d\n", i);
+                       verbose(env, "unreachable insn %d\n", i);
                        ret = -EINVAL;
                        goto err_free;
                }
@@ -3677,7 +3529,7 @@ static int do_check(struct bpf_verifier_env *env)
        int insn_processed = 0;
        bool do_print_state = false;
 
-       init_reg_state(regs);
+       init_reg_state(env, regs);
        state->parent = NULL;
        insn_idx = 0;
        for (;;) {
@@ -3686,7 +3538,7 @@ static int do_check(struct bpf_verifier_env *env)
                int err;
 
                if (insn_idx >= insn_cnt) {
-                       verbose("invalid insn idx %d insn_cnt %d\n",
+                       verbose(env, "invalid insn idx %d insn_cnt %d\n",
                                insn_idx, insn_cnt);
                        return -EFAULT;
                }
@@ -3695,7 +3547,8 @@ static int do_check(struct bpf_verifier_env *env)
                class = BPF_CLASS(insn->code);
 
                if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) {
-                       verbose("BPF program is too large. Processed %d insn\n",
+                       verbose(env,
+                               "BPF program is too large. Processed %d insn\n",
                                insn_processed);
                        return -E2BIG;
                }
@@ -3705,12 +3558,12 @@ static int do_check(struct bpf_verifier_env *env)
                        return err;
                if (err == 1) {
                        /* found equivalent state, can prune the search */
-                       if (log_level) {
+                       if (env->log.level) {
                                if (do_print_state)
-                                       verbose("\nfrom %d to %d: safe\n",
+                                       verbose(env, "\nfrom %d to %d: safe\n",
                                                prev_insn_idx, insn_idx);
                                else
-                                       verbose("%d: safe\n", insn_idx);
+                                       verbose(env, "%d: safe\n", insn_idx);
                        }
                        goto process_bpf_exit;
                }
@@ -3718,19 +3571,20 @@ static int do_check(struct bpf_verifier_env *env)
                if (need_resched())
                        cond_resched();
 
-               if (log_level > 1 || (log_level && do_print_state)) {
-                       if (log_level > 1)
-                               verbose("%d:", insn_idx);
+               if (env->log.level > 1 || (env->log.level && do_print_state)) {
+                       if (env->log.level > 1)
+                               verbose(env, "%d:", insn_idx);
                        else
-                               verbose("\nfrom %d to %d:",
+                               verbose(env, "\nfrom %d to %d:",
                                        prev_insn_idx, insn_idx);
-                       print_verifier_state(&env->cur_state);
+                       print_verifier_state(env, &env->cur_state);
                        do_print_state = false;
                }
 
-               if (log_level) {
-                       verbose("%d: ", insn_idx);
-                       print_bpf_insn(env, insn);
+               if (env->log.level) {
+                       verbose(env, "%d: ", insn_idx);
+                       print_bpf_insn(verbose, env, insn,
+                                      env->allow_ptr_leaks);
                }
 
                err = ext_analyzer_insn_hook(env, insn_idx, prev_insn_idx);
@@ -3786,7 +3640,7 @@ static int do_check(struct bpf_verifier_env *env)
                                 * src_reg == stack|map in some other branch.
                                 * Reject it.
                                 */
-                               verbose("same insn cannot be used with different pointers\n");
+                               verbose(env, "same insn cannot be used with different pointers\n");
                                return -EINVAL;
                        }
 
@@ -3826,14 +3680,14 @@ static int do_check(struct bpf_verifier_env *env)
                        } else if (dst_reg_type != *prev_dst_type &&
                                   (dst_reg_type == PTR_TO_CTX ||
                                    *prev_dst_type == PTR_TO_CTX)) {
-                               verbose("same insn cannot be used with different pointers\n");
+                               verbose(env, "same insn cannot be used with different pointers\n");
                                return -EINVAL;
                        }
 
                } else if (class == BPF_ST) {
                        if (BPF_MODE(insn->code) != BPF_MEM ||
                            insn->src_reg != BPF_REG_0) {
-                               verbose("BPF_ST uses reserved fields\n");
+                               verbose(env, "BPF_ST uses reserved fields\n");
                                return -EINVAL;
                        }
                        /* check src operand */
@@ -3856,7 +3710,7 @@ static int do_check(struct bpf_verifier_env *env)
                                    insn->off != 0 ||
                                    insn->src_reg != BPF_REG_0 ||
                                    insn->dst_reg != BPF_REG_0) {
-                                       verbose("BPF_CALL uses reserved fields\n");
+                                       verbose(env, "BPF_CALL uses reserved fields\n");
                                        return -EINVAL;
                                }
 
@@ -3869,7 +3723,7 @@ static int do_check(struct bpf_verifier_env *env)
                                    insn->imm != 0 ||
                                    insn->src_reg != BPF_REG_0 ||
                                    insn->dst_reg != BPF_REG_0) {
-                                       verbose("BPF_JA uses reserved fields\n");
+                                       verbose(env, "BPF_JA uses reserved fields\n");
                                        return -EINVAL;
                                }
 
@@ -3881,7 +3735,7 @@ static int do_check(struct bpf_verifier_env *env)
                                    insn->imm != 0 ||
                                    insn->src_reg != BPF_REG_0 ||
                                    insn->dst_reg != BPF_REG_0) {
-                                       verbose("BPF_EXIT uses reserved fields\n");
+                                       verbose(env, "BPF_EXIT uses reserved fields\n");
                                        return -EINVAL;
                                }
 
@@ -3896,7 +3750,7 @@ static int do_check(struct bpf_verifier_env *env)
                                        return err;
 
                                if (is_pointer_value(env, BPF_REG_0)) {
-                                       verbose("R0 leaks addr as return value\n");
+                                       verbose(env, "R0 leaks addr as return value\n");
                                        return -EACCES;
                                }
 
@@ -3931,19 +3785,19 @@ process_bpf_exit:
 
                                insn_idx++;
                        } else {
-                               verbose("invalid BPF_LD mode\n");
+                               verbose(env, "invalid BPF_LD mode\n");
                                return -EINVAL;
                        }
                } else {
-                       verbose("unknown insn class %d\n", class);
+                       verbose(env, "unknown insn class %d\n", class);
                        return -EINVAL;
                }
 
                insn_idx++;
        }
 
-       verbose("processed %d insns, stack depth %d\n",
-               insn_processed, env->prog->aux->stack_depth);
+       verbose(env, "processed %d insns, stack depth %d\n", insn_processed,
+               env->prog->aux->stack_depth);
        return 0;
 }
 
@@ -3955,7 +3809,8 @@ static int check_map_prealloc(struct bpf_map *map)
                !(map->map_flags & BPF_F_NO_PREALLOC);
 }
 
-static int check_map_prog_compatibility(struct bpf_map *map,
+static int check_map_prog_compatibility(struct bpf_verifier_env *env,
+                                       struct bpf_map *map,
                                        struct bpf_prog *prog)
 
 {
@@ -3966,12 +3821,12 @@ static int check_map_prog_compatibility(struct bpf_map *map,
         */
        if (prog->type == BPF_PROG_TYPE_PERF_EVENT) {
                if (!check_map_prealloc(map)) {
-                       verbose("perf_event programs can only use preallocated hash map\n");
+                       verbose(env, "perf_event programs can only use preallocated hash map\n");
                        return -EINVAL;
                }
                if (map->inner_map_meta &&
                    !check_map_prealloc(map->inner_map_meta)) {
-                       verbose("perf_event programs can only use preallocated inner hash map\n");
+                       verbose(env, "perf_event programs can only use preallocated inner hash map\n");
                        return -EINVAL;
                }
        }
@@ -3994,14 +3849,14 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
        for (i = 0; i < insn_cnt; i++, insn++) {
                if (BPF_CLASS(insn->code) == BPF_LDX &&
                    (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0)) {
-                       verbose("BPF_LDX uses reserved fields\n");
+                       verbose(env, "BPF_LDX uses reserved fields\n");
                        return -EINVAL;
                }
 
                if (BPF_CLASS(insn->code) == BPF_STX &&
                    ((BPF_MODE(insn->code) != BPF_MEM &&
                      BPF_MODE(insn->code) != BPF_XADD) || insn->imm != 0)) {
-                       verbose("BPF_STX uses reserved fields\n");
+                       verbose(env, "BPF_STX uses reserved fields\n");
                        return -EINVAL;
                }
 
@@ -4012,7 +3867,7 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
                        if (i == insn_cnt - 1 || insn[1].code != 0 ||
                            insn[1].dst_reg != 0 || insn[1].src_reg != 0 ||
                            insn[1].off != 0) {
-                               verbose("invalid bpf_ld_imm64 insn\n");
+                               verbose(env, "invalid bpf_ld_imm64 insn\n");
                                return -EINVAL;
                        }
 
@@ -4021,19 +3876,20 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
                                goto next_insn;
 
                        if (insn->src_reg != BPF_PSEUDO_MAP_FD) {
-                               verbose("unrecognized bpf_ld_imm64 insn\n");
+                               verbose(env,
+                                       "unrecognized bpf_ld_imm64 insn\n");
                                return -EINVAL;
                        }
 
                        f = fdget(insn->imm);
                        map = __bpf_map_get(f);
                        if (IS_ERR(map)) {
-                               verbose("fd %d is not pointing to valid bpf_map\n",
+                               verbose(env, "fd %d is not pointing to valid bpf_map\n",
                                        insn->imm);
                                return PTR_ERR(map);
                        }
 
-                       err = check_map_prog_compatibility(map, env->prog);
+                       err = check_map_prog_compatibility(env, map, env->prog);
                        if (err) {
                                fdput(f);
                                return err;
@@ -4155,7 +4011,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
                cnt = ops->gen_prologue(insn_buf, env->seen_direct_write,
                                        env->prog);
                if (cnt >= ARRAY_SIZE(insn_buf)) {
-                       verbose("bpf verifier is misconfigured\n");
+                       verbose(env, "bpf verifier is misconfigured\n");
                        return -EINVAL;
                } else if (cnt) {
                        new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt);
@@ -4203,7 +4059,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
                        u8 size_code;
 
                        if (type == BPF_WRITE) {
-                               verbose("bpf verifier narrow ctx access misconfigured\n");
+                               verbose(env, "bpf verifier narrow ctx access misconfigured\n");
                                return -EINVAL;
                        }
 
@@ -4222,7 +4078,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
                                              &target_size);
                if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) ||
                    (ctx_field_size && !target_size)) {
-                       verbose("bpf verifier is misconfigured\n");
+                       verbose(env, "bpf verifier is misconfigured\n");
                        return -EINVAL;
                }
 
@@ -4304,7 +4160,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 
                        cnt = map_ptr->ops->map_gen_lookup(map_ptr, insn_buf);
                        if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
-                               verbose("bpf verifier is misconfigured\n");
+                               verbose(env, "bpf verifier is misconfigured\n");
                                return -EINVAL;
                        }
 
@@ -4348,7 +4204,8 @@ patch_call_imm:
                 * programs to call them, must be real in-kernel functions
                 */
                if (!fn->func) {
-                       verbose("kernel subsystem misconfigured func %s#%d\n",
+                       verbose(env,
+                               "kernel subsystem misconfigured func %s#%d\n",
                                func_id_name(insn->imm), insn->imm);
                        return -EFAULT;
                }
@@ -4382,8 +4239,8 @@ static void free_states(struct bpf_verifier_env *env)
 
 int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 {
-       char __user *log_ubuf = NULL;
        struct bpf_verifier_env *env;
+       struct bpf_verifer_log *log;
        int ret = -EINVAL;
 
        /* 'struct bpf_verifier_env' can be global, but since it's not small,
@@ -4392,6 +4249,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
        env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
        if (!env)
                return -ENOMEM;
+       log = &env->log;
 
        env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) *
                                     (*prog)->len);
@@ -4407,23 +4265,15 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
                /* user requested verbose verifier output
                 * and supplied buffer to store the verification trace
                 */
-               log_level = attr->log_level;
-               log_ubuf = (char __user *) (unsigned long) attr->log_buf;
-               log_size = attr->log_size;
-               log_len = 0;
+               log->level = attr->log_level;
+               log->ubuf = (char __user *) (unsigned long) attr->log_buf;
+               log->len_total = attr->log_size;
 
                ret = -EINVAL;
-               /* log_* values have to be sane */
-               if (log_size < 128 || log_size > UINT_MAX >> 8 ||
-                   log_level == 0 || log_ubuf == NULL)
-                       goto err_unlock;
-
-               ret = -ENOMEM;
-               log_buf = vmalloc(log_size);
-               if (!log_buf)
+               /* log attributes have to be sane */
+               if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 ||
+                   !log->level || !log->ubuf)
                        goto err_unlock;
-       } else {
-               log_level = 0;
        }
 
        env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT);
@@ -4460,17 +4310,11 @@ skip_full_check:
        if (ret == 0)
                ret = fixup_bpf_calls(env);
 
-       if (log_level && log_len >= log_size - 1) {
-               BUG_ON(log_len >= log_size);
-               /* verifier log exceeded user supplied buffer */
+       if (log->level && bpf_verifier_log_full(log))
                ret = -ENOSPC;
-               /* fall through to return what was recorded */
-       }
-
-       /* copy verifier log back to user space including trailing zero */
-       if (log_level && copy_to_user(log_ubuf, log_buf, log_len + 1) != 0) {
+       if (log->level && !log->ubuf) {
                ret = -EFAULT;
-               goto free_log_buf;
+               goto err_release_maps;
        }
 
        if (ret == 0 && env->used_map_cnt) {
@@ -4481,7 +4325,7 @@ skip_full_check:
 
                if (!env->prog->aux->used_maps) {
                        ret = -ENOMEM;
-                       goto free_log_buf;
+                       goto err_release_maps;
                }
 
                memcpy(env->prog->aux->used_maps, env->used_maps,
@@ -4494,9 +4338,7 @@ skip_full_check:
                convert_pseudo_ld_imm64(env);
        }
 
-free_log_buf:
-       if (log_level)
-               vfree(log_buf);
+err_release_maps:
        if (!env->prog->aux->used_maps)
                /* if we didn't copy map pointers into bpf_prog_info, release
                 * them now. Otherwise free_bpf_prog_info() will release them.
@@ -4533,8 +4375,6 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
        /* grab the mutex to protect few globals used by verifier */
        mutex_lock(&bpf_verifier_lock);
 
-       log_level = 0;
-
        env->strict_alignment = false;
        if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
                env->strict_alignment = true;
index 8de11a2..d851df2 100644 (file)
@@ -24,6 +24,7 @@
 #include <linux/lockdep.h>
 #include <linux/tick.h>
 #include <linux/irq.h>
+#include <linux/nmi.h>
 #include <linux/smpboot.h>
 #include <linux/relay.h>
 #include <linux/slab.h>
@@ -897,6 +898,11 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
 
 out:
        cpus_write_unlock();
+       /*
+        * Do post unplug cleanup. This is still protected against
+        * concurrent CPU hotplug via cpu_add_remove_lock.
+        */
+       lockup_detector_cleanup();
        return ret;
 }
 
index 6bc21e2..902149f 100644 (file)
@@ -3684,10 +3684,12 @@ static inline u64 perf_event_count(struct perf_event *event)
  *     will not be local and we cannot read them atomically
  *   - must not have a pmu::count method
  */
-int perf_event_read_local(struct perf_event *event, u64 *value)
+int perf_event_read_local(struct perf_event *event, u64 *value,
+                         u64 *enabled, u64 *running)
 {
        unsigned long flags;
        int ret = 0;
+       u64 now;
 
        /*
         * Disabling interrupts avoids all counter scheduling (context
@@ -3718,13 +3720,21 @@ int perf_event_read_local(struct perf_event *event, u64 *value)
                goto out;
        }
 
+       now = event->shadow_ctx_time + perf_clock();
+       if (enabled)
+               *enabled = now - event->tstamp_enabled;
        /*
         * If the event is currently on this CPU, its either a per-task event,
         * or local to this CPU. Furthermore it means its ACTIVE (otherwise
         * oncpu == -1).
         */
-       if (event->oncpu == smp_processor_id())
+       if (event->oncpu == smp_processor_id()) {
                event->pmu->read(event);
+               if (running)
+                       *running = now - event->tstamp_running;
+       } else if (running) {
+               *running = event->total_time_running;
+       }
 
        *value = local64_read(&event->count);
 out:
@@ -8072,6 +8082,7 @@ static void bpf_overflow_handler(struct perf_event *event,
        struct bpf_perf_event_data_kern ctx = {
                .data = data,
                .regs = regs,
+               .event = event,
        };
        int ret = 0;
 
index 1d71c05..5043e74 100644 (file)
@@ -344,39 +344,30 @@ EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread);
  * by the client, but only by calling this function.
  * This function can only be called on a registered smp_hotplug_thread.
  */
-int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
-                                        const struct cpumask *new)
+void smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
+                                         const struct cpumask *new)
 {
        struct cpumask *old = plug_thread->cpumask;
-       cpumask_var_t tmp;
+       static struct cpumask tmp;
        unsigned int cpu;
 
-       if (!alloc_cpumask_var(&tmp, GFP_KERNEL))
-               return -ENOMEM;
-
-       get_online_cpus();
+       lockdep_assert_cpus_held();
        mutex_lock(&smpboot_threads_lock);
 
        /* Park threads that were exclusively enabled on the old mask. */
-       cpumask_andnot(tmp, old, new);
-       for_each_cpu_and(cpu, tmp, cpu_online_mask)
+       cpumask_andnot(&tmp, old, new);
+       for_each_cpu_and(cpu, &tmp, cpu_online_mask)
                smpboot_park_thread(plug_thread, cpu);
 
        /* Unpark threads that are exclusively enabled on the new mask. */
-       cpumask_andnot(tmp, new, old);
-       for_each_cpu_and(cpu, tmp, cpu_online_mask)
+       cpumask_andnot(&tmp, new, old);
+       for_each_cpu_and(cpu, &tmp, cpu_online_mask)
                smpboot_unpark_thread(plug_thread, cpu);
 
        cpumask_copy(old, new);
 
        mutex_unlock(&smpboot_threads_lock);
-       put_online_cpus();
-
-       free_cpumask_var(tmp);
-
-       return 0;
 }
-EXPORT_SYMBOL_GPL(smpboot_update_cpumask_percpu_thread);
 
 static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD);
 
index 4da9e62..d9c31bc 100644 (file)
@@ -872,9 +872,9 @@ static struct ctl_table kern_table[] = {
 #if defined(CONFIG_LOCKUP_DETECTOR)
        {
                .procname       = "watchdog",
-               .data           = &watchdog_user_enabled,
-               .maxlen         = sizeof (int),
-               .mode           = 0644,
+               .data           = &watchdog_user_enabled,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
                .proc_handler   = proc_watchdog,
                .extra1         = &zero,
                .extra2         = &one,
@@ -890,16 +890,12 @@ static struct ctl_table kern_table[] = {
        },
        {
                .procname       = "nmi_watchdog",
-               .data           = &nmi_watchdog_enabled,
-               .maxlen         = sizeof (int),
-               .mode           = 0644,
+               .data           = &nmi_watchdog_user_enabled,
+               .maxlen         = sizeof(int),
+               .mode           = NMI_WATCHDOG_SYSCTL_PERM,
                .proc_handler   = proc_nmi_watchdog,
                .extra1         = &zero,
-#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR)
                .extra2         = &one,
-#else
-               .extra2         = &zero,
-#endif
        },
        {
                .procname       = "watchdog_cpumask",
@@ -911,9 +907,9 @@ static struct ctl_table kern_table[] = {
 #ifdef CONFIG_SOFTLOCKUP_DETECTOR
        {
                .procname       = "soft_watchdog",
-               .data           = &soft_watchdog_enabled,
-               .maxlen         = sizeof (int),
-               .mode           = 0644,
+               .data           = &soft_watchdog_user_enabled,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
                .proc_handler   = proc_soft_watchdog,
                .extra1         = &zero,
                .extra2         = &one,
index dc498b6..04ea531 100644 (file)
@@ -255,14 +255,14 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
        return &bpf_trace_printk_proto;
 }
 
-BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
+static __always_inline int
+get_map_perf_counter(struct bpf_map *map, u64 flags,
+                    u64 *value, u64 *enabled, u64 *running)
 {
        struct bpf_array *array = container_of(map, struct bpf_array, map);
        unsigned int cpu = smp_processor_id();
        u64 index = flags & BPF_F_INDEX_MASK;
        struct bpf_event_entry *ee;
-       u64 value = 0;
-       int err;
 
        if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
                return -EINVAL;
@@ -275,7 +275,15 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
        if (!ee)
                return -ENOENT;
 
-       err = perf_event_read_local(ee->event, &value);
+       return perf_event_read_local(ee->event, value, enabled, running);
+}
+
+BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
+{
+       u64 value = 0;
+       int err;
+
+       err = get_map_perf_counter(map, flags, &value, NULL, NULL);
        /*
         * this api is ugly since we miss [-22..-2] range of valid
         * counter values, but that's uapi
@@ -293,6 +301,33 @@ static const struct bpf_func_proto bpf_perf_event_read_proto = {
        .arg2_type      = ARG_ANYTHING,
 };
 
+BPF_CALL_4(bpf_perf_event_read_value, struct bpf_map *, map, u64, flags,
+          struct bpf_perf_event_value *, buf, u32, size)
+{
+       int err = -EINVAL;
+
+       if (unlikely(size != sizeof(struct bpf_perf_event_value)))
+               goto clear;
+       err = get_map_perf_counter(map, flags, &buf->counter, &buf->enabled,
+                                  &buf->running);
+       if (unlikely(err))
+               goto clear;
+       return 0;
+clear:
+       memset(buf, 0, size);
+       return err;
+}
+
+static const struct bpf_func_proto bpf_perf_event_read_value_proto = {
+       .func           = bpf_perf_event_read_value,
+       .gpl_only       = true,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_CONST_MAP_PTR,
+       .arg2_type      = ARG_ANYTHING,
+       .arg3_type      = ARG_PTR_TO_UNINIT_MEM,
+       .arg4_type      = ARG_CONST_SIZE,
+};
+
 static DEFINE_PER_CPU(struct perf_sample_data, bpf_sd);
 
 static __always_inline u64
@@ -499,6 +534,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
                return &bpf_perf_event_output_proto;
        case BPF_FUNC_get_stackid:
                return &bpf_get_stackid_proto;
+       case BPF_FUNC_perf_event_read_value:
+               return &bpf_perf_event_read_value_proto;
        default:
                return tracing_func_proto(func_id);
        }
@@ -576,6 +613,32 @@ static const struct bpf_func_proto bpf_get_stackid_proto_tp = {
        .arg3_type      = ARG_ANYTHING,
 };
 
+BPF_CALL_3(bpf_perf_prog_read_value_tp, struct bpf_perf_event_data_kern *, ctx,
+          struct bpf_perf_event_value *, buf, u32, size)
+{
+       int err = -EINVAL;
+
+       if (unlikely(size != sizeof(struct bpf_perf_event_value)))
+               goto clear;
+       err = perf_event_read_local(ctx->event, &buf->counter, &buf->enabled,
+                                   &buf->running);
+       if (unlikely(err))
+               goto clear;
+       return 0;
+clear:
+       memset(buf, 0, size);
+       return err;
+}
+
+static const struct bpf_func_proto bpf_perf_prog_read_value_proto_tp = {
+         .func           = bpf_perf_prog_read_value_tp,
+         .gpl_only       = true,
+         .ret_type       = RET_INTEGER,
+         .arg1_type      = ARG_PTR_TO_CTX,
+         .arg2_type      = ARG_PTR_TO_UNINIT_MEM,
+         .arg3_type      = ARG_CONST_SIZE,
+};
+
 static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
 {
        switch (func_id) {
@@ -583,6 +646,8 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
                return &bpf_perf_event_output_proto_tp;
        case BPF_FUNC_get_stackid:
                return &bpf_get_stackid_proto_tp;
+       case BPF_FUNC_perf_prog_read_value:
+               return &bpf_perf_prog_read_value_proto_tp;
        default:
                return tracing_func_proto(func_id);
        }
index f5d5202..6bcb854 100644 (file)
 #include <linux/kvm_para.h>
 #include <linux/kthread.h>
 
-/* Watchdog configuration */
-static DEFINE_MUTEX(watchdog_proc_mutex);
-
-int __read_mostly nmi_watchdog_enabled;
+static DEFINE_MUTEX(watchdog_mutex);
 
 #if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG)
-unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED |
-                                               NMI_WATCHDOG_ENABLED;
+# define WATCHDOG_DEFAULT      (SOFT_WATCHDOG_ENABLED | NMI_WATCHDOG_ENABLED)
+# define NMI_WATCHDOG_DEFAULT  1
 #else
-unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED;
+# define WATCHDOG_DEFAULT      (SOFT_WATCHDOG_ENABLED)
+# define NMI_WATCHDOG_DEFAULT  0
 #endif
 
+unsigned long __read_mostly watchdog_enabled;
+int __read_mostly watchdog_user_enabled = 1;
+int __read_mostly nmi_watchdog_user_enabled = NMI_WATCHDOG_DEFAULT;
+int __read_mostly soft_watchdog_user_enabled = 1;
+int __read_mostly watchdog_thresh = 10;
+int __read_mostly nmi_watchdog_available;
+
+struct cpumask watchdog_allowed_mask __read_mostly;
+
+struct cpumask watchdog_cpumask __read_mostly;
+unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
+
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
-/* boot commands */
 /*
  * Should we panic when a soft-lockup or hard-lockup occurs:
  */
@@ -56,9 +65,9 @@ unsigned int __read_mostly hardlockup_panic =
  * kernel command line parameters are parsed, because otherwise it is not
  * possible to override this in hardlockup_panic_setup().
  */
-void hardlockup_detector_disable(void)
+void __init hardlockup_detector_disable(void)
 {
-       watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
+       nmi_watchdog_user_enabled = 0;
 }
 
 static int __init hardlockup_panic_setup(char *str)
@@ -68,48 +77,24 @@ static int __init hardlockup_panic_setup(char *str)
        else if (!strncmp(str, "nopanic", 7))
                hardlockup_panic = 0;
        else if (!strncmp(str, "0", 1))
-               watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
+               nmi_watchdog_user_enabled = 0;
        else if (!strncmp(str, "1", 1))
-               watchdog_enabled |= NMI_WATCHDOG_ENABLED;
+               nmi_watchdog_user_enabled = 1;
        return 1;
 }
 __setup("nmi_watchdog=", hardlockup_panic_setup);
 
-#endif
-
-#ifdef CONFIG_SOFTLOCKUP_DETECTOR
-int __read_mostly soft_watchdog_enabled;
-#endif
-
-int __read_mostly watchdog_user_enabled;
-int __read_mostly watchdog_thresh = 10;
-
-#ifdef CONFIG_SMP
-int __read_mostly sysctl_softlockup_all_cpu_backtrace;
+# ifdef CONFIG_SMP
 int __read_mostly sysctl_hardlockup_all_cpu_backtrace;
-#endif
-struct cpumask watchdog_cpumask __read_mostly;
-unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
 
-/*
- * The 'watchdog_running' variable is set to 1 when the watchdog threads
- * are registered/started and is set to 0 when the watchdog threads are
- * unregistered/stopped, so it is an indicator whether the threads exist.
- */
-static int __read_mostly watchdog_running;
-/*
- * If a subsystem has a need to deactivate the watchdog temporarily, it
- * can use the suspend/resume interface to achieve this. The content of
- * the 'watchdog_suspended' variable reflects this state. Existing threads
- * are parked/unparked by the lockup_detector_{suspend|resume} functions
- * (see comment blocks pertaining to those functions for further details).
- *
- * 'watchdog_suspended' also prevents threads from being registered/started
- * or unregistered/stopped via parameters in /proc/sys/kernel, so the state
- * of 'watchdog_running' cannot change while the watchdog is deactivated
- * temporarily (see related code in 'proc' handlers).
- */
-int __read_mostly watchdog_suspended;
+static int __init hardlockup_all_cpu_backtrace_setup(char *str)
+{
+       sysctl_hardlockup_all_cpu_backtrace = !!simple_strtol(str, NULL, 0);
+       return 1;
+}
+__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup);
+# endif /* CONFIG_SMP */
+#endif /* CONFIG_HARDLOCKUP_DETECTOR */
 
 /*
  * These functions can be overridden if an architecture implements its
@@ -121,36 +106,68 @@ int __read_mostly watchdog_suspended;
  */
 int __weak watchdog_nmi_enable(unsigned int cpu)
 {
+       hardlockup_detector_perf_enable();
        return 0;
 }
+
 void __weak watchdog_nmi_disable(unsigned int cpu)
 {
+       hardlockup_detector_perf_disable();
 }
 
-/*
- * watchdog_nmi_reconfigure can be implemented to be notified after any
- * watchdog configuration change. The arch hardlockup watchdog should
- * respond to the following variables:
- * - nmi_watchdog_enabled
+/* Return 0, if a NMI watchdog is available. Error code otherwise */
+int __weak __init watchdog_nmi_probe(void)
+{
+       return hardlockup_detector_perf_init();
+}
+
+/**
+ * watchdog_nmi_stop - Stop the watchdog for reconfiguration
+ *
+ * The reconfiguration steps are:
+ * watchdog_nmi_stop();
+ * update_variables();
+ * watchdog_nmi_start();
+ */
+void __weak watchdog_nmi_stop(void) { }
+
+/**
+ * watchdog_nmi_start - Start the watchdog after reconfiguration
+ *
+ * Counterpart to watchdog_nmi_stop().
+ *
+ * The following variables have been updated in update_variables() and
+ * contain the currently valid configuration:
+ * - watchdog_enabled
  * - watchdog_thresh
  * - watchdog_cpumask
- * - sysctl_hardlockup_all_cpu_backtrace
- * - hardlockup_panic
- * - watchdog_suspended
  */
-void __weak watchdog_nmi_reconfigure(void)
+void __weak watchdog_nmi_start(void) { }
+
+/**
+ * lockup_detector_update_enable - Update the sysctl enable bit
+ *
+ * Caller needs to make sure that the NMI/perf watchdogs are off, so this
+ * can't race with watchdog_nmi_disable().
+ */
+static void lockup_detector_update_enable(void)
 {
+       watchdog_enabled = 0;
+       if (!watchdog_user_enabled)
+               return;
+       if (nmi_watchdog_available && nmi_watchdog_user_enabled)
+               watchdog_enabled |= NMI_WATCHDOG_ENABLED;
+       if (soft_watchdog_user_enabled)
+               watchdog_enabled |= SOFT_WATCHDOG_ENABLED;
 }
 
-
 #ifdef CONFIG_SOFTLOCKUP_DETECTOR
 
-/* Helper for online, unparked cpus. */
-#define for_each_watchdog_cpu(cpu) \
-       for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask)
-
-atomic_t watchdog_park_in_progress = ATOMIC_INIT(0);
+/* Global variables, exported for sysctl */
+unsigned int __read_mostly softlockup_panic =
+                       CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
 
+static bool softlockup_threads_initialized __read_mostly;
 static u64 __read_mostly sample_period;
 
 static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
@@ -164,50 +181,40 @@ static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved);
 static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
 static unsigned long soft_lockup_nmi_warn;
 
-unsigned int __read_mostly softlockup_panic =
-                       CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
-
 static int __init softlockup_panic_setup(char *str)
 {
        softlockup_panic = simple_strtoul(str, NULL, 0);
-
        return 1;
 }
 __setup("softlockup_panic=", softlockup_panic_setup);
 
 static int __init nowatchdog_setup(char *str)
 {
-       watchdog_enabled = 0;
+       watchdog_user_enabled = 0;
        return 1;
 }
 __setup("nowatchdog", nowatchdog_setup);
 
 static int __init nosoftlockup_setup(char *str)
 {
-       watchdog_enabled &= ~SOFT_WATCHDOG_ENABLED;
+       soft_watchdog_user_enabled = 0;
        return 1;
 }
 __setup("nosoftlockup", nosoftlockup_setup);
 
 #ifdef CONFIG_SMP
+int __read_mostly sysctl_softlockup_all_cpu_backtrace;
+
 static int __init softlockup_all_cpu_backtrace_setup(char *str)
 {
-       sysctl_softlockup_all_cpu_backtrace =
-               !!simple_strtol(str, NULL, 0);
+       sysctl_softlockup_all_cpu_backtrace = !!simple_strtol(str, NULL, 0);
        return 1;
 }
 __setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup);
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
-static int __init hardlockup_all_cpu_backtrace_setup(char *str)
-{
-       sysctl_hardlockup_all_cpu_backtrace =
-               !!simple_strtol(str, NULL, 0);
-       return 1;
-}
-__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup);
-#endif
 #endif
 
+static void __lockup_detector_cleanup(void);
+
 /*
  * Hard-lockup warnings should be triggered after just a few seconds. Soft-
  * lockups can have false positives under extreme conditions. So we generally
@@ -278,11 +285,15 @@ void touch_all_softlockup_watchdogs(void)
        int cpu;
 
        /*
-        * this is done lockless
-        * do we care if a 0 races with a timestamp?
-        * all it means is the softlock check starts one cycle later
+        * watchdog_mutex cannpt be taken here, as this might be called
+        * from (soft)interrupt context, so the access to
+        * watchdog_allowed_cpumask might race with a concurrent update.
+        *
+        * The watchdog time stamp can race against a concurrent real
+        * update as well, the only side effect might be a cycle delay for
+        * the softlockup check.
         */
-       for_each_watchdog_cpu(cpu)
+       for_each_cpu(cpu, &watchdog_allowed_mask)
                per_cpu(watchdog_touch_ts, cpu) = 0;
        wq_watchdog_touch(-1);
 }
@@ -322,9 +333,6 @@ static void watchdog_interrupt_count(void)
        __this_cpu_inc(hrtimer_interrupts);
 }
 
-static int watchdog_enable_all_cpus(void);
-static void watchdog_disable_all_cpus(void);
-
 /* watchdog kicker functions */
 static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 {
@@ -333,7 +341,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
        int duration;
        int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
 
-       if (atomic_read(&watchdog_park_in_progress) != 0)
+       if (!watchdog_enabled)
                return HRTIMER_NORESTART;
 
        /* kick the hardlockup detector */
@@ -447,32 +455,38 @@ static void watchdog_set_prio(unsigned int policy, unsigned int prio)
 
 static void watchdog_enable(unsigned int cpu)
 {
-       struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
+       struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer);
 
-       /* kick off the timer for the hardlockup detector */
+       /*
+        * Start the timer first to prevent the NMI watchdog triggering
+        * before the timer has a chance to fire.
+        */
        hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        hrtimer->function = watchdog_timer_fn;
-
-       /* Enable the perf event */
-       watchdog_nmi_enable(cpu);
-
-       /* done here because hrtimer_start can only pin to smp_processor_id() */
        hrtimer_start(hrtimer, ns_to_ktime(sample_period),
                      HRTIMER_MODE_REL_PINNED);
 
-       /* initialize timestamp */
-       watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1);
+       /* Initialize timestamp */
        __touch_watchdog();
+       /* Enable the perf event */
+       if (watchdog_enabled & NMI_WATCHDOG_ENABLED)
+               watchdog_nmi_enable(cpu);
+
+       watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1);
 }
 
 static void watchdog_disable(unsigned int cpu)
 {
-       struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
+       struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer);
 
        watchdog_set_prio(SCHED_NORMAL, 0);
-       hrtimer_cancel(hrtimer);
-       /* disable the perf event */
+       /*
+        * Disable the perf event first. That prevents that a large delay
+        * between disabling the timer and disabling the perf event causes
+        * the perf NMI to detect a false positive.
+        */
        watchdog_nmi_disable(cpu);
+       hrtimer_cancel(hrtimer);
 }
 
 static void watchdog_cleanup(unsigned int cpu, bool online)
@@ -499,21 +513,6 @@ static void watchdog(unsigned int cpu)
        __this_cpu_write(soft_lockup_hrtimer_cnt,
                         __this_cpu_read(hrtimer_interrupts));
        __touch_watchdog();
-
-       /*
-        * watchdog_nmi_enable() clears the NMI_WATCHDOG_ENABLED bit in the
-        * failure path. Check for failures that can occur asynchronously -
-        * for example, when CPUs are on-lined - and shut down the hardware
-        * perf event on each CPU accordingly.
-        *
-        * The only non-obvious place this bit can be cleared is through
-        * watchdog_nmi_enable(), so a pr_info() is placed there.  Placing a
-        * pr_info here would be too noisy as it would result in a message
-        * every few seconds if the hardlockup was disabled but the softlockup
-        * enabled.
-        */
-       if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
-               watchdog_nmi_disable(cpu);
 }
 
 static struct smp_hotplug_thread watchdog_threads = {
@@ -527,295 +526,174 @@ static struct smp_hotplug_thread watchdog_threads = {
        .unpark                 = watchdog_enable,
 };
 
-/*
- * park all watchdog threads that are specified in 'watchdog_cpumask'
- *
- * This function returns an error if kthread_park() of a watchdog thread
- * fails. In this situation, the watchdog threads of some CPUs can already
- * be parked and the watchdog threads of other CPUs can still be runnable.
- * Callers are expected to handle this special condition as appropriate in
- * their context.
- *
- * This function may only be called in a context that is protected against
- * races with CPU hotplug - for example, via get_online_cpus().
- */
-static int watchdog_park_threads(void)
+static void softlockup_update_smpboot_threads(void)
 {
-       int cpu, ret = 0;
+       lockdep_assert_held(&watchdog_mutex);
 
-       atomic_set(&watchdog_park_in_progress, 1);
+       if (!softlockup_threads_initialized)
+               return;
 
-       for_each_watchdog_cpu(cpu) {
-               ret = kthread_park(per_cpu(softlockup_watchdog, cpu));
-               if (ret)
-                       break;
-       }
-
-       atomic_set(&watchdog_park_in_progress, 0);
-
-       return ret;
+       smpboot_update_cpumask_percpu_thread(&watchdog_threads,
+                                            &watchdog_allowed_mask);
 }
 
-/*
- * unpark all watchdog threads that are specified in 'watchdog_cpumask'
- *
- * This function may only be called in a context that is protected against
- * races with CPU hotplug - for example, via get_online_cpus().
- */
-static void watchdog_unpark_threads(void)
+/* Temporarily park all watchdog threads */
+static void softlockup_park_all_threads(void)
 {
-       int cpu;
-
-       for_each_watchdog_cpu(cpu)
-               kthread_unpark(per_cpu(softlockup_watchdog, cpu));
+       cpumask_clear(&watchdog_allowed_mask);
+       softlockup_update_smpboot_threads();
 }
 
-static int update_watchdog_all_cpus(void)
+/* Unpark enabled threads */
+static void softlockup_unpark_threads(void)
 {
-       int ret;
-
-       ret = watchdog_park_threads();
-       if (ret)
-               return ret;
-
-       watchdog_unpark_threads();
-
-       return 0;
+       cpumask_copy(&watchdog_allowed_mask, &watchdog_cpumask);
+       softlockup_update_smpboot_threads();
 }
 
-static int watchdog_enable_all_cpus(void)
+static void lockup_detector_reconfigure(void)
 {
-       int err = 0;
-
-       if (!watchdog_running) {
-               err = smpboot_register_percpu_thread_cpumask(&watchdog_threads,
-                                                            &watchdog_cpumask);
-               if (err)
-                       pr_err("Failed to create watchdog threads, disabled\n");
-               else
-                       watchdog_running = 1;
-       } else {
-               /*
-                * Enable/disable the lockup detectors or
-                * change the sample period 'on the fly'.
-                */
-               err = update_watchdog_all_cpus();
-
-               if (err) {
-                       watchdog_disable_all_cpus();
-                       pr_err("Failed to update lockup detectors, disabled\n");
-               }
-       }
-
-       if (err)
-               watchdog_enabled = 0;
-
-       return err;
+       cpus_read_lock();
+       watchdog_nmi_stop();
+       softlockup_park_all_threads();
+       set_sample_period();
+       lockup_detector_update_enable();
+       if (watchdog_enabled && watchdog_thresh)
+               softlockup_unpark_threads();
+       watchdog_nmi_start();
+       cpus_read_unlock();
+       /*
+        * Must be called outside the cpus locked section to prevent
+        * recursive locking in the perf code.
+        */
+       __lockup_detector_cleanup();
 }
 
-static void watchdog_disable_all_cpus(void)
+/*
+ * Create the watchdog thread infrastructure and configure the detector(s).
+ *
+ * The threads are not unparked as watchdog_allowed_mask is empty.  When
+ * the threads are sucessfully initialized, take the proper locks and
+ * unpark the threads in the watchdog_cpumask if the watchdog is enabled.
+ */
+static __init void lockup_detector_setup(void)
 {
-       if (watchdog_running) {
-               watchdog_running = 0;
-               smpboot_unregister_percpu_thread(&watchdog_threads);
-       }
-}
+       int ret;
 
-#ifdef CONFIG_SYSCTL
-static int watchdog_update_cpus(void)
-{
-       return smpboot_update_cpumask_percpu_thread(
-                   &watchdog_threads, &watchdog_cpumask);
-}
-#endif
+       /*
+        * If sysctl is off and watchdog got disabled on the command line,
+        * nothing to do here.
+        */
+       lockup_detector_update_enable();
 
-#else /* SOFTLOCKUP */
-static int watchdog_park_threads(void)
-{
-       return 0;
-}
+       if (!IS_ENABLED(CONFIG_SYSCTL) &&
+           !(watchdog_enabled && watchdog_thresh))
+               return;
 
-static void watchdog_unpark_threads(void)
-{
-}
+       ret = smpboot_register_percpu_thread_cpumask(&watchdog_threads,
+                                                    &watchdog_allowed_mask);
+       if (ret) {
+               pr_err("Failed to initialize soft lockup detector threads\n");
+               return;
+       }
 
-static int watchdog_enable_all_cpus(void)
-{
-       return 0;
+       mutex_lock(&watchdog_mutex);
+       softlockup_threads_initialized = true;
+       lockup_detector_reconfigure();
+       mutex_unlock(&watchdog_mutex);
 }
 
-static void watchdog_disable_all_cpus(void)
+#else /* CONFIG_SOFTLOCKUP_DETECTOR */
+static inline int watchdog_park_threads(void) { return 0; }
+static inline void watchdog_unpark_threads(void) { }
+static inline int watchdog_enable_all_cpus(void) { return 0; }
+static inline void watchdog_disable_all_cpus(void) { }
+static void lockup_detector_reconfigure(void)
 {
+       cpus_read_lock();
+       watchdog_nmi_stop();
+       lockup_detector_update_enable();
+       watchdog_nmi_start();
+       cpus_read_unlock();
 }
-
-#ifdef CONFIG_SYSCTL
-static int watchdog_update_cpus(void)
+static inline void lockup_detector_setup(void)
 {
-       return 0;
+       lockup_detector_reconfigure();
 }
-#endif
+#endif /* !CONFIG_SOFTLOCKUP_DETECTOR */
 
-static void set_sample_period(void)
+static void __lockup_detector_cleanup(void)
 {
+       lockdep_assert_held(&watchdog_mutex);
+       hardlockup_detector_perf_cleanup();
 }
-#endif /* SOFTLOCKUP */
 
-/*
- * Suspend the hard and soft lockup detector by parking the watchdog threads.
+/**
+ * lockup_detector_cleanup - Cleanup after cpu hotplug or sysctl changes
+ *
+ * Caller must not hold the cpu hotplug rwsem.
  */
-int lockup_detector_suspend(void)
+void lockup_detector_cleanup(void)
 {
-       int ret = 0;
-
-       get_online_cpus();
-       mutex_lock(&watchdog_proc_mutex);
-       /*
-        * Multiple suspend requests can be active in parallel (counted by
-        * the 'watchdog_suspended' variable). If the watchdog threads are
-        * running, the first caller takes care that they will be parked.
-        * The state of 'watchdog_running' cannot change while a suspend
-        * request is active (see related code in 'proc' handlers).
-        */
-       if (watchdog_running && !watchdog_suspended)
-               ret = watchdog_park_threads();
-
-       if (ret == 0)
-               watchdog_suspended++;
-       else {
-               watchdog_disable_all_cpus();
-               pr_err("Failed to suspend lockup detectors, disabled\n");
-               watchdog_enabled = 0;
-       }
-
-       watchdog_nmi_reconfigure();
-
-       mutex_unlock(&watchdog_proc_mutex);
-
-       return ret;
+       mutex_lock(&watchdog_mutex);
+       __lockup_detector_cleanup();
+       mutex_unlock(&watchdog_mutex);
 }
 
-/*
- * Resume the hard and soft lockup detector by unparking the watchdog threads.
+/**
+ * lockup_detector_soft_poweroff - Interface to stop lockup detector(s)
+ *
+ * Special interface for parisc. It prevents lockup detector warnings from
+ * the default pm_poweroff() function which busy loops forever.
  */
-void lockup_detector_resume(void)
+void lockup_detector_soft_poweroff(void)
 {
-       mutex_lock(&watchdog_proc_mutex);
-
-       watchdog_suspended--;
-       /*
-        * The watchdog threads are unparked if they were previously running
-        * and if there is no more active suspend request.
-        */
-       if (watchdog_running && !watchdog_suspended)
-               watchdog_unpark_threads();
-
-       watchdog_nmi_reconfigure();
-
-       mutex_unlock(&watchdog_proc_mutex);
-       put_online_cpus();
+       watchdog_enabled = 0;
 }
 
 #ifdef CONFIG_SYSCTL
 
-/*
- * Update the run state of the lockup detectors.
- */
-static int proc_watchdog_update(void)
+/* Propagate any changes to the watchdog threads */
+static void proc_watchdog_update(void)
 {
-       int err = 0;
-
-       /*
-        * Watchdog threads won't be started if they are already active.
-        * The 'watchdog_running' variable in watchdog_*_all_cpus() takes
-        * care of this. If those threads are already active, the sample
-        * period will be updated and the lockup detectors will be enabled
-        * or disabled 'on the fly'.
-        */
-       if (watchdog_enabled && watchdog_thresh)
-               err = watchdog_enable_all_cpus();
-       else
-               watchdog_disable_all_cpus();
-
-       watchdog_nmi_reconfigure();
-
-       return err;
-
+       /* Remove impossible cpus to keep sysctl output clean. */
+       cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask);
+       lockup_detector_reconfigure();
 }
 
 /*
  * common function for watchdog, nmi_watchdog and soft_watchdog parameter
  *
- * caller             | table->data points to | 'which' contains the flag(s)
- * -------------------|-----------------------|-----------------------------
- * proc_watchdog      | watchdog_user_enabled | NMI_WATCHDOG_ENABLED or'ed
- *                    |                       | with SOFT_WATCHDOG_ENABLED
- * -------------------|-----------------------|-----------------------------
- * proc_nmi_watchdog  | nmi_watchdog_enabled  | NMI_WATCHDOG_ENABLED
- * -------------------|-----------------------|-----------------------------
- * proc_soft_watchdog | soft_watchdog_enabled | SOFT_WATCHDOG_ENABLED
+ * caller             | table->data points to      | 'which'
+ * -------------------|----------------------------|--------------------------
+ * proc_watchdog      | watchdog_user_enabled      | NMI_WATCHDOG_ENABLED |
+ *                    |                            | SOFT_WATCHDOG_ENABLED
+ * -------------------|----------------------------|--------------------------
+ * proc_nmi_watchdog  | nmi_watchdog_user_enabled  | NMI_WATCHDOG_ENABLED
+ * -------------------|----------------------------|--------------------------
+ * proc_soft_watchdog | soft_watchdog_user_enabled | SOFT_WATCHDOG_ENABLED
  */
 static int proc_watchdog_common(int which, struct ctl_table *table, int write,
                                void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-       int err, old, new;
-       int *watchdog_param = (int *)table->data;
+       int err, old, *param = table->data;
 
-       get_online_cpus();
-       mutex_lock(&watchdog_proc_mutex);
+       mutex_lock(&watchdog_mutex);
 
-       if (watchdog_suspended) {
-               /* no parameter changes allowed while watchdog is suspended */
-               err = -EAGAIN;
-               goto out;
-       }
-
-       /*
-        * If the parameter is being read return the state of the corresponding
-        * bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the
-        * run state of the lockup detectors.
-        */
        if (!write) {
-               *watchdog_param = (watchdog_enabled & which) != 0;
+               /*
+                * On read synchronize the userspace interface. This is a
+                * racy snapshot.
+                */
+               *param = (watchdog_enabled & which) != 0;
                err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        } else {
+               old = READ_ONCE(*param);
                err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-               if (err)
-                       goto out;
-
-               /*
-                * There is a race window between fetching the current value
-                * from 'watchdog_enabled' and storing the new value. During
-                * this race window, watchdog_nmi_enable() can sneak in and
-                * clear the NMI_WATCHDOG_ENABLED bit in 'watchdog_enabled'.
-                * The 'cmpxchg' detects this race and the loop retries.
-                */
-               do {
-                       old = watchdog_enabled;
-                       /*
-                        * If the parameter value is not zero set the
-                        * corresponding bit(s), else clear it(them).
-                        */
-                       if (*watchdog_param)
-                               new = old | which;
-                       else
-                               new = old & ~which;
-               } while (cmpxchg(&watchdog_enabled, old, new) != old);
-
-               /*
-                * Update the run state of the lockup detectors. There is _no_
-                * need to check the value returned by proc_watchdog_update()
-                * and to restore the previous value of 'watchdog_enabled' as
-                * both lockup detectors are disabled if proc_watchdog_update()
-                * returns an error.
-                */
-               if (old == new)
-                       goto out;
-
-               err = proc_watchdog_update();
+               if (!err && old != READ_ONCE(*param))
+                       proc_watchdog_update();
        }
-out:
-       mutex_unlock(&watchdog_proc_mutex);
-       put_online_cpus();
+       mutex_unlock(&watchdog_mutex);
        return err;
 }
 
@@ -835,6 +713,8 @@ int proc_watchdog(struct ctl_table *table, int write,
 int proc_nmi_watchdog(struct ctl_table *table, int write,
                      void __user *buffer, size_t *lenp, loff_t *ppos)
 {
+       if (!nmi_watchdog_available && write)
+               return -ENOTSUPP;
        return proc_watchdog_common(NMI_WATCHDOG_ENABLED,
                                    table, write, buffer, lenp, ppos);
 }
@@ -855,39 +735,17 @@ int proc_soft_watchdog(struct ctl_table *table, int write,
 int proc_watchdog_thresh(struct ctl_table *table, int write,
                         void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-       int err, old, new;
-
-       get_online_cpus();
-       mutex_lock(&watchdog_proc_mutex);
+       int err, old;
 
-       if (watchdog_suspended) {
-               /* no parameter changes allowed while watchdog is suspended */
-               err = -EAGAIN;
-               goto out;
-       }
+       mutex_lock(&watchdog_mutex);
 
-       old = ACCESS_ONCE(watchdog_thresh);
+       old = READ_ONCE(watchdog_thresh);
        err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 
-       if (err || !write)
-               goto out;
-
-       /*
-        * Update the sample period. Restore on failure.
-        */
-       new = ACCESS_ONCE(watchdog_thresh);
-       if (old == new)
-               goto out;
+       if (!err && write && old != READ_ONCE(watchdog_thresh))
+               proc_watchdog_update();
 
-       set_sample_period();
-       err = proc_watchdog_update();
-       if (err) {
-               watchdog_thresh = old;
-               set_sample_period();
-       }
-out:
-       mutex_unlock(&watchdog_proc_mutex);
-       put_online_cpus();
+       mutex_unlock(&watchdog_mutex);
        return err;
 }
 
@@ -902,45 +760,19 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
 {
        int err;
 
-       get_online_cpus();
-       mutex_lock(&watchdog_proc_mutex);
-
-       if (watchdog_suspended) {
-               /* no parameter changes allowed while watchdog is suspended */
-               err = -EAGAIN;
-               goto out;
-       }
+       mutex_lock(&watchdog_mutex);
 
        err = proc_do_large_bitmap(table, write, buffer, lenp, ppos);
-       if (!err && write) {
-               /* Remove impossible cpus to keep sysctl output cleaner. */
-               cpumask_and(&watchdog_cpumask, &watchdog_cpumask,
-                           cpu_possible_mask);
-
-               if (watchdog_running) {
-                       /*
-                        * Failure would be due to being unable to allocate
-                        * a temporary cpumask, so we are likely not in a
-                        * position to do much else to make things better.
-                        */
-                       if (watchdog_update_cpus() != 0)
-                               pr_err("cpumask update failed\n");
-               }
+       if (!err && write)
+               proc_watchdog_update();
 
-               watchdog_nmi_reconfigure();
-       }
-out:
-       mutex_unlock(&watchdog_proc_mutex);
-       put_online_cpus();
+       mutex_unlock(&watchdog_mutex);
        return err;
 }
-
 #endif /* CONFIG_SYSCTL */
 
 void __init lockup_detector_init(void)
 {
-       set_sample_period();
-
 #ifdef CONFIG_NO_HZ_FULL
        if (tick_nohz_full_enabled()) {
                pr_info("Disabling watchdog on nohz_full cores by default\n");
@@ -951,6 +783,7 @@ void __init lockup_detector_init(void)
        cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
 #endif
 
-       if (watchdog_enabled)
-               watchdog_enable_all_cpus();
+       if (!watchdog_nmi_probe())
+               nmi_watchdog_available = true;
+       lockup_detector_setup();
 }
index 3a09ea1..71a62ce 100644 (file)
 static DEFINE_PER_CPU(bool, hard_watchdog_warn);
 static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
 static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
+static struct cpumask dead_events_mask;
 
 static unsigned long hardlockup_allcpu_dumped;
+static unsigned int watchdog_cpus;
 
 void arch_touch_nmi_watchdog(void)
 {
@@ -103,15 +105,12 @@ static struct perf_event_attr wd_hw_attr = {
 
 /* Callback function for perf event subsystem */
 static void watchdog_overflow_callback(struct perf_event *event,
-                struct perf_sample_data *data,
-                struct pt_regs *regs)
+                                      struct perf_sample_data *data,
+                                      struct pt_regs *regs)
 {
        /* Ensure the watchdog never gets throttled */
        event->hw.interrupts = 0;
 
-       if (atomic_read(&watchdog_park_in_progress) != 0)
-               return;
-
        if (__this_cpu_read(watchdog_nmi_touch) == true) {
                __this_cpu_write(watchdog_nmi_touch, false);
                return;
@@ -160,104 +159,131 @@ static void watchdog_overflow_callback(struct perf_event *event,
        return;
 }
 
-/*
- * People like the simple clean cpu node info on boot.
- * Reduce the watchdog noise by only printing messages
- * that are different from what cpu0 displayed.
- */
-static unsigned long firstcpu_err;
-static atomic_t watchdog_cpus;
-
-int watchdog_nmi_enable(unsigned int cpu)
+static int hardlockup_detector_event_create(void)
 {
+       unsigned int cpu = smp_processor_id();
        struct perf_event_attr *wd_attr;
-       struct perf_event *event = per_cpu(watchdog_ev, cpu);
-       int firstcpu = 0;
-
-       /* nothing to do if the hard lockup detector is disabled */
-       if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
-               goto out;
-
-       /* is it already setup and enabled? */
-       if (event && event->state > PERF_EVENT_STATE_OFF)
-               goto out;
-
-       /* it is setup but not enabled */
-       if (event != NULL)
-               goto out_enable;
-
-       if (atomic_inc_return(&watchdog_cpus) == 1)
-               firstcpu = 1;
+       struct perf_event *evt;
 
        wd_attr = &wd_hw_attr;
        wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
 
        /* Try to register using hardware perf events */
-       event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
+       evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL,
+                                              watchdog_overflow_callback, NULL);
+       if (IS_ERR(evt)) {
+               pr_info("Perf event create on CPU %d failed with %ld\n", cpu,
+                       PTR_ERR(evt));
+               return PTR_ERR(evt);
+       }
+       this_cpu_write(watchdog_ev, evt);
+       return 0;
+}
 
-       /* save the first cpu's error for future comparision */
-       if (firstcpu && IS_ERR(event))
-               firstcpu_err = PTR_ERR(event);
+/**
+ * hardlockup_detector_perf_enable - Enable the local event
+ */
+void hardlockup_detector_perf_enable(void)
+{
+       if (hardlockup_detector_event_create())
+               return;
 
-       if (!IS_ERR(event)) {
-               /* only print for the first cpu initialized */
-               if (firstcpu || firstcpu_err)
-                       pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n");
-               goto out_save;
-       }
+       if (!watchdog_cpus++)
+               pr_info("Enabled. Permanently consumes one hw-PMU counter.\n");
 
-       /*
-        * Disable the hard lockup detector if _any_ CPU fails to set up
-        * set up the hardware perf event. The watchdog() function checks
-        * the NMI_WATCHDOG_ENABLED bit periodically.
-        *
-        * The barriers are for syncing up watchdog_enabled across all the
-        * cpus, as clear_bit() does not use barriers.
-        */
-       smp_mb__before_atomic();
-       clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled);
-       smp_mb__after_atomic();
-
-       /* skip displaying the same error again */
-       if (!firstcpu && (PTR_ERR(event) == firstcpu_err))
-               return PTR_ERR(event);
-
-       /* vary the KERN level based on the returned errno */
-       if (PTR_ERR(event) == -EOPNOTSUPP)
-               pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
-       else if (PTR_ERR(event) == -ENOENT)
-               pr_warn("disabled (cpu%i): hardware events not enabled\n",
-                        cpu);
-       else
-               pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
-                       cpu, PTR_ERR(event));
-
-       pr_info("Shutting down hard lockup detector on all cpus\n");
-
-       return PTR_ERR(event);
-
-       /* success path */
-out_save:
-       per_cpu(watchdog_ev, cpu) = event;
-out_enable:
-       perf_event_enable(per_cpu(watchdog_ev, cpu));
-out:
-       return 0;
+       perf_event_enable(this_cpu_read(watchdog_ev));
 }
 
-void watchdog_nmi_disable(unsigned int cpu)
+/**
+ * hardlockup_detector_perf_disable - Disable the local event
+ */
+void hardlockup_detector_perf_disable(void)
 {
-       struct perf_event *event = per_cpu(watchdog_ev, cpu);
+       struct perf_event *event = this_cpu_read(watchdog_ev);
 
        if (event) {
                perf_event_disable(event);
+               cpumask_set_cpu(smp_processor_id(), &dead_events_mask);
+               watchdog_cpus--;
+       }
+}
+
+/**
+ * hardlockup_detector_perf_cleanup - Cleanup disabled events and destroy them
+ *
+ * Called from lockup_detector_cleanup(). Serialized by the caller.
+ */
+void hardlockup_detector_perf_cleanup(void)
+{
+       int cpu;
+
+       for_each_cpu(cpu, &dead_events_mask) {
+               struct perf_event *event = per_cpu(watchdog_ev, cpu);
+
+               /*
+                * Required because for_each_cpu() reports  unconditionally
+                * CPU0 as set on UP kernels. Sigh.
+                */
+               if (event)
+                       perf_event_release_kernel(event);
                per_cpu(watchdog_ev, cpu) = NULL;
+       }
+       cpumask_clear(&dead_events_mask);
+}
+
+/**
+ * hardlockup_detector_perf_stop - Globally stop watchdog events
+ *
+ * Special interface for x86 to handle the perf HT bug.
+ */
+void __init hardlockup_detector_perf_stop(void)
+{
+       int cpu;
+
+       lockdep_assert_cpus_held();
+
+       for_each_online_cpu(cpu) {
+               struct perf_event *event = per_cpu(watchdog_ev, cpu);
+
+               if (event)
+                       perf_event_disable(event);
+       }
+}
 
-               /* should be in cleanup, but blocks oprofile */
-               perf_event_release_kernel(event);
+/**
+ * hardlockup_detector_perf_restart - Globally restart watchdog events
+ *
+ * Special interface for x86 to handle the perf HT bug.
+ */
+void __init hardlockup_detector_perf_restart(void)
+{
+       int cpu;
+
+       lockdep_assert_cpus_held();
+
+       if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
+               return;
+
+       for_each_online_cpu(cpu) {
+               struct perf_event *event = per_cpu(watchdog_ev, cpu);
+
+               if (event)
+                       perf_event_enable(event);
+       }
+}
+
+/**
+ * hardlockup_detector_perf_init - Probe whether NMI event is available at all
+ */
+int __init hardlockup_detector_perf_init(void)
+{
+       int ret = hardlockup_detector_event_create();
 
-               /* watchdog_nmi_enable() expects this to be zero initially. */
-               if (atomic_dec_and_test(&watchdog_cpus))
-                       firstcpu_err = 0;
+       if (ret) {
+               pr_info("Perf NMI watchdog permanently disabled\n");
+       } else {
+               perf_event_release_kernel(this_cpu_read(watchdog_ev));
+               this_cpu_write(watchdog_ev, NULL);
        }
+       return ret;
 }
index 05c8604..831c5a6 100644 (file)
@@ -5,7 +5,7 @@
 
 struct once_work {
        struct work_struct work;
-       struct static_key *key;
+       struct static_key_true *key;
 };
 
 static void once_deferred(struct work_struct *w)
@@ -14,11 +14,11 @@ static void once_deferred(struct work_struct *w)
 
        work = container_of(w, struct once_work, work);
        BUG_ON(!static_key_enabled(work->key));
-       static_key_slow_dec(work->key);
+       static_branch_disable(work->key);
        kfree(work);
 }
 
-static void once_disable_jump(struct static_key *key)
+static void once_disable_jump(struct static_key_true *key)
 {
        struct once_work *w;
 
@@ -51,7 +51,7 @@ bool __do_once_start(bool *done, unsigned long *flags)
 }
 EXPORT_SYMBOL(__do_once_start);
 
-void __do_once_done(bool *done, struct static_key *once_key,
+void __do_once_done(bool *done, struct static_key_true *once_key,
                    unsigned long *flags)
        __releases(once_lock)
 {
index 83ba548..1b659ab 100644 (file)
@@ -916,8 +916,8 @@ static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface)
        u16 tvlv_len = 0;
        unsigned long send_time;
 
-       if ((hard_iface->if_status == BATADV_IF_NOT_IN_USE) ||
-           (hard_iface->if_status == BATADV_IF_TO_BE_REMOVED))
+       if (hard_iface->if_status == BATADV_IF_NOT_IN_USE ||
+           hard_iface->if_status == BATADV_IF_TO_BE_REMOVED)
                return;
 
        /* the interface gets activated here to avoid race conditions between
@@ -1264,7 +1264,7 @@ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node,
         * drops as they can't send and receive at the same time.
         */
        tq_iface_penalty = BATADV_TQ_MAX_VALUE;
-       if (if_outgoing && (if_incoming == if_outgoing) &&
+       if (if_outgoing && if_incoming == if_outgoing &&
            batadv_is_wifi_hardif(if_outgoing))
                tq_iface_penalty = batadv_hop_penalty(BATADV_TQ_MAX_VALUE,
                                                      bat_priv);
@@ -1369,7 +1369,7 @@ batadv_iv_ogm_update_seqnos(const struct ethhdr *ethhdr,
                                ret = BATADV_NEIGH_DUP;
                } else {
                        set_mark = 0;
-                       if (is_dup && (ret != BATADV_NEIGH_DUP))
+                       if (is_dup && ret != BATADV_NEIGH_DUP)
                                ret = BATADV_ORIG_DUP;
                }
 
@@ -1515,7 +1515,7 @@ batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset,
        /* drop packet if sender is not a direct neighbor and if we
         * don't route towards it
         */
-       if (!is_single_hop_neigh && (!orig_neigh_router)) {
+       if (!is_single_hop_neigh && !orig_neigh_router) {
                batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
                           "Drop packet: OGM via unknown neighbor!\n");
                goto out_neigh;
@@ -1535,7 +1535,7 @@ batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset,
        sameseq = orig_ifinfo->last_real_seqno == ntohl(ogm_packet->seqno);
        similar_ttl = (orig_ifinfo->last_ttl - 3) <= ogm_packet->ttl;
 
-       if (is_bidirect && ((dup_status == BATADV_NO_DUP) ||
+       if (is_bidirect && (dup_status == BATADV_NO_DUP ||
                            (sameseq && similar_ttl))) {
                batadv_iv_ogm_orig_update(bat_priv, orig_node,
                                          orig_ifinfo, ethhdr,
@@ -1553,8 +1553,8 @@ batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset,
                /* OGMs from secondary interfaces should only scheduled once
                 * per interface where it has been received, not multiple times
                 */
-               if ((ogm_packet->ttl <= 2) &&
-                   (if_incoming != if_outgoing)) {
+               if (ogm_packet->ttl <= 2 &&
+                   if_incoming != if_outgoing) {
                        batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
                                   "Drop packet: OGM from secondary interface and wrong outgoing interface\n");
                        goto out_neigh;
@@ -1590,7 +1590,7 @@ batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset,
                              if_incoming, if_outgoing);
 
 out_neigh:
-       if ((orig_neigh_node) && (!is_single_hop_neigh))
+       if (orig_neigh_node && !is_single_hop_neigh)
                batadv_orig_node_put(orig_neigh_node);
 out:
        if (router_ifinfo)
@@ -2523,9 +2523,9 @@ batadv_iv_gw_get_best_gw_node(struct batadv_priv *bat_priv)
                        tmp_gw_factor *= 100 * 100;
                        tmp_gw_factor >>= 18;
 
-                       if ((tmp_gw_factor > max_gw_factor) ||
-                           ((tmp_gw_factor == max_gw_factor) &&
-                            (tq_avg > max_tq))) {
+                       if (tmp_gw_factor > max_gw_factor ||
+                           (tmp_gw_factor == max_gw_factor &&
+                            tq_avg > max_tq)) {
                                if (curr_gw)
                                        batadv_gw_node_put(curr_gw);
                                curr_gw = gw_node;
index 4e2724c..93ef1c0 100644 (file)
@@ -767,7 +767,7 @@ batadv_v_gw_get_best_gw_node(struct batadv_priv *bat_priv)
                if (batadv_v_gw_throughput_get(gw_node, &bw) < 0)
                        goto next;
 
-               if (curr_gw && (bw <= max_bw))
+               if (curr_gw && bw <= max_bw)
                        goto next;
 
                if (curr_gw)
index bd1064d..1de992c 100644 (file)
@@ -134,7 +134,7 @@ static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh)
                        hard_iface->bat_v.flags &= ~BATADV_FULL_DUPLEX;
 
                throughput = link_settings.base.speed;
-               if (throughput && (throughput != SPEED_UNKNOWN))
+               if (throughput && throughput != SPEED_UNKNOWN)
                        return throughput * 10;
        }
 
@@ -263,8 +263,8 @@ static void batadv_v_elp_periodic_work(struct work_struct *work)
                goto out;
 
        /* we are in the process of shutting this interface down */
-       if ((hard_iface->if_status == BATADV_IF_NOT_IN_USE) ||
-           (hard_iface->if_status == BATADV_IF_TO_BE_REMOVED))
+       if (hard_iface->if_status == BATADV_IF_NOT_IN_USE ||
+           hard_iface->if_status == BATADV_IF_TO_BE_REMOVED)
                goto out;
 
        /* the interface was enabled but may not be ready yet */
index 8be6173..c251445 100644 (file)
@@ -304,8 +304,8 @@ static u32 batadv_v_forward_penalty(struct batadv_priv *bat_priv,
         * due to the store & forward characteristics of WIFI.
         * Very low throughput values are the exception.
         */
-       if ((throughput > 10) &&
-           (if_incoming == if_outgoing) &&
+       if (throughput > 10 &&
+           if_incoming == if_outgoing &&
            !(if_incoming->bat_v.flags & BATADV_FULL_DUPLEX))
                return throughput / 2;
 
@@ -455,7 +455,7 @@ static int batadv_v_ogm_metric_update(struct batadv_priv *bat_priv,
        /* drop packets with old seqnos, however accept the first packet after
         * a host has been rebooted.
         */
-       if ((seq_diff < 0) && !protection_started)
+       if (seq_diff < 0 && !protection_started)
                goto out;
 
        neigh_node->last_seen = jiffies;
@@ -568,8 +568,8 @@ static bool batadv_v_ogm_route_update(struct batadv_priv *bat_priv,
                router_throughput = router_ifinfo->bat_v.throughput;
                neigh_throughput = neigh_ifinfo->bat_v.throughput;
 
-               if ((neigh_seq_diff < BATADV_OGM_MAX_ORIGDIFF) &&
-                   (router_throughput >= neigh_throughput))
+               if (neigh_seq_diff < BATADV_OGM_MAX_ORIGDIFF &&
+                   router_throughput >= neigh_throughput)
                        goto out;
        }
 
@@ -621,7 +621,7 @@ batadv_v_ogm_process_per_outif(struct batadv_priv *bat_priv,
                return;
 
        /* only unknown & newer OGMs contain TVLVs we are interested in */
-       if ((seqno_age > 0) && (if_outgoing == BATADV_IF_DEFAULT))
+       if (seqno_age > 0 && if_outgoing == BATADV_IF_DEFAULT)
                batadv_tvlv_containers_process(bat_priv, true, orig_node,
                                               NULL, NULL,
                                               (unsigned char *)(ogm2 + 1),
index b6cfa78..760c0de 100644 (file)
@@ -492,8 +492,8 @@ static bool batadv_is_orig_node_eligible(struct batadv_dat_candidate *res,
        /* this is an hash collision with the temporary selected node. Choose
         * the one with the lowest address
         */
-       if ((tmp_max == max) && max_orig_node &&
-           (batadv_compare_eth(candidate->orig, max_orig_node->orig) > 0))
+       if (tmp_max == max && max_orig_node &&
+           batadv_compare_eth(candidate->orig, max_orig_node->orig) > 0)
                goto out;
 
        ret = true;
index de9955d..10d521f 100644 (file)
@@ -248,12 +248,12 @@ void batadv_gw_election(struct batadv_priv *bat_priv)
                }
        }
 
-       if ((curr_gw) && (!next_gw)) {
+       if (curr_gw && !next_gw) {
                batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
                           "Removing selected gateway - no gateway in range\n");
                batadv_throw_uevent(bat_priv, BATADV_UEV_GW, BATADV_UEV_DEL,
                                    NULL);
-       } else if ((!curr_gw) && (next_gw)) {
+       } else if (!curr_gw && next_gw) {
                batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
                           "Adding route to gateway %pM (bandwidth: %u.%u/%u.%u MBit, tq: %i)\n",
                           next_gw->orig_node->orig,
@@ -411,8 +411,8 @@ void batadv_gw_node_update(struct batadv_priv *bat_priv,
                goto out;
        }
 
-       if ((gw_node->bandwidth_down == ntohl(gateway->bandwidth_down)) &&
-           (gw_node->bandwidth_up == ntohl(gateway->bandwidth_up)))
+       if (gw_node->bandwidth_down == ntohl(gateway->bandwidth_down) &&
+           gw_node->bandwidth_up == ntohl(gateway->bandwidth_up))
                goto out;
 
        batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
index 33940c5..2c26039 100644 (file)
@@ -56,8 +56,8 @@ bool batadv_parse_throughput(struct net_device *net_dev, char *buff,
                if (strncasecmp(tmp_ptr, "mbit", 4) == 0)
                        bw_unit_type = BATADV_BW_UNIT_MBIT;
 
-               if ((strncasecmp(tmp_ptr, "kbit", 4) == 0) ||
-                   (bw_unit_type == BATADV_BW_UNIT_MBIT))
+               if (strncasecmp(tmp_ptr, "kbit", 4) == 0 ||
+                   bw_unit_type == BATADV_BW_UNIT_MBIT)
                        *tmp_ptr = '\0';
        }
 
@@ -190,7 +190,7 @@ ssize_t batadv_gw_bandwidth_set(struct net_device *net_dev, char *buff,
        if (!up_new)
                up_new = 1;
 
-       if ((down_curr == down_new) && (up_curr == up_new))
+       if (down_curr == down_new && up_curr == up_new)
                return count;
 
        batadv_gw_reselect(bat_priv);
@@ -224,16 +224,16 @@ static void batadv_gw_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv,
        /* only fetch the tvlv value if the handler wasn't called via the
         * CIFNOTFND flag and if there is data to fetch
         */
-       if ((flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND) ||
-           (tvlv_value_len < sizeof(gateway))) {
+       if (flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND ||
+           tvlv_value_len < sizeof(gateway)) {
                gateway.bandwidth_down = 0;
                gateway.bandwidth_up = 0;
        } else {
                gateway_ptr = tvlv_value;
                gateway.bandwidth_down = gateway_ptr->bandwidth_down;
                gateway.bandwidth_up = gateway_ptr->bandwidth_up;
-               if ((gateway.bandwidth_down == 0) ||
-                   (gateway.bandwidth_up == 0)) {
+               if (gateway.bandwidth_down == 0 ||
+                   gateway.bandwidth_up == 0) {
                        gateway.bandwidth_down = 0;
                        gateway.bandwidth_up = 0;
                }
@@ -242,8 +242,8 @@ static void batadv_gw_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv,
        batadv_gw_node_update(bat_priv, orig, &gateway);
 
        /* restart gateway selection */
-       if ((gateway.bandwidth_down != 0) &&
-           (atomic_read(&bat_priv->gw.mode) == BATADV_GW_MODE_CLIENT))
+       if (gateway.bandwidth_down != 0 &&
+           atomic_read(&bat_priv->gw.mode) == BATADV_GW_MODE_CLIENT)
                batadv_gw_check_election(bat_priv, orig);
 }
 
index f7b413b..4e3d534 100644 (file)
@@ -504,8 +504,8 @@ static void batadv_check_known_mac_addr(const struct net_device *net_dev)
 
        rcu_read_lock();
        list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
-               if ((hard_iface->if_status != BATADV_IF_ACTIVE) &&
-                   (hard_iface->if_status != BATADV_IF_TO_BE_ACTIVATED))
+               if (hard_iface->if_status != BATADV_IF_ACTIVE &&
+                   hard_iface->if_status != BATADV_IF_TO_BE_ACTIVATED)
                        continue;
 
                if (hard_iface->net_dev == net_dev)
@@ -568,8 +568,8 @@ int batadv_hardif_min_mtu(struct net_device *soft_iface)
 
        rcu_read_lock();
        list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
-               if ((hard_iface->if_status != BATADV_IF_ACTIVE) &&
-                   (hard_iface->if_status != BATADV_IF_TO_BE_ACTIVATED))
+               if (hard_iface->if_status != BATADV_IF_ACTIVE &&
+                   hard_iface->if_status != BATADV_IF_TO_BE_ACTIVATED)
                        continue;
 
                if (hard_iface->soft_iface != soft_iface)
@@ -654,8 +654,8 @@ out:
 static void
 batadv_hardif_deactivate_interface(struct batadv_hard_iface *hard_iface)
 {
-       if ((hard_iface->if_status != BATADV_IF_ACTIVE) &&
-           (hard_iface->if_status != BATADV_IF_TO_BE_ACTIVATED))
+       if (hard_iface->if_status != BATADV_IF_ACTIVE &&
+           hard_iface->if_status != BATADV_IF_TO_BE_ACTIVATED)
                return;
 
        hard_iface->if_status = BATADV_IF_INACTIVE;
index 8ead292..bded311 100644 (file)
@@ -132,10 +132,10 @@ static ssize_t batadv_socket_read(struct file *file, char __user *buf,
        size_t packet_len;
        int error;
 
-       if ((file->f_flags & O_NONBLOCK) && (socket_client->queue_len == 0))
+       if ((file->f_flags & O_NONBLOCK) && socket_client->queue_len == 0)
                return -EAGAIN;
 
-       if ((!buf) || (count < sizeof(struct batadv_icmp_packet)))
+       if (!buf || count < sizeof(struct batadv_icmp_packet))
                return -EINVAL;
 
        if (!access_ok(VERIFY_WRITE, buf, count))
index fb381fb..4daed7a 100644 (file)
@@ -73,8 +73,8 @@
  * list traversals just rcu-locked
  */
 struct list_head batadv_hardif_list;
-static int (*batadv_rx_handler[256])(struct sk_buff *,
-                                    struct batadv_hard_iface *);
+static int (*batadv_rx_handler[256])(struct sk_buff *skb,
+                                    struct batadv_hard_iface *recv_if);
 
 unsigned char batadv_broadcast_addr[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
 
@@ -540,12 +540,12 @@ batadv_recv_handler_register(u8 packet_type,
                             int (*recv_handler)(struct sk_buff *,
                                                 struct batadv_hard_iface *))
 {
-       int (*curr)(struct sk_buff *,
-                   struct batadv_hard_iface *);
+       int (*curr)(struct sk_buff *skb,
+                   struct batadv_hard_iface *recv_if);
        curr = batadv_rx_handler[packet_type];
 
-       if ((curr != batadv_recv_unhandled_packet) &&
-           (curr != batadv_recv_unhandled_unicast_packet))
+       if (curr != batadv_recv_unhandled_packet &&
+           curr != batadv_recv_unhandled_unicast_packet)
                return -EBUSY;
 
        batadv_rx_handler[packet_type] = recv_handler;
index 05cc763..edb2f23 100644 (file)
@@ -24,7 +24,7 @@
 #define BATADV_DRIVER_DEVICE "batman-adv"
 
 #ifndef BATADV_SOURCE_VERSION
-#define BATADV_SOURCE_VERSION "2017.3"
+#define BATADV_SOURCE_VERSION "2017.4"
 #endif
 
 /* B.A.T.M.A.N. parameters */
index d327670..e553a87 100644 (file)
@@ -1126,7 +1126,7 @@ static void batadv_mcast_tvlv_ogm_handler(struct batadv_priv *bat_priv,
        bool orig_initialized;
 
        if (orig_mcast_enabled && tvlv_value &&
-           (tvlv_value_len >= sizeof(mcast_flags)))
+           tvlv_value_len >= sizeof(mcast_flags))
                mcast_flags = *(u8 *)tvlv_value;
 
        spin_lock_bh(&orig->mcast_handler_lock);
index 8e2a4b2..2967b86 100644 (file)
@@ -1062,9 +1062,9 @@ batadv_purge_neigh_ifinfo(struct batadv_priv *bat_priv,
                        continue;
 
                /* don't purge if the interface is not (going) down */
-               if ((if_outgoing->if_status != BATADV_IF_INACTIVE) &&
-                   (if_outgoing->if_status != BATADV_IF_NOT_IN_USE) &&
-                   (if_outgoing->if_status != BATADV_IF_TO_BE_REMOVED))
+               if (if_outgoing->if_status != BATADV_IF_INACTIVE &&
+                   if_outgoing->if_status != BATADV_IF_NOT_IN_USE &&
+                   if_outgoing->if_status != BATADV_IF_TO_BE_REMOVED)
                        continue;
 
                batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
@@ -1106,9 +1106,9 @@ batadv_purge_orig_ifinfo(struct batadv_priv *bat_priv,
                        continue;
 
                /* don't purge if the interface is not (going) down */
-               if ((if_outgoing->if_status != BATADV_IF_INACTIVE) &&
-                   (if_outgoing->if_status != BATADV_IF_NOT_IN_USE) &&
-                   (if_outgoing->if_status != BATADV_IF_TO_BE_REMOVED))
+               if (if_outgoing->if_status != BATADV_IF_INACTIVE &&
+                   if_outgoing->if_status != BATADV_IF_NOT_IN_USE &&
+                   if_outgoing->if_status != BATADV_IF_TO_BE_REMOVED)
                        continue;
 
                batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
@@ -1155,13 +1155,13 @@ batadv_purge_orig_neighbors(struct batadv_priv *bat_priv,
                last_seen = neigh_node->last_seen;
                if_incoming = neigh_node->if_incoming;
 
-               if ((batadv_has_timed_out(last_seen, BATADV_PURGE_TIMEOUT)) ||
-                   (if_incoming->if_status == BATADV_IF_INACTIVE) ||
-                   (if_incoming->if_status == BATADV_IF_NOT_IN_USE) ||
-                   (if_incoming->if_status == BATADV_IF_TO_BE_REMOVED)) {
-                       if ((if_incoming->if_status == BATADV_IF_INACTIVE) ||
-                           (if_incoming->if_status == BATADV_IF_NOT_IN_USE) ||
-                           (if_incoming->if_status == BATADV_IF_TO_BE_REMOVED))
+               if (batadv_has_timed_out(last_seen, BATADV_PURGE_TIMEOUT) ||
+                   if_incoming->if_status == BATADV_IF_INACTIVE ||
+                   if_incoming->if_status == BATADV_IF_NOT_IN_USE ||
+                   if_incoming->if_status == BATADV_IF_TO_BE_REMOVED) {
+                       if (if_incoming->if_status == BATADV_IF_INACTIVE ||
+                           if_incoming->if_status == BATADV_IF_NOT_IN_USE ||
+                           if_incoming->if_status == BATADV_IF_TO_BE_REMOVED)
                                batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
                                           "neighbor purge: originator %pM, neighbor: %pM, iface: %s\n",
                                           orig_node->orig, neigh_node->addr,
index f10e3ff..40d9bf3 100644 (file)
@@ -93,14 +93,14 @@ static void _batadv_update_route(struct batadv_priv *bat_priv,
        batadv_orig_ifinfo_put(orig_ifinfo);
 
        /* route deleted */
-       if ((curr_router) && (!neigh_node)) {
+       if (curr_router && !neigh_node) {
                batadv_dbg(BATADV_DBG_ROUTES, bat_priv,
                           "Deleting route towards: %pM\n", orig_node->orig);
                batadv_tt_global_del_orig(bat_priv, orig_node, -1,
                                          "Deleted route towards originator");
 
        /* route added */
-       } else if ((!curr_router) && (neigh_node)) {
+       } else if (!curr_router && neigh_node) {
                batadv_dbg(BATADV_DBG_ROUTES, bat_priv,
                           "Adding route towards: %pM (via %pM)\n",
                           orig_node->orig, neigh_node->addr);
@@ -381,7 +381,7 @@ int batadv_recv_icmp_packet(struct sk_buff *skb,
        /* add record route information if not full */
        if ((icmph->msg_type == BATADV_ECHO_REPLY ||
             icmph->msg_type == BATADV_ECHO_REQUEST) &&
-           (skb->len >= sizeof(struct batadv_icmp_packet_rr))) {
+           skb->len >= sizeof(struct batadv_icmp_packet_rr)) {
                if (skb_linearize(skb) < 0)
                        goto free_skb;
 
index 054a65e..7895323 100644 (file)
@@ -142,7 +142,7 @@ int batadv_send_unicast_skb(struct sk_buff *skb,
 #ifdef CONFIG_BATMAN_ADV_BATMAN_V
        hardif_neigh = batadv_hardif_neigh_get(neigh->if_incoming, neigh->addr);
 
-       if ((hardif_neigh) && (ret != NET_XMIT_DROP))
+       if (hardif_neigh && ret != NET_XMIT_DROP)
                hardif_neigh->bat_v.last_unicast_tx = jiffies;
 
        if (hardif_neigh)
@@ -615,8 +615,8 @@ batadv_forw_packet_list_steal(struct hlist_head *forw_list,
                 * we delete only packets belonging to the given interface
                 */
                if (hard_iface &&
-                   (forw_packet->if_incoming != hard_iface) &&
-                   (forw_packet->if_outgoing != hard_iface))
+                   forw_packet->if_incoming != hard_iface &&
+                   forw_packet->if_outgoing != hard_iface)
                        continue;
 
                hlist_del(&forw_packet->list);
index e7d5fbb..543d2c3 100644 (file)
@@ -69,8 +69,8 @@ int batadv_skb_head_push(struct sk_buff *skb, unsigned int len)
        int result;
 
        /* TODO: We must check if we can release all references to non-payload
-        * data using __skb_header_release in our skbs to allow skb_cow_header to
-        * work optimally. This means that those skbs are not allowed to read
+        * data using __skb_header_release in our skbs to allow skb_cow_header
+        * to work optimally. This means that those skbs are not allowed to read
         * or write any data which is before the current position of skb->data
         * after that call and thus allow other skbs with the same data buffer
         * to write freely in that area.
@@ -160,7 +160,7 @@ static int batadv_interface_set_mac_addr(struct net_device *dev, void *p)
 static int batadv_interface_change_mtu(struct net_device *dev, int new_mtu)
 {
        /* check ranges */
-       if ((new_mtu < 68) || (new_mtu > batadv_hardif_min_mtu(dev)))
+       if (new_mtu < 68 || new_mtu > batadv_hardif_min_mtu(dev))
                return -EINVAL;
 
        dev->mtu = new_mtu;
index 0ae8b30..aa187fd 100644 (file)
@@ -925,8 +925,8 @@ static int batadv_store_mesh_iface_finish(struct net_device *net_dev,
        if (hard_iface->if_status == status_tmp)
                goto out;
 
-       if ((hard_iface->soft_iface) &&
-           (strncmp(hard_iface->soft_iface->name, ifname, IFNAMSIZ) == 0))
+       if (hard_iface->soft_iface &&
+           strncmp(hard_iface->soft_iface->name, ifname, IFNAMSIZ) == 0)
                goto out;
 
        if (status_tmp == BATADV_IF_NOT_IN_USE) {
index bfe8eff..4b90033 100644 (file)
@@ -1206,7 +1206,7 @@ static int batadv_tp_send_ack(struct batadv_priv *bat_priv, const u8 *dst,
 
        /* send the ack */
        r = batadv_send_skb_to_orig(skb, orig_node, NULL);
-       if (unlikely(r < 0) || (r == NET_XMIT_DROP)) {
+       if (unlikely(r < 0) || r == NET_XMIT_DROP) {
                ret = BATADV_TP_REASON_DST_UNREACHABLE;
                goto out;
        }
index 40b1ede..4aee55f 100644 (file)
@@ -7,7 +7,7 @@ obj-$(CONFIG_BRIDGE) += bridge.o
 bridge-y       := br.o br_device.o br_fdb.o br_forward.o br_if.o br_input.o \
                        br_ioctl.o br_stp.o br_stp_bpdu.o \
                        br_stp_if.o br_stp_timer.o br_netlink.o \
-                       br_netlink_tunnel.o
+                       br_netlink_tunnel.o br_arp_nd_proxy.o
 
 bridge-$(CONFIG_SYSFS) += br_sysfs_if.o br_sysfs_br.o
 
diff --git a/net/bridge/br_arp_nd_proxy.c b/net/bridge/br_arp_nd_proxy.c
new file mode 100644 (file)
index 0000000..2cf7716
--- /dev/null
@@ -0,0 +1,469 @@
+/*
+ *  Handle bridge arp/nd proxy/suppress
+ *
+ *  Copyright (C) 2017 Cumulus Networks
+ *  Copyright (c) 2017 Roopa Prabhu <roopa@cumulusnetworks.com>
+ *
+ *  Authors:
+ *     Roopa Prabhu <roopa@cumulusnetworks.com>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/neighbour.h>
+#include <net/arp.h>
+#include <linux/if_vlan.h>
+#include <linux/inetdevice.h>
+#include <net/addrconf.h>
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/ip6_checksum.h>
+#endif
+
+#include "br_private.h"
+
+void br_recalculate_neigh_suppress_enabled(struct net_bridge *br)
+{
+       struct net_bridge_port *p;
+       bool neigh_suppress = false;
+
+       list_for_each_entry(p, &br->port_list, list) {
+               if (p->flags & BR_NEIGH_SUPPRESS) {
+                       neigh_suppress = true;
+                       break;
+               }
+       }
+
+       br->neigh_suppress_enabled = neigh_suppress;
+}
+
+#if IS_ENABLED(CONFIG_INET)
+static void br_arp_send(struct net_bridge *br, struct net_bridge_port *p,
+                       struct net_device *dev, __be32 dest_ip, __be32 src_ip,
+                       const unsigned char *dest_hw,
+                       const unsigned char *src_hw,
+                       const unsigned char *target_hw,
+                       __be16 vlan_proto, u16 vlan_tci)
+{
+       struct net_bridge_vlan_group *vg;
+       struct sk_buff *skb;
+       u16 pvid;
+
+       netdev_dbg(dev, "arp send dev %s dst %pI4 dst_hw %pM src %pI4 src_hw %pM\n",
+                  dev->name, &dest_ip, dest_hw, &src_ip, src_hw);
+
+       if (!vlan_tci) {
+               arp_send(ARPOP_REPLY, ETH_P_ARP, dest_ip, dev, src_ip,
+                        dest_hw, src_hw, target_hw);
+               return;
+       }
+
+       skb = arp_create(ARPOP_REPLY, ETH_P_ARP, dest_ip, dev, src_ip,
+                        dest_hw, src_hw, target_hw);
+       if (!skb)
+               return;
+
+       if (p)
+               vg = nbp_vlan_group_rcu(p);
+       else
+               vg = br_vlan_group_rcu(br);
+       pvid = br_get_pvid(vg);
+       if (pvid == (vlan_tci & VLAN_VID_MASK))
+               vlan_tci = 0;
+
+       if (vlan_tci)
+               __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
+
+       if (p) {
+               arp_xmit(skb);
+       } else {
+               skb_reset_mac_header(skb);
+               __skb_pull(skb, skb_network_offset(skb));
+               skb->ip_summed = CHECKSUM_UNNECESSARY;
+               skb->pkt_type = PACKET_HOST;
+
+               netif_rx_ni(skb);
+       }
+}
+
+static int br_chk_addr_ip(struct net_device *dev, void *data)
+{
+       __be32 ip = *(__be32 *)data;
+       struct in_device *in_dev;
+       __be32 addr = 0;
+
+       in_dev = __in_dev_get_rcu(dev);
+       if (in_dev)
+               addr = inet_confirm_addr(dev_net(dev), in_dev, 0, ip,
+                                        RT_SCOPE_HOST);
+
+       if (addr == ip)
+               return 1;
+
+       return 0;
+}
+
+static bool br_is_local_ip(struct net_device *dev, __be32 ip)
+{
+       if (br_chk_addr_ip(dev, &ip))
+               return true;
+
+       /* check if ip is configured on upper dev */
+       if (netdev_walk_all_upper_dev_rcu(dev, br_chk_addr_ip, &ip))
+               return true;
+
+       return false;
+}
+
+void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br,
+                             u16 vid, struct net_bridge_port *p)
+{
+       struct net_device *dev = br->dev;
+       struct net_device *vlandev = dev;
+       struct neighbour *n;
+       struct arphdr *parp;
+       u8 *arpptr, *sha;
+       __be32 sip, tip;
+
+       BR_INPUT_SKB_CB(skb)->proxyarp_replied = false;
+
+       if ((dev->flags & IFF_NOARP) ||
+           !pskb_may_pull(skb, arp_hdr_len(dev)))
+               return;
+
+       parp = arp_hdr(skb);
+
+       if (parp->ar_pro != htons(ETH_P_IP) ||
+           parp->ar_hln != dev->addr_len ||
+           parp->ar_pln != 4)
+               return;
+
+       arpptr = (u8 *)parp + sizeof(struct arphdr);
+       sha = arpptr;
+       arpptr += dev->addr_len;        /* sha */
+       memcpy(&sip, arpptr, sizeof(sip));
+       arpptr += sizeof(sip);
+       arpptr += dev->addr_len;        /* tha */
+       memcpy(&tip, arpptr, sizeof(tip));
+
+       if (ipv4_is_loopback(tip) ||
+           ipv4_is_multicast(tip))
+               return;
+
+       if (br->neigh_suppress_enabled) {
+               if (p && (p->flags & BR_NEIGH_SUPPRESS))
+                       return;
+               if (ipv4_is_zeronet(sip) || sip == tip) {
+                       /* prevent flooding to neigh suppress ports */
+                       BR_INPUT_SKB_CB(skb)->proxyarp_replied = true;
+                       return;
+               }
+       }
+
+       if (parp->ar_op != htons(ARPOP_REQUEST))
+               return;
+
+       if (vid != 0) {
+               vlandev = __vlan_find_dev_deep_rcu(br->dev, skb->vlan_proto,
+                                                  vid);
+               if (!vlandev)
+                       return;
+       }
+
+       if (br->neigh_suppress_enabled && br_is_local_ip(vlandev, tip)) {
+               /* its our local ip, so don't proxy reply
+                * and don't forward to neigh suppress ports
+                */
+               BR_INPUT_SKB_CB(skb)->proxyarp_replied = true;
+               return;
+       }
+
+       n = neigh_lookup(&arp_tbl, &tip, vlandev);
+       if (n) {
+               struct net_bridge_fdb_entry *f;
+
+               if (!(n->nud_state & NUD_VALID)) {
+                       neigh_release(n);
+                       return;
+               }
+
+               f = br_fdb_find_rcu(br, n->ha, vid);
+               if (f) {
+                       bool replied = false;
+
+                       if ((p && (p->flags & BR_PROXYARP)) ||
+                           (f->dst && (f->dst->flags & (BR_PROXYARP_WIFI |
+                                                        BR_NEIGH_SUPPRESS)))) {
+                               if (!vid)
+                                       br_arp_send(br, p, skb->dev, sip, tip,
+                                                   sha, n->ha, sha, 0, 0);
+                               else
+                                       br_arp_send(br, p, skb->dev, sip, tip,
+                                                   sha, n->ha, sha,
+                                                   skb->vlan_proto,
+                                                   skb_vlan_tag_get(skb));
+                               replied = true;
+                       }
+
+                       /* If we have replied or as long as we know the
+                        * mac, indicate to arp replied
+                        */
+                       if (replied || br->neigh_suppress_enabled)
+                               BR_INPUT_SKB_CB(skb)->proxyarp_replied = true;
+               }
+
+               neigh_release(n);
+       }
+}
+#endif
+
+#if IS_ENABLED(CONFIG_IPV6)
+struct nd_msg *br_is_nd_neigh_msg(struct sk_buff *skb, struct nd_msg *msg)
+{
+       struct nd_msg *m;
+
+       m = skb_header_pointer(skb, skb_network_offset(skb) +
+                              sizeof(struct ipv6hdr), sizeof(*msg), msg);
+       if (!m)
+               return NULL;
+
+       if (m->icmph.icmp6_code != 0 ||
+           (m->icmph.icmp6_type != NDISC_NEIGHBOUR_SOLICITATION &&
+            m->icmph.icmp6_type != NDISC_NEIGHBOUR_ADVERTISEMENT))
+               return NULL;
+
+       return m;
+}
+
+static void br_nd_send(struct net_bridge *br, struct net_bridge_port *p,
+                      struct sk_buff *request, struct neighbour *n,
+                      __be16 vlan_proto, u16 vlan_tci, struct nd_msg *ns)
+{
+       struct net_device *dev = request->dev;
+       struct net_bridge_vlan_group *vg;
+       struct sk_buff *reply;
+       struct nd_msg *na;
+       struct ipv6hdr *pip6;
+       int na_olen = 8; /* opt hdr + ETH_ALEN for target */
+       int ns_olen;
+       int i, len;
+       u8 *daddr;
+       u16 pvid;
+
+       if (!dev)
+               return;
+
+       len = LL_RESERVED_SPACE(dev) + sizeof(struct ipv6hdr) +
+               sizeof(*na) + na_olen + dev->needed_tailroom;
+
+       reply = alloc_skb(len, GFP_ATOMIC);
+       if (!reply)
+               return;
+
+       reply->protocol = htons(ETH_P_IPV6);
+       reply->dev = dev;
+       skb_reserve(reply, LL_RESERVED_SPACE(dev));
+       skb_push(reply, sizeof(struct ethhdr));
+       skb_set_mac_header(reply, 0);
+
+       daddr = eth_hdr(request)->h_source;
+
+       /* Do we need option processing ? */
+       ns_olen = request->len - (skb_network_offset(request) +
+                                 sizeof(struct ipv6hdr)) - sizeof(*ns);
+       for (i = 0; i < ns_olen - 1; i += (ns->opt[i + 1] << 3)) {
+               if (ns->opt[i] == ND_OPT_SOURCE_LL_ADDR) {
+                       daddr = ns->opt + i + sizeof(struct nd_opt_hdr);
+                       break;
+               }
+       }
+
+       /* Ethernet header */
+       ether_addr_copy(eth_hdr(reply)->h_dest, daddr);
+       ether_addr_copy(eth_hdr(reply)->h_source, n->ha);
+       eth_hdr(reply)->h_proto = htons(ETH_P_IPV6);
+       reply->protocol = htons(ETH_P_IPV6);
+
+       skb_pull(reply, sizeof(struct ethhdr));
+       skb_set_network_header(reply, 0);
+       skb_put(reply, sizeof(struct ipv6hdr));
+
+       /* IPv6 header */
+       pip6 = ipv6_hdr(reply);
+       memset(pip6, 0, sizeof(struct ipv6hdr));
+       pip6->version = 6;
+       pip6->priority = ipv6_hdr(request)->priority;
+       pip6->nexthdr = IPPROTO_ICMPV6;
+       pip6->hop_limit = 255;
+       pip6->daddr = ipv6_hdr(request)->saddr;
+       pip6->saddr = *(struct in6_addr *)n->primary_key;
+
+       skb_pull(reply, sizeof(struct ipv6hdr));
+       skb_set_transport_header(reply, 0);
+
+       na = (struct nd_msg *)skb_put(reply, sizeof(*na) + na_olen);
+
+       /* Neighbor Advertisement */
+       memset(na, 0, sizeof(*na) + na_olen);
+       na->icmph.icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT;
+       na->icmph.icmp6_router = 0; /* XXX: should be 1 ? */
+       na->icmph.icmp6_override = 1;
+       na->icmph.icmp6_solicited = 1;
+       na->target = ns->target;
+       ether_addr_copy(&na->opt[2], n->ha);
+       na->opt[0] = ND_OPT_TARGET_LL_ADDR;
+       na->opt[1] = na_olen >> 3;
+
+       na->icmph.icmp6_cksum = csum_ipv6_magic(&pip6->saddr,
+                                               &pip6->daddr,
+                                               sizeof(*na) + na_olen,
+                                               IPPROTO_ICMPV6,
+                                               csum_partial(na, sizeof(*na) + na_olen, 0));
+
+       pip6->payload_len = htons(sizeof(*na) + na_olen);
+
+       skb_push(reply, sizeof(struct ipv6hdr));
+       skb_push(reply, sizeof(struct ethhdr));
+
+       reply->ip_summed = CHECKSUM_UNNECESSARY;
+
+       if (p)
+               vg = nbp_vlan_group_rcu(p);
+       else
+               vg = br_vlan_group_rcu(br);
+       pvid = br_get_pvid(vg);
+       if (pvid == (vlan_tci & VLAN_VID_MASK))
+               vlan_tci = 0;
+
+       if (vlan_tci)
+               __vlan_hwaccel_put_tag(reply, vlan_proto, vlan_tci);
+
+       netdev_dbg(dev, "nd send dev %s dst %pI6 dst_hw %pM src %pI6 src_hw %pM\n",
+                  dev->name, &pip6->daddr, daddr, &pip6->saddr, n->ha);
+
+       if (p) {
+               dev_queue_xmit(reply);
+       } else {
+               skb_reset_mac_header(reply);
+               __skb_pull(reply, skb_network_offset(reply));
+               reply->ip_summed = CHECKSUM_UNNECESSARY;
+               reply->pkt_type = PACKET_HOST;
+
+               netif_rx_ni(reply);
+       }
+}
+
+static int br_chk_addr_ip6(struct net_device *dev, void *data)
+{
+       struct in6_addr *addr = (struct in6_addr *)data;
+
+       if (ipv6_chk_addr(dev_net(dev), addr, dev, 0))
+               return 1;
+
+       return 0;
+}
+
+static bool br_is_local_ip6(struct net_device *dev, struct in6_addr *addr)
+
+{
+       if (br_chk_addr_ip6(dev, addr))
+               return true;
+
+       /* check if ip is configured on upper dev */
+       if (netdev_walk_all_upper_dev_rcu(dev, br_chk_addr_ip6, addr))
+               return true;
+
+       return false;
+}
+
+void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br,
+                      u16 vid, struct net_bridge_port *p, struct nd_msg *msg)
+{
+       struct net_device *dev = br->dev;
+       struct net_device *vlandev = NULL;
+       struct in6_addr *saddr, *daddr;
+       struct ipv6hdr *iphdr;
+       struct neighbour *n;
+
+       BR_INPUT_SKB_CB(skb)->proxyarp_replied = false;
+
+       if (p && (p->flags & BR_NEIGH_SUPPRESS))
+               return;
+
+       if (msg->icmph.icmp6_type == NDISC_NEIGHBOUR_ADVERTISEMENT &&
+           !msg->icmph.icmp6_solicited) {
+               /* prevent flooding to neigh suppress ports */
+               BR_INPUT_SKB_CB(skb)->proxyarp_replied = true;
+               return;
+       }
+
+       if (msg->icmph.icmp6_type != NDISC_NEIGHBOUR_SOLICITATION)
+               return;
+
+       iphdr = ipv6_hdr(skb);
+       saddr = &iphdr->saddr;
+       daddr = &iphdr->daddr;
+
+       if (ipv6_addr_any(saddr) || !ipv6_addr_cmp(saddr, daddr)) {
+               /* prevent flooding to neigh suppress ports */
+               BR_INPUT_SKB_CB(skb)->proxyarp_replied = true;
+               return;
+       }
+
+       if (vid != 0) {
+               /* build neigh table lookup on the vlan device */
+               vlandev = __vlan_find_dev_deep_rcu(br->dev, skb->vlan_proto,
+                                                  vid);
+               if (!vlandev)
+                       return;
+       } else {
+               vlandev = dev;
+       }
+
+       if (br_is_local_ip6(vlandev, &msg->target)) {
+               /* its our own ip, so don't proxy reply
+                * and don't forward to arp suppress ports
+                */
+               BR_INPUT_SKB_CB(skb)->proxyarp_replied = true;
+               return;
+       }
+
+       n = neigh_lookup(ipv6_stub->nd_tbl, &msg->target, vlandev);
+       if (n) {
+               struct net_bridge_fdb_entry *f;
+
+               if (!(n->nud_state & NUD_VALID)) {
+                       neigh_release(n);
+                       return;
+               }
+
+               f = br_fdb_find_rcu(br, n->ha, vid);
+               if (f) {
+                       bool replied = false;
+
+                       if (f->dst && (f->dst->flags & BR_NEIGH_SUPPRESS)) {
+                               if (vid != 0)
+                                       br_nd_send(br, p, skb, n,
+                                                  skb->vlan_proto,
+                                                  skb_vlan_tag_get(skb), msg);
+                               else
+                                       br_nd_send(br, p, skb, n, 0, 0, msg);
+                               replied = true;
+                       }
+
+                       /* If we have replied or as long as we know the
+                        * mac, indicate to NEIGH_SUPPRESS ports that we
+                        * have replied
+                        */
+                       if (replied || br->neigh_suppress_enabled)
+                               BR_INPUT_SKB_CB(skb)->proxyarp_replied = true;
+               }
+               neigh_release(n);
+       }
+}
+#endif
index 7acb77c..28bb221 100644 (file)
@@ -39,6 +39,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
        struct pcpu_sw_netstats *brstats = this_cpu_ptr(br->stats);
        const struct nf_br_ops *nf_ops;
        const unsigned char *dest;
+       struct ethhdr *eth;
        u16 vid = 0;
 
        rcu_read_lock();
@@ -57,11 +58,30 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
        BR_INPUT_SKB_CB(skb)->brdev = dev;
 
        skb_reset_mac_header(skb);
+       eth = eth_hdr(skb);
        skb_pull(skb, ETH_HLEN);
 
        if (!br_allowed_ingress(br, br_vlan_group_rcu(br), skb, &vid))
                goto out;
 
+       if (IS_ENABLED(CONFIG_INET) &&
+           (eth->h_proto == htons(ETH_P_ARP) ||
+            eth->h_proto == htons(ETH_P_RARP)) &&
+           br->neigh_suppress_enabled) {
+               br_do_proxy_suppress_arp(skb, br, vid, NULL);
+       } else if (IS_ENABLED(CONFIG_IPV6) &&
+                  skb->protocol == htons(ETH_P_IPV6) &&
+                  br->neigh_suppress_enabled &&
+                  pskb_may_pull(skb, sizeof(struct ipv6hdr) +
+                                sizeof(struct nd_msg)) &&
+                  ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) {
+                       struct nd_msg *msg, _msg;
+
+                       msg = br_is_nd_neigh_msg(skb, &_msg);
+                       if (msg)
+                               br_do_suppress_nd(skb, br, vid, NULL, msg);
+       }
+
        dest = eth_hdr(skb)->h_dest;
        if (is_broadcast_ether_addr(dest)) {
                br_flood(br, skb, BR_PKT_BROADCAST, false, true);
index 48fb174..b4eed11 100644 (file)
@@ -204,7 +204,7 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb,
                /* Do not flood to ports that enable proxy ARP */
                if (p->flags & BR_PROXYARP)
                        continue;
-               if ((p->flags & BR_PROXYARP_WIFI) &&
+               if ((p->flags & (BR_PROXYARP_WIFI | BR_NEIGH_SUPPRESS)) &&
                    BR_INPUT_SKB_CB(skb)->proxyarp_replied)
                        continue;
 
index 59a74a4..ae38547 100644 (file)
@@ -310,6 +310,8 @@ void br_dev_delete(struct net_device *dev, struct list_head *head)
                del_nbp(p);
        }
 
+       br_recalculate_neigh_suppress_enabled(br);
+
        br_fdb_delete_by_port(br, NULL, 0, 1);
 
        cancel_delayed_work_sync(&br->gc_work);
@@ -660,4 +662,7 @@ void br_port_flags_change(struct net_bridge_port *p, unsigned long mask)
 
        if (mask & BR_AUTO_MASK)
                nbp_update_port_count(br);
+
+       if (mask & BR_NEIGH_SUPPRESS)
+               br_recalculate_neigh_suppress_enabled(br);
 }
index 7cb6137..a096d3e 100644 (file)
@@ -71,62 +71,6 @@ static int br_pass_frame_up(struct sk_buff *skb)
                       br_netif_receive_skb);
 }
 
-static void br_do_proxy_arp(struct sk_buff *skb, struct net_bridge *br,
-                           u16 vid, struct net_bridge_port *p)
-{
-       struct net_device *dev = br->dev;
-       struct neighbour *n;
-       struct arphdr *parp;
-       u8 *arpptr, *sha;
-       __be32 sip, tip;
-
-       BR_INPUT_SKB_CB(skb)->proxyarp_replied = false;
-
-       if ((dev->flags & IFF_NOARP) ||
-           !pskb_may_pull(skb, arp_hdr_len(dev)))
-               return;
-
-       parp = arp_hdr(skb);
-
-       if (parp->ar_pro != htons(ETH_P_IP) ||
-           parp->ar_op != htons(ARPOP_REQUEST) ||
-           parp->ar_hln != dev->addr_len ||
-           parp->ar_pln != 4)
-               return;
-
-       arpptr = (u8 *)parp + sizeof(struct arphdr);
-       sha = arpptr;
-       arpptr += dev->addr_len;        /* sha */
-       memcpy(&sip, arpptr, sizeof(sip));
-       arpptr += sizeof(sip);
-       arpptr += dev->addr_len;        /* tha */
-       memcpy(&tip, arpptr, sizeof(tip));
-
-       if (ipv4_is_loopback(tip) ||
-           ipv4_is_multicast(tip))
-               return;
-
-       n = neigh_lookup(&arp_tbl, &tip, dev);
-       if (n) {
-               struct net_bridge_fdb_entry *f;
-
-               if (!(n->nud_state & NUD_VALID)) {
-                       neigh_release(n);
-                       return;
-               }
-
-               f = br_fdb_find_rcu(br, n->ha, vid);
-               if (f && ((p->flags & BR_PROXYARP) ||
-                         (f->dst && (f->dst->flags & BR_PROXYARP_WIFI)))) {
-                       arp_send(ARPOP_REPLY, ETH_P_ARP, sip, skb->dev, tip,
-                                sha, n->ha, sha);
-                       BR_INPUT_SKB_CB(skb)->proxyarp_replied = true;
-               }
-
-               neigh_release(n);
-       }
-}
-
 /* note: already called with rcu_read_lock */
 int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
@@ -171,8 +115,22 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
 
        BR_INPUT_SKB_CB(skb)->brdev = br->dev;
 
-       if (IS_ENABLED(CONFIG_INET) && skb->protocol == htons(ETH_P_ARP))
-               br_do_proxy_arp(skb, br, vid, p);
+       if (IS_ENABLED(CONFIG_INET) &&
+           (skb->protocol == htons(ETH_P_ARP) ||
+            skb->protocol == htons(ETH_P_RARP))) {
+               br_do_proxy_suppress_arp(skb, br, vid, p);
+       } else if (IS_ENABLED(CONFIG_IPV6) &&
+                  skb->protocol == htons(ETH_P_IPV6) &&
+                  br->neigh_suppress_enabled &&
+                  pskb_may_pull(skb, sizeof(struct ipv6hdr) +
+                                sizeof(struct nd_msg)) &&
+                  ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) {
+                       struct nd_msg *msg, _msg;
+
+                       msg = br_is_nd_neigh_msg(skb, &_msg);
+                       if (msg)
+                               br_do_suppress_nd(skb, br, vid, p, msg);
+       }
 
        switch (pkt_type) {
        case BR_PKT_MULTICAST:
index 8dc5c8d..7947e04 100644 (file)
@@ -859,8 +859,32 @@ out:
        spin_unlock(&br->multicast_lock);
 }
 
+static void br_mc_router_state_change(struct net_bridge *p,
+                                     bool is_mc_router)
+{
+       struct switchdev_attr attr = {
+               .orig_dev = p->dev,
+               .id = SWITCHDEV_ATTR_ID_BRIDGE_MROUTER,
+               .flags = SWITCHDEV_F_DEFER,
+               .u.mrouter = is_mc_router,
+       };
+
+       switchdev_port_attr_set(p->dev, &attr);
+}
+
 static void br_multicast_local_router_expired(unsigned long data)
 {
+       struct net_bridge *br = (struct net_bridge *)data;
+
+       spin_lock(&br->multicast_lock);
+       if (br->multicast_router == MDB_RTR_TYPE_DISABLED ||
+           br->multicast_router == MDB_RTR_TYPE_PERM ||
+           timer_pending(&br->multicast_router_timer))
+               goto out;
+
+       br_mc_router_state_change(br, false);
+out:
+       spin_unlock(&br->multicast_lock);
 }
 
 static void br_multicast_querier_expired(struct net_bridge *br,
@@ -1364,9 +1388,12 @@ static void br_multicast_mark_router(struct net_bridge *br,
        unsigned long now = jiffies;
 
        if (!port) {
-               if (br->multicast_router == MDB_RTR_TYPE_TEMP_QUERY)
+               if (br->multicast_router == MDB_RTR_TYPE_TEMP_QUERY) {
+                       if (!timer_pending(&br->multicast_router_timer))
+                               br_mc_router_state_change(br, true);
                        mod_timer(&br->multicast_router_timer,
                                  now + br->multicast_querier_interval);
+               }
                return;
        }
 
@@ -1952,7 +1979,7 @@ void br_multicast_init(struct net_bridge *br)
 
        spin_lock_init(&br->multicast_lock);
        setup_timer(&br->multicast_router_timer,
-                   br_multicast_local_router_expired, 0);
+                   br_multicast_local_router_expired, (unsigned long)br);
        setup_timer(&br->ip4_other_query.timer,
                    br_ip4_multicast_querier_expired, (unsigned long)br);
        setup_timer(&br->ip4_own_query.timer, br_ip4_multicast_query_expired,
@@ -2042,9 +2069,14 @@ int br_multicast_set_router(struct net_bridge *br, unsigned long val)
        switch (val) {
        case MDB_RTR_TYPE_DISABLED:
        case MDB_RTR_TYPE_PERM:
+               br_mc_router_state_change(br, val == MDB_RTR_TYPE_PERM);
                del_timer(&br->multicast_router_timer);
-               /* fall through */
+               br->multicast_router = val;
+               err = 0;
+               break;
        case MDB_RTR_TYPE_TEMP_QUERY:
+               if (br->multicast_router != MDB_RTR_TYPE_TEMP_QUERY)
+                       br_mc_router_state_change(br, false);
                br->multicast_router = val;
                err = 0;
                break;
@@ -2184,6 +2216,18 @@ bool br_multicast_enabled(const struct net_device *dev)
 }
 EXPORT_SYMBOL_GPL(br_multicast_enabled);
 
+bool br_multicast_router(const struct net_device *dev)
+{
+       struct net_bridge *br = netdev_priv(dev);
+       bool is_router;
+
+       spin_lock_bh(&br->multicast_lock);
+       is_router = br_multicast_is_router(br);
+       spin_unlock_bh(&br->multicast_lock);
+       return is_router;
+}
+EXPORT_SYMBOL_GPL(br_multicast_router);
+
 int br_multicast_set_querier(struct net_bridge *br, unsigned long val)
 {
        unsigned long max_delay;
index dea88a2..f0e8268 100644 (file)
@@ -138,6 +138,7 @@ static inline size_t br_port_info_size(void)
                + nla_total_size(1)     /* IFLA_BRPORT_PROXYARP */
                + nla_total_size(1)     /* IFLA_BRPORT_PROXYARP_WIFI */
                + nla_total_size(1)     /* IFLA_BRPORT_VLAN_TUNNEL */
+               + nla_total_size(1)     /* IFLA_BRPORT_NEIGH_SUPPRESS */
                + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_ROOT_ID */
                + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_BRIDGE_ID */
                + nla_total_size(sizeof(u16))   /* IFLA_BRPORT_DESIGNATED_PORT */
@@ -210,7 +211,9 @@ static int br_port_fill_attrs(struct sk_buff *skb,
            nla_put_u8(skb, IFLA_BRPORT_CONFIG_PENDING, p->config_pending) ||
            nla_put_u8(skb, IFLA_BRPORT_VLAN_TUNNEL, !!(p->flags &
                                                        BR_VLAN_TUNNEL)) ||
-           nla_put_u16(skb, IFLA_BRPORT_GROUP_FWD_MASK, p->group_fwd_mask))
+           nla_put_u16(skb, IFLA_BRPORT_GROUP_FWD_MASK, p->group_fwd_mask) ||
+           nla_put_u8(skb, IFLA_BRPORT_NEIGH_SUPPRESS,
+                      !!(p->flags & BR_NEIGH_SUPPRESS)))
                return -EMSGSIZE;
 
        timerval = br_timer_value(&p->message_age_timer);
@@ -785,6 +788,11 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[])
                p->group_fwd_mask = fwd_mask;
        }
 
+       err = br_set_port_flag(p, tb, IFLA_BRPORT_NEIGH_SUPPRESS,
+                              BR_NEIGH_SUPPRESS);
+       if (err)
+               return err;
+
        br_port_flags_change(p, old_flags ^ p->flags);
        return 0;
 }
index ab4df24..fa0039f 100644 (file)
@@ -404,6 +404,7 @@ struct net_bridge {
 #ifdef CONFIG_NET_SWITCHDEV
        int offload_fwd_mark;
 #endif
+       bool                            neigh_suppress_enabled;
 };
 
 struct br_input_skb_cb {
@@ -1139,4 +1140,11 @@ static inline void br_switchdev_frame_unmark(struct sk_buff *skb)
 }
 #endif /* CONFIG_NET_SWITCHDEV */
 
+/* br_arp_nd_proxy.c */
+void br_recalculate_neigh_suppress_enabled(struct net_bridge *br);
+void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br,
+                             u16 vid, struct net_bridge_port *p);
+void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br,
+                      u16 vid, struct net_bridge_port *p, struct nd_msg *msg);
+struct nd_msg *br_is_nd_neigh_msg(struct sk_buff *skb, struct nd_msg *m);
 #endif
index 9110d5e..0a1fa9c 100644 (file)
@@ -191,6 +191,7 @@ BRPORT_ATTR_FLAG(proxyarp, BR_PROXYARP);
 BRPORT_ATTR_FLAG(proxyarp_wifi, BR_PROXYARP_WIFI);
 BRPORT_ATTR_FLAG(multicast_flood, BR_MCAST_FLOOD);
 BRPORT_ATTR_FLAG(broadcast_flood, BR_BCAST_FLOOD);
+BRPORT_ATTR_FLAG(neigh_suppress, BR_NEIGH_SUPPRESS);
 
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
 static ssize_t show_multicast_router(struct net_bridge_port *p, char *buf)
@@ -241,6 +242,7 @@ static const struct brport_attribute *brport_attrs[] = {
        &brport_attr_multicast_flood,
        &brport_attr_broadcast_flood,
        &brport_attr_group_fwd_mask,
+       &brport_attr_neigh_suppress,
        NULL
 };
 
index 2585b10..276b602 100644 (file)
@@ -65,8 +65,8 @@ static int ebt_broute(struct sk_buff *skb)
 
 static int __net_init broute_net_init(struct net *net)
 {
-       net->xt.broute_table = ebt_register_table(net, &broute_table, NULL);
-       return PTR_ERR_OR_ZERO(net->xt.broute_table);
+       return ebt_register_table(net, &broute_table, NULL,
+                                 &net->xt.broute_table);
 }
 
 static void __net_exit broute_net_exit(struct net *net)
index 45a00db..c41da5f 100644 (file)
@@ -93,8 +93,8 @@ static const struct nf_hook_ops ebt_ops_filter[] = {
 
 static int __net_init frame_filter_net_init(struct net *net)
 {
-       net->xt.frame_filter = ebt_register_table(net, &frame_filter, ebt_ops_filter);
-       return PTR_ERR_OR_ZERO(net->xt.frame_filter);
+       return ebt_register_table(net, &frame_filter, ebt_ops_filter,
+                                 &net->xt.frame_filter);
 }
 
 static void __net_exit frame_filter_net_exit(struct net *net)
index 57cd5bb..08df740 100644 (file)
@@ -93,8 +93,8 @@ static const struct nf_hook_ops ebt_ops_nat[] = {
 
 static int __net_init frame_nat_net_init(struct net *net)
 {
-       net->xt.frame_nat = ebt_register_table(net, &frame_nat, ebt_ops_nat);
-       return PTR_ERR_OR_ZERO(net->xt.frame_nat);
+       return ebt_register_table(net, &frame_nat, ebt_ops_nat,
+                                 &net->xt.frame_nat);
 }
 
 static void __net_exit frame_nat_net_exit(struct net *net)
index 83951f9..3b3dcf7 100644 (file)
@@ -1169,9 +1169,8 @@ static void __ebt_unregister_table(struct net *net, struct ebt_table *table)
        kfree(table);
 }
 
-struct ebt_table *
-ebt_register_table(struct net *net, const struct ebt_table *input_table,
-                  const struct nf_hook_ops *ops)
+int ebt_register_table(struct net *net, const struct ebt_table *input_table,
+                      const struct nf_hook_ops *ops, struct ebt_table **res)
 {
        struct ebt_table_info *newinfo;
        struct ebt_table *t, *table;
@@ -1183,7 +1182,7 @@ ebt_register_table(struct net *net, const struct ebt_table *input_table,
            repl->entries == NULL || repl->entries_size == 0 ||
            repl->counters != NULL || input_table->private != NULL) {
                BUGPRINT("Bad table data for ebt_register_table!!!\n");
-               return ERR_PTR(-EINVAL);
+               return -EINVAL;
        }
 
        /* Don't add one table to multiple lists. */
@@ -1252,16 +1251,18 @@ ebt_register_table(struct net *net, const struct ebt_table *input_table,
        list_add(&table->list, &net->xt.tables[NFPROTO_BRIDGE]);
        mutex_unlock(&ebt_mutex);
 
+       WRITE_ONCE(*res, table);
+
        if (!ops)
-               return table;
+               return 0;
 
        ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks));
        if (ret) {
                __ebt_unregister_table(net, table);
-               return ERR_PTR(ret);
+               *res = NULL;
        }
 
-       return table;
+       return ret;
 free_unlock:
        mutex_unlock(&ebt_mutex);
 free_chainstack:
@@ -1276,7 +1277,7 @@ free_newinfo:
 free_table:
        kfree(table);
 out:
-       return ERR_PTR(ret);
+       return ret;
 }
 
 void ebt_unregister_table(struct net *net, struct ebt_table *table,
index a6c47da..662a2d4 100644 (file)
@@ -322,3 +322,19 @@ metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags)
        return md_dst;
 }
 EXPORT_SYMBOL_GPL(metadata_dst_alloc_percpu);
+
+void metadata_dst_free_percpu(struct metadata_dst __percpu *md_dst)
+{
+#ifdef CONFIG_DST_CACHE
+       int cpu;
+
+       for_each_possible_cpu(cpu) {
+               struct metadata_dst *one_md_dst = per_cpu_ptr(md_dst, cpu);
+
+               if (one_md_dst->type == METADATA_IP_TUNNEL)
+                       dst_cache_destroy(&one_md_dst->u.tun_info.dst_cache);
+       }
+#endif
+       free_percpu(md_dst);
+}
+EXPORT_SYMBOL_GPL(metadata_dst_free_percpu);
index b7e8caa..140fa9f 100644 (file)
@@ -43,6 +43,7 @@
 #include <linux/timer.h>
 #include <linux/uaccess.h>
 #include <asm/unaligned.h>
+#include <asm/cmpxchg.h>
 #include <linux/filter.h>
 #include <linux/ratelimit.h>
 #include <linux/seccomp.h>
@@ -2987,14 +2988,15 @@ static const struct bpf_func_proto *
 bpf_get_skb_set_tunnel_proto(enum bpf_func_id which)
 {
        if (!md_dst) {
-               /* Race is not possible, since it's called from verifier
-                * that is holding verifier mutex.
-                */
-               md_dst = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX,
-                                                  METADATA_IP_TUNNEL,
-                                                  GFP_KERNEL);
-               if (!md_dst)
+               struct metadata_dst __percpu *tmp;
+
+               tmp = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX,
+                                               METADATA_IP_TUNNEL,
+                                               GFP_KERNEL);
+               if (!tmp)
                        return NULL;
+               if (cmpxchg(&md_dst, NULL, tmp))
+                       metadata_dst_free_percpu(tmp);
        }
 
        switch (which) {
index e84d108..6a09f3d 100644 (file)
@@ -3066,21 +3066,21 @@ int ndo_dflt_fdb_add(struct ndmsg *ndm,
 }
 EXPORT_SYMBOL(ndo_dflt_fdb_add);
 
-static int fdb_vid_parse(struct nlattr *vlan_attr, u16 *p_vid)
+static int fdb_vid_parse(struct nlattr *vlan_attr, u16 *p_vid,
+                        struct netlink_ext_ack *extack)
 {
        u16 vid = 0;
 
        if (vlan_attr) {
                if (nla_len(vlan_attr) != sizeof(u16)) {
-                       pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid vlan\n");
+                       NL_SET_ERR_MSG(extack, "invalid vlan attribute size");
                        return -EINVAL;
                }
 
                vid = nla_get_u16(vlan_attr);
 
                if (!vid || vid >= VLAN_VID_MASK) {
-                       pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid vlan id %d\n",
-                               vid);
+                       NL_SET_ERR_MSG(extack, "invalid vlan id");
                        return -EINVAL;
                }
        }
@@ -3105,24 +3105,24 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh,
 
        ndm = nlmsg_data(nlh);
        if (ndm->ndm_ifindex == 0) {
-               pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid ifindex\n");
+               NL_SET_ERR_MSG(extack, "invalid ifindex");
                return -EINVAL;
        }
 
        dev = __dev_get_by_index(net, ndm->ndm_ifindex);
        if (dev == NULL) {
-               pr_info("PF_BRIDGE: RTM_NEWNEIGH with unknown ifindex\n");
+               NL_SET_ERR_MSG(extack, "unknown ifindex");
                return -ENODEV;
        }
 
        if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) {
-               pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid address\n");
+               NL_SET_ERR_MSG(extack, "invalid address");
                return -EINVAL;
        }
 
        addr = nla_data(tb[NDA_LLADDR]);
 
-       err = fdb_vid_parse(tb[NDA_VLAN], &vid);
+       err = fdb_vid_parse(tb[NDA_VLAN], &vid, extack);
        if (err)
                return err;
 
@@ -3209,24 +3209,24 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
 
        ndm = nlmsg_data(nlh);
        if (ndm->ndm_ifindex == 0) {
-               pr_info("PF_BRIDGE: RTM_DELNEIGH with invalid ifindex\n");
+               NL_SET_ERR_MSG(extack, "invalid ifindex");
                return -EINVAL;
        }
 
        dev = __dev_get_by_index(net, ndm->ndm_ifindex);
        if (dev == NULL) {
-               pr_info("PF_BRIDGE: RTM_DELNEIGH with unknown ifindex\n");
+               NL_SET_ERR_MSG(extack, "unknown ifindex");
                return -ENODEV;
        }
 
        if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) {
-               pr_info("PF_BRIDGE: RTM_DELNEIGH with invalid address\n");
+               NL_SET_ERR_MSG(extack, "invalid address");
                return -EINVAL;
        }
 
        addr = nla_data(tb[NDA_LLADDR]);
 
-       err = fdb_vid_parse(tb[NDA_VLAN], &vid);
+       err = fdb_vid_parse(tb[NDA_VLAN], &vid, extack);
        if (err)
                return err;
 
@@ -3666,7 +3666,7 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
 
        dev = __dev_get_by_index(net, ifm->ifi_index);
        if (!dev) {
-               pr_info("PF_BRIDGE: RTM_SETLINK with unknown ifindex\n");
+               NL_SET_ERR_MSG(extack, "unknown ifindex");
                return -ENODEV;
        }
 
@@ -3741,7 +3741,7 @@ static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
 
        dev = __dev_get_by_index(net, ifm->ifi_index);
        if (!dev) {
-               pr_info("PF_BRIDGE: RTM_SETLINK with unknown ifindex\n");
+               NL_SET_ERR_MSG(extack, "unknown ifindex");
                return -ENODEV;
        }
 
index 822a90e..4071750 100644 (file)
@@ -1350,8 +1350,7 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
        /* Set the tail pointer and length */
        skb_put(n, skb->len);
 
-       if (skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len))
-               BUG();
+       BUG_ON(skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len));
 
        copy_skb_header(n, skb);
        return n;
@@ -1449,8 +1448,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
 
        BUG_ON(nhead < 0);
 
-       if (skb_shared(skb))
-               BUG();
+       BUG_ON(skb_shared(skb));
 
        size = SKB_DATA_ALIGN(size);
 
@@ -1595,9 +1593,8 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
                head_copy_off = newheadroom - head_copy_len;
 
        /* Copy the linear header and data. */
-       if (skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off,
-                         skb->len + head_copy_len))
-               BUG();
+       BUG_ON(skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off,
+                            skb->len + head_copy_len));
 
        copy_skb_header(n, skb);
 
@@ -1878,8 +1875,8 @@ void *__pskb_pull_tail(struct sk_buff *skb, int delta)
                        return NULL;
        }
 
-       if (skb_copy_bits(skb, skb_headlen(skb), skb_tail_pointer(skb), delta))
-               BUG();
+       BUG_ON(skb_copy_bits(skb, skb_headlen(skb),
+                            skb_tail_pointer(skb), delta));
 
        /* Optimization: no fragments, no reasons to preestimate
         * size of pulled pages. Superb.
index 416bb30..1859c47 100644 (file)
@@ -86,7 +86,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
                greh = (struct gre_base_hdr *)skb_transport_header(skb);
                pcsum = (__sum16 *)(greh + 1);
 
-               if (gso_partial) {
+               if (gso_partial && skb_is_gso(skb)) {
                        unsigned int partial_adj;
 
                        /* Adjust checksum to account for the fact that
index dc23177..c105a31 100644 (file)
@@ -579,8 +579,8 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev,
        if (gre_handle_offloads(skb, false))
                goto err_free_rt;
 
-       if (skb->len > dev->mtu) {
-               pskb_trim(skb, dev->mtu);
+       if (skb->len > dev->mtu + dev->hard_header_len) {
+               pskb_trim(skb, dev->mtu + dev->hard_header_len);
                truncate = true;
        }
 
@@ -731,8 +731,8 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb,
        if (skb_cow_head(skb, dev->needed_headroom))
                goto free_skb;
 
-       if (skb->len - dev->hard_header_len > dev->mtu) {
-               pskb_trim(skb, dev->mtu);
+       if (skb->len > dev->mtu + dev->hard_header_len) {
+               pskb_trim(skb, dev->mtu + dev->hard_header_len);
                truncate = true;
        }
 
index 811689e..f75fc6b 100644 (file)
@@ -330,7 +330,8 @@ static unsigned int ipv4_synproxy_hook(void *priv,
        if (synproxy == NULL)
                return NF_ACCEPT;
 
-       if (nf_is_loopback_packet(skb))
+       if (nf_is_loopback_packet(skb) ||
+           ip_hdr(skb)->protocol != IPPROTO_TCP)
                return NF_ACCEPT;
 
        thoff = ip_hdrlen(skb);
index 1c7ed77..4306db8 100644 (file)
@@ -2513,7 +2513,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
        struct rtable *ort = (struct rtable *) dst_orig;
        struct rtable *rt;
 
-       rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
+       rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
        if (rt) {
                struct dst_entry *new = &rt->dst;
 
index 8cf742f..3b34850 100644 (file)
@@ -413,6 +413,7 @@ void tcp_init_sock(struct sock *sk)
        struct tcp_sock *tp = tcp_sk(sk);
 
        tp->out_of_order_queue = RB_ROOT;
+       sk->tcp_rtx_queue = RB_ROOT;
        tcp_init_xmit_timers(sk);
        INIT_LIST_HEAD(&tp->tsq_node);
        INIT_LIST_HEAD(&tp->tsorted_sent_queue);
@@ -469,8 +470,10 @@ void tcp_init_transfer(struct sock *sk, int bpf_op)
        tcp_init_buffer_space(sk);
 }
 
-static void tcp_tx_timestamp(struct sock *sk, u16 tsflags, struct sk_buff *skb)
+static void tcp_tx_timestamp(struct sock *sk, u16 tsflags)
 {
+       struct sk_buff *skb = tcp_write_queue_tail(sk);
+
        if (tsflags && skb) {
                struct skb_shared_info *shinfo = skb_shinfo(skb);
                struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
@@ -699,10 +702,9 @@ static void tcp_push(struct sock *sk, int flags, int mss_now,
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb;
 
-       if (!tcp_send_head(sk))
-               return;
-
        skb = tcp_write_queue_tail(sk);
+       if (!skb)
+               return;
        if (!(flags & MSG_MORE) || forced_push(tp))
                tcp_mark_push(tp, skb);
 
@@ -962,14 +964,14 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
                int copy, i;
                bool can_coalesce;
 
-               if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0 ||
+               if (!skb || (copy = size_goal - skb->len) <= 0 ||
                    !tcp_skb_can_collapse_to(skb)) {
 new_segment:
                        if (!sk_stream_memory_free(sk))
                                goto wait_for_sndbuf;
 
                        skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
-                                                 skb_queue_empty(&sk->sk_write_queue));
+                                       tcp_rtx_and_write_queues_empty(sk));
                        if (!skb)
                                goto wait_for_memory;
 
@@ -1041,7 +1043,7 @@ wait_for_memory:
 
 out:
        if (copied) {
-               tcp_tx_timestamp(sk, sk->sk_tsflags, tcp_write_queue_tail(sk));
+               tcp_tx_timestamp(sk, sk->sk_tsflags);
                if (!(flags & MSG_SENDPAGE_NOTLAST))
                        tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
        }
@@ -1197,7 +1199,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
                        goto out_err;
                }
 
-               skb = tcp_send_head(sk) ? tcp_write_queue_tail(sk) : NULL;
+               skb = tcp_write_queue_tail(sk);
                uarg = sock_zerocopy_realloc(sk, size, skb_zcopy(skb));
                if (!uarg) {
                        err = -ENOBUFS;
@@ -1273,7 +1275,7 @@ restart:
                int max = size_goal;
 
                skb = tcp_write_queue_tail(sk);
-               if (tcp_send_head(sk)) {
+               if (skb) {
                        if (skb->ip_summed == CHECKSUM_NONE)
                                max = mss_now;
                        copy = max - skb->len;
@@ -1293,7 +1295,7 @@ new_segment:
                                process_backlog = false;
                                goto restart;
                        }
-                       first_skb = skb_queue_empty(&sk->sk_write_queue);
+                       first_skb = tcp_rtx_and_write_queues_empty(sk);
                        skb = sk_stream_alloc_skb(sk,
                                                  select_size(sk, sg, first_skb),
                                                  sk->sk_allocation,
@@ -1418,7 +1420,7 @@ wait_for_memory:
 
 out:
        if (copied) {
-               tcp_tx_timestamp(sk, sockc.tsflags, tcp_write_queue_tail(sk));
+               tcp_tx_timestamp(sk, sockc.tsflags);
                tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
        }
 out_nopush:
@@ -1519,6 +1521,13 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
 
        /* XXX -- need to support SO_PEEK_OFF */
 
+       skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
+               err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
+               if (err)
+                       return err;
+               copied += skb->len;
+       }
+
        skb_queue_walk(&sk->sk_write_queue, skb) {
                err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
                if (err)
@@ -2318,6 +2327,37 @@ static inline bool tcp_need_reset(int state)
                TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
 }
 
+static void tcp_rtx_queue_purge(struct sock *sk)
+{
+       struct rb_node *p = rb_first(&sk->tcp_rtx_queue);
+
+       while (p) {
+               struct sk_buff *skb = rb_to_skb(p);
+
+               p = rb_next(p);
+               /* Since we are deleting whole queue, no need to
+                * list_del(&skb->tcp_tsorted_anchor)
+                */
+               tcp_rtx_queue_unlink(skb, sk);
+               sk_wmem_free_skb(sk, skb);
+       }
+}
+
+void tcp_write_queue_purge(struct sock *sk)
+{
+       struct sk_buff *skb;
+
+       tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
+       while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
+               tcp_skb_tsorted_anchor_cleanup(skb);
+               sk_wmem_free_skb(sk, skb);
+       }
+       tcp_rtx_queue_purge(sk);
+       INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
+       sk_mem_reclaim(sk);
+       tcp_clear_all_retrans_hints(tcp_sk(sk));
+}
+
 int tcp_disconnect(struct sock *sk, int flags)
 {
        struct inet_sock *inet = inet_sk(sk);
@@ -2376,7 +2416,6 @@ int tcp_disconnect(struct sock *sk, int flags)
         * issue in __tcp_select_window()
         */
        icsk->icsk_ack.rcv_mss = TCP_MIN_MSS;
-       tcp_init_send_head(sk);
        memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
        __sk_dst_reset(sk);
        dst_release(sk->sk_rx_dst);
index 29fff14..7ee4aad 100644 (file)
@@ -465,17 +465,15 @@ bool tcp_fastopen_active_should_disable(struct sock *sk)
 void tcp_fastopen_active_disable_ofo_check(struct sock *sk)
 {
        struct tcp_sock *tp = tcp_sk(sk);
-       struct rb_node *p;
-       struct sk_buff *skb;
        struct dst_entry *dst;
+       struct sk_buff *skb;
 
        if (!tp->syn_fastopen)
                return;
 
        if (!tp->data_segs_in) {
-               p = rb_first(&tp->out_of_order_queue);
-               if (p && !rb_next(p)) {
-                       skb = rb_entry(p, struct sk_buff, rbnode);
+               skb = skb_rb_first(&tp->out_of_order_queue);
+               if (skb && !skb_rb_next(skb)) {
                        if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
                                tcp_fastopen_active_disable(sk);
                                return;
index fb0d7ed..d0682ce 100644 (file)
@@ -1142,6 +1142,7 @@ struct tcp_sacktag_state {
        u64     last_sackt;
        struct rate_sample *rate;
        int     flag;
+       unsigned int mss_now;
 };
 
 /* Check if skb is fully within the SACK block. In presence of GSO skbs,
@@ -1191,7 +1192,8 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
                if (pkt_len >= skb->len && !in_sack)
                        return 0;
 
-               err = tcp_fragment(sk, skb, pkt_len, mss, GFP_ATOMIC);
+               err = tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
+                                  pkt_len, mss, GFP_ATOMIC);
                if (err < 0)
                        return err;
        }
@@ -1288,13 +1290,13 @@ static u8 tcp_sacktag_one(struct sock *sk,
 /* Shift newly-SACKed bytes from this skb to the immediately previous
  * already-SACKed sk_buff. Mark the newly-SACKed bytes as such.
  */
-static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
+static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
+                           struct sk_buff *skb,
                            struct tcp_sacktag_state *state,
                            unsigned int pcount, int shifted, int mss,
                            bool dup_sack)
 {
        struct tcp_sock *tp = tcp_sk(sk);
-       struct sk_buff *prev = tcp_write_queue_prev(sk, skb);
        u32 start_seq = TCP_SKB_CB(skb)->seq;   /* start of newly-SACKed */
        u32 end_seq = start_seq + shifted;      /* end of newly-SACKed */
 
@@ -1363,8 +1365,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
        if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp))
                TCP_SKB_CB(prev)->tx.delivered_mstamp = 0;
 
-       tcp_unlink_write_queue(skb, sk);
-       sk_wmem_free_skb(sk, skb);
+       tcp_rtx_queue_unlink_and_free(skb, sk);
 
        NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKMERGED);
 
@@ -1414,9 +1415,9 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
                goto fallback;
 
        /* Can only happen with delayed DSACK + discard craziness */
-       if (unlikely(skb == tcp_write_queue_head(sk)))
+       prev = skb_rb_prev(skb);
+       if (!prev)
                goto fallback;
-       prev = tcp_write_queue_prev(sk, skb);
 
        if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
                goto fallback;
@@ -1495,18 +1496,17 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
 
        if (!skb_shift(prev, skb, len))
                goto fallback;
-       if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack))
+       if (!tcp_shifted_skb(sk, prev, skb, state, pcount, len, mss, dup_sack))
                goto out;
 
        /* Hole filled allows collapsing with the next as well, this is very
         * useful when hole on every nth skb pattern happens
         */
-       if (prev == tcp_write_queue_tail(sk))
+       skb = skb_rb_next(prev);
+       if (!skb)
                goto out;
-       skb = tcp_write_queue_next(sk, prev);
 
        if (!skb_can_shift(skb) ||
-           (skb == tcp_send_head(sk)) ||
            ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) ||
            (mss != tcp_skb_seglen(skb)))
                goto out;
@@ -1514,7 +1514,8 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
        len = skb->len;
        if (skb_shift(prev, skb, len)) {
                pcount += tcp_skb_pcount(skb);
-               tcp_shifted_skb(sk, skb, state, tcp_skb_pcount(skb), len, mss, 0);
+               tcp_shifted_skb(sk, prev, skb, state, tcp_skb_pcount(skb),
+                               len, mss, 0);
        }
 
 out:
@@ -1538,13 +1539,10 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *tmp;
 
-       tcp_for_write_queue_from(skb, sk) {
+       skb_rbtree_walk_from(skb) {
                int in_sack = 0;
                bool dup_sack = dup_sack_in;
 
-               if (skb == tcp_send_head(sk))
-                       break;
-
                /* queue is in-order => we can short-circuit the walk early */
                if (!before(TCP_SKB_CB(skb)->seq, end_seq))
                        break;
@@ -1606,23 +1604,44 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
        return skb;
 }
 
-/* Avoid all extra work that is being done by sacktag while walking in
- * a normal way
- */
+static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk,
+                                          struct tcp_sacktag_state *state,
+                                          u32 seq)
+{
+       struct rb_node *parent, **p = &sk->tcp_rtx_queue.rb_node;
+       struct sk_buff *skb;
+       int unack_bytes;
+
+       while (*p) {
+               parent = *p;
+               skb = rb_to_skb(parent);
+               if (before(seq, TCP_SKB_CB(skb)->seq)) {
+                       p = &parent->rb_left;
+                       continue;
+               }
+               if (!before(seq, TCP_SKB_CB(skb)->end_seq)) {
+                       p = &parent->rb_right;
+                       continue;
+               }
+
+               state->fack_count = 0;
+               unack_bytes = TCP_SKB_CB(skb)->seq - tcp_sk(sk)->snd_una;
+               if (state->mss_now && unack_bytes > 0)
+                       state->fack_count = unack_bytes / state->mss_now;
+
+               return skb;
+       }
+       return NULL;
+}
+
 static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk,
                                        struct tcp_sacktag_state *state,
                                        u32 skip_to_seq)
 {
-       tcp_for_write_queue_from(skb, sk) {
-               if (skb == tcp_send_head(sk))
-                       break;
-
-               if (after(TCP_SKB_CB(skb)->end_seq, skip_to_seq))
-                       break;
+       if (skb && after(TCP_SKB_CB(skb)->seq, skip_to_seq))
+               return skb;
 
-               state->fack_count += tcp_skb_pcount(skb);
-       }
-       return skb;
+       return tcp_sacktag_bsearch(sk, state, skip_to_seq);
 }
 
 static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb,
@@ -1744,8 +1763,9 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
                }
        }
 
-       skb = tcp_write_queue_head(sk);
+       state->mss_now = tcp_current_mss(sk);
        state->fack_count = 0;
+       skb = NULL;
        i = 0;
 
        if (!tp->sacked_out) {
@@ -1969,7 +1989,7 @@ void tcp_enter_loss(struct sock *sk)
        if (tcp_is_reno(tp))
                tcp_reset_reno_sack(tp);
 
-       skb = tcp_write_queue_head(sk);
+       skb = tcp_rtx_queue_head(sk);
        is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED);
        if (is_reneg) {
                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
@@ -1978,10 +1998,7 @@ void tcp_enter_loss(struct sock *sk)
        }
        tcp_clear_all_retrans_hints(tp);
 
-       tcp_for_write_queue(skb, sk) {
-               if (skb == tcp_send_head(sk))
-                       break;
-
+       skb_rbtree_walk_from(skb) {
                mark_lost = (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
                             is_reneg);
                if (mark_lost)
@@ -2207,20 +2224,18 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
        const u32 loss_high = tcp_is_sack(tp) ?  tp->snd_nxt : tp->high_seq;
 
        WARN_ON(packets > tp->packets_out);
-       if (tp->lost_skb_hint) {
-               skb = tp->lost_skb_hint;
-               cnt = tp->lost_cnt_hint;
+       skb = tp->lost_skb_hint;
+       if (skb) {
                /* Head already handled? */
-               if (mark_head && skb != tcp_write_queue_head(sk))
+               if (mark_head && after(TCP_SKB_CB(skb)->seq, tp->snd_una))
                        return;
+               cnt = tp->lost_cnt_hint;
        } else {
-               skb = tcp_write_queue_head(sk);
+               skb = tcp_rtx_queue_head(sk);
                cnt = 0;
        }
 
-       tcp_for_write_queue_from(skb, sk) {
-               if (skb == tcp_send_head(sk))
-                       break;
+       skb_rbtree_walk_from(skb) {
                /* TODO: do this better */
                /* this is not the most efficient way to do this... */
                tp->lost_skb_hint = skb;
@@ -2244,7 +2259,8 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
                        /* If needed, chop off the prefix to mark as lost. */
                        lost = (packets - oldcnt) * mss;
                        if (lost < skb->len &&
-                           tcp_fragment(sk, skb, lost, mss, GFP_ATOMIC) < 0)
+                           tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
+                                        lost, mss, GFP_ATOMIC) < 0)
                                break;
                        cnt = packets;
                }
@@ -2328,7 +2344,7 @@ static bool tcp_any_retrans_done(const struct sock *sk)
        if (tp->retrans_out)
                return true;
 
-       skb = tcp_write_queue_head(sk);
+       skb = tcp_rtx_queue_head(sk);
        if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS))
                return true;
 
@@ -2369,9 +2385,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
        if (unmark_loss) {
                struct sk_buff *skb;
 
-               tcp_for_write_queue(skb, sk) {
-                       if (skb == tcp_send_head(sk))
-                               break;
+               skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
                        TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
                }
                tp->lost_out = 0;
@@ -2616,9 +2630,7 @@ void tcp_simple_retransmit(struct sock *sk)
        unsigned int mss = tcp_current_mss(sk);
        u32 prior_lost = tp->lost_out;
 
-       tcp_for_write_queue(skb, sk) {
-               if (skb == tcp_send_head(sk))
-                       break;
+       skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
                if (tcp_skb_seglen(skb) > mss &&
                    !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
                        if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
@@ -2712,7 +2724,7 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,
                         * is updated in tcp_ack()). Otherwise fall back to
                         * the conventional recovery.
                         */
-                       if (tcp_send_head(sk) &&
+                       if (!tcp_write_queue_empty(sk) &&
                            after(tcp_wnd_end(tp), tp->snd_nxt)) {
                                *rexmit = REXMIT_NEW;
                                return;
@@ -2804,9 +2816,9 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
        bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) &&
                                    (tcp_fackets_out(tp) > tp->reordering));
 
-       if (WARN_ON(!tp->packets_out && tp->sacked_out))
+       if (!tp->packets_out && tp->sacked_out)
                tp->sacked_out = 0;
-       if (WARN_ON(!tp->sacked_out && tp->fackets_out))
+       if (!tp->sacked_out && tp->fackets_out)
                tp->fackets_out = 0;
 
        /* Now state machine starts.
@@ -3076,11 +3088,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
        struct tcp_sock *tp = tcp_sk(sk);
        u32 prior_sacked = tp->sacked_out;
        u32 reord = tp->packets_out;
+       struct sk_buff *skb, *next;
        bool fully_acked = true;
        long sack_rtt_us = -1L;
        long seq_rtt_us = -1L;
        long ca_rtt_us = -1L;
-       struct sk_buff *skb;
        u32 pkts_acked = 0;
        u32 last_in_flight = 0;
        bool rtt_update;
@@ -3088,7 +3100,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 
        first_ackt = 0;
 
-       while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) {
+       for (skb = skb_rb_first(&sk->tcp_rtx_queue); skb; skb = next) {
                struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
                u8 sacked = scb->sacked;
                u32 acked_pcount;
@@ -3106,8 +3118,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
                                break;
                        fully_acked = false;
                } else {
-                       /* Speedup tcp_unlink_write_queue() and next loop */
-                       prefetchw(skb->next);
                        acked_pcount = tcp_skb_pcount(skb);
                }
 
@@ -3159,12 +3169,12 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
                if (!fully_acked)
                        break;
 
-               tcp_unlink_write_queue(skb, sk);
-               sk_wmem_free_skb(sk, skb);
+               next = skb_rb_next(skb);
                if (unlikely(skb == tp->retransmit_skb_hint))
                        tp->retransmit_skb_hint = NULL;
                if (unlikely(skb == tp->lost_skb_hint))
                        tp->lost_skb_hint = NULL;
+               tcp_rtx_queue_unlink_and_free(skb, sk);
        }
 
        if (!skb)
@@ -3256,12 +3266,14 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 
 static void tcp_ack_probe(struct sock *sk)
 {
-       const struct tcp_sock *tp = tcp_sk(sk);
        struct inet_connection_sock *icsk = inet_csk(sk);
+       struct sk_buff *head = tcp_send_head(sk);
+       const struct tcp_sock *tp = tcp_sk(sk);
 
        /* Was it a usable window open? */
-
-       if (!after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, tcp_wnd_end(tp))) {
+       if (!head)
+               return;
+       if (!after(TCP_SKB_CB(head)->end_seq, tcp_wnd_end(tp))) {
                icsk->icsk_backoff = 0;
                inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
                /* Socket must be waked up by subsequent tcp_data_snd_check().
@@ -3381,7 +3393,7 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32
                        tp->pred_flags = 0;
                        tcp_fast_path_check(sk);
 
-                       if (tcp_send_head(sk))
+                       if (!tcp_write_queue_empty(sk))
                                tcp_slow_start_after_idle_check(sk);
 
                        if (nwin > tp->max_window) {
@@ -3566,8 +3578,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
        sack_state.first_sackt = 0;
        sack_state.rate = &rs;
 
-       /* We very likely will need to access write queue head. */
-       prefetchw(sk->sk_write_queue.next);
+       /* We very likely will need to access rtx queue. */
+       prefetch(sk->tcp_rtx_queue.rb_node);
 
        /* If the ack is older than previous acks
         * then we can probably ignore it.
@@ -3681,8 +3693,7 @@ no_queue:
         * being used to time the probes, and is probably far higher than
         * it needs to be for normal retransmission.
         */
-       if (tcp_send_head(sk))
-               tcp_ack_probe(sk);
+       tcp_ack_probe(sk);
 
        if (tp->tlp_high_seq)
                tcp_process_tlp_ack(sk, ack, flag);
@@ -4335,7 +4346,7 @@ static void tcp_ofo_queue(struct sock *sk)
 
        p = rb_first(&tp->out_of_order_queue);
        while (p) {
-               skb = rb_entry(p, struct sk_buff, rbnode);
+               skb = rb_to_skb(p);
                if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
                        break;
 
@@ -4399,7 +4410,7 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
 static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 {
        struct tcp_sock *tp = tcp_sk(sk);
-       struct rb_node **p, *q, *parent;
+       struct rb_node **p, *parent;
        struct sk_buff *skb1;
        u32 seq, end_seq;
        bool fragstolen;
@@ -4458,7 +4469,7 @@ coalesce_done:
        parent = NULL;
        while (*p) {
                parent = *p;
-               skb1 = rb_entry(parent, struct sk_buff, rbnode);
+               skb1 = rb_to_skb(parent);
                if (before(seq, TCP_SKB_CB(skb1)->seq)) {
                        p = &parent->rb_left;
                        continue;
@@ -4503,9 +4514,7 @@ insert:
 
 merge_right:
        /* Remove other segments covered by skb. */
-       while ((q = rb_next(&skb->rbnode)) != NULL) {
-               skb1 = rb_entry(q, struct sk_buff, rbnode);
-
+       while ((skb1 = skb_rb_next(skb)) != NULL) {
                if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
                        break;
                if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
@@ -4520,7 +4529,7 @@ merge_right:
                tcp_drop(sk, skb1);
        }
        /* If there is no skb after us, we are the last_skb ! */
-       if (!q)
+       if (!skb1)
                tp->ooo_last_skb = skb;
 
 add_sack:
@@ -4706,7 +4715,7 @@ static struct sk_buff *tcp_skb_next(struct sk_buff *skb, struct sk_buff_head *li
        if (list)
                return !skb_queue_is_last(list, skb) ? skb->next : NULL;
 
-       return rb_entry_safe(rb_next(&skb->rbnode), struct sk_buff, rbnode);
+       return skb_rb_next(skb);
 }
 
 static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
@@ -4727,7 +4736,7 @@ static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
 }
 
 /* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */
-static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
+void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
 {
        struct rb_node **p = &root->rb_node;
        struct rb_node *parent = NULL;
@@ -4735,7 +4744,7 @@ static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
 
        while (*p) {
                parent = *p;
-               skb1 = rb_entry(parent, struct sk_buff, rbnode);
+               skb1 = rb_to_skb(parent);
                if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq))
                        p = &parent->rb_left;
                else
@@ -4854,26 +4863,19 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb, *head;
-       struct rb_node *p;
        u32 start, end;
 
-       p = rb_first(&tp->out_of_order_queue);
-       skb = rb_entry_safe(p, struct sk_buff, rbnode);
+       skb = skb_rb_first(&tp->out_of_order_queue);
 new_range:
        if (!skb) {
-               p = rb_last(&tp->out_of_order_queue);
-               /* Note: This is possible p is NULL here. We do not
-                * use rb_entry_safe(), as ooo_last_skb is valid only
-                * if rbtree is not empty.
-                */
-               tp->ooo_last_skb = rb_entry(p, struct sk_buff, rbnode);
+               tp->ooo_last_skb = skb_rb_last(&tp->out_of_order_queue);
                return;
        }
        start = TCP_SKB_CB(skb)->seq;
        end = TCP_SKB_CB(skb)->end_seq;
 
        for (head = skb;;) {
-               skb = tcp_skb_next(skb, NULL);
+               skb = skb_rb_next(skb);
 
                /* Range is terminated when we see a gap or when
                 * we are at the queue end.
@@ -4916,14 +4918,14 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
        do {
                prev = rb_prev(node);
                rb_erase(node, &tp->out_of_order_queue);
-               tcp_drop(sk, rb_entry(node, struct sk_buff, rbnode));
+               tcp_drop(sk, rb_to_skb(node));
                sk_mem_reclaim(sk);
                if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
                    !tcp_under_memory_pressure(sk))
                        break;
                node = prev;
        } while (node);
-       tp->ooo_last_skb = rb_entry(prev, struct sk_buff, rbnode);
+       tp->ooo_last_skb = rb_to_skb(prev);
 
        /* Reset SACK state.  A conforming SACK implementation will
         * do the same at a timeout based retransmit.  When a connection
@@ -5538,7 +5540,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
                                    struct tcp_fastopen_cookie *cookie)
 {
        struct tcp_sock *tp = tcp_sk(sk);
-       struct sk_buff *data = tp->syn_data ? tcp_write_queue_head(sk) : NULL;
+       struct sk_buff *data = tp->syn_data ? tcp_rtx_queue_head(sk) : NULL;
        u16 mss = tp->rx_opt.mss_clamp, try_exp = 0;
        bool syn_drop = false;
 
@@ -5573,9 +5575,8 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
        tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp);
 
        if (data) { /* Retransmit unacked data in SYN */
-               tcp_for_write_queue_from(data, sk) {
-                       if (data == tcp_send_head(sk) ||
-                           __tcp_retransmit_skb(sk, data, 1))
+               skb_rbtree_walk_from(data) {
+                       if (__tcp_retransmit_skb(sk, data, 1))
                                break;
                }
                tcp_rearm_rto(sk);
index c7460fd..5418ecf 100644 (file)
@@ -480,7 +480,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
                                               TCP_TIMEOUT_INIT;
                icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 
-               skb = tcp_write_queue_head(sk);
+               skb = tcp_rtx_queue_head(sk);
                BUG_ON(!skb);
 
                tcp_mstamp_refresh(tp);
index 8162e28..696b0a1 100644 (file)
@@ -66,15 +66,17 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
                           int push_one, gfp_t gfp);
 
 /* Account for new data that has been sent to the network. */
-static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
+static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
 {
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        unsigned int prior_packets = tp->packets_out;
 
-       tcp_advance_send_head(sk, skb);
        tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
 
+       __skb_unlink(skb, &sk->sk_write_queue);
+       tcp_rbtree_insert(&sk->tcp_rtx_queue, skb);
+
        tp->packets_out += tcp_skb_pcount(skb);
        if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
                tcp_rearm_rto(sk);
@@ -1249,12 +1251,25 @@ static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2)
        TCP_SKB_CB(skb)->eor = 0;
 }
 
+/* Insert buff after skb on the write or rtx queue of sk.  */
+static void tcp_insert_write_queue_after(struct sk_buff *skb,
+                                        struct sk_buff *buff,
+                                        struct sock *sk,
+                                        enum tcp_queue tcp_queue)
+{
+       if (tcp_queue == TCP_FRAG_IN_WRITE_QUEUE)
+               __skb_queue_after(&sk->sk_write_queue, skb, buff);
+       else
+               tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
+}
+
 /* Function to create two new TCP segments.  Shrinks the given segment
  * to the specified size and appends a new segment with the rest of the
  * packet to the list.  This won't be called frequently, I hope.
  * Remember, these are still headerless SKBs at this point.
  */
-int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
+int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
+                struct sk_buff *skb, u32 len,
                 unsigned int mss_now, gfp_t gfp)
 {
        struct tcp_sock *tp = tcp_sk(sk);
@@ -1337,7 +1352,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
 
        /* Link BUFF into the send queue. */
        __skb_header_release(buff);
-       tcp_insert_write_queue_after(skb, buff, sk);
+       tcp_insert_write_queue_after(skb, buff, sk, tcp_queue);
        list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor);
 
        return 0;
@@ -1625,10 +1640,10 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
                 * is caused by insufficient sender buffer:
                 * 1) just sent some data (see tcp_write_xmit)
                 * 2) not cwnd limited (this else condition)
-                * 3) no more data to send (null tcp_send_head )
+                * 3) no more data to send (tcp_write_queue_empty())
                 * 4) application is hitting buffer limit (SOCK_NOSPACE)
                 */
-               if (!tcp_send_head(sk) && sk->sk_socket &&
+               if (tcp_write_queue_empty(sk) && sk->sk_socket &&
                    test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) &&
                    (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
                        tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED);
@@ -1824,7 +1839,8 @@ static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
  * know that all the data is in scatter-gather pages, and that the
  * packet has never been sent out before (and thus is not cloned).
  */
-static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
+static int tso_fragment(struct sock *sk, enum tcp_queue tcp_queue,
+                       struct sk_buff *skb, unsigned int len,
                        unsigned int mss_now, gfp_t gfp)
 {
        struct sk_buff *buff;
@@ -1833,7 +1849,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
 
        /* All of a TSO frame must be composed of paged data.  */
        if (skb->len != skb->data_len)
-               return tcp_fragment(sk, skb, len, mss_now, gfp);
+               return tcp_fragment(sk, tcp_queue, skb, len, mss_now, gfp);
 
        buff = sk_stream_alloc_skb(sk, 0, gfp, true);
        if (unlikely(!buff))
@@ -1869,7 +1885,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
 
        /* Link BUFF into the send queue. */
        __skb_header_release(buff);
-       tcp_insert_write_queue_after(skb, buff, sk);
+       tcp_insert_write_queue_after(skb, buff, sk, tcp_queue);
 
        return 0;
 }
@@ -1939,8 +1955,10 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
                        goto send_now;
        }
 
-       head = tcp_write_queue_head(sk);
-
+       /* TODO : use tsorted_sent_queue ? */
+       head = tcp_rtx_queue_head(sk);
+       if (!head)
+               goto send_now;
        age = tcp_stamp_us_delta(tp->tcp_mstamp, head->skb_mstamp);
        /* If next ACK is likely to come too late (half srtt), do not defer */
        if (age < (tp->srtt_us >> 4))
@@ -2158,13 +2176,12 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
        limit <<= factor;
 
        if (refcount_read(&sk->sk_wmem_alloc) > limit) {
-               /* Always send the 1st or 2nd skb in write queue.
+               /* Always send skb if rtx queue is empty.
                 * No need to wait for TX completion to call us back,
                 * after softirq/tasklet schedule.
                 * This helps when TX completions are delayed too much.
                 */
-               if (skb == sk->sk_write_queue.next ||
-                   skb->prev == sk->sk_write_queue.next)
+               if (tcp_rtx_queue_empty(sk))
                        return false;
 
                set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
@@ -2215,7 +2232,7 @@ void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type)
         * it's the "most interesting" or current chrono we are
         * tracking and starts busy chrono if we have pending data.
         */
-       if (tcp_write_queue_empty(sk))
+       if (tcp_rtx_and_write_queues_empty(sk))
                tcp_chrono_set(tp, TCP_CHRONO_UNSPEC);
        else if (type == tp->chrono_type)
                tcp_chrono_set(tp, TCP_CHRONO_BUSY);
@@ -2310,7 +2327,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
                                                    nonagle);
 
                if (skb->len > limit &&
-                   unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
+                   unlikely(tso_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
+                                         skb, limit, mss_now, gfp)))
                        break;
 
                if (test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
@@ -2350,7 +2368,7 @@ repair:
                tcp_cwnd_validate(sk, is_cwnd_limited);
                return false;
        }
-       return !tp->packets_out && tcp_send_head(sk);
+       return !tp->packets_out && !tcp_write_queue_empty(sk);
 }
 
 bool tcp_schedule_loss_probe(struct sock *sk)
@@ -2374,7 +2392,7 @@ bool tcp_schedule_loss_probe(struct sock *sk)
                return false;
 
        if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) &&
-            tcp_send_head(sk))
+            !tcp_write_queue_empty(sk))
                return false;
 
        /* Probe timeout is 2*rtt. Add minimum RTO to account
@@ -2427,18 +2445,14 @@ void tcp_send_loss_probe(struct sock *sk)
        int mss = tcp_current_mss(sk);
 
        skb = tcp_send_head(sk);
-       if (skb) {
-               if (tcp_snd_wnd_test(tp, skb, mss)) {
-                       pcount = tp->packets_out;
-                       tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
-                       if (tp->packets_out > pcount)
-                               goto probe_sent;
-                       goto rearm_timer;
-               }
-               skb = tcp_write_queue_prev(sk, skb);
-       } else {
-               skb = tcp_write_queue_tail(sk);
+       if (skb && tcp_snd_wnd_test(tp, skb, mss)) {
+               pcount = tp->packets_out;
+               tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
+               if (tp->packets_out > pcount)
+                       goto probe_sent;
+               goto rearm_timer;
        }
+       skb = skb_rb_last(&sk->tcp_rtx_queue);
 
        /* At most one outstanding TLP retransmission. */
        if (tp->tlp_high_seq)
@@ -2456,10 +2470,11 @@ void tcp_send_loss_probe(struct sock *sk)
                goto rearm_timer;
 
        if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) {
-               if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss,
+               if (unlikely(tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
+                                         (pcount - 1) * mss, mss,
                                          GFP_ATOMIC)))
                        goto rearm_timer;
-               skb = tcp_write_queue_next(sk, skb);
+               skb = skb_rb_next(skb);
        }
 
        if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
@@ -2659,7 +2674,7 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb,
 static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
 {
        struct tcp_sock *tp = tcp_sk(sk);
-       struct sk_buff *next_skb = tcp_write_queue_next(sk, skb);
+       struct sk_buff *next_skb = skb_rb_next(skb);
        int skb_size, next_skb_size;
 
        skb_size = skb->len;
@@ -2676,8 +2691,6 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
        }
        tcp_highest_sack_combine(sk, next_skb, skb);
 
-       tcp_unlink_write_queue(next_skb, sk);
-
        if (next_skb->ip_summed == CHECKSUM_PARTIAL)
                skb->ip_summed = CHECKSUM_PARTIAL;
 
@@ -2705,7 +2718,7 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
 
        tcp_skb_collapse_tstamp(skb, next_skb);
 
-       sk_wmem_free_skb(sk, next_skb);
+       tcp_rtx_queue_unlink_and_free(next_skb, sk);
        return true;
 }
 
@@ -2716,8 +2729,6 @@ static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb)
                return false;
        if (skb_cloned(skb))
                return false;
-       if (skb == tcp_send_head(sk))
-               return false;
        /* Some heuristics for collapsing over SACK'd could be invented */
        if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
                return false;
@@ -2740,7 +2751,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
        if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
                return;
 
-       tcp_for_write_queue_from_safe(skb, tmp, sk) {
+       skb_rbtree_walk_from_safe(skb, tmp) {
                if (!tcp_can_collapse(sk, skb))
                        break;
 
@@ -2815,7 +2826,8 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
 
        len = cur_mss * segs;
        if (skb->len > len) {
-               if (tcp_fragment(sk, skb, len, cur_mss, GFP_ATOMIC))
+               if (tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, len,
+                                cur_mss, GFP_ATOMIC))
                        return -ENOMEM; /* We'll try again later. */
        } else {
                if (skb_unclone(skb, GFP_ATOMIC))
@@ -2906,29 +2918,24 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
 void tcp_xmit_retransmit_queue(struct sock *sk)
 {
        const struct inet_connection_sock *icsk = inet_csk(sk);
+       struct sk_buff *skb, *rtx_head = NULL, *hole = NULL;
        struct tcp_sock *tp = tcp_sk(sk);
-       struct sk_buff *skb;
-       struct sk_buff *hole = NULL;
        u32 max_segs;
        int mib_idx;
 
        if (!tp->packets_out)
                return;
 
-       if (tp->retransmit_skb_hint) {
-               skb = tp->retransmit_skb_hint;
-       } else {
-               skb = tcp_write_queue_head(sk);
+       skb = tp->retransmit_skb_hint;
+       if (!skb) {
+               rtx_head = tcp_rtx_queue_head(sk);
+               skb = rtx_head;
        }
-
        max_segs = tcp_tso_segs(sk, tcp_current_mss(sk));
-       tcp_for_write_queue_from(skb, sk) {
+       skb_rbtree_walk_from(skb) {
                __u8 sacked;
                int segs;
 
-               if (skb == tcp_send_head(sk))
-                       break;
-
                if (tcp_pacing_check(sk))
                        break;
 
@@ -2973,7 +2980,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
                if (tcp_in_cwnd_reduction(sk))
                        tp->prr_out += tcp_skb_pcount(skb);
 
-               if (skb == tcp_write_queue_head(sk) &&
+               if (skb == rtx_head &&
                    icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT)
                        inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
                                                  inet_csk(sk)->icsk_rto,
@@ -3015,12 +3022,15 @@ void tcp_send_fin(struct sock *sk)
         * Note: in the latter case, FIN packet will be sent after a timeout,
         * as TCP stack thinks it has already been transmitted.
         */
-       if (tskb && (tcp_send_head(sk) || tcp_under_memory_pressure(sk))) {
+       if (!tskb && tcp_under_memory_pressure(sk))
+               tskb = skb_rb_last(&sk->tcp_rtx_queue);
+
+       if (tskb) {
 coalesce:
                TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
                TCP_SKB_CB(tskb)->end_seq++;
                tp->write_seq++;
-               if (!tcp_send_head(sk)) {
+               if (tcp_write_queue_empty(sk)) {
                        /* This means tskb was already sent.
                         * Pretend we included the FIN on previous transmit.
                         * We need to set tp->snd_nxt to the value it would have
@@ -3086,9 +3096,9 @@ int tcp_send_synack(struct sock *sk)
 {
        struct sk_buff *skb;
 
-       skb = tcp_write_queue_head(sk);
+       skb = tcp_rtx_queue_head(sk);
        if (!skb || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
-               pr_debug("%s: wrong queue state\n", __func__);
+               pr_err("%s: wrong queue state\n", __func__);
                return -EFAULT;
        }
        if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
@@ -3101,10 +3111,9 @@ int tcp_send_synack(struct sock *sk)
                        if (!nskb)
                                return -ENOMEM;
                        INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor);
-                       tcp_unlink_write_queue(skb, sk);
+                       tcp_rtx_queue_unlink_and_free(skb, sk);
                        __skb_header_release(nskb);
-                       __tcp_add_write_queue_head(sk, nskb);
-                       sk_wmem_free_skb(sk, skb);
+                       tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb);
                        sk->sk_wmem_queued += nskb->truesize;
                        sk_mem_charge(sk, nskb->truesize);
                        skb = nskb;
@@ -3327,7 +3336,6 @@ static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
 
        tcb->end_seq += skb->len;
        __skb_header_release(skb);
-       __tcp_add_write_queue_tail(sk, skb);
        sk->sk_wmem_queued += skb->truesize;
        sk_mem_charge(sk, skb->truesize);
        tp->write_seq = tcb->end_seq;
@@ -3405,12 +3413,13 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
        TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH;
        if (!err) {
                tp->syn_data = (fo->copied > 0);
+               tcp_rbtree_insert(&sk->tcp_rtx_queue, syn_data);
                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT);
                goto done;
        }
 
-       /* data was not sent, this is our new send_head */
-       sk->sk_send_head = syn_data;
+       /* data was not sent, put it in write_queue */
+       __skb_queue_tail(&sk->sk_write_queue, syn_data);
        tp->packets_out -= tcp_skb_pcount(syn_data);
 
 fallback:
@@ -3453,6 +3462,7 @@ int tcp_connect(struct sock *sk)
        tp->retrans_stamp = tcp_time_stamp(tp);
        tcp_connect_queue_skb(sk, buff);
        tcp_ecn_send_syn(sk, buff);
+       tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
 
        /* Send off SYN; include data in Fast Open. */
        err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) :
@@ -3647,7 +3657,8 @@ int tcp_write_wakeup(struct sock *sk, int mib)
                    skb->len > mss) {
                        seg_size = min(seg_size, mss);
                        TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
-                       if (tcp_fragment(sk, skb, seg_size, mss, GFP_ATOMIC))
+                       if (tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
+                                        skb, seg_size, mss, GFP_ATOMIC))
                                return -1;
                } else if (!tcp_skb_pcount(skb))
                        tcp_set_skb_tso_segs(skb, mss);
@@ -3677,7 +3688,7 @@ void tcp_send_probe0(struct sock *sk)
 
        err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE);
 
-       if (tp->packets_out || !tcp_send_head(sk)) {
+       if (tp->packets_out || tcp_write_queue_empty(sk)) {
                /* Cancel probe timer, if it is not required. */
                icsk->icsk_probes_out = 0;
                icsk->icsk_backoff = 0;
index 655dd8d..7014cc0 100644 (file)
@@ -156,8 +156,13 @@ static bool retransmits_timed_out(struct sock *sk,
                return false;
 
        start_ts = tcp_sk(sk)->retrans_stamp;
-       if (unlikely(!start_ts))
-               start_ts = tcp_skb_timestamp(tcp_write_queue_head(sk));
+       if (unlikely(!start_ts)) {
+               struct sk_buff *head = tcp_rtx_queue_head(sk);
+
+               if (!head)
+                       return false;
+               start_ts = tcp_skb_timestamp(head);
+       }
 
        if (likely(timeout == 0)) {
                linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base);
@@ -304,11 +309,12 @@ static void tcp_delack_timer(unsigned long data)
 static void tcp_probe_timer(struct sock *sk)
 {
        struct inet_connection_sock *icsk = inet_csk(sk);
+       struct sk_buff *skb = tcp_send_head(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        int max_probes;
        u32 start_ts;
 
-       if (tp->packets_out || !tcp_send_head(sk)) {
+       if (tp->packets_out || !skb) {
                icsk->icsk_probes_out = 0;
                return;
        }
@@ -321,9 +327,9 @@ static void tcp_probe_timer(struct sock *sk)
         * corresponding system limit. We also implement similar policy when
         * we use RTO to probe window in tcp_retransmit_timer().
         */
-       start_ts = tcp_skb_timestamp(tcp_send_head(sk));
+       start_ts = tcp_skb_timestamp(skb);
        if (!start_ts)
-               tcp_send_head(sk)->skb_mstamp = tp->tcp_mstamp;
+               skb->skb_mstamp = tp->tcp_mstamp;
        else if (icsk->icsk_user_timeout &&
                 (s32)(tcp_time_stamp(tp) - start_ts) >
                 jiffies_to_msecs(icsk->icsk_user_timeout))
@@ -408,7 +414,7 @@ void tcp_retransmit_timer(struct sock *sk)
        if (!tp->packets_out)
                goto out;
 
-       WARN_ON(tcp_write_queue_empty(sk));
+       WARN_ON(tcp_rtx_queue_empty(sk));
 
        tp->tlp_high_seq = 0;
 
@@ -441,7 +447,7 @@ void tcp_retransmit_timer(struct sock *sk)
                        goto out;
                }
                tcp_enter_loss(sk);
-               tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 1);
+               tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1);
                __sk_dst_reset(sk);
                goto out_reset_timer;
        }
@@ -473,7 +479,7 @@ void tcp_retransmit_timer(struct sock *sk)
 
        tcp_enter_loss(sk);
 
-       if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 1) > 0) {
+       if (tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1) > 0) {
                /* Retransmission failed because of local congestion,
                 * do not backoff.
                 */
@@ -647,7 +653,7 @@ static void tcp_keepalive_timer (unsigned long data)
        elapsed = keepalive_time_when(tp);
 
        /* It is alive without keepalive 8) */
-       if (tp->packets_out || tcp_send_head(sk))
+       if (tp->packets_out || !tcp_write_queue_empty(sk))
                goto resched;
 
        elapsed = keepalive_time_elapsed(tp);
index eb0359b..7c9a6e4 100644 (file)
@@ -2239,20 +2239,16 @@ int udp_v4_early_demux(struct sk_buff *skb)
        iph = ip_hdr(skb);
        uh = udp_hdr(skb);
 
-       if (skb->pkt_type == PACKET_BROADCAST ||
-           skb->pkt_type == PACKET_MULTICAST) {
+       if (skb->pkt_type == PACKET_MULTICAST) {
                in_dev = __in_dev_get_rcu(skb->dev);
 
                if (!in_dev)
                        return 0;
 
-               /* we are supposed to accept bcast packets */
-               if (skb->pkt_type == PACKET_MULTICAST) {
-                       ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr,
-                                              iph->protocol);
-                       if (!ours)
-                               return 0;
-               }
+               ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr,
+                                      iph->protocol);
+               if (!ours)
+                       return 0;
 
                sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr,
                                                   uh->source, iph->saddr,
index 97658bf..e360d55 100644 (file)
@@ -120,7 +120,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
                 * will be using a length value equal to only one MSS sized
                 * segment instead of the entire frame.
                 */
-               if (gso_partial) {
+               if (gso_partial && skb_is_gso(skb)) {
                        uh->len = htons(skb_shinfo(skb)->gso_size +
                                        SKB_GSO_CB(skb)->data_offset +
                                        skb->head - (unsigned char *)uh);
index 837418f..d9f6226 100644 (file)
@@ -152,7 +152,7 @@ static void ipv6_regen_rndid(struct inet6_dev *idev);
 static void ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr);
 
 static int ipv6_generate_eui64(u8 *eui, struct net_device *dev);
-static int ipv6_count_addresses(struct inet6_dev *idev);
+static int ipv6_count_addresses(const struct inet6_dev *idev);
 static int ipv6_generate_stable_address(struct in6_addr *addr,
                                        u8 dad_count,
                                        const struct inet6_dev *idev);
@@ -945,7 +945,7 @@ ipv6_link_dev_addr(struct inet6_dev *idev, struct inet6_ifaddr *ifp)
                        break;
        }
 
-       list_add_tail(&ifp->if_list, p);
+       list_add_tail_rcu(&ifp->if_list, p);
 }
 
 static u32 inet6_addr_hash(const struct in6_addr *addr)
@@ -1204,7 +1204,7 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp)
        if (ifp->flags & IFA_F_PERMANENT && !(ifp->flags & IFA_F_NOPREFIXROUTE))
                action = check_cleanup_prefix_route(ifp, &expires);
 
-       list_del_init(&ifp->if_list);
+       list_del_rcu(&ifp->if_list);
        __in6_ifa_put(ifp);
 
        write_unlock_bh(&ifp->idev->lock);
@@ -1558,8 +1558,7 @@ static int __ipv6_dev_get_saddr(struct net *net,
 {
        struct ipv6_saddr_score *score = &scores[1 - hiscore_idx], *hiscore = &scores[hiscore_idx];
 
-       read_lock_bh(&idev->lock);
-       list_for_each_entry(score->ifa, &idev->addr_list, if_list) {
+       list_for_each_entry_rcu(score->ifa, &idev->addr_list, if_list) {
                int i;
 
                /*
@@ -1609,11 +1608,6 @@ static int __ipv6_dev_get_saddr(struct net *net,
                                }
                                break;
                        } else if (minihiscore < miniscore) {
-                               if (hiscore->ifa)
-                                       in6_ifa_put(hiscore->ifa);
-
-                               in6_ifa_hold(score->ifa);
-
                                swap(hiscore, score);
                                hiscore_idx = 1 - hiscore_idx;
 
@@ -1625,7 +1619,6 @@ static int __ipv6_dev_get_saddr(struct net *net,
                }
        }
 out:
-       read_unlock_bh(&idev->lock);
        return hiscore_idx;
 }
 
@@ -1662,6 +1655,7 @@ int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev,
        int dst_type;
        bool use_oif_addr = false;
        int hiscore_idx = 0;
+       int ret = 0;
 
        dst_type = __ipv6_addr_type(daddr);
        dst.addr = daddr;
@@ -1737,15 +1731,14 @@ int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev,
        }
 
 out:
-       rcu_read_unlock();
-
        hiscore = &scores[hiscore_idx];
        if (!hiscore->ifa)
-               return -EADDRNOTAVAIL;
+               ret = -EADDRNOTAVAIL;
+       else
+               *saddr = hiscore->ifa->addr;
 
-       *saddr = hiscore->ifa->addr;
-       in6_ifa_put(hiscore->ifa);
-       return 0;
+       rcu_read_unlock();
+       return ret;
 }
 EXPORT_SYMBOL(ipv6_dev_get_saddr);
 
@@ -1785,15 +1778,15 @@ int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr,
        return err;
 }
 
-static int ipv6_count_addresses(struct inet6_dev *idev)
+static int ipv6_count_addresses(const struct inet6_dev *idev)
 {
+       const struct inet6_ifaddr *ifp;
        int cnt = 0;
-       struct inet6_ifaddr *ifp;
 
-       read_lock_bh(&idev->lock);
-       list_for_each_entry(ifp, &idev->addr_list, if_list)
+       rcu_read_lock();
+       list_for_each_entry_rcu(ifp, &idev->addr_list, if_list)
                cnt++;
-       read_unlock_bh(&idev->lock);
+       rcu_read_unlock();
        return cnt;
 }
 
@@ -1859,20 +1852,18 @@ static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr,
 bool ipv6_chk_custom_prefix(const struct in6_addr *addr,
        const unsigned int prefix_len, struct net_device *dev)
 {
-       struct inet6_dev *idev;
-       struct inet6_ifaddr *ifa;
+       const struct inet6_ifaddr *ifa;
+       const struct inet6_dev *idev;
        bool ret = false;
 
        rcu_read_lock();
        idev = __in6_dev_get(dev);
        if (idev) {
-               read_lock_bh(&idev->lock);
-               list_for_each_entry(ifa, &idev->addr_list, if_list) {
+               list_for_each_entry_rcu(ifa, &idev->addr_list, if_list) {
                        ret = ipv6_prefix_equal(addr, &ifa->addr, prefix_len);
                        if (ret)
                                break;
                }
-               read_unlock_bh(&idev->lock);
        }
        rcu_read_unlock();
 
@@ -1882,22 +1873,20 @@ EXPORT_SYMBOL(ipv6_chk_custom_prefix);
 
 int ipv6_chk_prefix(const struct in6_addr *addr, struct net_device *dev)
 {
-       struct inet6_dev *idev;
-       struct inet6_ifaddr *ifa;
+       const struct inet6_ifaddr *ifa;
+       const struct inet6_dev *idev;
        int     onlink;
 
        onlink = 0;
        rcu_read_lock();
        idev = __in6_dev_get(dev);
        if (idev) {
-               read_lock_bh(&idev->lock);
-               list_for_each_entry(ifa, &idev->addr_list, if_list) {
+               list_for_each_entry_rcu(ifa, &idev->addr_list, if_list) {
                        onlink = ipv6_prefix_equal(addr, &ifa->addr,
                                                   ifa->prefix_len);
                        if (onlink)
                                break;
                }
-               read_unlock_bh(&idev->lock);
        }
        rcu_read_unlock();
        return onlink;
@@ -2321,24 +2310,24 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
        if (!table)
                return NULL;
 
-       read_lock_bh(&table->tb6_lock);
-       fn = fib6_locate(&table->tb6_root, pfx, plen, NULL, 0);
+       rcu_read_lock();
+       fn = fib6_locate(&table->tb6_root, pfx, plen, NULL, 0, true);
        if (!fn)
                goto out;
 
-       noflags |= RTF_CACHE;
-       for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
+       for_each_fib6_node_rt_rcu(fn) {
                if (rt->dst.dev->ifindex != dev->ifindex)
                        continue;
                if ((rt->rt6i_flags & flags) != flags)
                        continue;
                if ((rt->rt6i_flags & noflags) != 0)
                        continue;
-               dst_hold(&rt->dst);
+               if (!dst_hold_safe(&rt->dst))
+                       rt = NULL;
                break;
        }
 out:
-       read_unlock_bh(&table->tb6_lock);
+       rcu_read_unlock();
        return rt;
 }
 
@@ -3562,7 +3551,6 @@ static int addrconf_ifdown(struct net_device *dev, int how)
        struct net *net = dev_net(dev);
        struct inet6_dev *idev;
        struct inet6_ifaddr *ifa, *tmp;
-       struct list_head del_list;
        int _keep_addr;
        bool keep_addr;
        int state, i;
@@ -3654,7 +3642,6 @@ restart:
         */
        keep_addr = (!how && _keep_addr > 0 && !idev->cnf.disable_ipv6);
 
-       INIT_LIST_HEAD(&del_list);
        list_for_each_entry_safe(ifa, tmp, &idev->addr_list, if_list) {
                struct rt6_info *rt = NULL;
                bool keep;
@@ -3663,8 +3650,6 @@ restart:
 
                keep = keep_addr && (ifa->flags & IFA_F_PERMANENT) &&
                        !addr_is_local(&ifa->addr);
-               if (!keep)
-                       list_move(&ifa->if_list, &del_list);
 
                write_unlock_bh(&idev->lock);
                spin_lock_bh(&ifa->lock);
@@ -3698,19 +3683,14 @@ restart:
                }
 
                write_lock_bh(&idev->lock);
+               if (!keep) {
+                       list_del_rcu(&ifa->if_list);
+                       in6_ifa_put(ifa);
+               }
        }
 
        write_unlock_bh(&idev->lock);
 
-       /* now clean up addresses to be removed */
-       while (!list_empty(&del_list)) {
-               ifa = list_first_entry(&del_list,
-                                      struct inet6_ifaddr, if_list);
-               list_del(&ifa->if_list);
-
-               in6_ifa_put(ifa);
-       }
-
        /* Step 5: Discard anycast and multicast list */
        if (how) {
                ipv6_ac_destroy_dev(idev);
@@ -3820,8 +3800,8 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp)
                goto out;
 
        if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) ||
-           dev_net(dev)->ipv6.devconf_all->accept_dad < 1 ||
-           idev->cnf.accept_dad < 1 ||
+           (dev_net(dev)->ipv6.devconf_all->accept_dad < 1 &&
+            idev->cnf.accept_dad < 1) ||
            !(ifp->flags&IFA_F_TENTATIVE) ||
            ifp->flags & IFA_F_NODAD) {
                bump_id = ifp->flags & IFA_F_TENTATIVE;
@@ -5898,10 +5878,9 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val)
                spin_lock(&ifa->lock);
                if (ifa->rt) {
                        struct rt6_info *rt = ifa->rt;
-                       struct fib6_table *table = rt->rt6i_table;
                        int cpu;
 
-                       read_lock(&table->tb6_lock);
+                       rcu_read_lock();
                        addrconf_set_nopolicy(ifa->rt, val);
                        if (rt->rt6i_pcpu) {
                                for_each_possible_cpu(cpu) {
@@ -5911,7 +5890,7 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val)
                                        addrconf_set_nopolicy(*rtp, val);
                                }
                        }
-                       read_unlock(&table->tb6_lock);
+                       rcu_read_unlock();
                }
                spin_unlock(&ifa->lock);
        }
index c6311d7..2606d2f 100644 (file)
@@ -18,7 +18,6 @@
 #include <linux/if_addrlabel.h>
 #include <linux/netlink.h>
 #include <linux/rtnetlink.h>
-#include <linux/refcount.h>
 
 #if 0
 #define ADDRLABEL(x...) printk(x)
@@ -36,7 +35,6 @@ struct ip6addrlbl_entry {
        int addrtype;
        u32 label;
        struct hlist_node list;
-       refcount_t refcnt;
        struct rcu_head rcu;
 };
 
@@ -111,28 +109,6 @@ static const __net_initconst struct ip6addrlbl_init_table
        }
 };
 
-/* Object management */
-static inline void ip6addrlbl_free(struct ip6addrlbl_entry *p)
-{
-       kfree(p);
-}
-
-static void ip6addrlbl_free_rcu(struct rcu_head *h)
-{
-       ip6addrlbl_free(container_of(h, struct ip6addrlbl_entry, rcu));
-}
-
-static bool ip6addrlbl_hold(struct ip6addrlbl_entry *p)
-{
-       return refcount_inc_not_zero(&p->refcnt);
-}
-
-static inline void ip6addrlbl_put(struct ip6addrlbl_entry *p)
-{
-       if (refcount_dec_and_test(&p->refcnt))
-               call_rcu(&p->rcu, ip6addrlbl_free_rcu);
-}
-
 /* Find label */
 static bool __ip6addrlbl_match(const struct ip6addrlbl_entry *p,
                               const struct in6_addr *addr,
@@ -219,7 +195,6 @@ static struct ip6addrlbl_entry *ip6addrlbl_alloc(const struct in6_addr *prefix,
        newp->addrtype = addrtype;
        newp->label = label;
        INIT_HLIST_NODE(&newp->list);
-       refcount_set(&newp->refcnt, 1);
        return newp;
 }
 
@@ -243,7 +218,7 @@ static int __ip6addrlbl_add(struct net *net, struct ip6addrlbl_entry *newp,
                                goto out;
                        }
                        hlist_replace_rcu(&p->list, &newp->list);
-                       ip6addrlbl_put(p);
+                       kfree_rcu(p, rcu);
                        goto out;
                } else if ((p->prefixlen == newp->prefixlen && !p->ifindex) ||
                           (p->prefixlen < newp->prefixlen)) {
@@ -281,7 +256,7 @@ static int ip6addrlbl_add(struct net *net,
        ret = __ip6addrlbl_add(net, newp, replace);
        spin_unlock(&net->ipv6.ip6addrlbl_table.lock);
        if (ret)
-               ip6addrlbl_free(newp);
+               kfree(newp);
        return ret;
 }
 
@@ -302,7 +277,7 @@ static int __ip6addrlbl_del(struct net *net,
                    p->ifindex == ifindex &&
                    ipv6_addr_equal(&p->prefix, prefix)) {
                        hlist_del_rcu(&p->list);
-                       ip6addrlbl_put(p);
+                       kfree_rcu(p, rcu);
                        ret = 0;
                        break;
                }
@@ -360,7 +335,7 @@ static void __net_exit ip6addrlbl_net_exit(struct net *net)
        spin_lock(&net->ipv6.ip6addrlbl_table.lock);
        hlist_for_each_entry_safe(p, n, &net->ipv6.ip6addrlbl_table.head, list) {
                hlist_del_rcu(&p->list);
-               ip6addrlbl_put(p);
+               kfree_rcu(p, rcu);
        }
        spin_unlock(&net->ipv6.ip6addrlbl_table.lock);
 }
@@ -546,38 +521,28 @@ static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr *nlh,
                return -EINVAL;
        addr = nla_data(tb[IFAL_ADDRESS]);
 
-       rcu_read_lock();
-       p = __ipv6_addr_label(net, addr, ipv6_addr_type(addr), ifal->ifal_index);
-       if (p && !ip6addrlbl_hold(p))
-               p = NULL;
-       lseq = net->ipv6.ip6addrlbl_table.seq;
-       rcu_read_unlock();
-
-       if (!p) {
-               err = -ESRCH;
-               goto out;
-       }
-
        skb = nlmsg_new(ip6addrlbl_msgsize(), GFP_KERNEL);
-       if (!skb) {
-               ip6addrlbl_put(p);
+       if (!skb)
                return -ENOBUFS;
-       }
 
-       err = ip6addrlbl_fill(skb, p, lseq,
-                             NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
-                             RTM_NEWADDRLABEL, 0);
+       err = -ESRCH;
 
-       ip6addrlbl_put(p);
+       rcu_read_lock();
+       p = __ipv6_addr_label(net, addr, ipv6_addr_type(addr), ifal->ifal_index);
+       lseq = net->ipv6.ip6addrlbl_table.seq;
+       if (p)
+               err = ip6addrlbl_fill(skb, p, lseq,
+                                     NETLINK_CB(in_skb).portid,
+                                     nlh->nlmsg_seq,
+                                     RTM_NEWADDRLABEL, 0);
+       rcu_read_unlock();
 
        if (err < 0) {
                WARN_ON(err == -EMSGSIZE);
                kfree_skb(skb);
-               goto out;
+       } else {
+               err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
        }
-
-       err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
-out:
        return err;
 }
 
index aeb49b4..4e52d52 100644 (file)
@@ -250,15 +250,15 @@ static bool opt_unrec(struct sk_buff *skb, __u32 offset)
        return (*op & 0xC0) == 0x80;
 }
 
-int icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6,
-                              struct icmp6hdr *thdr, int len)
+void icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6,
+                               struct icmp6hdr *thdr, int len)
 {
        struct sk_buff *skb;
        struct icmp6hdr *icmp6h;
 
        skb = skb_peek(&sk->sk_write_queue);
        if (!skb)
-               goto out;
+               return;
 
        icmp6h = icmp6_hdr(skb);
        memcpy(icmp6h, thdr, sizeof(struct icmp6hdr));
@@ -286,8 +286,6 @@ int icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6,
                                                      tmp_csum);
        }
        ip6_push_pending_frames(sk);
-out:
-       return 0;
 }
 
 struct icmpv6_msg {
@@ -437,7 +435,6 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
        int iif = 0;
        int addr_type = 0;
        int len;
-       int err = 0;
        u32 mark = IP6_REPLY_MARK(net, skb->mark);
 
        if ((u8 *)hdr < skb->head ||
@@ -574,17 +571,16 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
        rcu_read_lock();
        idev = __in6_dev_get(skb->dev);
 
-       err = ip6_append_data(sk, icmpv6_getfrag, &msg,
-                             len + sizeof(struct icmp6hdr),
-                             sizeof(struct icmp6hdr),
-                             &ipc6, &fl6, (struct rt6_info *)dst,
-                             MSG_DONTWAIT, &sockc_unused);
-       if (err) {
+       if (ip6_append_data(sk, icmpv6_getfrag, &msg,
+                           len + sizeof(struct icmp6hdr),
+                           sizeof(struct icmp6hdr),
+                           &ipc6, &fl6, (struct rt6_info *)dst,
+                           MSG_DONTWAIT, &sockc_unused)) {
                ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS);
                ip6_flush_pending_frames(sk);
        } else {
-               err = icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr,
-                                                len + sizeof(struct icmp6hdr));
+               icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr,
+                                          len + sizeof(struct icmp6hdr));
        }
        rcu_read_unlock();
 out_dst_release:
@@ -681,7 +677,6 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
        struct icmpv6_msg msg;
        struct dst_entry *dst;
        struct ipcm6_cookie ipc6;
-       int err = 0;
        u32 mark = IP6_REPLY_MARK(net, skb->mark);
        struct sockcm_cookie sockc_unused = {0};
 
@@ -718,8 +713,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
        else if (!fl6.flowi6_oif)
                fl6.flowi6_oif = np->ucast_oif;
 
-       err = ip6_dst_lookup(net, sk, &dst, &fl6);
-       if (err)
+       if (ip6_dst_lookup(net, sk, &dst, &fl6))
                goto out;
        dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), sk, 0);
        if (IS_ERR(dst))
@@ -736,17 +730,16 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
        ipc6.dontfrag = np->dontfrag;
        ipc6.opt = NULL;
 
-       err = ip6_append_data(sk, icmpv6_getfrag, &msg, skb->len + sizeof(struct icmp6hdr),
-                               sizeof(struct icmp6hdr), &ipc6, &fl6,
-                               (struct rt6_info *)dst, MSG_DONTWAIT,
-                               &sockc_unused);
-
-       if (err) {
+       if (ip6_append_data(sk, icmpv6_getfrag, &msg,
+                           skb->len + sizeof(struct icmp6hdr),
+                           sizeof(struct icmp6hdr), &ipc6, &fl6,
+                           (struct rt6_info *)dst, MSG_DONTWAIT,
+                           &sockc_unused)) {
                __ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS);
                ip6_flush_pending_frames(sk);
        } else {
-               err = icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr,
-                                                skb->len + sizeof(struct icmp6hdr));
+               icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr,
+                                          skb->len + sizeof(struct icmp6hdr));
        }
        dst_release(dst);
 out:
index e5308d7..c2ecd5e 100644 (file)
 #include <net/ip6_fib.h>
 #include <net/ip6_route.h>
 
-#define RT6_DEBUG 2
-
-#if RT6_DEBUG >= 3
-#define RT6_TRACE(x...) pr_debug(x)
-#else
-#define RT6_TRACE(x...) do { ; } while (0)
-#endif
-
 static struct kmem_cache *fib6_node_kmem __read_mostly;
 
 struct fib6_cleaner {
@@ -62,9 +54,12 @@ struct fib6_cleaner {
 #define FWS_INIT FWS_L
 #endif
 
-static void fib6_prune_clones(struct net *net, struct fib6_node *fn);
-static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn);
-static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_node *fn);
+static struct rt6_info *fib6_find_prefix(struct net *net,
+                                        struct fib6_table *table,
+                                        struct fib6_node *fn);
+static struct fib6_node *fib6_repair_tree(struct net *net,
+                                         struct fib6_table *table,
+                                         struct fib6_node *fn);
 static int fib6_walk(struct net *net, struct fib6_walker *w);
 static int fib6_walk_continue(struct fib6_walker *w);
 
@@ -110,6 +105,20 @@ enum {
        FIB6_NO_SERNUM_CHANGE = 0,
 };
 
+void fib6_update_sernum(struct rt6_info *rt)
+{
+       struct fib6_table *table = rt->rt6i_table;
+       struct net *net = dev_net(rt->dst.dev);
+       struct fib6_node *fn;
+
+       spin_lock_bh(&table->tb6_lock);
+       fn = rcu_dereference_protected(rt->rt6i_node,
+                       lockdep_is_held(&table->tb6_lock));
+       if (fn)
+               fn->fn_sernum = fib6_new_sernum(net);
+       spin_unlock_bh(&table->tb6_lock);
+}
+
 /*
  *     Auxiliary address test functions for the radix tree.
  *
@@ -140,18 +149,21 @@ static __be32 addr_bit_set(const void *token, int fn_bit)
               addr[fn_bit >> 5];
 }
 
-static struct fib6_node *node_alloc(void)
+static struct fib6_node *node_alloc(struct net *net)
 {
        struct fib6_node *fn;
 
        fn = kmem_cache_zalloc(fib6_node_kmem, GFP_ATOMIC);
+       if (fn)
+               net->ipv6.rt6_stats->fib_nodes++;
 
        return fn;
 }
 
-static void node_free_immediate(struct fib6_node *fn)
+static void node_free_immediate(struct net *net, struct fib6_node *fn)
 {
        kmem_cache_free(fib6_node_kmem, fn);
+       net->ipv6.rt6_stats->fib_nodes--;
 }
 
 static void node_free_rcu(struct rcu_head *head)
@@ -161,9 +173,10 @@ static void node_free_rcu(struct rcu_head *head)
        kmem_cache_free(fib6_node_kmem, fn);
 }
 
-static void node_free(struct fib6_node *fn)
+static void node_free(struct net *net, struct fib6_node *fn)
 {
        call_rcu(&fn->rcu, node_free_rcu);
+       net->ipv6.rt6_stats->fib_nodes--;
 }
 
 void rt6_free_pcpu(struct rt6_info *non_pcpu_rt)
@@ -185,9 +198,6 @@ void rt6_free_pcpu(struct rt6_info *non_pcpu_rt)
                        *ppcpu_rt = NULL;
                }
        }
-
-       free_percpu(non_pcpu_rt->rt6i_pcpu);
-       non_pcpu_rt->rt6i_pcpu = NULL;
 }
 EXPORT_SYMBOL_GPL(rt6_free_pcpu);
 
@@ -205,8 +215,7 @@ static void fib6_link_table(struct net *net, struct fib6_table *tb)
         * Initialize table lock at a single place to give lockdep a key,
         * tables aren't visible prior to being linked to the list.
         */
-       rwlock_init(&tb->tb6_lock);
-
+       spin_lock_init(&tb->tb6_lock);
        h = tb->tb6_id & (FIB6_TABLE_HASHSZ - 1);
 
        /*
@@ -225,7 +234,8 @@ static struct fib6_table *fib6_alloc_table(struct net *net, u32 id)
        table = kzalloc(sizeof(*table), GFP_ATOMIC);
        if (table) {
                table->tb6_id = id;
-               table->tb6_root.leaf = net->ipv6.ip6_null_entry;
+               rcu_assign_pointer(table->tb6_root.leaf,
+                                  net->ipv6.ip6_null_entry);
                table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
                inet_peer_base_init(&table->tb6_peers);
        }
@@ -322,11 +332,8 @@ unsigned int fib6_tables_seq_read(struct net *net)
                struct hlist_head *head = &net->ipv6.fib_table_hash[h];
                struct fib6_table *tb;
 
-               hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
-                       read_lock_bh(&tb->tb6_lock);
+               hlist_for_each_entry_rcu(tb, head, tb6_hlist)
                        fib_seq += tb->fib_seq;
-                       read_unlock_bh(&tb->tb6_lock);
-               }
        }
        rcu_read_unlock();
 
@@ -372,7 +379,7 @@ static int fib6_node_dump(struct fib6_walker *w)
 {
        struct rt6_info *rt;
 
-       for (rt = w->leaf; rt; rt = rt->dst.rt6_next)
+       for_each_fib6_walker_rt(w)
                fib6_rt_dump(rt, w->args);
        w->leaf = NULL;
        return 0;
@@ -382,9 +389,9 @@ static void fib6_table_dump(struct net *net, struct fib6_table *tb,
                            struct fib6_walker *w)
 {
        w->root = &tb->tb6_root;
-       read_lock_bh(&tb->tb6_lock);
+       spin_lock_bh(&tb->tb6_lock);
        fib6_walk(net, w);
-       read_unlock_bh(&tb->tb6_lock);
+       spin_unlock_bh(&tb->tb6_lock);
 }
 
 /* Called with rcu_read_lock() */
@@ -421,7 +428,7 @@ static int fib6_dump_node(struct fib6_walker *w)
        int res;
        struct rt6_info *rt;
 
-       for (rt = w->leaf; rt; rt = rt->dst.rt6_next) {
+       for_each_fib6_walker_rt(w) {
                res = rt6_dump_route(rt, w->args);
                if (res < 0) {
                        /* Frame is full, suspend walking */
@@ -480,9 +487,9 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
                w->count = 0;
                w->skip = 0;
 
-               read_lock_bh(&table->tb6_lock);
+               spin_lock_bh(&table->tb6_lock);
                res = fib6_walk(net, w);
-               read_unlock_bh(&table->tb6_lock);
+               spin_unlock_bh(&table->tb6_lock);
                if (res > 0) {
                        cb->args[4] = 1;
                        cb->args[5] = w->root->fn_sernum;
@@ -497,9 +504,9 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
                } else
                        w->skip = 0;
 
-               read_lock_bh(&table->tb6_lock);
+               spin_lock_bh(&table->tb6_lock);
                res = fib6_walk_continue(w);
-               read_unlock_bh(&table->tb6_lock);
+               spin_unlock_bh(&table->tb6_lock);
                if (res <= 0) {
                        fib6_walker_unlink(net, w);
                        cb->args[4] = 0;
@@ -580,11 +587,13 @@ out:
  *     node.
  */
 
-static struct fib6_node *fib6_add_1(struct fib6_node *root,
-                                    struct in6_addr *addr, int plen,
-                                    int offset, int allow_create,
-                                    int replace_required, int sernum,
-                                    struct netlink_ext_ack *extack)
+static struct fib6_node *fib6_add_1(struct net *net,
+                                   struct fib6_table *table,
+                                   struct fib6_node *root,
+                                   struct in6_addr *addr, int plen,
+                                   int offset, int allow_create,
+                                   int replace_required,
+                                   struct netlink_ext_ack *extack)
 {
        struct fib6_node *fn, *in, *ln;
        struct fib6_node *pn = NULL;
@@ -599,7 +608,9 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root,
        fn = root;
 
        do {
-               key = (struct rt6key *)((u8 *)fn->leaf + offset);
+               struct rt6_info *leaf = rcu_dereference_protected(fn->leaf,
+                                           lockdep_is_held(&table->tb6_lock));
+               key = (struct rt6key *)((u8 *)leaf + offset);
 
                /*
                 *      Prefix match
@@ -625,12 +636,10 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root,
                if (plen == fn->fn_bit) {
                        /* clean up an intermediate node */
                        if (!(fn->fn_flags & RTN_RTINFO)) {
-                               rt6_release(fn->leaf);
-                               fn->leaf = NULL;
+                               RCU_INIT_POINTER(fn->leaf, NULL);
+                               rt6_release(leaf);
                        }
 
-                       fn->fn_sernum = sernum;
-
                        return fn;
                }
 
@@ -639,10 +648,13 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root,
                 */
 
                /* Try to walk down on tree. */
-               fn->fn_sernum = sernum;
                dir = addr_bit_set(addr, fn->fn_bit);
                pn = fn;
-               fn = dir ? fn->right : fn->left;
+               fn = dir ?
+                    rcu_dereference_protected(fn->right,
+                                       lockdep_is_held(&table->tb6_lock)) :
+                    rcu_dereference_protected(fn->left,
+                                       lockdep_is_held(&table->tb6_lock));
        } while (fn);
 
        if (!allow_create) {
@@ -668,19 +680,17 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root,
         *      Create new leaf node without children.
         */
 
-       ln = node_alloc();
+       ln = node_alloc(net);
 
        if (!ln)
                return ERR_PTR(-ENOMEM);
        ln->fn_bit = plen;
-
-       ln->parent = pn;
-       ln->fn_sernum = sernum;
+       RCU_INIT_POINTER(ln->parent, pn);
 
        if (dir)
-               pn->right = ln;
+               rcu_assign_pointer(pn->right, ln);
        else
-               pn->left  = ln;
+               rcu_assign_pointer(pn->left, ln);
 
        return ln;
 
@@ -694,7 +704,8 @@ insert_above:
         * and the current
         */
 
-       pn = fn->parent;
+       pn = rcu_dereference_protected(fn->parent,
+                                      lockdep_is_held(&table->tb6_lock));
 
        /* find 1st bit in difference between the 2 addrs.
 
@@ -710,14 +721,14 @@ insert_above:
         *      (new leaf node)[ln] (old node)[fn]
         */
        if (plen > bit) {
-               in = node_alloc();
-               ln = node_alloc();
+               in = node_alloc(net);
+               ln = node_alloc(net);
 
                if (!in || !ln) {
                        if (in)
-                               node_free_immediate(in);
+                               node_free_immediate(net, in);
                        if (ln)
-                               node_free_immediate(ln);
+                               node_free_immediate(net, ln);
                        return ERR_PTR(-ENOMEM);
                }
 
@@ -731,31 +742,28 @@ insert_above:
 
                in->fn_bit = bit;
 
-               in->parent = pn;
+               RCU_INIT_POINTER(in->parent, pn);
                in->leaf = fn->leaf;
-               atomic_inc(&in->leaf->rt6i_ref);
-
-               in->fn_sernum = sernum;
+               atomic_inc(&rcu_dereference_protected(in->leaf,
+                               lockdep_is_held(&table->tb6_lock))->rt6i_ref);
 
                /* update parent pointer */
                if (dir)
-                       pn->right = in;
+                       rcu_assign_pointer(pn->right, in);
                else
-                       pn->left  = in;
+                       rcu_assign_pointer(pn->left, in);
 
                ln->fn_bit = plen;
 
-               ln->parent = in;
-               fn->parent = in;
-
-               ln->fn_sernum = sernum;
+               RCU_INIT_POINTER(ln->parent, in);
+               rcu_assign_pointer(fn->parent, in);
 
                if (addr_bit_set(addr, bit)) {
-                       in->right = ln;
-                       in->left  = fn;
+                       rcu_assign_pointer(in->right, ln);
+                       rcu_assign_pointer(in->left, fn);
                } else {
-                       in->left  = ln;
-                       in->right = fn;
+                       rcu_assign_pointer(in->left, ln);
+                       rcu_assign_pointer(in->right, fn);
                }
        } else { /* plen <= bit */
 
@@ -765,28 +773,26 @@ insert_above:
                 *           (old node)[fn] NULL
                 */
 
-               ln = node_alloc();
+               ln = node_alloc(net);
 
                if (!ln)
                        return ERR_PTR(-ENOMEM);
 
                ln->fn_bit = plen;
 
-               ln->parent = pn;
-
-               ln->fn_sernum = sernum;
-
-               if (dir)
-                       pn->right = ln;
-               else
-                       pn->left  = ln;
+               RCU_INIT_POINTER(ln->parent, pn);
 
                if (addr_bit_set(&key->addr, plen))
-                       ln->right = fn;
+                       RCU_INIT_POINTER(ln->right, fn);
                else
-                       ln->left  = fn;
+                       RCU_INIT_POINTER(ln->left, fn);
+
+               rcu_assign_pointer(fn->parent, ln);
 
-               fn->parent = ln;
+               if (dir)
+                       rcu_assign_pointer(pn->right, ln);
+               else
+                       rcu_assign_pointer(pn->left, ln);
        }
        return ln;
 }
@@ -832,6 +838,8 @@ static int fib6_commit_metrics(struct dst_entry *dst, struct mx6_config *mxc)
 static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn,
                          struct net *net)
 {
+       struct fib6_table *table = rt->rt6i_table;
+
        if (atomic_read(&rt->rt6i_ref) != 1) {
                /* This route is used as dummy address holder in some split
                 * nodes. It is not leaked, but it still holds other resources,
@@ -840,12 +848,17 @@ static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn,
                 * to still alive ones.
                 */
                while (fn) {
-                       if (!(fn->fn_flags & RTN_RTINFO) && fn->leaf == rt) {
-                               fn->leaf = fib6_find_prefix(net, fn);
-                               atomic_inc(&fn->leaf->rt6i_ref);
+                       struct rt6_info *leaf = rcu_dereference_protected(fn->leaf,
+                                           lockdep_is_held(&table->tb6_lock));
+                       struct rt6_info *new_leaf;
+                       if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) {
+                               new_leaf = fib6_find_prefix(net, table, fn);
+                               atomic_inc(&new_leaf->rt6i_ref);
+                               rcu_assign_pointer(fn->leaf, new_leaf);
                                rt6_release(rt);
                        }
-                       fn = fn->parent;
+                       fn = rcu_dereference_protected(fn->parent,
+                                   lockdep_is_held(&table->tb6_lock));
                }
        }
 }
@@ -857,9 +870,11 @@ static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn,
 static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
                            struct nl_info *info, struct mx6_config *mxc)
 {
+       struct rt6_info *leaf = rcu_dereference_protected(fn->leaf,
+                                   lockdep_is_held(&rt->rt6i_table->tb6_lock));
        struct rt6_info *iter = NULL;
-       struct rt6_info **ins;
-       struct rt6_info **fallback_ins = NULL;
+       struct rt6_info __rcu **ins;
+       struct rt6_info __rcu **fallback_ins = NULL;
        int replace = (info->nlh &&
                       (info->nlh->nlmsg_flags & NLM_F_REPLACE));
        int add = (!info->nlh ||
@@ -874,7 +889,9 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
 
        ins = &fn->leaf;
 
-       for (iter = fn->leaf; iter; iter = iter->dst.rt6_next) {
+       for (iter = leaf; iter;
+            iter = rcu_dereference_protected(iter->dst.rt6_next,
+                               lockdep_is_held(&rt->rt6i_table->tb6_lock))) {
                /*
                 *      Search for duplicates
                 */
@@ -936,7 +953,8 @@ next_iter:
        if (fallback_ins && !found) {
                /* No ECMP-able route found, replace first non-ECMP one */
                ins = fallback_ins;
-               iter = *ins;
+               iter = rcu_dereference_protected(*ins,
+                                   lockdep_is_held(&rt->rt6i_table->tb6_lock));
                found++;
        }
 
@@ -950,7 +968,7 @@ next_iter:
                struct rt6_info *sibling, *temp_sibling;
 
                /* Find the first route that have the same metric */
-               sibling = fn->leaf;
+               sibling = leaf;
                while (sibling) {
                        if (sibling->rt6i_metric == rt->rt6i_metric &&
                            rt6_qualify_for_ecmp(sibling)) {
@@ -958,7 +976,8 @@ next_iter:
                                              &sibling->rt6i_siblings);
                                break;
                        }
-                       sibling = sibling->dst.rt6_next;
+                       sibling = rcu_dereference_protected(sibling->dst.rt6_next,
+                                   lockdep_is_held(&rt->rt6i_table->tb6_lock));
                }
                /* For each sibling in the list, increment the counter of
                 * siblings. BUG() if counters does not match, list of siblings
@@ -987,10 +1006,10 @@ add:
                if (err)
                        return err;
 
-               rt->dst.rt6_next = iter;
-               *ins = rt;
-               rcu_assign_pointer(rt->rt6i_node, fn);
+               rcu_assign_pointer(rt->dst.rt6_next, iter);
                atomic_inc(&rt->rt6i_ref);
+               rcu_assign_pointer(rt->rt6i_node, fn);
+               rcu_assign_pointer(*ins, rt);
                call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_ADD,
                                          rt);
                if (!info->skip_notify)
@@ -1016,10 +1035,10 @@ add:
                if (err)
                        return err;
 
-               *ins = rt;
+               atomic_inc(&rt->rt6i_ref);
                rcu_assign_pointer(rt->rt6i_node, fn);
                rt->dst.rt6_next = iter->dst.rt6_next;
-               atomic_inc(&rt->rt6i_ref);
+               rcu_assign_pointer(*ins, rt);
                call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_REPLACE,
                                          rt);
                if (!info->skip_notify)
@@ -1031,14 +1050,15 @@ add:
                nsiblings = iter->rt6i_nsiblings;
                iter->rt6i_node = NULL;
                fib6_purge_rt(iter, fn, info->nl_net);
-               if (fn->rr_ptr == iter)
+               if (rcu_access_pointer(fn->rr_ptr) == iter)
                        fn->rr_ptr = NULL;
                rt6_release(iter);
 
                if (nsiblings) {
                        /* Replacing an ECMP route, remove all siblings */
                        ins = &rt->dst.rt6_next;
-                       iter = *ins;
+                       iter = rcu_dereference_protected(*ins,
+                                   lockdep_is_held(&rt->rt6i_table->tb6_lock));
                        while (iter) {
                                if (iter->rt6i_metric > rt->rt6i_metric)
                                        break;
@@ -1046,14 +1066,16 @@ add:
                                        *ins = iter->dst.rt6_next;
                                        iter->rt6i_node = NULL;
                                        fib6_purge_rt(iter, fn, info->nl_net);
-                                       if (fn->rr_ptr == iter)
+                                       if (rcu_access_pointer(fn->rr_ptr) == iter)
                                                fn->rr_ptr = NULL;
                                        rt6_release(iter);
                                        nsiblings--;
+                                       info->nl_net->ipv6.rt6_stats->fib_rt_entries--;
                                } else {
                                        ins = &iter->dst.rt6_next;
                                }
-                               iter = *ins;
+                               iter = rcu_dereference_protected(*ins,
+                                       lockdep_is_held(&rt->rt6i_table->tb6_lock));
                        }
                        WARN_ON(nsiblings != 0);
                }
@@ -1077,16 +1099,33 @@ void fib6_force_start_gc(struct net *net)
                          jiffies + net->ipv6.sysctl.ip6_rt_gc_interval);
 }
 
+static void fib6_update_sernum_upto_root(struct rt6_info *rt,
+                                        int sernum)
+{
+       struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node,
+                               lockdep_is_held(&rt->rt6i_table->tb6_lock));
+
+       /* paired with smp_rmb() in rt6_get_cookie_safe() */
+       smp_wmb();
+       while (fn) {
+               fn->fn_sernum = sernum;
+               fn = rcu_dereference_protected(fn->parent,
+                               lockdep_is_held(&rt->rt6i_table->tb6_lock));
+       }
+}
+
 /*
  *     Add routing information to the routing tree.
  *     <destination addr>/<source addr>
  *     with source addr info in sub-trees
+ *     Need to own table->tb6_lock
  */
 
 int fib6_add(struct fib6_node *root, struct rt6_info *rt,
             struct nl_info *info, struct mx6_config *mxc,
             struct netlink_ext_ack *extack)
 {
+       struct fib6_table *table = rt->rt6i_table;
        struct fib6_node *fn, *pn = NULL;
        int err = -ENOMEM;
        int allow_create = 1;
@@ -1095,6 +1134,8 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 
        if (WARN_ON_ONCE(!atomic_read(&rt->dst.__refcnt)))
                return -EINVAL;
+       if (WARN_ON_ONCE(rt->rt6i_flags & RTF_CACHE))
+               return -EINVAL;
 
        if (info->nlh) {
                if (!(info->nlh->nlmsg_flags & NLM_F_CREATE))
@@ -1105,9 +1146,10 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
        if (!allow_create && !replace_required)
                pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n");
 
-       fn = fib6_add_1(root, &rt->rt6i_dst.addr, rt->rt6i_dst.plen,
+       fn = fib6_add_1(info->nl_net, table, root,
+                       &rt->rt6i_dst.addr, rt->rt6i_dst.plen,
                        offsetof(struct rt6_info, rt6i_dst), allow_create,
-                       replace_required, sernum, extack);
+                       replace_required, extack);
        if (IS_ERR(fn)) {
                err = PTR_ERR(fn);
                fn = NULL;
@@ -1120,7 +1162,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
        if (rt->rt6i_src.plen) {
                struct fib6_node *sn;
 
-               if (!fn->subtree) {
+               if (!rcu_access_pointer(fn->subtree)) {
                        struct fib6_node *sfn;
 
                        /*
@@ -1134,42 +1176,40 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
                         */
 
                        /* Create subtree root node */
-                       sfn = node_alloc();
+                       sfn = node_alloc(info->nl_net);
                        if (!sfn)
                                goto failure;
 
-                       sfn->leaf = info->nl_net->ipv6.ip6_null_entry;
                        atomic_inc(&info->nl_net->ipv6.ip6_null_entry->rt6i_ref);
+                       rcu_assign_pointer(sfn->leaf,
+                                          info->nl_net->ipv6.ip6_null_entry);
                        sfn->fn_flags = RTN_ROOT;
-                       sfn->fn_sernum = sernum;
 
                        /* Now add the first leaf node to new subtree */
 
-                       sn = fib6_add_1(sfn, &rt->rt6i_src.addr,
-                                       rt->rt6i_src.plen,
+                       sn = fib6_add_1(info->nl_net, table, sfn,
+                                       &rt->rt6i_src.addr, rt->rt6i_src.plen,
                                        offsetof(struct rt6_info, rt6i_src),
-                                       allow_create, replace_required, sernum,
-                                       extack);
+                                       allow_create, replace_required, extack);
 
                        if (IS_ERR(sn)) {
                                /* If it is failed, discard just allocated
                                   root, and then (in failure) stale node
                                   in main tree.
                                 */
-                               node_free_immediate(sfn);
+                               node_free_immediate(info->nl_net, sfn);
                                err = PTR_ERR(sn);
                                goto failure;
                        }
 
                        /* Now link new subtree to main tree */
-                       sfn->parent = fn;
-                       fn->subtree = sfn;
+                       rcu_assign_pointer(sfn->parent, fn);
+                       rcu_assign_pointer(fn->subtree, sfn);
                } else {
-                       sn = fib6_add_1(fn->subtree, &rt->rt6i_src.addr,
-                                       rt->rt6i_src.plen,
+                       sn = fib6_add_1(info->nl_net, table, FIB6_SUBTREE(fn),
+                                       &rt->rt6i_src.addr, rt->rt6i_src.plen,
                                        offsetof(struct rt6_info, rt6i_src),
-                                       allow_create, replace_required, sernum,
-                                       extack);
+                                       allow_create, replace_required, extack);
 
                        if (IS_ERR(sn)) {
                                err = PTR_ERR(sn);
@@ -1177,9 +1217,9 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
                        }
                }
 
-               if (!fn->leaf) {
-                       fn->leaf = rt;
+               if (!rcu_access_pointer(fn->leaf)) {
                        atomic_inc(&rt->rt6i_ref);
+                       rcu_assign_pointer(fn->leaf, rt);
                }
                fn = sn;
        }
@@ -1187,9 +1227,8 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 
        err = fib6_add_rt2node(fn, rt, info, mxc);
        if (!err) {
+               fib6_update_sernum_upto_root(rt, sernum);
                fib6_start_gc(info->nl_net, rt);
-               if (!(rt->rt6i_flags & RTF_CACHE))
-                       fib6_prune_clones(info->nl_net, pn);
        }
 
 out:
@@ -1199,19 +1238,23 @@ out:
                 * If fib6_add_1 has cleared the old leaf pointer in the
                 * super-tree leaf node we have to find a new one for it.
                 */
-               if (pn != fn && pn->leaf == rt) {
-                       pn->leaf = NULL;
+               struct rt6_info *pn_leaf = rcu_dereference_protected(pn->leaf,
+                                           lockdep_is_held(&table->tb6_lock));
+               if (pn != fn && pn_leaf == rt) {
+                       pn_leaf = NULL;
+                       RCU_INIT_POINTER(pn->leaf, NULL);
                        atomic_dec(&rt->rt6i_ref);
                }
-               if (pn != fn && !pn->leaf && !(pn->fn_flags & RTN_RTINFO)) {
-                       pn->leaf = fib6_find_prefix(info->nl_net, pn);
+               if (pn != fn && !pn_leaf && !(pn->fn_flags & RTN_RTINFO)) {
+                       pn_leaf = fib6_find_prefix(info->nl_net, table, pn);
 #if RT6_DEBUG >= 2
-                       if (!pn->leaf) {
-                               WARN_ON(pn->leaf == NULL);
-                               pn->leaf = info->nl_net->ipv6.ip6_null_entry;
+                       if (!pn_leaf) {
+                               WARN_ON(!pn_leaf);
+                               pn_leaf = info->nl_net->ipv6.ip6_null_entry;
                        }
 #endif
-                       atomic_inc(&pn->leaf->rt6i_ref);
+                       atomic_inc(&pn_leaf->rt6i_ref);
+                       rcu_assign_pointer(pn->leaf, pn_leaf);
                }
 #endif
                goto failure;
@@ -1226,7 +1269,7 @@ failure:
         * fn->leaf.
         */
        if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)))
-               fib6_repair_tree(info->nl_net, fn);
+               fib6_repair_tree(info->nl_net, table, fn);
        /* Always release dst as dst->__refcnt is guaranteed
         * to be taken before entering this function
         */
@@ -1264,7 +1307,8 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root,
 
                dir = addr_bit_set(args->addr, fn->fn_bit);
 
-               next = dir ? fn->right : fn->left;
+               next = dir ? rcu_dereference(fn->right) :
+                            rcu_dereference(fn->left);
 
                if (next) {
                        fn = next;
@@ -1274,18 +1318,22 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root,
        }
 
        while (fn) {
-               if (FIB6_SUBTREE(fn) || fn->fn_flags & RTN_RTINFO) {
+               struct fib6_node *subtree = FIB6_SUBTREE(fn);
+
+               if (subtree || fn->fn_flags & RTN_RTINFO) {
+                       struct rt6_info *leaf = rcu_dereference(fn->leaf);
                        struct rt6key *key;
 
-                       key = (struct rt6key *) ((u8 *) fn->leaf +
-                                                args->offset);
+                       if (!leaf)
+                               goto backtrack;
+
+                       key = (struct rt6key *) ((u8 *)leaf + args->offset);
 
                        if (ipv6_prefix_equal(&key->addr, args->addr, key->plen)) {
 #ifdef CONFIG_IPV6_SUBTREES
-                               if (fn->subtree) {
+                               if (subtree) {
                                        struct fib6_node *sfn;
-                                       sfn = fib6_lookup_1(fn->subtree,
-                                                           args + 1);
+                                       sfn = fib6_lookup_1(subtree, args + 1);
                                        if (!sfn)
                                                goto backtrack;
                                        fn = sfn;
@@ -1295,18 +1343,18 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root,
                                        return fn;
                        }
                }
-#ifdef CONFIG_IPV6_SUBTREES
 backtrack:
-#endif
                if (fn->fn_flags & RTN_ROOT)
                        break;
 
-               fn = fn->parent;
+               fn = rcu_dereference(fn->parent);
        }
 
        return NULL;
 }
 
+/* called with rcu_read_lock() held
+ */
 struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *daddr,
                              const struct in6_addr *saddr)
 {
@@ -1337,54 +1385,84 @@ struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *dad
 /*
  *     Get node with specified destination prefix (and source prefix,
  *     if subtrees are used)
+ *     exact_match == true means we try to find fn with exact match of
+ *     the passed in prefix addr
+ *     exact_match == false means we try to find fn with longest prefix
+ *     match of the passed in prefix addr. This is useful for finding fn
+ *     for cached route as it will be stored in the exception table under
+ *     the node with longest prefix length.
  */
 
 
 static struct fib6_node *fib6_locate_1(struct fib6_node *root,
                                       const struct in6_addr *addr,
-                                      int plen, int offset)
+                                      int plen, int offset,
+                                      bool exact_match)
 {
-       struct fib6_node *fn;
+       struct fib6_node *fn, *prev = NULL;
 
        for (fn = root; fn ; ) {
-               struct rt6key *key = (struct rt6key *)((u8 *)fn->leaf + offset);
+               struct rt6_info *leaf = rcu_dereference(fn->leaf);
+               struct rt6key *key;
+
+               /* This node is being deleted */
+               if (!leaf) {
+                       if (plen <= fn->fn_bit)
+                               goto out;
+                       else
+                               goto next;
+               }
+
+               key = (struct rt6key *)((u8 *)leaf + offset);
 
                /*
                 *      Prefix match
                 */
                if (plen < fn->fn_bit ||
                    !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit))
-                       return NULL;
+                       goto out;
 
                if (plen == fn->fn_bit)
                        return fn;
 
+               prev = fn;
+
+next:
                /*
                 *      We have more bits to go
                 */
                if (addr_bit_set(addr, fn->fn_bit))
-                       fn = fn->right;
+                       fn = rcu_dereference(fn->right);
                else
-                       fn = fn->left;
+                       fn = rcu_dereference(fn->left);
        }
-       return NULL;
+out:
+       if (exact_match)
+               return NULL;
+       else
+               return prev;
 }
 
 struct fib6_node *fib6_locate(struct fib6_node *root,
                              const struct in6_addr *daddr, int dst_len,
-                             const struct in6_addr *saddr, int src_len)
+                             const struct in6_addr *saddr, int src_len,
+                             bool exact_match)
 {
        struct fib6_node *fn;
 
        fn = fib6_locate_1(root, daddr, dst_len,
-                          offsetof(struct rt6_info, rt6i_dst));
+                          offsetof(struct rt6_info, rt6i_dst),
+                          exact_match);
 
 #ifdef CONFIG_IPV6_SUBTREES
        if (src_len) {
+               struct fib6_node *subtree = FIB6_SUBTREE(fn);
+
                WARN_ON(saddr == NULL);
-               if (fn && fn->subtree)
-                       fn = fib6_locate_1(fn->subtree, saddr, src_len,
-                                          offsetof(struct rt6_info, rt6i_src));
+               if (fn && subtree)
+                       fn = fib6_locate_1(subtree, saddr, src_len,
+                                          offsetof(struct rt6_info, rt6i_src),
+                                          exact_match);
        }
 #endif
 
@@ -1400,16 +1478,26 @@ struct fib6_node *fib6_locate(struct fib6_node *root,
  *
  */
 
-static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn)
+static struct rt6_info *fib6_find_prefix(struct net *net,
+                                        struct fib6_table *table,
+                                        struct fib6_node *fn)
 {
+       struct fib6_node *child_left, *child_right;
+
        if (fn->fn_flags & RTN_ROOT)
                return net->ipv6.ip6_null_entry;
 
        while (fn) {
-               if (fn->left)
-                       return fn->left->leaf;
-               if (fn->right)
-                       return fn->right->leaf;
+               child_left = rcu_dereference_protected(fn->left,
+                                   lockdep_is_held(&table->tb6_lock));
+               child_right = rcu_dereference_protected(fn->right,
+                                   lockdep_is_held(&table->tb6_lock));
+               if (child_left)
+                       return rcu_dereference_protected(child_left->leaf,
+                                       lockdep_is_held(&table->tb6_lock));
+               if (child_right)
+                       return rcu_dereference_protected(child_right->leaf,
+                                       lockdep_is_held(&table->tb6_lock));
 
                fn = FIB6_SUBTREE(fn);
        }
@@ -1419,31 +1507,49 @@ static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn)
 /*
  *     Called to trim the tree of intermediate nodes when possible. "fn"
  *     is the node we want to try and remove.
+ *     Need to own table->tb6_lock
  */
 
 static struct fib6_node *fib6_repair_tree(struct net *net,
-                                          struct fib6_node *fn)
+                                         struct fib6_table *table,
+                                         struct fib6_node *fn)
 {
        int children;
        int nstate;
-       struct fib6_node *child, *pn;
+       struct fib6_node *child;
        struct fib6_walker *w;
        int iter = 0;
 
        for (;;) {
+               struct fib6_node *fn_r = rcu_dereference_protected(fn->right,
+                                           lockdep_is_held(&table->tb6_lock));
+               struct fib6_node *fn_l = rcu_dereference_protected(fn->left,
+                                           lockdep_is_held(&table->tb6_lock));
+               struct fib6_node *pn = rcu_dereference_protected(fn->parent,
+                                           lockdep_is_held(&table->tb6_lock));
+               struct fib6_node *pn_r = rcu_dereference_protected(pn->right,
+                                           lockdep_is_held(&table->tb6_lock));
+               struct fib6_node *pn_l = rcu_dereference_protected(pn->left,
+                                           lockdep_is_held(&table->tb6_lock));
+               struct rt6_info *fn_leaf = rcu_dereference_protected(fn->leaf,
+                                           lockdep_is_held(&table->tb6_lock));
+               struct rt6_info *pn_leaf = rcu_dereference_protected(pn->leaf,
+                                           lockdep_is_held(&table->tb6_lock));
+               struct rt6_info *new_fn_leaf;
+
                RT6_TRACE("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter);
                iter++;
 
                WARN_ON(fn->fn_flags & RTN_RTINFO);
                WARN_ON(fn->fn_flags & RTN_TL_ROOT);
-               WARN_ON(fn->leaf);
+               WARN_ON(fn_leaf);
 
                children = 0;
                child = NULL;
-               if (fn->right)
-                       child = fn->right, children |= 1;
-               if (fn->left)
-                       child = fn->left, children |= 2;
+               if (fn_r)
+                       child = fn_r, children |= 1;
+               if (fn_l)
+                       child = fn_l, children |= 2;
 
                if (children == 3 || FIB6_SUBTREE(fn)
 #ifdef CONFIG_IPV6_SUBTREES
@@ -1451,36 +1557,36 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
                    || (children && fn->fn_flags & RTN_ROOT)
 #endif
                    ) {
-                       fn->leaf = fib6_find_prefix(net, fn);
+                       new_fn_leaf = fib6_find_prefix(net, table, fn);
 #if RT6_DEBUG >= 2
-                       if (!fn->leaf) {
-                               WARN_ON(!fn->leaf);
-                               fn->leaf = net->ipv6.ip6_null_entry;
+                       if (!new_fn_leaf) {
+                               WARN_ON(!new_fn_leaf);
+                               new_fn_leaf = net->ipv6.ip6_null_entry;
                        }
 #endif
-                       atomic_inc(&fn->leaf->rt6i_ref);
-                       return fn->parent;
+                       atomic_inc(&new_fn_leaf->rt6i_ref);
+                       rcu_assign_pointer(fn->leaf, new_fn_leaf);
+                       return pn;
                }
 
-               pn = fn->parent;
 #ifdef CONFIG_IPV6_SUBTREES
                if (FIB6_SUBTREE(pn) == fn) {
                        WARN_ON(!(fn->fn_flags & RTN_ROOT));
-                       FIB6_SUBTREE(pn) = NULL;
+                       RCU_INIT_POINTER(pn->subtree, NULL);
                        nstate = FWS_L;
                } else {
                        WARN_ON(fn->fn_flags & RTN_ROOT);
 #endif
-                       if (pn->right == fn)
-                               pn->right = child;
-                       else if (pn->left == fn)
-                               pn->left = child;
+                       if (pn_r == fn)
+                               rcu_assign_pointer(pn->right, child);
+                       else if (pn_l == fn)
+                               rcu_assign_pointer(pn->left, child);
 #if RT6_DEBUG >= 2
                        else
                                WARN_ON(1);
 #endif
                        if (child)
-                               child->parent = pn;
+                               rcu_assign_pointer(child->parent, pn);
                        nstate = FWS_R;
 #ifdef CONFIG_IPV6_SUBTREES
                }
@@ -1489,19 +1595,12 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
                read_lock(&net->ipv6.fib6_walker_lock);
                FOR_WALKERS(net, w) {
                        if (!child) {
-                               if (w->root == fn) {
-                                       w->root = w->node = NULL;
-                                       RT6_TRACE("W %p adjusted by delroot 1\n", w);
-                               } else if (w->node == fn) {
+                               if (w->node == fn) {
                                        RT6_TRACE("W %p adjusted by delnode 1, s=%d/%d\n", w, w->state, nstate);
                                        w->node = pn;
                                        w->state = nstate;
                                }
                        } else {
-                               if (w->root == fn) {
-                                       w->root = child;
-                                       RT6_TRACE("W %p adjusted by delroot 2\n", w);
-                               }
                                if (w->node == fn) {
                                        w->node = child;
                                        if (children&2) {
@@ -1516,33 +1615,39 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
                }
                read_unlock(&net->ipv6.fib6_walker_lock);
 
-               node_free(fn);
+               node_free(net, fn);
                if (pn->fn_flags & RTN_RTINFO || FIB6_SUBTREE(pn))
                        return pn;
 
-               rt6_release(pn->leaf);
-               pn->leaf = NULL;
+               RCU_INIT_POINTER(pn->leaf, NULL);
+               rt6_release(pn_leaf);
                fn = pn;
        }
 }
 
-static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
-                          struct nl_info *info)
+static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
+                          struct rt6_info __rcu **rtp, struct nl_info *info)
 {
        struct fib6_walker *w;
-       struct rt6_info *rt = *rtp;
+       struct rt6_info *rt = rcu_dereference_protected(*rtp,
+                                   lockdep_is_held(&table->tb6_lock));
        struct net *net = info->nl_net;
 
        RT6_TRACE("fib6_del_route\n");
 
+       WARN_ON_ONCE(rt->rt6i_flags & RTF_CACHE);
+
        /* Unlink it */
        *rtp = rt->dst.rt6_next;
        rt->rt6i_node = NULL;
        net->ipv6.rt6_stats->fib_rt_entries--;
        net->ipv6.rt6_stats->fib_discarded_routes++;
 
+       /* Flush all cached dst in exception table */
+       rt6_flush_exceptions(rt);
+
        /* Reset round-robin state, if necessary */
-       if (fn->rr_ptr == rt)
+       if (rcu_access_pointer(fn->rr_ptr) == rt)
                fn->rr_ptr = NULL;
 
        /* Remove this entry from other siblings */
@@ -1561,20 +1666,19 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
        FOR_WALKERS(net, w) {
                if (w->state == FWS_C && w->leaf == rt) {
                        RT6_TRACE("walker %p adjusted by delroute\n", w);
-                       w->leaf = rt->dst.rt6_next;
+                       w->leaf = rcu_dereference_protected(rt->dst.rt6_next,
+                                           lockdep_is_held(&table->tb6_lock));
                        if (!w->leaf)
                                w->state = FWS_U;
                }
        }
        read_unlock(&net->ipv6.fib6_walker_lock);
 
-       rt->dst.rt6_next = NULL;
-
        /* If it was last route, expunge its radix tree node */
-       if (!fn->leaf) {
+       if (!rcu_access_pointer(fn->leaf)) {
                fn->fn_flags &= ~RTN_RTINFO;
                net->ipv6.rt6_stats->fib_route_nodes--;
-               fn = fib6_repair_tree(net, fn);
+               fn = fib6_repair_tree(net, table, fn);
        }
 
        fib6_purge_rt(rt, fn, net);
@@ -1585,12 +1689,15 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
        rt6_release(rt);
 }
 
+/* Need to own table->tb6_lock */
 int fib6_del(struct rt6_info *rt, struct nl_info *info)
 {
        struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node,
                                    lockdep_is_held(&rt->rt6i_table->tb6_lock));
+       struct fib6_table *table = rt->rt6i_table;
        struct net *net = info->nl_net;
-       struct rt6_info **rtp;
+       struct rt6_info __rcu **rtp;
+       struct rt6_info __rcu **rtp_next;
 
 #if RT6_DEBUG >= 2
        if (rt->dst.obsolete > 0) {
@@ -1603,28 +1710,22 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info)
 
        WARN_ON(!(fn->fn_flags & RTN_RTINFO));
 
-       if (!(rt->rt6i_flags & RTF_CACHE)) {
-               struct fib6_node *pn = fn;
-#ifdef CONFIG_IPV6_SUBTREES
-               /* clones of this route might be in another subtree */
-               if (rt->rt6i_src.plen) {
-                       while (!(pn->fn_flags & RTN_ROOT))
-                               pn = pn->parent;
-                       pn = pn->parent;
-               }
-#endif
-               fib6_prune_clones(info->nl_net, pn);
-       }
+       /* remove cached dst from exception table */
+       if (rt->rt6i_flags & RTF_CACHE)
+               return rt6_remove_exception_rt(rt);
 
        /*
         *      Walk the leaf entries looking for ourself
         */
 
-       for (rtp = &fn->leaf; *rtp; rtp = &(*rtp)->dst.rt6_next) {
-               if (*rtp == rt) {
-                       fib6_del_route(fn, rtp, info);
+       for (rtp = &fn->leaf; *rtp; rtp = rtp_next) {
+               struct rt6_info *cur = rcu_dereference_protected(*rtp,
+                                       lockdep_is_held(&table->tb6_lock));
+               if (rt == cur) {
+                       fib6_del_route(table, fn, rtp, info);
                        return 0;
                }
+               rtp_next = &cur->dst.rt6_next;
        }
        return -ENOENT;
 }
@@ -1651,22 +1752,22 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info)
  *     0   -> walk is complete.
  *     >0  -> walk is incomplete (i.e. suspended)
  *     <0  -> walk is terminated by an error.
+ *
+ *     This function is called with tb6_lock held.
  */
 
 static int fib6_walk_continue(struct fib6_walker *w)
 {
-       struct fib6_node *fn, *pn;
+       struct fib6_node *fn, *pn, *left, *right;
+
+       /* w->root should always be table->tb6_root */
+       WARN_ON_ONCE(!(w->root->fn_flags & RTN_TL_ROOT));
 
        for (;;) {
                fn = w->node;
                if (!fn)
                        return 0;
 
-               if (w->prune && fn != w->root &&
-                   fn->fn_flags & RTN_RTINFO && w->state < FWS_C) {
-                       w->state = FWS_C;
-                       w->leaf = fn->leaf;
-               }
                switch (w->state) {
 #ifdef CONFIG_IPV6_SUBTREES
                case FWS_S:
@@ -1677,20 +1778,22 @@ static int fib6_walk_continue(struct fib6_walker *w)
                        w->state = FWS_L;
 #endif
                case FWS_L:
-                       if (fn->left) {
-                               w->node = fn->left;
+                       left = rcu_dereference_protected(fn->left, 1);
+                       if (left) {
+                               w->node = left;
                                w->state = FWS_INIT;
                                continue;
                        }
                        w->state = FWS_R;
                case FWS_R:
-                       if (fn->right) {
-                               w->node = fn->right;
+                       right = rcu_dereference_protected(fn->right, 1);
+                       if (right) {
+                               w->node = right;
                                w->state = FWS_INIT;
                                continue;
                        }
                        w->state = FWS_C;
-                       w->leaf = fn->leaf;
+                       w->leaf = rcu_dereference_protected(fn->leaf, 1);
                case FWS_C:
                        if (w->leaf && fn->fn_flags & RTN_RTINFO) {
                                int err;
@@ -1712,7 +1815,9 @@ skip:
                case FWS_U:
                        if (fn == w->root)
                                return 0;
-                       pn = fn->parent;
+                       pn = rcu_dereference_protected(fn->parent, 1);
+                       left = rcu_dereference_protected(pn->left, 1);
+                       right = rcu_dereference_protected(pn->right, 1);
                        w->node = pn;
 #ifdef CONFIG_IPV6_SUBTREES
                        if (FIB6_SUBTREE(pn) == fn) {
@@ -1721,13 +1826,13 @@ skip:
                                continue;
                        }
 #endif
-                       if (pn->left == fn) {
+                       if (left == fn) {
                                w->state = FWS_R;
                                continue;
                        }
-                       if (pn->right == fn) {
+                       if (right == fn) {
                                w->state = FWS_C;
-                               w->leaf = w->node->leaf;
+                               w->leaf = rcu_dereference_protected(w->node->leaf, 1);
                                continue;
                        }
 #if RT6_DEBUG >= 2
@@ -1770,7 +1875,7 @@ static int fib6_clean_node(struct fib6_walker *w)
                return 0;
        }
 
-       for (rt = w->leaf; rt; rt = rt->dst.rt6_next) {
+       for_each_fib6_walker_rt(w) {
                res = c->func(rt, c->arg);
                if (res < 0) {
                        w->leaf = rt;
@@ -1798,20 +1903,16 @@ static int fib6_clean_node(struct fib6_walker *w)
  *     func is called on each route.
  *             It may return -1 -> delete this route.
  *                           0  -> continue walking
- *
- *     prune==1 -> only immediate children of node (certainly,
- *     ignoring pure split nodes) will be scanned.
  */
 
 static void fib6_clean_tree(struct net *net, struct fib6_node *root,
                            int (*func)(struct rt6_info *, void *arg),
-                           bool prune, int sernum, void *arg)
+                           int sernum, void *arg)
 {
        struct fib6_cleaner c;
 
        c.w.root = root;
        c.w.func = fib6_clean_node;
-       c.w.prune = prune;
        c.w.count = 0;
        c.w.skip = 0;
        c.func = func;
@@ -1834,10 +1935,10 @@ static void __fib6_clean_all(struct net *net,
        for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
                head = &net->ipv6.fib_table_hash[h];
                hlist_for_each_entry_rcu(table, head, tb6_hlist) {
-                       write_lock_bh(&table->tb6_lock);
+                       spin_lock_bh(&table->tb6_lock);
                        fib6_clean_tree(net, &table->tb6_root,
-                                       func, false, sernum, arg);
-                       write_unlock_bh(&table->tb6_lock);
+                                       func, sernum, arg);
+                       spin_unlock_bh(&table->tb6_lock);
                }
        }
        rcu_read_unlock();
@@ -1849,22 +1950,6 @@ void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *),
        __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg);
 }
 
-static int fib6_prune_clone(struct rt6_info *rt, void *arg)
-{
-       if (rt->rt6i_flags & RTF_CACHE) {
-               RT6_TRACE("pruning clone %p\n", rt);
-               return -1;
-       }
-
-       return 0;
-}
-
-static void fib6_prune_clones(struct net *net, struct fib6_node *fn)
-{
-       fib6_clean_tree(net, fn, fib6_prune_clone, true,
-                       FIB6_NO_SERNUM_CHANGE, NULL);
-}
-
 static void fib6_flush_trees(struct net *net)
 {
        int new_sernum = fib6_new_sernum(net);
@@ -1876,12 +1961,6 @@ static void fib6_flush_trees(struct net *net)
  *     Garbage collection
  */
 
-struct fib6_gc_args
-{
-       int                     timeout;
-       int                     more;
-};
-
 static int fib6_age(struct rt6_info *rt, void *arg)
 {
        struct fib6_gc_args *gc_args = arg;
@@ -1890,9 +1969,6 @@ static int fib6_age(struct rt6_info *rt, void *arg)
        /*
         *      check addrconf expiration here.
         *      Routes are expired even if they are in use.
-        *
-        *      Also age clones. Note, that clones are aged out
-        *      only if they are not in use now.
         */
 
        if (rt->rt6i_flags & RTF_EXPIRES && rt->dst.expires) {
@@ -1901,31 +1977,14 @@ static int fib6_age(struct rt6_info *rt, void *arg)
                        return -1;
                }
                gc_args->more++;
-       } else if (rt->rt6i_flags & RTF_CACHE) {
-               if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout))
-                       rt->dst.obsolete = DST_OBSOLETE_KILL;
-               if (atomic_read(&rt->dst.__refcnt) == 1 &&
-                   rt->dst.obsolete == DST_OBSOLETE_KILL) {
-                       RT6_TRACE("aging clone %p\n", rt);
-                       return -1;
-               } else if (rt->rt6i_flags & RTF_GATEWAY) {
-                       struct neighbour *neigh;
-                       __u8 neigh_flags = 0;
-
-                       neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway);
-                       if (neigh) {
-                               neigh_flags = neigh->flags;
-                               neigh_release(neigh);
-                       }
-                       if (!(neigh_flags & NTF_ROUTER)) {
-                               RT6_TRACE("purging route %p via non-router but gateway\n",
-                                         rt);
-                               return -1;
-                       }
-               }
-               gc_args->more++;
        }
 
+       /*      Also age clones in the exception table.
+        *      Note, that clones are aged out
+        *      only if they are not in use now.
+        */
+       rt6_age_exceptions(rt, gc_args, now);
+
        return 0;
 }
 
@@ -1993,7 +2052,8 @@ static int __net_init fib6_net_init(struct net *net)
                goto out_fib_table_hash;
 
        net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN;
-       net->ipv6.fib6_main_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry;
+       rcu_assign_pointer(net->ipv6.fib6_main_tbl->tb6_root.leaf,
+                          net->ipv6.ip6_null_entry);
        net->ipv6.fib6_main_tbl->tb6_root.fn_flags =
                RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
        inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers);
@@ -2004,7 +2064,8 @@ static int __net_init fib6_net_init(struct net *net)
        if (!net->ipv6.fib6_local_tbl)
                goto out_fib6_main_tbl;
        net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL;
-       net->ipv6.fib6_local_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry;
+       rcu_assign_pointer(net->ipv6.fib6_local_tbl->tb6_root.leaf,
+                          net->ipv6.ip6_null_entry);
        net->ipv6.fib6_local_tbl->tb6_root.fn_flags =
                RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
        inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers);
@@ -2134,7 +2195,9 @@ static int ipv6_route_yield(struct fib6_walker *w)
                return 1;
 
        do {
-               iter->w.leaf = iter->w.leaf->dst.rt6_next;
+               iter->w.leaf = rcu_dereference_protected(
+                               iter->w.leaf->dst.rt6_next,
+                               lockdep_is_held(&iter->tbl->tb6_lock));
                iter->skip--;
                if (!iter->skip && iter->w.leaf)
                        return 1;
@@ -2199,7 +2262,7 @@ static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
        if (!v)
                goto iter_table;
 
-       n = ((struct rt6_info *)v)->dst.rt6_next;
+       n = rcu_dereference_bh(((struct rt6_info *)v)->dst.rt6_next);
        if (n) {
                ++*pos;
                return n;
@@ -2207,9 +2270,9 @@ static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 
 iter_table:
        ipv6_route_check_sernum(iter);
-       read_lock(&iter->tbl->tb6_lock);
+       spin_lock_bh(&iter->tbl->tb6_lock);
        r = fib6_walk_continue(&iter->w);
-       read_unlock(&iter->tbl->tb6_lock);
+       spin_unlock_bh(&iter->tbl->tb6_lock);
        if (r > 0) {
                if (v)
                        ++*pos;
index cdb3728..4a87f94 100644 (file)
@@ -105,7 +105,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
 
        for (skb = segs; skb; skb = skb->next) {
                ipv6h = (struct ipv6hdr *)(skb_mac_header(skb) + nhoff);
-               if (gso_partial)
+               if (gso_partial && skb_is_gso(skb))
                        payload_len = skb_shinfo(skb)->gso_size +
                                      SKB_GSO_CB(skb)->data_offset +
                                      skb->head - (unsigned char *)(ipv6h + 1);
index a5cd43d..437af8c 100644 (file)
@@ -353,7 +353,7 @@ static unsigned int ipv6_synproxy_hook(void *priv,
        nexthdr = ipv6_hdr(skb)->nexthdr;
        thoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr,
                                 &frag_off);
-       if (thoff < 0)
+       if (thoff < 0 || nexthdr != IPPROTO_TCP)
                return NF_ACCEPT;
 
        th = skb_header_pointer(skb, thoff, sizeof(_th), &_th);
index ac826dd..d12c55d 100644 (file)
@@ -154,9 +154,8 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
                                ICMP6_MIB_OUTERRORS);
                ip6_flush_pending_frames(sk);
        } else {
-               err = icmpv6_push_pending_frames(sk, &fl6,
-                                                (struct icmp6hdr *) &pfh.icmph,
-                                                len);
+               icmpv6_push_pending_frames(sk, &fl6,
+                                          (struct icmp6hdr *)&pfh.icmph, len);
        }
        release_sock(sk);
 
index 26cc9f4..2e8842f 100644 (file)
@@ -44,6 +44,7 @@
 #include <linux/seq_file.h>
 #include <linux/nsproxy.h>
 #include <linux/slab.h>
+#include <linux/jhash.h>
 #include <net/net_namespace.h>
 #include <net/snmp.h>
 #include <net/ipv6.h>
@@ -104,6 +105,9 @@ static int rt6_fill_node(struct net *net,
                         struct in6_addr *dst, struct in6_addr *src,
                         int iif, int type, u32 portid, u32 seq,
                         unsigned int flags);
+static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
+                                          struct in6_addr *daddr,
+                                          struct in6_addr *saddr);
 
 #ifdef CONFIG_IPV6_ROUTE_INFO
 static struct rt6_info *rt6_add_route_info(struct net *net,
@@ -139,9 +143,11 @@ static void rt6_uncached_list_del(struct rt6_info *rt)
 {
        if (!list_empty(&rt->rt6i_uncached)) {
                struct uncached_list *ul = rt->rt6i_uncached_list;
+               struct net *net = dev_net(rt->dst.dev);
 
                spin_lock_bh(&ul->lock);
                list_del(&rt->rt6i_uncached);
+               atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
                spin_unlock_bh(&ul->lock);
        }
 }
@@ -355,8 +361,10 @@ static struct rt6_info *__ip6_dst_alloc(struct net *net,
        struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
                                        1, DST_OBSOLETE_FORCE_CHK, flags);
 
-       if (rt)
+       if (rt) {
                rt6_info_init(rt);
+               atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
+       }
 
        return rt;
 }
@@ -369,17 +377,7 @@ struct rt6_info *ip6_dst_alloc(struct net *net,
 
        if (rt) {
                rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
-               if (rt->rt6i_pcpu) {
-                       int cpu;
-
-                       for_each_possible_cpu(cpu) {
-                               struct rt6_info **p;
-
-                               p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
-                               /* no one shares rt */
-                               *p =  NULL;
-                       }
-               } else {
+               if (!rt->rt6i_pcpu) {
                        dst_release_immediate(&rt->dst);
                        return NULL;
                }
@@ -392,6 +390,7 @@ EXPORT_SYMBOL(ip6_dst_alloc);
 static void ip6_dst_destroy(struct dst_entry *dst)
 {
        struct rt6_info *rt = (struct rt6_info *)dst;
+       struct rt6_exception_bucket *bucket;
        struct dst_entry *from = dst->from;
        struct inet6_dev *idev;
 
@@ -404,6 +403,11 @@ static void ip6_dst_destroy(struct dst_entry *dst)
                rt->rt6i_idev = NULL;
                in6_dev_put(idev);
        }
+       bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1);
+       if (bucket) {
+               rt->rt6i_exception_bucket = NULL;
+               kfree(bucket);
+       }
 
        dst->from = NULL;
        dst_release(from);
@@ -478,7 +482,7 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
 }
 
 /*
- *     Route lookup. Any table->tb6_lock is implied.
+ *     Route lookup. rcu_read_lock() should be held.
  */
 
 static inline struct rt6_info *rt6_device_match(struct net *net,
@@ -493,7 +497,7 @@ static inline struct rt6_info *rt6_device_match(struct net *net,
        if (!oif && ipv6_addr_any(saddr))
                goto out;
 
-       for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
+       for (sprt = rt; sprt; sprt = rcu_dereference(sprt->dst.rt6_next)) {
                struct net_device *dev = sprt->dst.dev;
 
                if (oif) {
@@ -702,6 +706,7 @@ out:
 }
 
 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
+                                    struct rt6_info *leaf,
                                     struct rt6_info *rr_head,
                                     u32 metric, int oif, int strict,
                                     bool *do_rr)
@@ -711,7 +716,7 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
 
        match = NULL;
        cont = NULL;
-       for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
+       for (rt = rr_head; rt; rt = rcu_dereference(rt->dst.rt6_next)) {
                if (rt->rt6i_metric != metric) {
                        cont = rt;
                        break;
@@ -720,7 +725,8 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
                match = find_match(rt, oif, strict, &mpri, match, do_rr);
        }
 
-       for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
+       for (rt = leaf; rt && rt != rr_head;
+            rt = rcu_dereference(rt->dst.rt6_next)) {
                if (rt->rt6i_metric != metric) {
                        cont = rt;
                        break;
@@ -732,37 +738,59 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
        if (match || !cont)
                return match;
 
-       for (rt = cont; rt; rt = rt->dst.rt6_next)
+       for (rt = cont; rt; rt = rcu_dereference(rt->dst.rt6_next))
                match = find_match(rt, oif, strict, &mpri, match, do_rr);
 
        return match;
 }
 
-static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
+static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
+                                  int oif, int strict)
 {
+       struct rt6_info *leaf = rcu_dereference(fn->leaf);
        struct rt6_info *match, *rt0;
-       struct net *net;
        bool do_rr = false;
+       int key_plen;
 
-       rt0 = fn->rr_ptr;
+       if (!leaf)
+               return net->ipv6.ip6_null_entry;
+
+       rt0 = rcu_dereference(fn->rr_ptr);
        if (!rt0)
-               fn->rr_ptr = rt0 = fn->leaf;
+               rt0 = leaf;
 
-       match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
+       /* Double check to make sure fn is not an intermediate node
+        * and fn->leaf does not points to its child's leaf
+        * (This might happen if all routes under fn are deleted from
+        * the tree and fib6_repair_tree() is called on the node.)
+        */
+       key_plen = rt0->rt6i_dst.plen;
+#ifdef CONFIG_IPV6_SUBTREES
+       if (rt0->rt6i_src.plen)
+               key_plen = rt0->rt6i_src.plen;
+#endif
+       if (fn->fn_bit != key_plen)
+               return net->ipv6.ip6_null_entry;
+
+       match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict,
                             &do_rr);
 
        if (do_rr) {
-               struct rt6_info *next = rt0->dst.rt6_next;
+               struct rt6_info *next = rcu_dereference(rt0->dst.rt6_next);
 
                /* no entries matched; do round-robin */
                if (!next || next->rt6i_metric != rt0->rt6i_metric)
-                       next = fn->leaf;
-
-               if (next != rt0)
-                       fn->rr_ptr = next;
+                       next = leaf;
+
+               if (next != rt0) {
+                       spin_lock_bh(&leaf->rt6i_table->tb6_lock);
+                       /* make sure next is not being deleted from the tree */
+                       if (next->rt6i_node)
+                               rcu_assign_pointer(fn->rr_ptr, next);
+                       spin_unlock_bh(&leaf->rt6i_table->tb6_lock);
+               }
        }
 
-       net = dev_net(rt0->dst.dev);
        return match ? match : net->ipv6.ip6_null_entry;
 }
 
@@ -850,13 +878,14 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
                                        struct in6_addr *saddr)
 {
-       struct fib6_node *pn;
+       struct fib6_node *pn, *sn;
        while (1) {
                if (fn->fn_flags & RTN_TL_ROOT)
                        return NULL;
-               pn = fn->parent;
-               if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
-                       fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
+               pn = rcu_dereference(fn->parent);
+               sn = FIB6_SUBTREE(pn);
+               if (sn && sn != fn)
+                       fn = fib6_lookup(sn, NULL, saddr);
                else
                        fn = pn;
                if (fn->fn_flags & RTN_RTINFO)
@@ -864,27 +893,57 @@ static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
        }
 }
 
+static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
+                         bool null_fallback)
+{
+       struct rt6_info *rt = *prt;
+
+       if (dst_hold_safe(&rt->dst))
+               return true;
+       if (null_fallback) {
+               rt = net->ipv6.ip6_null_entry;
+               dst_hold(&rt->dst);
+       } else {
+               rt = NULL;
+       }
+       *prt = rt;
+       return false;
+}
+
 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
                                             struct fib6_table *table,
                                             struct flowi6 *fl6, int flags)
 {
+       struct rt6_info *rt, *rt_cache;
        struct fib6_node *fn;
-       struct rt6_info *rt;
 
-       read_lock_bh(&table->tb6_lock);
+       rcu_read_lock();
        fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
 restart:
-       rt = fn->leaf;
-       rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
-       if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
-               rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
+       rt = rcu_dereference(fn->leaf);
+       if (!rt) {
+               rt = net->ipv6.ip6_null_entry;
+       } else {
+               rt = rt6_device_match(net, rt, &fl6->saddr,
+                                     fl6->flowi6_oif, flags);
+               if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
+                       rt = rt6_multipath_select(rt, fl6,
+                                                 fl6->flowi6_oif, flags);
+       }
        if (rt == net->ipv6.ip6_null_entry) {
                fn = fib6_backtrack(fn, &fl6->saddr);
                if (fn)
                        goto restart;
        }
-       dst_use(&rt->dst, jiffies);
-       read_unlock_bh(&table->tb6_lock);
+       /* Search through exception table */
+       rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
+       if (rt_cache)
+               rt = rt_cache;
+
+       if (ip6_hold_safe(net, &rt, true))
+               dst_use_noref(&rt->dst, jiffies);
+
+       rcu_read_unlock();
 
        trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
 
@@ -938,9 +997,9 @@ static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
        struct fib6_table *table;
 
        table = rt->rt6i_table;
-       write_lock_bh(&table->tb6_lock);
+       spin_lock_bh(&table->tb6_lock);
        err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
-       write_unlock_bh(&table->tb6_lock);
+       spin_unlock_bh(&table->tb6_lock);
 
        return err;
 }
@@ -1038,7 +1097,7 @@ static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
        return pcpu_rt;
 }
 
-/* It should be called with read_lock_bh(&tb6_lock) acquired */
+/* It should be called with rcu_read_lock() acquired */
 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
 {
        struct rt6_info *pcpu_rt, **p;
@@ -1046,16 +1105,14 @@ static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
        p = this_cpu_ptr(rt->rt6i_pcpu);
        pcpu_rt = *p;
 
-       if (pcpu_rt) {
-               dst_hold(&pcpu_rt->dst);
+       if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false))
                rt6_dst_from_metrics_check(pcpu_rt);
-       }
+
        return pcpu_rt;
 }
 
 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
 {
-       struct fib6_table *table = rt->rt6i_table;
        struct rt6_info *pcpu_rt, *prev, **p;
 
        pcpu_rt = ip6_rt_pcpu_alloc(rt);
@@ -1066,36 +1123,514 @@ static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
                return net->ipv6.ip6_null_entry;
        }
 
-       read_lock_bh(&table->tb6_lock);
-       if (rt->rt6i_pcpu) {
-               p = this_cpu_ptr(rt->rt6i_pcpu);
-               prev = cmpxchg(p, NULL, pcpu_rt);
-               if (prev) {
-                       /* If someone did it before us, return prev instead */
-                       dst_release_immediate(&pcpu_rt->dst);
-                       pcpu_rt = prev;
-               }
-       } else {
-               /* rt has been removed from the fib6 tree
-                * before we have a chance to acquire the read_lock.
-                * In this case, don't brother to create a pcpu rt
-                * since rt is going away anyway.  The next
-                * dst_check() will trigger a re-lookup.
-                */
-               dst_release_immediate(&pcpu_rt->dst);
-               pcpu_rt = rt;
-       }
        dst_hold(&pcpu_rt->dst);
+       p = this_cpu_ptr(rt->rt6i_pcpu);
+       prev = cmpxchg(p, NULL, pcpu_rt);
+       BUG_ON(prev);
+
        rt6_dst_from_metrics_check(pcpu_rt);
-       read_unlock_bh(&table->tb6_lock);
        return pcpu_rt;
 }
 
+/* exception hash table implementation
+ */
+static DEFINE_SPINLOCK(rt6_exception_lock);
+
+/* Remove rt6_ex from hash table and free the memory
+ * Caller must hold rt6_exception_lock
+ */
+static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
+                                struct rt6_exception *rt6_ex)
+{
+       struct net *net;
+
+       if (!bucket || !rt6_ex)
+               return;
+
+       net = dev_net(rt6_ex->rt6i->dst.dev);
+       rt6_ex->rt6i->rt6i_node = NULL;
+       hlist_del_rcu(&rt6_ex->hlist);
+       rt6_release(rt6_ex->rt6i);
+       kfree_rcu(rt6_ex, rcu);
+       WARN_ON_ONCE(!bucket->depth);
+       bucket->depth--;
+       net->ipv6.rt6_stats->fib_rt_cache--;
+}
+
+/* Remove oldest rt6_ex in bucket and free the memory
+ * Caller must hold rt6_exception_lock
+ */
+static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
+{
+       struct rt6_exception *rt6_ex, *oldest = NULL;
+
+       if (!bucket)
+               return;
+
+       hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
+               if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
+                       oldest = rt6_ex;
+       }
+       rt6_remove_exception(bucket, oldest);
+}
+
+static u32 rt6_exception_hash(const struct in6_addr *dst,
+                             const struct in6_addr *src)
+{
+       static u32 seed __read_mostly;
+       u32 val;
+
+       net_get_random_once(&seed, sizeof(seed));
+       val = jhash(dst, sizeof(*dst), seed);
+
+#ifdef CONFIG_IPV6_SUBTREES
+       if (src)
+               val = jhash(src, sizeof(*src), val);
+#endif
+       return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
+}
+
+/* Helper function to find the cached rt in the hash table
+ * and update bucket pointer to point to the bucket for this
+ * (daddr, saddr) pair
+ * Caller must hold rt6_exception_lock
+ */
+static struct rt6_exception *
+__rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
+                             const struct in6_addr *daddr,
+                             const struct in6_addr *saddr)
+{
+       struct rt6_exception *rt6_ex;
+       u32 hval;
+
+       if (!(*bucket) || !daddr)
+               return NULL;
+
+       hval = rt6_exception_hash(daddr, saddr);
+       *bucket += hval;
+
+       hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
+               struct rt6_info *rt6 = rt6_ex->rt6i;
+               bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
+
+#ifdef CONFIG_IPV6_SUBTREES
+               if (matched && saddr)
+                       matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
+#endif
+               if (matched)
+                       return rt6_ex;
+       }
+       return NULL;
+}
+
+/* Helper function to find the cached rt in the hash table
+ * and update bucket pointer to point to the bucket for this
+ * (daddr, saddr) pair
+ * Caller must hold rcu_read_lock()
+ */
+static struct rt6_exception *
+__rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
+                        const struct in6_addr *daddr,
+                        const struct in6_addr *saddr)
+{
+       struct rt6_exception *rt6_ex;
+       u32 hval;
+
+       WARN_ON_ONCE(!rcu_read_lock_held());
+
+       if (!(*bucket) || !daddr)
+               return NULL;
+
+       hval = rt6_exception_hash(daddr, saddr);
+       *bucket += hval;
+
+       hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
+               struct rt6_info *rt6 = rt6_ex->rt6i;
+               bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
+
+#ifdef CONFIG_IPV6_SUBTREES
+               if (matched && saddr)
+                       matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
+#endif
+               if (matched)
+                       return rt6_ex;
+       }
+       return NULL;
+}
+
+static int rt6_insert_exception(struct rt6_info *nrt,
+                               struct rt6_info *ort)
+{
+       struct net *net = dev_net(ort->dst.dev);
+       struct rt6_exception_bucket *bucket;
+       struct in6_addr *src_key = NULL;
+       struct rt6_exception *rt6_ex;
+       int err = 0;
+
+       /* ort can't be a cache or pcpu route */
+       if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
+               ort = (struct rt6_info *)ort->dst.from;
+       WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
+
+       spin_lock_bh(&rt6_exception_lock);
+
+       if (ort->exception_bucket_flushed) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
+                                       lockdep_is_held(&rt6_exception_lock));
+       if (!bucket) {
+               bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
+                                GFP_ATOMIC);
+               if (!bucket) {
+                       err = -ENOMEM;
+                       goto out;
+               }
+               rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
+       }
+
+#ifdef CONFIG_IPV6_SUBTREES
+       /* rt6i_src.plen != 0 indicates ort is in subtree
+        * and exception table is indexed by a hash of
+        * both rt6i_dst and rt6i_src.
+        * Otherwise, the exception table is indexed by
+        * a hash of only rt6i_dst.
+        */
+       if (ort->rt6i_src.plen)
+               src_key = &nrt->rt6i_src.addr;
+#endif
+
+       /* Update rt6i_prefsrc as it could be changed
+        * in rt6_remove_prefsrc()
+        */
+       nrt->rt6i_prefsrc = ort->rt6i_prefsrc;
+       /* rt6_mtu_change() might lower mtu on ort.
+        * Only insert this exception route if its mtu
+        * is less than ort's mtu value.
+        */
+       if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
+                                              src_key);
+       if (rt6_ex)
+               rt6_remove_exception(bucket, rt6_ex);
+
+       rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
+       if (!rt6_ex) {
+               err = -ENOMEM;
+               goto out;
+       }
+       rt6_ex->rt6i = nrt;
+       rt6_ex->stamp = jiffies;
+       atomic_inc(&nrt->rt6i_ref);
+       nrt->rt6i_node = ort->rt6i_node;
+       hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
+       bucket->depth++;
+       net->ipv6.rt6_stats->fib_rt_cache++;
+
+       if (bucket->depth > FIB6_MAX_DEPTH)
+               rt6_exception_remove_oldest(bucket);
+
+out:
+       spin_unlock_bh(&rt6_exception_lock);
+
+       /* Update fn->fn_sernum to invalidate all cached dst */
+       if (!err)
+               fib6_update_sernum(ort);
+
+       return err;
+}
+
+void rt6_flush_exceptions(struct rt6_info *rt)
+{
+       struct rt6_exception_bucket *bucket;
+       struct rt6_exception *rt6_ex;
+       struct hlist_node *tmp;
+       int i;
+
+       spin_lock_bh(&rt6_exception_lock);
+       /* Prevent rt6_insert_exception() to recreate the bucket list */
+       rt->exception_bucket_flushed = 1;
+
+       bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
+                                   lockdep_is_held(&rt6_exception_lock));
+       if (!bucket)
+               goto out;
+
+       for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
+               hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
+                       rt6_remove_exception(bucket, rt6_ex);
+               WARN_ON_ONCE(bucket->depth);
+               bucket++;
+       }
+
+out:
+       spin_unlock_bh(&rt6_exception_lock);
+}
+
+/* Find cached rt in the hash table inside passed in rt
+ * Caller has to hold rcu_read_lock()
+ */
+static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
+                                          struct in6_addr *daddr,
+                                          struct in6_addr *saddr)
+{
+       struct rt6_exception_bucket *bucket;
+       struct in6_addr *src_key = NULL;
+       struct rt6_exception *rt6_ex;
+       struct rt6_info *res = NULL;
+
+       bucket = rcu_dereference(rt->rt6i_exception_bucket);
+
+#ifdef CONFIG_IPV6_SUBTREES
+       /* rt6i_src.plen != 0 indicates rt is in subtree
+        * and exception table is indexed by a hash of
+        * both rt6i_dst and rt6i_src.
+        * Otherwise, the exception table is indexed by
+        * a hash of only rt6i_dst.
+        */
+       if (rt->rt6i_src.plen)
+               src_key = saddr;
+#endif
+       rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
+
+       if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
+               res = rt6_ex->rt6i;
+
+       return res;
+}
+
+/* Remove the passed in cached rt from the hash table that contains it */
+int rt6_remove_exception_rt(struct rt6_info *rt)
+{
+       struct rt6_info *from = (struct rt6_info *)rt->dst.from;
+       struct rt6_exception_bucket *bucket;
+       struct in6_addr *src_key = NULL;
+       struct rt6_exception *rt6_ex;
+       int err;
+
+       if (!from ||
+           !(rt->rt6i_flags & RTF_CACHE))
+               return -EINVAL;
+
+       if (!rcu_access_pointer(from->rt6i_exception_bucket))
+               return -ENOENT;
+
+       spin_lock_bh(&rt6_exception_lock);
+       bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
+                                   lockdep_is_held(&rt6_exception_lock));
+#ifdef CONFIG_IPV6_SUBTREES
+       /* rt6i_src.plen != 0 indicates 'from' is in subtree
+        * and exception table is indexed by a hash of
+        * both rt6i_dst and rt6i_src.
+        * Otherwise, the exception table is indexed by
+        * a hash of only rt6i_dst.
+        */
+       if (from->rt6i_src.plen)
+               src_key = &rt->rt6i_src.addr;
+#endif
+       rt6_ex = __rt6_find_exception_spinlock(&bucket,
+                                              &rt->rt6i_dst.addr,
+                                              src_key);
+       if (rt6_ex) {
+               rt6_remove_exception(bucket, rt6_ex);
+               err = 0;
+       } else {
+               err = -ENOENT;
+       }
+
+       spin_unlock_bh(&rt6_exception_lock);
+       return err;
+}
+
+/* Find rt6_ex which contains the passed in rt cache and
+ * refresh its stamp
+ */
+static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
+{
+       struct rt6_info *from = (struct rt6_info *)rt->dst.from;
+       struct rt6_exception_bucket *bucket;
+       struct in6_addr *src_key = NULL;
+       struct rt6_exception *rt6_ex;
+
+       if (!from ||
+           !(rt->rt6i_flags & RTF_CACHE))
+               return;
+
+       rcu_read_lock();
+       bucket = rcu_dereference(from->rt6i_exception_bucket);
+
+#ifdef CONFIG_IPV6_SUBTREES
+       /* rt6i_src.plen != 0 indicates 'from' is in subtree
+        * and exception table is indexed by a hash of
+        * both rt6i_dst and rt6i_src.
+        * Otherwise, the exception table is indexed by
+        * a hash of only rt6i_dst.
+        */
+       if (from->rt6i_src.plen)
+               src_key = &rt->rt6i_src.addr;
+#endif
+       rt6_ex = __rt6_find_exception_rcu(&bucket,
+                                         &rt->rt6i_dst.addr,
+                                         src_key);
+       if (rt6_ex)
+               rt6_ex->stamp = jiffies;
+
+       rcu_read_unlock();
+}
+
+static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt)
+{
+       struct rt6_exception_bucket *bucket;
+       struct rt6_exception *rt6_ex;
+       int i;
+
+       bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
+                                       lockdep_is_held(&rt6_exception_lock));
+
+       if (bucket) {
+               for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
+                       hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
+                               rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
+                       }
+                       bucket++;
+               }
+       }
+}
+
+static void rt6_exceptions_update_pmtu(struct rt6_info *rt, int mtu)
+{
+       struct rt6_exception_bucket *bucket;
+       struct rt6_exception *rt6_ex;
+       int i;
+
+       bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
+                                       lockdep_is_held(&rt6_exception_lock));
+
+       if (bucket) {
+               for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
+                       hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
+                               struct rt6_info *entry = rt6_ex->rt6i;
+                               /* For RTF_CACHE with rt6i_pmtu == 0
+                                * (i.e. a redirected route),
+                                * the metrics of its rt->dst.from has already
+                                * been updated.
+                                */
+                               if (entry->rt6i_pmtu && entry->rt6i_pmtu > mtu)
+                                       entry->rt6i_pmtu = mtu;
+                       }
+                       bucket++;
+               }
+       }
+}
+
+#define RTF_CACHE_GATEWAY      (RTF_GATEWAY | RTF_CACHE)
+
+static void rt6_exceptions_clean_tohost(struct rt6_info *rt,
+                                       struct in6_addr *gateway)
+{
+       struct rt6_exception_bucket *bucket;
+       struct rt6_exception *rt6_ex;
+       struct hlist_node *tmp;
+       int i;
+
+       if (!rcu_access_pointer(rt->rt6i_exception_bucket))
+               return;
+
+       spin_lock_bh(&rt6_exception_lock);
+       bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
+                                    lockdep_is_held(&rt6_exception_lock));
+
+       if (bucket) {
+               for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
+                       hlist_for_each_entry_safe(rt6_ex, tmp,
+                                                 &bucket->chain, hlist) {
+                               struct rt6_info *entry = rt6_ex->rt6i;
+
+                               if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
+                                   RTF_CACHE_GATEWAY &&
+                                   ipv6_addr_equal(gateway,
+                                                   &entry->rt6i_gateway)) {
+                                       rt6_remove_exception(bucket, rt6_ex);
+                               }
+                       }
+                       bucket++;
+               }
+       }
+
+       spin_unlock_bh(&rt6_exception_lock);
+}
+
+static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
+                                     struct rt6_exception *rt6_ex,
+                                     struct fib6_gc_args *gc_args,
+                                     unsigned long now)
+{
+       struct rt6_info *rt = rt6_ex->rt6i;
+
+       if (atomic_read(&rt->dst.__refcnt) == 1 &&
+           time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
+               RT6_TRACE("aging clone %p\n", rt);
+               rt6_remove_exception(bucket, rt6_ex);
+               return;
+       } else if (rt->rt6i_flags & RTF_GATEWAY) {
+               struct neighbour *neigh;
+               __u8 neigh_flags = 0;
+
+               neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway);
+               if (neigh) {
+                       neigh_flags = neigh->flags;
+                       neigh_release(neigh);
+               }
+               if (!(neigh_flags & NTF_ROUTER)) {
+                       RT6_TRACE("purging route %p via non-router but gateway\n",
+                                 rt);
+                       rt6_remove_exception(bucket, rt6_ex);
+                       return;
+               }
+       }
+       gc_args->more++;
+}
+
+void rt6_age_exceptions(struct rt6_info *rt,
+                       struct fib6_gc_args *gc_args,
+                       unsigned long now)
+{
+       struct rt6_exception_bucket *bucket;
+       struct rt6_exception *rt6_ex;
+       struct hlist_node *tmp;
+       int i;
+
+       if (!rcu_access_pointer(rt->rt6i_exception_bucket))
+               return;
+
+       spin_lock_bh(&rt6_exception_lock);
+       bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
+                                   lockdep_is_held(&rt6_exception_lock));
+
+       if (bucket) {
+               for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
+                       hlist_for_each_entry_safe(rt6_ex, tmp,
+                                                 &bucket->chain, hlist) {
+                               rt6_age_examine_exception(bucket, rt6_ex,
+                                                         gc_args, now);
+                       }
+                       bucket++;
+               }
+       }
+       spin_unlock_bh(&rt6_exception_lock);
+}
+
 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
                               int oif, struct flowi6 *fl6, int flags)
 {
        struct fib6_node *fn, *saved_fn;
-       struct rt6_info *rt;
+       struct rt6_info *rt, *rt_cache;
        int strict = 0;
 
        strict |= flags & RT6_LOOKUP_F_IFACE;
@@ -1103,7 +1638,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
        if (net->ipv6.devconf_all->forwarding == 0)
                strict |= RT6_LOOKUP_F_REACHABLE;
 
-       read_lock_bh(&table->tb6_lock);
+       rcu_read_lock();
 
        fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
        saved_fn = fn;
@@ -1112,7 +1647,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
                oif = 0;
 
 redo_rt6_select:
-       rt = rt6_select(fn, oif, strict);
+       rt = rt6_select(net, fn, oif, strict);
        if (rt->rt6i_nsiblings)
                rt = rt6_multipath_select(rt, fl6, oif, strict);
        if (rt == net->ipv6.ip6_null_entry) {
@@ -1127,13 +1662,22 @@ redo_rt6_select:
                }
        }
 
+       /*Search through exception table */
+       rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
+       if (rt_cache)
+               rt = rt_cache;
 
-       if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
-               dst_use(&rt->dst, jiffies);
-               read_unlock_bh(&table->tb6_lock);
-
-               rt6_dst_from_metrics_check(rt);
-
+       if (rt == net->ipv6.ip6_null_entry) {
+               rcu_read_unlock();
+               dst_hold(&rt->dst);
+               trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
+               return rt;
+       } else if (rt->rt6i_flags & RTF_CACHE) {
+               if (ip6_hold_safe(net, &rt, true)) {
+                       dst_use_noref(&rt->dst, jiffies);
+                       rt6_dst_from_metrics_check(rt);
+               }
+               rcu_read_unlock();
                trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
                return rt;
        } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
@@ -1146,8 +1690,14 @@ redo_rt6_select:
 
                struct rt6_info *uncached_rt;
 
-               dst_use(&rt->dst, jiffies);
-               read_unlock_bh(&table->tb6_lock);
+               if (ip6_hold_safe(net, &rt, true)) {
+                       dst_use_noref(&rt->dst, jiffies);
+               } else {
+                       rcu_read_unlock();
+                       uncached_rt = rt;
+                       goto uncached_rt_out;
+               }
+               rcu_read_unlock();
 
                uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
                dst_release(&rt->dst);
@@ -1157,11 +1707,13 @@ redo_rt6_select:
                         * No need for another dst_hold()
                         */
                        rt6_uncached_list_add(uncached_rt);
+                       atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
                } else {
                        uncached_rt = net->ipv6.ip6_null_entry;
                        dst_hold(&uncached_rt->dst);
                }
 
+uncached_rt_out:
                trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6);
                return uncached_rt;
 
@@ -1170,26 +1722,28 @@ redo_rt6_select:
 
                struct rt6_info *pcpu_rt;
 
-               rt->dst.lastuse = jiffies;
-               rt->dst.__use++;
+               dst_use_noref(&rt->dst, jiffies);
+               local_bh_disable();
                pcpu_rt = rt6_get_pcpu_route(rt);
 
-               if (pcpu_rt) {
-                       read_unlock_bh(&table->tb6_lock);
-               } else {
-                       /* We have to do the read_unlock first
-                        * because rt6_make_pcpu_route() may trigger
-                        * ip6_dst_gc() which will take the write_lock.
-                        */
-                       dst_hold(&rt->dst);
-                       read_unlock_bh(&table->tb6_lock);
-                       pcpu_rt = rt6_make_pcpu_route(rt);
-                       dst_release(&rt->dst);
+               if (!pcpu_rt) {
+                       /* atomic_inc_not_zero() is needed when using rcu */
+                       if (atomic_inc_not_zero(&rt->rt6i_ref)) {
+                               /* No dst_hold() on rt is needed because grabbing
+                                * rt->rt6i_ref makes sure rt can't be released.
+                                */
+                               pcpu_rt = rt6_make_pcpu_route(rt);
+                               rt6_release(rt);
+                       } else {
+                               /* rt is already removed from tree */
+                               pcpu_rt = net->ipv6.ip6_null_entry;
+                               dst_hold(&pcpu_rt->dst);
+                       }
                }
-
+               local_bh_enable();
+               rcu_read_unlock();
                trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6);
                return pcpu_rt;
-
        }
 }
 EXPORT_SYMBOL_GPL(ip6_pol_route);
@@ -1325,9 +1879,10 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori
        struct dst_entry *new = NULL;
 
        rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
-                      DST_OBSOLETE_NONE, 0);
+                      DST_OBSOLETE_DEAD, 0);
        if (rt) {
                rt6_info_init(rt);
+               atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
 
                new = &rt->dst;
                new->__use = 1;
@@ -1491,23 +2046,17 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
 
        if (!rt6_cache_allowed_for_pmtu(rt6)) {
                rt6_do_update_pmtu(rt6, mtu);
+               /* update rt6_ex->stamp for cache */
+               if (rt6->rt6i_flags & RTF_CACHE)
+                       rt6_update_exception_stamp_rt(rt6);
        } else if (daddr) {
                struct rt6_info *nrt6;
 
                nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
                if (nrt6) {
                        rt6_do_update_pmtu(nrt6, mtu);
-
-                       /* ip6_ins_rt(nrt6) will bump the
-                        * rt6->rt6i_node->fn_sernum
-                        * which will fail the next rt6_check() and
-                        * invalidate the sk->sk_dst_cache.
-                        */
-                       ip6_ins_rt(nrt6);
-                       /* Release the reference taken in
-                        * ip6_rt_cache_alloc()
-                        */
-                       dst_release(&nrt6->dst);
+                       if (rt6_insert_exception(nrt6, rt6))
+                               dst_release_immediate(&nrt6->dst);
                }
        }
 }
@@ -1571,7 +2120,7 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
                                             int flags)
 {
        struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
-       struct rt6_info *rt;
+       struct rt6_info *rt, *rt_cache;
        struct fib6_node *fn;
 
        /* Get the "current" route for this destination and
@@ -1584,10 +2133,10 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
         * routes.
         */
 
-       read_lock_bh(&table->tb6_lock);
+       rcu_read_lock();
        fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
 restart:
-       for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
+       for_each_fib6_node_rt_rcu(fn) {
                if (rt6_check_expired(rt))
                        continue;
                if (rt->dst.error)
@@ -1596,8 +2145,23 @@ restart:
                        continue;
                if (fl6->flowi6_oif != rt->dst.dev->ifindex)
                        continue;
-               if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
+               /* rt_cache's gateway might be different from its 'parent'
+                * in the case of an ip redirect.
+                * So we keep searching in the exception table if the gateway
+                * is different.
+                */
+               if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) {
+                       rt_cache = rt6_find_cached_rt(rt,
+                                                     &fl6->daddr,
+                                                     &fl6->saddr);
+                       if (rt_cache &&
+                           ipv6_addr_equal(&rdfl->gateway,
+                                           &rt_cache->rt6i_gateway)) {
+                               rt = rt_cache;
+                               break;
+                       }
                        continue;
+               }
                break;
        }
 
@@ -1615,9 +2179,9 @@ restart:
        }
 
 out:
-       dst_hold(&rt->dst);
+       ip6_hold_safe(net, &rt, true);
 
-       read_unlock_bh(&table->tb6_lock);
+       rcu_read_unlock();
 
        trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
        return rt;
@@ -1766,6 +2330,7 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
         * do proper release of the net_device
         */
        rt6_uncached_list_add(rt);
+       atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
 
        dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
 
@@ -2216,9 +2781,9 @@ static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
        }
 
        table = rt->rt6i_table;
-       write_lock_bh(&table->tb6_lock);
+       spin_lock_bh(&table->tb6_lock);
        err = fib6_del(rt, info);
-       write_unlock_bh(&table->tb6_lock);
+       spin_unlock_bh(&table->tb6_lock);
 
 out:
        ip6_rt_put(rt);
@@ -2244,7 +2809,7 @@ static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
        if (rt == net->ipv6.ip6_null_entry)
                goto out_put;
        table = rt->rt6i_table;
-       write_lock_bh(&table->tb6_lock);
+       spin_lock_bh(&table->tb6_lock);
 
        if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
                struct rt6_info *sibling, *next_sibling;
@@ -2274,7 +2839,7 @@ static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
 
        err = fib6_del(rt, info);
 out_unlock:
-       write_unlock_bh(&table->tb6_lock);
+       spin_unlock_bh(&table->tb6_lock);
 out_put:
        ip6_rt_put(rt);
 
@@ -2288,9 +2853,9 @@ out_put:
 static int ip6_route_del(struct fib6_config *cfg,
                         struct netlink_ext_ack *extack)
 {
+       struct rt6_info *rt, *rt_cache;
        struct fib6_table *table;
        struct fib6_node *fn;
-       struct rt6_info *rt;
        int err = -ESRCH;
 
        table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
@@ -2299,17 +2864,22 @@ static int ip6_route_del(struct fib6_config *cfg,
                return err;
        }
 
-       read_lock_bh(&table->tb6_lock);
+       rcu_read_lock();
 
        fn = fib6_locate(&table->tb6_root,
                         &cfg->fc_dst, cfg->fc_dst_len,
-                        &cfg->fc_src, cfg->fc_src_len);
+                        &cfg->fc_src, cfg->fc_src_len,
+                        !(cfg->fc_flags & RTF_CACHE));
 
        if (fn) {
-               for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
-                       if ((rt->rt6i_flags & RTF_CACHE) &&
-                           !(cfg->fc_flags & RTF_CACHE))
-                               continue;
+               for_each_fib6_node_rt_rcu(fn) {
+                       if (cfg->fc_flags & RTF_CACHE) {
+                               rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
+                                                             &cfg->fc_src);
+                               if (!rt_cache)
+                                       continue;
+                               rt = rt_cache;
+                       }
                        if (cfg->fc_ifindex &&
                            (!rt->dst.dev ||
                             rt->dst.dev->ifindex != cfg->fc_ifindex))
@@ -2321,8 +2891,9 @@ static int ip6_route_del(struct fib6_config *cfg,
                                continue;
                        if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
                                continue;
-                       dst_hold(&rt->dst);
-                       read_unlock_bh(&table->tb6_lock);
+                       if (!dst_hold_safe(&rt->dst))
+                               break;
+                       rcu_read_unlock();
 
                        /* if gateway was specified only delete the one hop */
                        if (cfg->fc_flags & RTF_GATEWAY)
@@ -2331,7 +2902,7 @@ static int ip6_route_del(struct fib6_config *cfg,
                        return __ip6_del_rt_siblings(rt, cfg);
                }
        }
-       read_unlock_bh(&table->tb6_lock);
+       rcu_read_unlock();
 
        return err;
 }
@@ -2435,8 +3006,14 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
        nrt->rt6i_protocol = RTPROT_REDIRECT;
        nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
 
-       if (ip6_ins_rt(nrt))
-               goto out_release;
+       /* No need to remove rt from the exception table if rt is
+        * a cached route because rt6_insert_exception() will
+        * takes care of it
+        */
+       if (rt6_insert_exception(nrt, rt)) {
+               dst_release_immediate(&nrt->dst);
+               goto out;
+       }
 
        netevent.old = &rt->dst;
        netevent.new = &nrt->dst;
@@ -2444,17 +3021,6 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
        netevent.neigh = neigh;
        call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
 
-       if (rt->rt6i_flags & RTF_CACHE) {
-               rt = (struct rt6_info *) dst_clone(&rt->dst);
-               ip6_del_rt(rt);
-       }
-
-out_release:
-       /* Release the reference taken in
-        * ip6_rt_cache_alloc()
-        */
-       dst_release(&nrt->dst);
-
 out:
        neigh_release(neigh);
 }
@@ -2511,23 +3077,23 @@ static struct rt6_info *rt6_get_route_info(struct net *net,
        if (!table)
                return NULL;
 
-       read_lock_bh(&table->tb6_lock);
-       fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0);
+       rcu_read_lock();
+       fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
        if (!fn)
                goto out;
 
-       for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
+       for_each_fib6_node_rt_rcu(fn) {
                if (rt->dst.dev->ifindex != ifindex)
                        continue;
                if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
                        continue;
                if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
                        continue;
-               dst_hold(&rt->dst);
+               ip6_hold_safe(NULL, &rt, false);
                break;
        }
 out:
-       read_unlock_bh(&table->tb6_lock);
+       rcu_read_unlock();
        return rt;
 }
 
@@ -2573,16 +3139,16 @@ struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_dev
        if (!table)
                return NULL;
 
-       read_lock_bh(&table->tb6_lock);
-       for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
+       rcu_read_lock();
+       for_each_fib6_node_rt_rcu(&table->tb6_root) {
                if (dev == rt->dst.dev &&
                    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
                    ipv6_addr_equal(&rt->rt6i_gateway, addr))
                        break;
        }
        if (rt)
-               dst_hold(&rt->dst);
-       read_unlock_bh(&table->tb6_lock);
+               ip6_hold_safe(NULL, &rt, false);
+       rcu_read_unlock();
        return rt;
 }
 
@@ -2620,17 +3186,20 @@ static void __rt6_purge_dflt_routers(struct fib6_table *table)
        struct rt6_info *rt;
 
 restart:
-       read_lock_bh(&table->tb6_lock);
-       for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
+       rcu_read_lock();
+       for_each_fib6_node_rt_rcu(&table->tb6_root) {
                if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
                    (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
-                       dst_hold(&rt->dst);
-                       read_unlock_bh(&table->tb6_lock);
-                       ip6_del_rt(rt);
+                       if (dst_hold_safe(&rt->dst)) {
+                               rcu_read_unlock();
+                               ip6_del_rt(rt);
+                       } else {
+                               rcu_read_unlock();
+                       }
                        goto restart;
                }
        }
-       read_unlock_bh(&table->tb6_lock);
+       rcu_read_unlock();
 
        table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
 }
@@ -2818,8 +3387,12 @@ static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
        if (((void *)rt->dst.dev == dev || !dev) &&
            rt != net->ipv6.ip6_null_entry &&
            ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
+               spin_lock_bh(&rt6_exception_lock);
                /* remove prefsrc entry */
                rt->rt6i_prefsrc.plen = 0;
+               /* need to update cache as well */
+               rt6_exceptions_remove_prefsrc(rt);
+               spin_unlock_bh(&rt6_exception_lock);
        }
        return 0;
 }
@@ -2836,18 +3409,23 @@ void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
 }
 
 #define RTF_RA_ROUTER          (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
-#define RTF_CACHE_GATEWAY      (RTF_GATEWAY | RTF_CACHE)
 
 /* Remove routers and update dst entries when gateway turn into host. */
 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
 {
        struct in6_addr *gateway = (struct in6_addr *)arg;
 
-       if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) ||
-            ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) &&
-            ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
+       if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
+           ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
                return -1;
        }
+
+       /* Further clean up cached routes in exception table.
+        * This is needed because cached route may have a different
+        * gateway than its 'parent' in the case of an ip redirect.
+        */
+       rt6_exceptions_clean_tohost(rt, gateway);
+
        return 0;
 }
 
@@ -2926,19 +3504,14 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
        if (rt->dst.dev == arg->dev &&
            dst_metric_raw(&rt->dst, RTAX_MTU) &&
            !dst_metric_locked(&rt->dst, RTAX_MTU)) {
-               if (rt->rt6i_flags & RTF_CACHE) {
-                       /* For RTF_CACHE with rt6i_pmtu == 0
-                        * (i.e. a redirected route),
-                        * the metrics of its rt->dst.from has already
-                        * been updated.
-                        */
-                       if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
-                               rt->rt6i_pmtu = arg->mtu;
-               } else if (dst_mtu(&rt->dst) >= arg->mtu ||
-                          (dst_mtu(&rt->dst) < arg->mtu &&
-                           dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
+               spin_lock_bh(&rt6_exception_lock);
+               if (dst_mtu(&rt->dst) >= arg->mtu ||
+                   (dst_mtu(&rt->dst) < arg->mtu &&
+                    dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
                        dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
                }
+               rt6_exceptions_update_pmtu(rt, arg->mtu);
+               spin_unlock_bh(&rt6_exception_lock);
        }
        return 0;
 }
@@ -3839,7 +4412,7 @@ static int rt6_stats_seq_show(struct seq_file *seq, void *v)
        seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
                   net->ipv6.rt6_stats->fib_nodes,
                   net->ipv6.rt6_stats->fib_route_nodes,
-                  net->ipv6.rt6_stats->fib_rt_alloc,
+                  atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
                   net->ipv6.rt6_stats->fib_rt_entries,
                   net->ipv6.rt6_stats->fib_rt_cache,
                   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
index c5b9ce4..9745e8f 100644 (file)
@@ -16,6 +16,7 @@
 #include <net/arp.h>
 #include <net/ip_fib.h>
 #include <net/netevent.h>
+#include <net/ip_tunnels.h>
 #include <net/netns/generic.h>
 #if IS_ENABLED(CONFIG_IPV6)
 #include <net/ipv6.h>
@@ -39,6 +40,36 @@ static int one = 1;
 static int label_limit = (1 << 20) - 1;
 static int ttl_max = 255;
 
+#if IS_ENABLED(CONFIG_NET_IP_TUNNEL)
+size_t ipgre_mpls_encap_hlen(struct ip_tunnel_encap *e)
+{
+       return sizeof(struct mpls_shim_hdr);
+}
+
+static const struct ip_tunnel_encap_ops mpls_iptun_ops = {
+       .encap_hlen     = ipgre_mpls_encap_hlen,
+};
+
+static int ipgre_tunnel_encap_add_mpls_ops(void)
+{
+       return ip_tunnel_encap_add_ops(&mpls_iptun_ops, TUNNEL_ENCAP_MPLS);
+}
+
+static void ipgre_tunnel_encap_del_mpls_ops(void)
+{
+       ip_tunnel_encap_del_ops(&mpls_iptun_ops, TUNNEL_ENCAP_MPLS);
+}
+#else
+static int ipgre_tunnel_encap_add_mpls_ops(void)
+{
+       return 0;
+}
+
+static void ipgre_tunnel_encap_del_mpls_ops(void)
+{
+}
+#endif
+
 static void rtmsg_lfib(int event, u32 label, struct mpls_route *rt,
                       struct nlmsghdr *nlh, struct net *net, u32 portid,
                       unsigned int nlm_flags);
@@ -2485,6 +2516,10 @@ static int __init mpls_init(void)
                      0);
        rtnl_register(PF_MPLS, RTM_GETNETCONF, mpls_netconf_get_devconf,
                      mpls_netconf_dump_devconf, 0);
+       err = ipgre_tunnel_encap_add_mpls_ops();
+       if (err)
+               pr_err("Can't add mpls over gre tunnel ops\n");
+
        err = 0;
 out:
        return err;
@@ -2502,6 +2537,7 @@ static void __exit mpls_exit(void)
        dev_remove_pack(&mpls_packet_type);
        unregister_netdevice_notifier(&mpls_dev_notifier);
        unregister_pernet_subsys(&mpls_net_ops);
+       ipgre_tunnel_encap_del_mpls_ops();
 }
 module_exit(mpls_exit);
 
index e495b5e..cf84f7b 100644 (file)
@@ -1191,14 +1191,17 @@ static int ip_set_swap(struct net *net, struct sock *ctnl, struct sk_buff *skb,
              from->family == to->family))
                return -IPSET_ERR_TYPE_MISMATCH;
 
-       if (from->ref_netlink || to->ref_netlink)
+       write_lock_bh(&ip_set_ref_lock);
+
+       if (from->ref_netlink || to->ref_netlink) {
+               write_unlock_bh(&ip_set_ref_lock);
                return -EBUSY;
+       }
 
        strncpy(from_name, from->name, IPSET_MAXNAMELEN);
        strncpy(from->name, to->name, IPSET_MAXNAMELEN);
        strncpy(to->name, from_name, IPSET_MAXNAMELEN);
 
-       write_lock_bh(&ip_set_ref_lock);
        swap(from->ref, to->ref);
        ip_set(inst, from_id) = to;
        ip_set(inst, to_id) = from;
@@ -2072,25 +2075,28 @@ static struct pernet_operations ip_set_net_ops = {
 static int __init
 ip_set_init(void)
 {
-       int ret = nfnetlink_subsys_register(&ip_set_netlink_subsys);
+       int ret = register_pernet_subsys(&ip_set_net_ops);
+
+       if (ret) {
+               pr_err("ip_set: cannot register pernet_subsys.\n");
+               return ret;
+       }
 
+       ret = nfnetlink_subsys_register(&ip_set_netlink_subsys);
        if (ret != 0) {
                pr_err("ip_set: cannot register with nfnetlink.\n");
+               unregister_pernet_subsys(&ip_set_net_ops);
                return ret;
        }
+
        ret = nf_register_sockopt(&so_set);
        if (ret != 0) {
                pr_err("SO_SET registry failed: %d\n", ret);
                nfnetlink_subsys_unregister(&ip_set_netlink_subsys);
+               unregister_pernet_subsys(&ip_set_net_ops);
                return ret;
        }
-       ret = register_pernet_subsys(&ip_set_net_ops);
-       if (ret) {
-               pr_err("ip_set: cannot register pernet_subsys.\n");
-               nf_unregister_sockopt(&so_set);
-               nfnetlink_subsys_unregister(&ip_set_netlink_subsys);
-               return ret;
-       }
+
        pr_info("ip_set: protocol %u\n", IPSET_PROTOCOL);
        return 0;
 }
@@ -2098,9 +2104,10 @@ ip_set_init(void)
 static void __exit
 ip_set_fini(void)
 {
-       unregister_pernet_subsys(&ip_set_net_ops);
        nf_unregister_sockopt(&so_set);
        nfnetlink_subsys_unregister(&ip_set_netlink_subsys);
+
+       unregister_pernet_subsys(&ip_set_net_ops);
        pr_debug("these are the famous last words\n");
 }
 
index 20bfbd3..613eb21 100644 (file)
@@ -123,13 +123,12 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[],
                return ret;
 
        ip &= ip_set_hostmask(h->netmask);
+       e.ip = htonl(ip);
+       if (e.ip == 0)
+               return -IPSET_ERR_HASH_ELEM;
 
-       if (adt == IPSET_TEST) {
-               e.ip = htonl(ip);
-               if (e.ip == 0)
-                       return -IPSET_ERR_HASH_ELEM;
+       if (adt == IPSET_TEST)
                return adtfn(set, &e, &ext, &ext, flags);
-       }
 
        ip_to = ip;
        if (tb[IPSET_ATTR_IP_TO]) {
@@ -148,17 +147,20 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[],
 
        hosts = h->netmask == 32 ? 1 : 2 << (32 - h->netmask - 1);
 
-       if (retried)
+       if (retried) {
                ip = ntohl(h->next.ip);
-       for (; !before(ip_to, ip); ip += hosts) {
                e.ip = htonl(ip);
-               if (e.ip == 0)
-                       return -IPSET_ERR_HASH_ELEM;
+       }
+       for (; ip <= ip_to;) {
                ret = adtfn(set, &e, &ext, &ext, flags);
-
                if (ret && !ip_set_eexist(ret, flags))
                        return ret;
 
+               ip += hosts;
+               e.ip = htonl(ip);
+               if (e.ip == 0)
+                       return 0;
+
                ret = 0;
        }
        return ret;
index b64cf14..f3ba834 100644 (file)
@@ -149,7 +149,7 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[],
 
        if (retried)
                ip = ntohl(h->next.ip);
-       for (; !before(ip_to, ip); ip++) {
+       for (; ip <= ip_to; ip++) {
                e.ip = htonl(ip);
                ret = adtfn(set, &e, &ext, &ext, flags);
 
index f438740..ddb8039 100644 (file)
@@ -178,7 +178,7 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[],
 
        if (retried)
                ip = ntohl(h->next.ip);
-       for (; !before(ip_to, ip); ip++) {
+       for (; ip <= ip_to; ip++) {
                p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port)
                                                       : port;
                for (; p <= port_to; p++) {
index 6215fb8..a7f4d7a 100644 (file)
@@ -185,7 +185,7 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[],
 
        if (retried)
                ip = ntohl(h->next.ip);
-       for (; !before(ip_to, ip); ip++) {
+       for (; ip <= ip_to; ip++) {
                p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port)
                                                       : port;
                for (; p <= port_to; p++) {
index 5ab1b99..a2f19b9 100644 (file)
@@ -271,7 +271,7 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
 
        if (retried)
                ip = ntohl(h->next.ip);
-       for (; !before(ip_to, ip); ip++) {
+       for (; ip <= ip_to; ip++) {
                e.ip = htonl(ip);
                p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port)
                                                       : port;
@@ -281,7 +281,7 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
                              ip == ntohl(h->next.ip) &&
                              p == ntohs(h->next.port)
                                ? ntohl(h->next.ip2) : ip2_from;
-                       while (!after(ip2, ip2_to)) {
+                       while (ip2 <= ip2_to) {
                                e.ip2 = htonl(ip2);
                                ip2_last = ip_set_range_to_cidr(ip2, ip2_to,
                                                                &cidr);
index 5d9e895..1c67a17 100644 (file)
@@ -193,7 +193,7 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[],
        }
        if (retried)
                ip = ntohl(h->next.ip);
-       while (!after(ip, ip_to)) {
+       while (ip <= ip_to) {
                e.ip = htonl(ip);
                last = ip_set_range_to_cidr(ip, ip_to, &e.cidr);
                ret = adtfn(set, &e, &ext, &ext, flags);
index 44cf119..d417074 100644 (file)
@@ -255,7 +255,7 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[],
 
        if (retried)
                ip = ntohl(h->next.ip);
-       while (!after(ip, ip_to)) {
+       while (ip <= ip_to) {
                e.ip = htonl(ip);
                last = ip_set_range_to_cidr(ip, ip_to, &e.cidr);
                ret = adtfn(set, &e, &ext, &ext, flags);
index db614e1..7f9ae2e 100644 (file)
@@ -250,13 +250,13 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[],
        if (retried)
                ip = ntohl(h->next.ip[0]);
 
-       while (!after(ip, ip_to)) {
+       while (ip <= ip_to) {
                e.ip[0] = htonl(ip);
                last = ip_set_range_to_cidr(ip, ip_to, &e.cidr[0]);
                ip2 = (retried &&
                       ip == ntohl(h->next.ip[0])) ? ntohl(h->next.ip[1])
                                                   : ip2_from;
-               while (!after(ip2, ip2_to)) {
+               while (ip2 <= ip2_to) {
                        e.ip[1] = htonl(ip2);
                        last2 = ip_set_range_to_cidr(ip2, ip2_to, &e.cidr[1]);
                        ret = adtfn(set, &e, &ext, &ext, flags);
index 54b64b6..e6ef382 100644 (file)
@@ -241,7 +241,7 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[],
 
        if (retried)
                ip = ntohl(h->next.ip);
-       while (!after(ip, ip_to)) {
+       while (ip <= ip_to) {
                e.ip = htonl(ip);
                last = ip_set_range_to_cidr(ip, ip_to, &cidr);
                e.cidr = cidr - 1;
index aff8469..8602f25 100644 (file)
@@ -291,7 +291,7 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
        if (retried)
                ip = ntohl(h->next.ip[0]);
 
-       while (!after(ip, ip_to)) {
+       while (ip <= ip_to) {
                e.ip[0] = htonl(ip);
                ip_last = ip_set_range_to_cidr(ip, ip_to, &e.cidr[0]);
                p = retried && ip == ntohl(h->next.ip[0]) ? ntohs(h->next.port)
@@ -301,7 +301,7 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
                        ip2 = (retried && ip == ntohl(h->next.ip[0]) &&
                               p == ntohs(h->next.port)) ? ntohl(h->next.ip[1])
                                                         : ip2_from;
-                       while (!after(ip2, ip2_to)) {
+                       while (ip2 <= ip2_to) {
                                e.ip[1] = htonl(ip2);
                                ip2_last = ip_set_range_to_cidr(ip2, ip2_to,
                                                                &e.cidr[1]);
index 90d3968..4527921 100644 (file)
@@ -921,6 +921,7 @@ ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af,
 {
        struct sk_buff *new_skb = NULL;
        struct iphdr *old_iph = NULL;
+       __u8 old_dsfield;
 #ifdef CONFIG_IP_VS_IPV6
        struct ipv6hdr *old_ipv6h = NULL;
 #endif
@@ -945,7 +946,7 @@ ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af,
                        *payload_len =
                                ntohs(old_ipv6h->payload_len) +
                                sizeof(*old_ipv6h);
-               *dsfield = ipv6_get_dsfield(old_ipv6h);
+               old_dsfield = ipv6_get_dsfield(old_ipv6h);
                *ttl = old_ipv6h->hop_limit;
                if (df)
                        *df = 0;
@@ -960,12 +961,15 @@ ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af,
 
                /* fix old IP header checksum */
                ip_send_check(old_iph);
-               *dsfield = ipv4_get_dsfield(old_iph);
+               old_dsfield = ipv4_get_dsfield(old_iph);
                *ttl = old_iph->ttl;
                if (payload_len)
                        *payload_len = ntohs(old_iph->tot_len);
        }
 
+       /* Implement full-functionality option for ECN encapsulation */
+       *dsfield = INET_ECN_encapsulate(old_dsfield, old_dsfield);
+
        return skb;
 error:
        kfree_skb(skb);
index 9299271..64e1ee0 100644 (file)
@@ -1048,7 +1048,7 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net,
                if (nla_put_string(skb, NFTA_CHAIN_TYPE, basechain->type->name))
                        goto nla_put_failure;
 
-               if (nft_dump_stats(skb, nft_base_chain(chain)->stats))
+               if (basechain->stats && nft_dump_stats(skb, basechain->stats))
                        goto nla_put_failure;
        }
 
@@ -1487,8 +1487,8 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
 
                chain2 = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME],
                                                genmask);
-               if (IS_ERR(chain2))
-                       return PTR_ERR(chain2);
+               if (!IS_ERR(chain2))
+                       return -EEXIST;
        }
 
        if (nla[NFTA_CHAIN_COUNTERS]) {
@@ -2741,8 +2741,10 @@ cont:
        list_for_each_entry(i, &ctx->table->sets, list) {
                if (!nft_is_active_next(ctx->net, i))
                        continue;
-               if (!strcmp(set->name, i->name))
+               if (!strcmp(set->name, i->name)) {
+                       kfree(set->name);
                        return -ENFILE;
+               }
        }
        return 0;
 }
index c83a3b5..d8571f4 100644 (file)
@@ -892,7 +892,7 @@ void *xt_copy_counters_from_user(const void __user *user, unsigned int len,
                if (copy_from_user(&compat_tmp, user, sizeof(compat_tmp)) != 0)
                        return ERR_PTR(-EFAULT);
 
-               strlcpy(info->name, compat_tmp.name, sizeof(info->name));
+               memcpy(info->name, compat_tmp.name, sizeof(info->name) - 1);
                info->num_counters = compat_tmp.num_counters;
                user += sizeof(compat_tmp);
        } else
@@ -905,9 +905,9 @@ void *xt_copy_counters_from_user(const void __user *user, unsigned int len,
                if (copy_from_user(info, user, sizeof(*info)) != 0)
                        return ERR_PTR(-EFAULT);
 
-               info->name[sizeof(info->name) - 1] = '\0';
                user += sizeof(*info);
        }
+       info->name[sizeof(info->name) - 1] = '\0';
 
        size = sizeof(struct xt_counters);
        size *= info->num_counters;
index 38986a9..2912393 100644 (file)
@@ -8,6 +8,7 @@
  */
 
 #include <linux/module.h>
+#include <linux/syscalls.h>
 #include <linux/skbuff.h>
 #include <linux/filter.h>
 #include <linux/bpf.h>
@@ -49,6 +50,22 @@ static int __bpf_mt_check_fd(int fd, struct bpf_prog **ret)
        return 0;
 }
 
+static int __bpf_mt_check_path(const char *path, struct bpf_prog **ret)
+{
+       mm_segment_t oldfs = get_fs();
+       int retval, fd;
+
+       set_fs(KERNEL_DS);
+       fd = bpf_obj_get_user(path);
+       set_fs(oldfs);
+       if (fd < 0)
+               return fd;
+
+       retval = __bpf_mt_check_fd(fd, ret);
+       sys_close(fd);
+       return retval;
+}
+
 static int bpf_mt_check(const struct xt_mtchk_param *par)
 {
        struct xt_bpf_info *info = par->matchinfo;
@@ -66,9 +83,10 @@ static int bpf_mt_check_v1(const struct xt_mtchk_param *par)
                return __bpf_mt_check_bytecode(info->bpf_program,
                                               info->bpf_program_num_elem,
                                               &info->filter);
-       else if (info->mode == XT_BPF_MODE_FD_PINNED ||
-                info->mode == XT_BPF_MODE_FD_ELF)
+       else if (info->mode == XT_BPF_MODE_FD_ELF)
                return __bpf_mt_check_fd(info->fd, &info->filter);
+       else if (info->mode == XT_BPF_MODE_PATH_PINNED)
+               return __bpf_mt_check_path(info->path, &info->filter);
        else
                return -EINVAL;
 }
index e75ef39..575d215 100644 (file)
@@ -76,7 +76,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
                        transparent = nf_sk_is_transparent(sk);
 
                if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard &&
-                   transparent)
+                   transparent && sk_fullsock(sk))
                        pskb->mark = sk->sk_mark;
 
                if (sk != skb->sk)
@@ -133,7 +133,7 @@ socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par)
                        transparent = nf_sk_is_transparent(sk);
 
                if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard &&
-                   transparent)
+                   transparent && sk_fullsock(sk))
                        pskb->mark = sk->sk_mark;
 
                if (sk != skb->sk)
index 94c11cf..f347506 100644 (file)
@@ -2266,16 +2266,17 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
        cb->min_dump_alloc = control->min_dump_alloc;
        cb->skb = skb;
 
+       if (cb->start) {
+               ret = cb->start(cb);
+               if (ret)
+                       goto error_unlock;
+       }
+
        nlk->cb_running = true;
 
        mutex_unlock(nlk->cb_mutex);
 
-       ret = 0;
-       if (cb->start)
-               ret = cb->start(cb);
-
-       if (!ret)
-               ret = netlink_dump(sk);
+       ret = netlink_dump(sk);
 
        sock_put(sk);
 
index a54a556..a551232 100644 (file)
@@ -1203,6 +1203,10 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
                                return err == -EINPROGRESS ? 0 : err;
                        break;
 
+               case OVS_ACTION_ATTR_CT_CLEAR:
+                       err = ovs_ct_clear(skb, key);
+                       break;
+
                case OVS_ACTION_ATTR_PUSH_ETH:
                        err = push_eth(skb, key, nla_data(a));
                        break;
index d558e88..fe861e2 100644 (file)
@@ -1129,6 +1129,17 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb,
        return err;
 }
 
+int ovs_ct_clear(struct sk_buff *skb, struct sw_flow_key *key)
+{
+       if (skb_nfct(skb)) {
+               nf_conntrack_put(skb_nfct(skb));
+               nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
+               ovs_ct_fill_key(skb, key);
+       }
+
+       return 0;
+}
+
 static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name,
                             const struct sw_flow_key *key, bool log)
 {
index bc7efd1..399dfdd 100644 (file)
@@ -30,6 +30,7 @@ int ovs_ct_action_to_attr(const struct ovs_conntrack_info *, struct sk_buff *);
 
 int ovs_ct_execute(struct net *, struct sk_buff *, struct sw_flow_key *,
                   const struct ovs_conntrack_info *);
+int ovs_ct_clear(struct sk_buff *skb, struct sw_flow_key *key);
 
 void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key);
 int ovs_ct_put_key(const struct sw_flow_key *swkey,
@@ -73,6 +74,12 @@ static inline int ovs_ct_execute(struct net *net, struct sk_buff *skb,
        return -ENOTSUPP;
 }
 
+static inline int ovs_ct_clear(struct sk_buff *skb,
+                              struct sw_flow_key *key)
+{
+       return -ENOTSUPP;
+}
+
 static inline void ovs_ct_fill_key(const struct sk_buff *skb,
                                   struct sw_flow_key *key)
 {
index e8eb427..dc0d790 100644 (file)
@@ -48,6 +48,7 @@
 #include <net/ndisc.h>
 #include <net/mpls.h>
 #include <net/vxlan.h>
+#include <net/erspan.h>
 
 #include "flow_netlink.h"
 
@@ -75,6 +76,7 @@ static bool actions_may_change_flow(const struct nlattr *actions)
                        break;
 
                case OVS_ACTION_ATTR_CT:
+               case OVS_ACTION_ATTR_CT_CLEAR:
                case OVS_ACTION_ATTR_HASH:
                case OVS_ACTION_ATTR_POP_ETH:
                case OVS_ACTION_ATTR_POP_MPLS:
@@ -319,7 +321,8 @@ size_t ovs_tun_key_attr_size(void)
                 * OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS and covered by it.
                 */
                + nla_total_size(2)    /* OVS_TUNNEL_KEY_ATTR_TP_SRC */
-               + nla_total_size(2);   /* OVS_TUNNEL_KEY_ATTR_TP_DST */
+               + nla_total_size(2)    /* OVS_TUNNEL_KEY_ATTR_TP_DST */
+               + nla_total_size(4);   /* OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS */
 }
 
 size_t ovs_key_attr_size(void)
@@ -371,6 +374,7 @@ static const struct ovs_len_tbl ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1]
                                                .next = ovs_vxlan_ext_key_lens },
        [OVS_TUNNEL_KEY_ATTR_IPV6_SRC]      = { .len = sizeof(struct in6_addr) },
        [OVS_TUNNEL_KEY_ATTR_IPV6_DST]      = { .len = sizeof(struct in6_addr) },
+       [OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS]   = { .len = sizeof(u32) },
 };
 
 /* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute.  */
@@ -593,6 +597,33 @@ static int vxlan_tun_opt_from_nlattr(const struct nlattr *attr,
        return 0;
 }
 
+static int erspan_tun_opt_from_nlattr(const struct nlattr *attr,
+                                     struct sw_flow_match *match, bool is_mask,
+                                     bool log)
+{
+       unsigned long opt_key_offset;
+       struct erspan_metadata opts;
+
+       BUILD_BUG_ON(sizeof(opts) > sizeof(match->key->tun_opts));
+
+       memset(&opts, 0, sizeof(opts));
+       opts.index = nla_get_be32(attr);
+
+       /* Index has only 20-bit */
+       if (ntohl(opts.index) & ~INDEX_MASK) {
+               OVS_NLERR(log, "ERSPAN index number %x too large.",
+                         ntohl(opts.index));
+               return -EINVAL;
+       }
+
+       SW_FLOW_KEY_PUT(match, tun_opts_len, sizeof(opts), is_mask);
+       opt_key_offset = TUN_METADATA_OFFSET(sizeof(opts));
+       SW_FLOW_KEY_MEMCPY_OFFSET(match, opt_key_offset, &opts, sizeof(opts),
+                                 is_mask);
+
+       return 0;
+}
+
 static int ip_tun_from_nlattr(const struct nlattr *attr,
                              struct sw_flow_match *match, bool is_mask,
                              bool log)
@@ -700,6 +731,19 @@ static int ip_tun_from_nlattr(const struct nlattr *attr,
                        break;
                case OVS_TUNNEL_KEY_ATTR_PAD:
                        break;
+               case OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS:
+                       if (opts_type) {
+                               OVS_NLERR(log, "Multiple metadata blocks provided");
+                               return -EINVAL;
+                       }
+
+                       err = erspan_tun_opt_from_nlattr(a, match, is_mask, log);
+                       if (err)
+                               return err;
+
+                       tun_flags |= TUNNEL_ERSPAN_OPT;
+                       opts_type = type;
+                       break;
                default:
                        OVS_NLERR(log, "Unknown IP tunnel attribute %d",
                                  type);
@@ -824,6 +868,10 @@ static int __ip_tun_to_nlattr(struct sk_buff *skb,
                else if (output->tun_flags & TUNNEL_VXLAN_OPT &&
                         vxlan_opt_to_nlattr(skb, tun_opts, swkey_tun_opts_len))
                        return -EMSGSIZE;
+               else if (output->tun_flags & TUNNEL_ERSPAN_OPT &&
+                        nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS,
+                                     ((struct erspan_metadata *)tun_opts)->index))
+                       return -EMSGSIZE;
        }
 
        return 0;
@@ -2195,6 +2243,8 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
                        break;
                case OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS:
                        break;
+               case OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS:
+                       break;
                }
        };
 
@@ -2479,6 +2529,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
                        [OVS_ACTION_ATTR_SAMPLE] = (u32)-1,
                        [OVS_ACTION_ATTR_HASH] = sizeof(struct ovs_action_hash),
                        [OVS_ACTION_ATTR_CT] = (u32)-1,
+                       [OVS_ACTION_ATTR_CT_CLEAR] = 0,
                        [OVS_ACTION_ATTR_TRUNC] = sizeof(struct ovs_action_trunc),
                        [OVS_ACTION_ATTR_PUSH_ETH] = sizeof(struct ovs_action_push_eth),
                        [OVS_ACTION_ATTR_POP_ETH] = 0,
@@ -2620,6 +2671,9 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
                        skip_copy = true;
                        break;
 
+               case OVS_ACTION_ATTR_CT_CLEAR:
+                       break;
+
                case OVS_ACTION_ATTR_PUSH_ETH:
                        /* Disallow pushing an Ethernet header if one
                         * is already present */
index f925753..3b0ef69 100644 (file)
 #include <net/phonet/pn_dev.h>
 
 /* Transport protocol registration */
-static struct phonet_protocol *proto_tab[PHONET_NPROTO] __read_mostly;
+static const struct phonet_protocol *proto_tab[PHONET_NPROTO] __read_mostly;
 
-static struct phonet_protocol *phonet_proto_get(unsigned int protocol)
+static const struct phonet_protocol *phonet_proto_get(unsigned int protocol)
 {
-       struct phonet_protocol *pp;
+       const struct phonet_protocol *pp;
 
        if (protocol >= PHONET_NPROTO)
                return NULL;
@@ -53,7 +53,7 @@ static struct phonet_protocol *phonet_proto_get(unsigned int protocol)
        return pp;
 }
 
-static inline void phonet_proto_put(struct phonet_protocol *pp)
+static inline void phonet_proto_put(const struct phonet_protocol *pp)
 {
        module_put(pp->prot->owner);
 }
@@ -65,7 +65,7 @@ static int pn_socket_create(struct net *net, struct socket *sock, int protocol,
 {
        struct sock *sk;
        struct pn_sock *pn;
-       struct phonet_protocol *pnp;
+       const struct phonet_protocol *pnp;
        int err;
 
        if (!capable(CAP_SYS_ADMIN))
@@ -149,7 +149,7 @@ static int pn_header_parse(const struct sk_buff *skb, unsigned char *haddr)
        return 1;
 }
 
-struct header_ops phonet_header_ops = {
+const struct header_ops phonet_header_ops = {
        .create = pn_header_create,
        .parse = pn_header_parse,
 };
@@ -470,7 +470,7 @@ static struct packet_type phonet_packet_type __read_mostly = {
 static DEFINE_MUTEX(proto_tab_lock);
 
 int __init_or_module phonet_proto_register(unsigned int protocol,
-                                               struct phonet_protocol *pp)
+                               const struct phonet_protocol *pp)
 {
        int err = 0;
 
@@ -492,7 +492,8 @@ int __init_or_module phonet_proto_register(unsigned int protocol,
 }
 EXPORT_SYMBOL(phonet_proto_register);
 
-void phonet_proto_unregister(unsigned int protocol, struct phonet_protocol *pp)
+void phonet_proto_unregister(unsigned int protocol,
+                       const struct phonet_protocol *pp)
 {
        mutex_lock(&proto_tab_lock);
        BUG_ON(proto_tab[protocol] != pp);
index 5e71043..b44fb90 100644 (file)
@@ -195,7 +195,7 @@ static struct proto pn_proto = {
        .name           = "PHONET",
 };
 
-static struct phonet_protocol pn_dgram_proto = {
+static const struct phonet_protocol pn_dgram_proto = {
        .ops            = &phonet_dgram_ops,
        .prot           = &pn_proto,
        .sock_type      = SOCK_DGRAM,
index e815379..9fc76b1 100644 (file)
@@ -1351,7 +1351,7 @@ static struct proto pep_proto = {
        .name           = "PNPIPE",
 };
 
-static struct phonet_protocol pep_pn_proto = {
+static const struct phonet_protocol pep_pn_proto = {
        .ops            = &phonet_stream_ops,
        .prot           = &pep_proto,
        .sock_type      = SOCK_SEQPACKET,
index 5a4f100..db0228a 100644 (file)
@@ -148,12 +148,6 @@ struct netem_skb_cb {
        psched_time_t   time_to_send;
 };
 
-
-static struct sk_buff *netem_rb_to_skb(struct rb_node *rb)
-{
-       return rb_entry(rb, struct sk_buff, rbnode);
-}
-
 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
 {
        /* we assume we can use skb next/prev/tstamp as storage for rb_node */
@@ -364,7 +358,7 @@ static void tfifo_reset(struct Qdisc *sch)
        struct rb_node *p = rb_first(&q->t_root);
 
        while (p) {
-               struct sk_buff *skb = netem_rb_to_skb(p);
+               struct sk_buff *skb = rb_to_skb(p);
 
                p = rb_next(p);
                rb_erase(&skb->rbnode, &q->t_root);
@@ -382,7 +376,7 @@ static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
                struct sk_buff *skb;
 
                parent = *p;
-               skb = netem_rb_to_skb(parent);
+               skb = rb_to_skb(parent);
                if (tnext >= netem_skb_cb(skb)->time_to_send)
                        p = &parent->rb_right;
                else
@@ -538,7 +532,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
                                struct sk_buff *t_skb;
                                struct netem_skb_cb *t_last;
 
-                               t_skb = netem_rb_to_skb(rb_last(&q->t_root));
+                               t_skb = skb_rb_last(&q->t_root);
                                t_last = netem_skb_cb(t_skb);
                                if (!last ||
                                    t_last->time_to_send > last->time_to_send) {
@@ -617,7 +611,7 @@ deliver:
        if (p) {
                psched_time_t time_to_send;
 
-               skb = netem_rb_to_skb(p);
+               skb = rb_to_skb(p);
 
                /* if more time remaining? */
                time_to_send = netem_skb_cb(skb)->time_to_send;
index 9b5de31..c1841f2 100644 (file)
@@ -2203,7 +2203,7 @@ static void xs_udp_setup_socket(struct work_struct *work)
        struct sock_xprt *transport =
                container_of(work, struct sock_xprt, connect_worker.work);
        struct rpc_xprt *xprt = &transport->xprt;
-       struct socket *sock = transport->sock;
+       struct socket *sock;
        int status = -EIO;
 
        sock = xs_create_sock(xprt, transport,
index 7d99029..a140dd4 100644 (file)
@@ -233,7 +233,7 @@ static int tipc_bcast_xmit(struct net *net, struct sk_buff_head *pkts,
        struct sk_buff_head xmitq;
        int rc = 0;
 
-       __skb_queue_head_init(&xmitq);
+       skb_queue_head_init(&xmitq);
        tipc_bcast_lock(net);
        if (tipc_link_bc_peers(l))
                rc = tipc_link_xmit(l, pkts, &xmitq);
@@ -263,7 +263,7 @@ static int tipc_rcast_xmit(struct net *net, struct sk_buff_head *pkts,
        u32 dst, selector;
 
        selector = msg_link_selector(buf_msg(skb_peek(pkts)));
-       __skb_queue_head_init(&_pkts);
+       skb_queue_head_init(&_pkts);
 
        list_for_each_entry_safe(n, tmp, &dests->list, list) {
                dst = n->value;
index 121e59a..17146c1 100644 (file)
@@ -568,6 +568,14 @@ bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err)
        msg_set_destnode(msg, dnode);
        msg_set_destport(msg, dport);
        *err = TIPC_OK;
+
+       if (!skb_cloned(skb))
+               return true;
+
+       /* Unclone buffer in case it was bundled */
+       if (pskb_expand_head(skb, BUF_HEADROOM, BUF_TAILROOM, GFP_ATOMIC))
+               return false;
+
        return true;
 }
 
index 67a03f2..fce2cbe 100644 (file)
@@ -549,6 +549,14 @@ nl80211_nan_srf_policy[NL80211_NAN_SRF_ATTR_MAX + 1] = {
        [NL80211_NAN_SRF_MAC_ADDRS] = { .type = NLA_NESTED },
 };
 
+/* policy for packet pattern attributes */
+static const struct nla_policy
+nl80211_packet_pattern_policy[MAX_NL80211_PKTPAT + 1] = {
+       [NL80211_PKTPAT_MASK] = { .type = NLA_BINARY, },
+       [NL80211_PKTPAT_PATTERN] = { .type = NLA_BINARY, },
+       [NL80211_PKTPAT_OFFSET] = { .type = NLA_U32 },
+};
+
 static int nl80211_prepare_wdev_dump(struct sk_buff *skb,
                                     struct netlink_callback *cb,
                                     struct cfg80211_registered_device **rdev,
@@ -10571,7 +10579,8 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info)
                        u8 *mask_pat;
 
                        nla_parse_nested(pat_tb, MAX_NL80211_PKTPAT, pat,
-                                        NULL, info->extack);
+                                        nl80211_packet_pattern_policy,
+                                        info->extack);
                        err = -EINVAL;
                        if (!pat_tb[NL80211_PKTPAT_MASK] ||
                            !pat_tb[NL80211_PKTPAT_PATTERN])
@@ -10820,7 +10829,8 @@ static int nl80211_parse_coalesce_rule(struct cfg80211_registered_device *rdev,
                            rem) {
                u8 *mask_pat;
 
-               nla_parse_nested(pat_tb, MAX_NL80211_PKTPAT, pat, NULL, NULL);
+               nla_parse_nested(pat_tb, MAX_NL80211_PKTPAT, pat,
+                                nl80211_packet_pattern_policy, NULL);
                if (!pat_tb[NL80211_PKTPAT_MASK] ||
                    !pat_tb[NL80211_PKTPAT_PATTERN])
                        return -EINVAL;
index acf0010..30e5746 100644 (file)
@@ -91,6 +91,7 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
        }
 
        if (!dev->xfrmdev_ops || !dev->xfrmdev_ops->xdo_dev_state_add) {
+               xso->dev = NULL;
                dev_put(dev);
                return 0;
        }
index 2515cd2..8ac9d32 100644 (file)
@@ -429,7 +429,8 @@ resume:
        nf_reset(skb);
 
        if (decaps) {
-               skb->sp->olen = 0;
+               if (skb->sp)
+                       skb->sp->olen = 0;
                skb_dst_drop(skb);
                gro_cells_receive(&gro_cells, skb);
                return 0;
@@ -440,7 +441,8 @@ resume:
 
                err = x->inner_mode->afinfo->transport_finish(skb, xfrm_gro || async);
                if (xfrm_gro) {
-                       skb->sp->olen = 0;
+                       if (skb->sp)
+                               skb->sp->olen = 0;
                        skb_dst_drop(skb);
                        gro_cells_receive(&gro_cells, skb);
                        return err;
index 0dab1cd..1221347 100644 (file)
@@ -732,12 +732,12 @@ restart:
                        }
                }
        }
+out:
+       spin_unlock_bh(&net->xfrm.xfrm_state_lock);
        if (cnt) {
                err = 0;
                xfrm_policy_cache_flush();
        }
-out:
-       spin_unlock_bh(&net->xfrm.xfrm_state_lock);
        return err;
 }
 EXPORT_SYMBOL(xfrm_state_flush);
index 2bfbd91..b997f13 100644 (file)
@@ -657,6 +657,7 @@ static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh,
 
        if (err < 0) {
                x->km.state = XFRM_STATE_DEAD;
+               xfrm_dev_state_delete(x);
                __xfrm_state_put(x);
                goto out;
        }
index 41b6115..a77a583 100644 (file)
@@ -37,10 +37,14 @@ struct bpf_map_def SEC("maps") stackmap = {
 SEC("perf_event")
 int bpf_prog1(struct bpf_perf_event_data *ctx)
 {
+       char time_fmt1[] = "Time Enabled: %llu, Time Running: %llu";
+       char time_fmt2[] = "Get Time Failed, ErrCode: %d";
        char fmt[] = "CPU-%d period %lld ip %llx";
        u32 cpu = bpf_get_smp_processor_id();
+       struct bpf_perf_event_value value_buf;
        struct key_t key;
        u64 *val, one = 1;
+       int ret;
 
        if (ctx->sample_period < 10000)
                /* ignore warmup */
@@ -54,6 +58,12 @@ int bpf_prog1(struct bpf_perf_event_data *ctx)
                return 0;
        }
 
+       ret = bpf_perf_prog_read_value(ctx, (void *)&value_buf, sizeof(struct bpf_perf_event_value));
+       if (!ret)
+         bpf_trace_printk(time_fmt1, sizeof(time_fmt1), value_buf.enabled, value_buf.running);
+       else
+         bpf_trace_printk(time_fmt2, sizeof(time_fmt2), ret);
+
        val = bpf_map_lookup_elem(&counts, &key);
        if (val)
                (*val)++;
index 7bd827b..bf4f1b6 100644 (file)
@@ -127,6 +127,9 @@ static void test_perf_event_all_cpu(struct perf_event_attr *attr)
        int *pmu_fd = malloc(nr_cpus * sizeof(int));
        int i, error = 0;
 
+       /* system wide perf event, no need to inherit */
+       attr->inherit = 0;
+
        /* open perf_event on all cpus */
        for (i = 0; i < nr_cpus; i++) {
                pmu_fd[i] = sys_perf_event_open(attr, -1, i, -1, 0);
@@ -154,6 +157,11 @@ static void test_perf_event_task(struct perf_event_attr *attr)
 {
        int pmu_fd;
 
+       /* per task perf event, enable inherit so the "dd ..." command can be traced properly.
+        * Enabling inherit will cause bpf_perf_prog_read_time helper failure.
+        */
+       attr->inherit = 1;
+
        /* open task bound event */
        pmu_fd = sys_perf_event_open(attr, 0, -1, -1, 0);
        if (pmu_fd < 0) {
@@ -175,14 +183,12 @@ static void test_bpf_perf_event(void)
                .freq = 1,
                .type = PERF_TYPE_HARDWARE,
                .config = PERF_COUNT_HW_CPU_CYCLES,
-               .inherit = 1,
        };
        struct perf_event_attr attr_type_sw = {
                .sample_freq = SAMPLE_FREQ,
                .freq = 1,
                .type = PERF_TYPE_SOFTWARE,
                .config = PERF_COUNT_SW_CPU_CLOCK,
-               .inherit = 1,
        };
        struct perf_event_attr attr_hw_cache_l1d = {
                .sample_freq = SAMPLE_FREQ,
@@ -192,7 +198,6 @@ static void test_bpf_perf_event(void)
                        PERF_COUNT_HW_CACHE_L1D |
                        (PERF_COUNT_HW_CACHE_OP_READ << 8) |
                        (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16),
-               .inherit = 1,
        };
        struct perf_event_attr attr_hw_cache_branch_miss = {
                .sample_freq = SAMPLE_FREQ,
@@ -202,7 +207,6 @@ static void test_bpf_perf_event(void)
                        PERF_COUNT_HW_CACHE_BPU |
                        (PERF_COUNT_HW_CACHE_OP_READ << 8) |
                        (PERF_COUNT_HW_CACHE_RESULT_MISS << 16),
-               .inherit = 1,
        };
        struct perf_event_attr attr_type_raw = {
                .sample_freq = SAMPLE_FREQ,
@@ -210,7 +214,6 @@ static void test_bpf_perf_event(void)
                .type = PERF_TYPE_RAW,
                /* Intel Instruction Retired */
                .config = 0xc0,
-               .inherit = 1,
        };
 
        printf("Test HW_CPU_CYCLES\n");
index e7d1803..46c557a 100644 (file)
@@ -15,6 +15,12 @@ struct bpf_map_def SEC("maps") values = {
        .value_size = sizeof(u64),
        .max_entries = 64,
 };
+struct bpf_map_def SEC("maps") values2 = {
+       .type = BPF_MAP_TYPE_HASH,
+       .key_size = sizeof(int),
+       .value_size = sizeof(struct bpf_perf_event_value),
+       .max_entries = 64,
+};
 
 SEC("kprobe/htab_map_get_next_key")
 int bpf_prog1(struct pt_regs *ctx)
@@ -37,5 +43,25 @@ int bpf_prog1(struct pt_regs *ctx)
        return 0;
 }
 
+SEC("kprobe/htab_map_lookup_elem")
+int bpf_prog2(struct pt_regs *ctx)
+{
+       u32 key = bpf_get_smp_processor_id();
+       struct bpf_perf_event_value *val, buf;
+       int error;
+
+       error = bpf_perf_event_read_value(&counters, key, &buf, sizeof(buf));
+       if (error)
+               return 0;
+
+       val = bpf_map_lookup_elem(&values2, &key);
+       if (val)
+               *val = buf;
+       else
+               bpf_map_update_elem(&values2, &key, &buf, BPF_NOEXIST);
+
+       return 0;
+}
+
 char _license[] SEC("license") = "GPL";
 u32 _version SEC("version") = LINUX_VERSION_CODE;
index a05a99a..3341a96 100644 (file)
@@ -22,6 +22,7 @@
 
 static void check_on_cpu(int cpu, struct perf_event_attr *attr)
 {
+       struct bpf_perf_event_value value2;
        int pmu_fd, error = 0;
        cpu_set_t set;
        __u64 value;
@@ -46,8 +47,18 @@ static void check_on_cpu(int cpu, struct perf_event_attr *attr)
                fprintf(stderr, "Value missing for CPU %d\n", cpu);
                error = 1;
                goto on_exit;
+       } else {
+               fprintf(stderr, "CPU %d: %llu\n", cpu, value);
+       }
+       /* The above bpf_map_lookup_elem should trigger the second kprobe */
+       if (bpf_map_lookup_elem(map_fd[2], &cpu, &value2)) {
+               fprintf(stderr, "Value2 missing for CPU %d\n", cpu);
+               error = 1;
+               goto on_exit;
+       } else {
+               fprintf(stderr, "CPU %d: counter: %llu, enabled: %llu, running: %llu\n", cpu,
+                       value2.counter, value2.enabled, value2.running);
        }
-       fprintf(stderr, "CPU %d: %llu\n", cpu, value);
 
 on_exit:
        assert(bpf_map_delete_elem(map_fd[0], &cpu) == 0 || error);
index 74f3fd8..2fe2f76 100644 (file)
@@ -13,23 +13,27 @@ struct bpf_map_def SEC("maps") redirect_err_cnt = {
        /* TODO: have entries for all possible errno's */
 };
 
+#define XDP_UNKNOWN    XDP_REDIRECT + 1
+struct bpf_map_def SEC("maps") exception_cnt = {
+       .type           = BPF_MAP_TYPE_PERCPU_ARRAY,
+       .key_size       = sizeof(u32),
+       .value_size     = sizeof(u64),
+       .max_entries    = XDP_UNKNOWN + 1,
+};
+
 /* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_redirect/format
  * Code in:                kernel/include/trace/events/xdp.h
  */
 struct xdp_redirect_ctx {
-       unsigned short common_type;     //      offset:0;  size:2; signed:0;
-       unsigned char common_flags;     //      offset:2;  size:1; signed:0;
-       unsigned char common_preempt_count;//   offset:3;  size:1; signed:0;
-       int common_pid;                 //      offset:4;  size:4; signed:1;
-
-       int prog_id;                    //      offset:8;  size:4; signed:1;
-       u32 act;                        //      offset:12  size:4; signed:0;
-       int ifindex;                    //      offset:16  size:4; signed:1;
-       int err;                        //      offset:20  size:4; signed:1;
-       int to_ifindex;                 //      offset:24  size:4; signed:1;
-       u32 map_id;                     //      offset:28  size:4; signed:0;
-       int map_index;                  //      offset:32  size:4; signed:1;
-};                                     //      offset:36
+       u64 __pad;              // First 8 bytes are not accessible by bpf code
+       int prog_id;            //      offset:8;  size:4; signed:1;
+       u32 act;                //      offset:12  size:4; signed:0;
+       int ifindex;            //      offset:16  size:4; signed:1;
+       int err;                //      offset:20  size:4; signed:1;
+       int to_ifindex;         //      offset:24  size:4; signed:1;
+       u32 map_id;             //      offset:28  size:4; signed:0;
+       int map_index;          //      offset:32  size:4; signed:1;
+};                             //      offset:36
 
 enum {
        XDP_REDIRECT_SUCCESS = 0,
@@ -48,7 +52,7 @@ int xdp_redirect_collect_stat(struct xdp_redirect_ctx *ctx)
 
        cnt  = bpf_map_lookup_elem(&redirect_err_cnt, &key);
        if (!cnt)
-               return 0;
+               return 1;
        *cnt += 1;
 
        return 0; /* Indicate event was filtered (no further processing)*/
@@ -86,3 +90,31 @@ int trace_xdp_redirect_map(struct xdp_redirect_ctx *ctx)
 {
        return xdp_redirect_collect_stat(ctx);
 }
+
+/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_exception/format
+ * Code in:                kernel/include/trace/events/xdp.h
+ */
+struct xdp_exception_ctx {
+       u64 __pad;      // First 8 bytes are not accessible by bpf code
+       int prog_id;    //      offset:8;  size:4; signed:1;
+       u32 act;        //      offset:12; size:4; signed:0;
+       int ifindex;    //      offset:16; size:4; signed:1;
+};
+
+SEC("tracepoint/xdp/xdp_exception")
+int trace_xdp_exception(struct xdp_exception_ctx *ctx)
+{
+       u64 *cnt;;
+       u32 key;
+
+       key = ctx->act;
+       if (key > XDP_REDIRECT)
+               key = XDP_UNKNOWN;
+
+       cnt = bpf_map_lookup_elem(&exception_cnt, &key);
+       if (!cnt)
+               return 1;
+       *cnt += 1;
+
+       return 0;
+}
index c5ab8b7..eaba165 100644 (file)
@@ -20,6 +20,7 @@ static const char *__doc_err_only__=
 #include <unistd.h>
 #include <locale.h>
 
+#include <sys/resource.h>
 #include <getopt.h>
 #include <net/if.h>
 #include <time.h>
@@ -89,6 +90,23 @@ static const char *err2str(int err)
                return redir_names[err];
        return NULL;
 }
+/* enum xdp_action */
+#define XDP_UNKNOWN    XDP_REDIRECT + 1
+#define XDP_ACTION_MAX (XDP_UNKNOWN + 1)
+static const char *xdp_action_names[XDP_ACTION_MAX] = {
+       [XDP_ABORTED]   = "XDP_ABORTED",
+       [XDP_DROP]      = "XDP_DROP",
+       [XDP_PASS]      = "XDP_PASS",
+       [XDP_TX]        = "XDP_TX",
+       [XDP_REDIRECT]  = "XDP_REDIRECT",
+       [XDP_UNKNOWN]   = "XDP_UNKNOWN",
+};
+static const char *action2str(int action)
+{
+       if (action < XDP_ACTION_MAX)
+               return xdp_action_names[action];
+       return NULL;
+}
 
 struct record {
        __u64 counter;
@@ -97,6 +115,7 @@ struct record {
 
 struct stats_record {
        struct record xdp_redir[REDIR_RES_MAX];
+       struct record xdp_exception[XDP_ACTION_MAX];
 };
 
 static void stats_print_headers(bool err_only)
@@ -104,39 +123,72 @@ static void stats_print_headers(bool err_only)
        if (err_only)
                printf("\n%s\n", __doc_err_only__);
 
-       printf("%-14s %-10s %-18s %-9s\n",
-              "XDP_REDIRECT", "pps ", "pps-human-readable", "measure-period");
+       printf("%-14s %-11s %-10s %-18s %-9s\n",
+              "ACTION", "result", "pps ", "pps-human-readable", "measure-period");
+}
+
+static double calc_period(struct record *r, struct record *p)
+{
+       double period_ = 0;
+       __u64 period = 0;
+
+       period = r->timestamp - p->timestamp;
+       if (period > 0)
+               period_ = ((double) period / NANOSEC_PER_SEC);
+
+       return period_;
+}
+
+static double calc_pps(struct record *r, struct record *p, double period)
+{
+       __u64 packets = 0;
+       double pps = 0;
+
+       if (period > 0) {
+               packets = r->counter - p->counter;
+               pps = packets / period;
+       }
+       return pps;
 }
 
 static void stats_print(struct stats_record *rec,
                        struct stats_record *prev,
                        bool err_only)
 {
+       double period = 0, pps = 0;
+       struct record *r, *p;
        int i = 0;
 
+       char *fmt = "%-14s %-11s %-10.0f %'-18.0f %f\n";
+
+       /* tracepoint: xdp:xdp_redirect_* */
        if (err_only)
                i = REDIR_ERROR;
 
        for (; i < REDIR_RES_MAX; i++) {
-               struct record *r = &rec->xdp_redir[i];
-               struct record *p = &prev->xdp_redir[i];
-               __u64 period  = 0;
-               __u64 packets = 0;
-               double pps = 0;
-               double period_ = 0;
+               r = &rec->xdp_redir[i];
+               p = &prev->xdp_redir[i];
 
                if (p->timestamp) {
-                       packets = r->counter - p->counter;
-                       period  = r->timestamp - p->timestamp;
-                       if (period > 0) {
-                               period_ = ((double) period / NANOSEC_PER_SEC);
-                               pps = packets / period_;
-                       }
+                       period = calc_period(r, p);
+                       pps = calc_pps(r, p, period);
                }
+               printf(fmt, "XDP_REDIRECT", err2str(i), pps, pps, period);
+       }
 
-               printf("%-14s %-10.0f %'-18.0f %f\n",
-                      err2str(i), pps, pps, period_);
+       /* tracepoint: xdp:xdp_exception */
+       for (i = 0; i < XDP_ACTION_MAX; i++) {
+               r = &rec->xdp_exception[i];
+               p = &prev->xdp_exception[i];
+               if (p->timestamp) {
+                       period = calc_period(r, p);
+                       pps = calc_pps(r, p, period);
+               }
+               if (pps > 0)
+                       printf(fmt, action2str(i), "Exception",
+                              pps, pps, period);
        }
+       printf("\n");
 }
 
 static __u64 get_key32_value64_percpu(int fd, __u32 key)
@@ -160,25 +212,33 @@ static __u64 get_key32_value64_percpu(int fd, __u32 key)
        return sum;
 }
 
-static bool stats_collect(int fd, struct stats_record *rec)
+static bool stats_collect(struct stats_record *rec)
 {
+       int fd;
        int i;
 
        /* TODO: Detect if someone unloaded the perf event_fd's, as
         * this can happen by someone running perf-record -e
         */
 
+       fd = map_data[0].fd; /* map0: redirect_err_cnt */
        for (i = 0; i < REDIR_RES_MAX; i++) {
                rec->xdp_redir[i].timestamp = gettime();
                rec->xdp_redir[i].counter = get_key32_value64_percpu(fd, i);
        }
+
+       fd = map_data[1].fd; /* map1: exception_cnt */
+       for (i = 0; i < XDP_ACTION_MAX; i++) {
+               rec->xdp_exception[i].timestamp = gettime();
+               rec->xdp_exception[i].counter = get_key32_value64_percpu(fd, i);
+       }
+
        return true;
 }
 
 static void stats_poll(int interval, bool err_only)
 {
        struct stats_record rec, prev;
-       int map_fd;
 
        memset(&rec, 0, sizeof(rec));
 
@@ -190,16 +250,17 @@ static void stats_poll(int interval, bool err_only)
                printf("\n%s", __doc__);
 
        /* TODO Need more advanced stats on error types */
-       if (verbose)
-               printf(" - Stats map: %s\n", map_data[0].name);
-       map_fd = map_data[0].fd;
-
-       stats_print_headers(err_only);
+       if (verbose) {
+               printf(" - Stats map0: %s\n", map_data[0].name);
+               printf(" - Stats map1: %s\n", map_data[1].name);
+               printf("\n");
+       }
        fflush(stdout);
 
        while (1) {
                memcpy(&prev, &rec, sizeof(rec));
-               stats_collect(map_fd, &rec);
+               stats_collect(&rec);
+               stats_print_headers(err_only);
                stats_print(&rec, &prev, err_only);
                fflush(stdout);
                sleep(interval);
@@ -235,6 +296,7 @@ static void print_bpf_prog_info(void)
 
 int main(int argc, char **argv)
 {
+       struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
        int longindex = 0, opt;
        int ret = EXIT_SUCCESS;
        char bpf_obj_file[256];
@@ -265,13 +327,18 @@ int main(int argc, char **argv)
                }
        }
 
+       if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+               perror("setrlimit(RLIMIT_MEMLOCK)");
+               return EXIT_FAILURE;
+       }
+
        if (load_bpf_file(bpf_obj_file)) {
                printf("ERROR - bpf_log_buf: %s", bpf_log_buf);
-               return 1;
+               return EXIT_FAILURE;
        }
        if (!prog_fd[0]) {
                printf("ERROR - load_bpf_file: %s\n", strerror(errno));
-               return 1;
+               return EXIT_FAILURE;
        }
 
        if (debug) {
index 57fc4b9..04d12f7 100644 (file)
@@ -11,7 +11,7 @@ SYNOPSIS
 ========
 
 |      **bpftool** prog show [*PROG*]
-|      **bpftool** prog dump xlated *PROG*  file *FILE*
+|      **bpftool** prog dump xlated *PROG* [file *FILE*] [opcodes]
 |      **bpftool** prog dump jited  *PROG* [file *FILE*] [opcodes]
 |      **bpftool** prog pin *PROG* *FILE*
 |      **bpftool** prog help
@@ -28,9 +28,12 @@ DESCRIPTION
                  Output will start with program ID followed by program type and
                  zero or more named attributes (depending on kernel version).
 
-       **bpftool prog dump xlated** *PROG*  **file** *FILE*
-                 Dump eBPF instructions of the program from the kernel to a
-                 file.
+       **bpftool prog dump xlated** *PROG* [**file** *FILE*] [**opcodes**]
+                 Dump eBPF instructions of the program from the kernel.
+                 If *FILE* is specified image will be written to a file,
+                 otherwise it will be disassembled and printed to stdout.
+
+                 **opcodes** controls if raw opcodes will be printed.
 
        **bpftool prog dump jited**  *PROG* [**file** *FILE*] [**opcodes**]
                  Dump jited image (host machine code) of the program.
index 8705ee4..4f33982 100644 (file)
@@ -51,7 +51,7 @@ CC = gcc
 
 CFLAGS += -O2
 CFLAGS += -W -Wall -Wextra -Wno-unused-parameter -Wshadow
-CFLAGS += -D__EXPORTED_HEADERS__ -I$(srctree)/tools/include/uapi -I$(srctree)/tools/include -I$(srctree)/tools/lib/bpf
+CFLAGS += -D__EXPORTED_HEADERS__ -I$(srctree)/tools/include/uapi -I$(srctree)/tools/include -I$(srctree)/tools/lib/bpf -I$(srctree)/kernel/bpf/
 LIBS = -lelf -lbfd -lopcodes $(LIBBPF)
 
 include $(wildcard *.d)
@@ -59,7 +59,10 @@ include $(wildcard *.d)
 all: $(OUTPUT)bpftool
 
 SRCS=$(wildcard *.c)
-OBJS=$(patsubst %.c,$(OUTPUT)%.o,$(SRCS))
+OBJS=$(patsubst %.c,$(OUTPUT)%.o,$(SRCS)) $(OUTPUT)disasm.o
+
+$(OUTPUT)disasm.o: $(srctree)/kernel/bpf/disasm.c
+       $(QUIET_CC)$(COMPILE.c) -MMD -o $@ $<
 
 $(OUTPUT)bpftool: $(OBJS) $(LIBBPF)
        $(QUIET_LINK)$(CC) $(CFLAGS) -o $@ $^ $(LIBS)
index 85d2d78..8e809b2 100644 (file)
 #ifndef __BPF_TOOL_H
 #define __BPF_TOOL_H
 
+/* BFD and kernel.h both define GCC_VERSION, differently */
+#undef GCC_VERSION
 #include <stdbool.h>
 #include <stdio.h>
 #include <linux/bpf.h>
-
-#define ARRAY_SIZE(a)  (sizeof(a) / sizeof(a[0]))
+#include <linux/kernel.h>
 
 #define err(msg...)    fprintf(stderr, "Error: " msg)
 #define warn(msg...)   fprintf(stderr, "Warning: " msg)
 
 #define ptr_to_u64(ptr)        ((__u64)(unsigned long)(ptr))
 
-#define min(a, b)                                                      \
-       ({ typeof(a) _a = (a); typeof(b) _b = (b); _a > _b ? _b : _a; })
-#define max(a, b)                                                      \
-       ({ typeof(a) _a = (a); typeof(b) _b = (b); _a < _b ? _b : _a; })
-
 #define NEXT_ARG()     ({ argc--; argv++; if (argc < 0) usage(); })
 #define NEXT_ARGP()    ({ (*argc)--; (*argv)++; if (*argc < 0) usage(); })
 #define BAD_ARG()      ({ err("what is '%s'?\n", *argv); -1; })
index 421ba89..9e2681c 100644 (file)
@@ -35,6 +35,7 @@
 
 #include <errno.h>
 #include <fcntl.h>
+#include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -46,6 +47,7 @@
 #include <bpf.h>
 
 #include "main.h"
+#include "disasm.h"
 
 static const char * const prog_type_name[] = {
        [BPF_PROG_TYPE_UNSPEC]          = "unspec",
@@ -297,11 +299,39 @@ static int do_show(int argc, char **argv)
        return 0;
 }
 
+static void print_insn(struct bpf_verifier_env *env, const char *fmt, ...)
+{
+       va_list args;
+
+       va_start(args, fmt);
+       vprintf(fmt, args);
+       va_end(args);
+}
+
+static void dump_xlated(void *buf, unsigned int len, bool opcodes)
+{
+       struct bpf_insn *insn = buf;
+       unsigned int i;
+
+       for (i = 0; i < len / sizeof(*insn); i++) {
+               printf("% 4d: ", i);
+               print_bpf_insn(print_insn, NULL, insn + i, true);
+
+               if (opcodes) {
+                       printf("       ");
+                       print_hex(insn + i, 8, " ");
+                       printf("\n");
+               }
+
+               if (insn[i].code == (BPF_LD | BPF_IMM | BPF_DW))
+                       i++;
+       }
+}
+
 static int do_dump(int argc, char **argv)
 {
        struct bpf_prog_info info = {};
        __u32 len = sizeof(info);
-       bool can_disasm = false;
        unsigned int buf_size;
        char *filepath = NULL;
        bool opcodes = false;
@@ -315,7 +345,6 @@ static int do_dump(int argc, char **argv)
        if (is_prefix(*argv, "jited")) {
                member_len = &info.jited_prog_len;
                member_ptr = &info.jited_prog_insns;
-               can_disasm = true;
        } else if (is_prefix(*argv, "xlated")) {
                member_len = &info.xlated_prog_len;
                member_ptr = &info.xlated_prog_insns;
@@ -346,10 +375,6 @@ static int do_dump(int argc, char **argv)
                NEXT_ARG();
        }
 
-       if (!filepath && !can_disasm) {
-               err("expected 'file' got %s\n", *argv);
-               return -1;
-       }
        if (argc) {
                usage();
                return -1;
@@ -409,7 +434,10 @@ static int do_dump(int argc, char **argv)
                        goto err_free;
                }
        } else {
-               disasm_print_insn(buf, *member_len, opcodes);
+               if (member_len == &info.jited_prog_len)
+                       disasm_print_insn(buf, *member_len, opcodes);
+               else
+                       dump_xlated(buf, *member_len, opcodes);
        }
 
        free(buf);
@@ -430,7 +458,7 @@ static int do_help(int argc, char **argv)
 {
        fprintf(stderr,
                "Usage: %s %s show [PROG]\n"
-               "       %s %s dump xlated PROG  file FILE\n"
+               "       %s %s dump xlated PROG [file FILE] [opcodes]\n"
                "       %s %s dump jited  PROG [file FILE] [opcodes]\n"
                "       %s %s pin   PROG FILE\n"
                "       %s %s help\n"
index cb2b9f9..fb4fb81 100644 (file)
@@ -230,7 +230,7 @@ union bpf_attr {
                __u32   numa_node;      /* numa node (effective only if
                                         * BPF_F_NUMA_NODE is set).
                                         */
-               __u8    map_name[BPF_OBJ_NAME_LEN];
+               char    map_name[BPF_OBJ_NAME_LEN];
        };
 
        struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
@@ -253,7 +253,7 @@ union bpf_attr {
                __aligned_u64   log_buf;        /* user supplied buffer */
                __u32           kern_version;   /* checked when prog_type=kprobe */
                __u32           prog_flags;
-               __u8            prog_name[BPF_OBJ_NAME_LEN];
+               char            prog_name[BPF_OBJ_NAME_LEN];
        };
 
        struct { /* anonymous struct used by BPF_OBJ_* commands */
@@ -697,7 +697,9 @@ union bpf_attr {
        FN(redirect_map),               \
        FN(sk_redirect_map),            \
        FN(sock_map_update),            \
-       FN(xdp_adjust_meta),
+       FN(xdp_adjust_meta),            \
+       FN(perf_event_read_value),      \
+       FN(perf_prog_read_value),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -869,7 +871,7 @@ struct bpf_prog_info {
        __u32 created_by_uid;
        __u32 nr_map_ids;
        __aligned_u64 map_ids;
-       __u8  name[BPF_OBJ_NAME_LEN];
+       char  name[BPF_OBJ_NAME_LEN];
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
@@ -879,7 +881,7 @@ struct bpf_map_info {
        __u32 value_size;
        __u32 max_entries;
        __u32 map_flags;
-       __u8  name[BPF_OBJ_NAME_LEN];
+       char  name[BPF_OBJ_NAME_LEN];
 } __attribute__((aligned(8)));
 
 /* User bpf_sock_ops struct to access socket values and specify request ops
index 924af8d..2e7880e 100644 (file)
@@ -12,7 +12,7 @@ CFLAGS += -Wall -O2 -I$(APIDIR) -I$(LIBDIR) -I$(GENDIR) $(GENFLAGS) -I../../../i
 LDLIBS += -lcap -lelf
 
 TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \
-       test_align
+       test_align test_verifier_log
 
 TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test_obj_id.o \
        test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o     \
index a56053d..e25dbf6 100644 (file)
@@ -72,6 +72,12 @@ static int (*bpf_sk_redirect_map)(void *map, int key, int flags) =
 static int (*bpf_sock_map_update)(void *map, void *key, void *value,
                                  unsigned long long flags) =
        (void *) BPF_FUNC_sock_map_update;
+static int (*bpf_perf_event_read_value)(void *map, unsigned long long flags,
+                                       void *buf, unsigned int buf_size) =
+       (void *) BPF_FUNC_perf_event_read_value;
+static int (*bpf_perf_prog_read_value)(void *ctx, void *buf,
+                                      unsigned int buf_size) =
+       (void *) BPF_FUNC_perf_prog_read_value;
 
 
 /* llvm builtin functions that eBPF C program may use to
diff --git a/tools/testing/selftests/bpf/test_verifier_log.c b/tools/testing/selftests/bpf/test_verifier_log.c
new file mode 100644 (file)
index 0000000..3cc0b56
--- /dev/null
@@ -0,0 +1,171 @@
+#include <errno.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/unistd.h>
+
+#include <bpf/bpf.h>
+
+#define LOG_SIZE (1 << 20)
+
+#define err(str...)    printf("ERROR: " str)
+
+static const struct bpf_insn code_sample[] = {
+       /* We need a few instructions to pass the min log length */
+       BPF_MOV64_IMM(BPF_REG_0, 0),
+       BPF_MOV64_IMM(BPF_REG_0, 0),
+       BPF_MOV64_IMM(BPF_REG_0, 0),
+       BPF_MOV64_IMM(BPF_REG_0, 0),
+       BPF_MOV64_IMM(BPF_REG_0, 0),
+       BPF_MOV64_IMM(BPF_REG_0, 0),
+       BPF_MOV64_IMM(BPF_REG_0, 0),
+       BPF_MOV64_IMM(BPF_REG_0, 0),
+       BPF_MOV64_IMM(BPF_REG_0, 0),
+       BPF_MOV64_IMM(BPF_REG_0, 0),
+       BPF_MOV64_IMM(BPF_REG_0, 0),
+       BPF_MOV64_IMM(BPF_REG_0, 0),
+       BPF_MOV64_IMM(BPF_REG_0, 0),
+       BPF_MOV64_IMM(BPF_REG_0, 0),
+       BPF_MOV64_IMM(BPF_REG_0, 0),
+       BPF_MOV64_IMM(BPF_REG_0, 0),
+       BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+                    BPF_FUNC_map_lookup_elem),
+       BPF_EXIT_INSN(),
+};
+
+static inline __u64 ptr_to_u64(const void *ptr)
+{
+       return (__u64) (unsigned long) ptr;
+}
+
+static int load(char *log, size_t log_len, int log_level)
+{
+       union bpf_attr attr;
+
+       bzero(&attr, sizeof(attr));
+       attr.prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
+       attr.insn_cnt = (__u32)(sizeof(code_sample) / sizeof(struct bpf_insn));
+       attr.insns = ptr_to_u64(code_sample);
+       attr.license = ptr_to_u64("GPL");
+       attr.log_buf = ptr_to_u64(log);
+       attr.log_size = log_len;
+       attr.log_level = log_level;
+
+       return syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
+}
+
+static void check_ret(int ret, int exp_errno)
+{
+       if (ret > 0) {
+               close(ret);
+               err("broken sample loaded successfully!?\n");
+               exit(1);
+       }
+
+       if (!ret || errno != exp_errno) {
+               err("Program load returned: ret:%d/errno:%d, expected ret:%d/errno:%d\n",
+                   ret, errno, -1, exp_errno);
+               exit(1);
+       }
+}
+
+static void check_ones(const char *buf, size_t len, const char *msg)
+{
+       while (len--)
+               if (buf[len] != 1) {
+                       err("%s", msg);
+                       exit(1);
+               }
+}
+
+static void test_log_good(char *log, size_t buf_len, size_t log_len,
+                         size_t exp_len, int exp_errno, const char *full_log)
+{
+       size_t len;
+       int ret;
+
+       memset(log, 1, buf_len);
+
+       ret = load(log, log_len, 1);
+       check_ret(ret, exp_errno);
+
+       len = strnlen(log, buf_len);
+       if (len == buf_len) {
+               err("verifier did not NULL terminate the log\n");
+               exit(1);
+       }
+       if (exp_len && len != exp_len) {
+               err("incorrect log length expected:%zd have:%zd\n",
+                   exp_len, len);
+               exit(1);
+       }
+
+       if (strchr(log, 1)) {
+               err("verifier leaked a byte through\n");
+               exit(1);
+       }
+
+       check_ones(log + len + 1, buf_len - len - 1,
+                  "verifier wrote bytes past NULL termination\n");
+
+       if (memcmp(full_log, log, LOG_SIZE)) {
+               err("log did not match expected output\n");
+               exit(1);
+       }
+}
+
+static void test_log_bad(char *log, size_t log_len, int log_level)
+{
+       int ret;
+
+       ret = load(log, log_len, log_level);
+       check_ret(ret, EINVAL);
+       if (log)
+               check_ones(log, LOG_SIZE,
+                          "verifier touched log with bad parameters\n");
+}
+
+int main(int argc, char **argv)
+{
+       char full_log[LOG_SIZE];
+       char log[LOG_SIZE];
+       size_t want_len;
+       int i;
+
+       memset(log, 1, LOG_SIZE);
+
+       /* Test incorrect attr */
+       printf("Test log_level 0...\n");
+       test_log_bad(log, LOG_SIZE, 0);
+
+       printf("Test log_size < 128...\n");
+       test_log_bad(log, 15, 1);
+
+       printf("Test log_buff = NULL...\n");
+       test_log_bad(NULL, LOG_SIZE, 1);
+
+       /* Test with log big enough */
+       printf("Test oversized buffer...\n");
+       test_log_good(full_log, LOG_SIZE, LOG_SIZE, 0, EACCES, full_log);
+
+       want_len = strlen(full_log);
+
+       printf("Test exact buffer...\n");
+       test_log_good(log, LOG_SIZE, want_len + 2, want_len, EACCES, full_log);
+
+       printf("Test undersized buffers...\n");
+       for (i = 0; i < 64; i++) {
+               full_log[want_len - i + 1] = 1;
+               full_log[want_len - i] = 0;
+
+               test_log_good(log, LOG_SIZE, want_len + 1 - i, want_len - i,
+                             ENOSPC, full_log);
+       }
+
+       printf("test_verifier_log: OK\n");
+       return 0;
+}
index e8c86c4..a8a8cdf 100755 (executable)
@@ -37,6 +37,26 @@ kci_del_dummy()
        check_err $?
 }
 
+kci_test_netconf()
+{
+       dev="$1"
+       r=$ret
+
+       ip netconf show dev "$dev" > /dev/null
+       check_err $?
+
+       for f in 4 6; do
+               ip -$f netconf show dev "$dev" > /dev/null
+               check_err $?
+       done
+
+       if [ $ret -ne 0 ] ;then
+               echo "FAIL: ip netconf show $dev"
+               test $r -eq 0 && ret=0
+               return 1
+       fi
+}
+
 # add a bridge with vlans on top
 kci_test_bridge()
 {
@@ -63,6 +83,11 @@ kci_test_bridge()
        check_err $?
        ip r s t all > /dev/null
        check_err $?
+
+       for name in "$devbr" "$vlandev" "$devdummy" ; do
+               kci_test_netconf "$name"
+       done
+
        ip -6 addr del dev "$vlandev" dead:42::1234/64
        check_err $?
 
@@ -100,6 +125,9 @@ kci_test_gre()
        check_err $?
        ip addr > /dev/null
        check_err $?
+
+       kci_test_netconf "$gredev"
+
        ip addr del dev "$devdummy" 10.23.7.11/24
        check_err $?
 
index 00f2866..dd4162f 100644 (file)
@@ -341,7 +341,7 @@ int main(int argc, char **argv)
                        return 0;
                case 'n':
                        t = atoi(optarg);
-                       if (t > ARRAY_SIZE(test_cases))
+                       if (t >= ARRAY_SIZE(test_cases))
                                error(1, 0, "Invalid test case: %d", t);
                        all_tests = false;
                        test_cases[t].enabled = true;